Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("train", help="Path to training corpus.")
    parser.add_argument("corpus", help="Path to corpus.")
    parser.add_argument("n", help="Tag sentences shorter than this length.")
    args = parser.parse_args()

    train_corpus = Corpus(args.train)
    corpus = Corpus(args.corpus)
    n = int(args.n)

    pos_frequencies = processing.pos_frequencies(train_corpus)
    poses_for_word_from_train, total_pos_count = processing.calculate_poses_for_word(train_corpus)
    pos_bigram_probabilities_train = processing.calculate_pos_bigram_probabilities(train_corpus)
    word_pos_probabilities_train = processing.calculate_word_pos_probabilities(train_corpus)

    sentences = [sentence for sentence in corpus.get_sentences() if len(sentence) < n]

    WORD_GIVEN_POS = 0
    POS_GIVEN_PREVPOS = 1

    for sentence in sentences:
        prev_pos = "<s>"
        columns = {}
        current_sentence = []
        for word in sentence:
            id, form, lemma, plemma, pos, ppos = word

            current_sentence.append([id, form, lemma, plemma, pos])

            columns[id] = {}
            if form in poses_for_word_from_train:
                for (pos_for_word, pos_for_word_count) in poses_for_word_from_train[form].items():
                    p_word_given_pos = word_pos_probabilities_train["{0} {1}".format(form, pos_for_word)]

                    pos_bigram = "{0} {1}".format(prev_pos, pos_for_word)
                    if pos_bigram in pos_bigram_probabilities_train:
                        p_pos_given_prevpos = pos_bigram_probabilities_train[pos_bigram]
                    else:
                        p_pos_given_prevpos = 0.00001  # Low chance that this is what we want

                    columns[id][pos_for_word] = {}
                    columns[id][pos_for_word][WORD_GIVEN_POS] = p_word_given_pos
                    columns[id][pos_for_word][POS_GIVEN_PREVPOS] = p_pos_given_prevpos
            else:
                most_common_pos = max(pos_frequencies.items(), key=lambda x: x[1])

                if form in word_pos_probabilities_train:
                    p_word_given_pos = word_pos_probabilities_train["{0} {1}".format(form, most_common_pos[0])]
                else:
                    p_word_given_pos = 0.00001  # Low chance that this is what we want

                p_pos_given_prevpos = pos_bigram_probabilities_train["{0} {1}".format(prev_pos, most_common_pos[0])]

                columns[id][most_common_pos[0]] = {}
                columns[id][most_common_pos[0]][WORD_GIVEN_POS] = p_word_given_pos
                columns[id][most_common_pos[0]][POS_GIVEN_PREVPOS] = p_pos_given_prevpos

            prev_pos = pos

        path = {}
        trellis = {}
        for (column_id, poses) in sorted(columns.items(), key=lambda x: int(x[0])):
            column_id = int(column_id)
            trellis[column_id] = {}
            for (current_pos, data) in poses.items():
                current_word_given_pos = data[WORD_GIVEN_POS]
                current_pos_given_prevpos = data[POS_GIVEN_PREVPOS]
                if column_id == 0:
                    break
                elif column_id == 1:
                    trellis[column_id][current_pos] = current_word_given_pos * current_pos_given_prevpos
                else:

                    max_prev_column = max(
                        [(id, data * current_pos_given_prevpos) for id, data in trellis[column_id - 1].items()],
                        key=lambda x: x[1],
                    )
                    p = max_prev_column[1] * current_word_given_pos
                    trellis[column_id][current_pos] = p

            if column_id == 0:
                continue
            else:
                path[column_id] = max(trellis[column_id].items(), key=lambda x: x[1])[0]

        for (id, predicted) in sorted(path.items(), key=lambda x: x[0]):
            if id == 1:
                print()
            id, form, lemma, plemma, pos = current_sentence[id]
            print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(id, form, lemma, plemma, pos, predicted))
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train', help='Path to training corpus.')
    parser.add_argument('corpus', help='Path to corpus file.')
    args = parser.parse_args()

    train_corpus = Corpus(args.train)
    corpus = Corpus(args.corpus)

    pos_frequencies = processing.pos_frequencies(corpus)
    word_pos_probabilities = processing.calculate_word_pos_probabilities(train_corpus)
    bigram_probabilities = processing.calculate_pos_bigram_probabilities(train_corpus)
    poses_for_words, total_pos_count = processing.calculate_poses_for_word(train_corpus)

    for sentence in corpus.get_sentences():
        parent_trellis = {'<s>': {'probability': 1, 'parent': None}}

        for word in sentence:
            id, form, lemma, plemma, current_word_pos, ppos = word

            if word == [0, '<s>', '<s>', '<s>', '<s>', '<s>']:
                continue

            trellis = {}

            # P(W|T)
            if form not in poses_for_words:
                most_common_pos = max(pos_frequencies.items(), key=lambda x: x[1])
                probability_pos_given_prevpos = {}
                for prev_pos in parent_trellis:
                    probability = parent_trellis[prev_pos]['probability']
                    bigram = '{0} {1}'.format(prev_pos, pos_for_word)
                    if not bigram in bigram_probabilities:
                        probability_bigram = 0.0001  # Gives highest value on the development set for unknown bigrams
                    else:
                        # P(T_i|T_i-1)
                        probability_bigram = bigram_probabilities[bigram]
                    probability *= probability_bigram
                    probability_pos_given_prevpos[prev_pos] = probability
                max_probability = max(probability_pos_given_prevpos.items(), key=lambda x: x[1])

                trellis[most_common_pos[0]] = {}
                trellis[most_common_pos[0]][
                    'probability'] = 0.0001  # Gives highest value on the development set for unknown poses
                trellis[most_common_pos[0]]['parent'] = {max_probability[0]: parent_trellis[max_probability[0]]}
            else:
                for (pos_for_word, count) in poses_for_words[form].items():
                    probability_word_given_pos = word_pos_probabilities['{0} {1}'.format(form, pos_for_word)]

                    probability_pos_given_prevpos = {}
                    for prev_pos in parent_trellis:
                        probability = parent_trellis[prev_pos]['probability']
                        bigram = '{0} {1}'.format(prev_pos, pos_for_word)
                        if not bigram in bigram_probabilities:
                            probability_bigram = 0.0001  # Gives highest value on the development set for
                            # unknown bigrams
                        else:
                            # P(T_i|T_i-1)
                            probability_bigram = bigram_probabilities[bigram]
                        probability *= probability_bigram
                        probability_pos_given_prevpos[prev_pos] = probability
                    max_probability = max(probability_pos_given_prevpos.items(), key=lambda x: x[1])

                    trellis[pos_for_word] = {}
                    trellis[pos_for_word]['probability'] = probability_word_given_pos * max_probability[1]
                    trellis[pos_for_word]['parent'] = {max_probability[0]: parent_trellis[max_probability[0]]}

            parent_trellis = trellis

        optimal_path = max(trellis.items(), key=lambda x: x[1]['probability'])
        prev_path = {optimal_path[0]: optimal_path[1]}
        current_id = int(sentence[-1][0])

        while prev_path != None:
            predicted = prev_path.keys()
            if current_id == 0:
                break
            sentence[current_id][-1] = list(predicted)[0]
            prev_path = prev_path[list(predicted)[0]]['parent']
            current_id -= 1

        for word in sentence:
            id, form, lemma, plemma, pos, ppos = word
            if id == 0:
                print()
            else:
                print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(id, form, lemma, plemma, pos, ppos))