def main(): parser = argparse.ArgumentParser() parser.add_argument("train", help="Path to training corpus.") parser.add_argument("corpus", help="Path to corpus.") parser.add_argument("n", help="Tag sentences shorter than this length.") args = parser.parse_args() train_corpus = Corpus(args.train) corpus = Corpus(args.corpus) n = int(args.n) pos_frequencies = processing.pos_frequencies(train_corpus) poses_for_word_from_train, total_pos_count = processing.calculate_poses_for_word(train_corpus) pos_bigram_probabilities_train = processing.calculate_pos_bigram_probabilities(train_corpus) word_pos_probabilities_train = processing.calculate_word_pos_probabilities(train_corpus) sentences = [sentence for sentence in corpus.get_sentences() if len(sentence) < n] WORD_GIVEN_POS = 0 POS_GIVEN_PREVPOS = 1 for sentence in sentences: prev_pos = "<s>" columns = {} current_sentence = [] for word in sentence: id, form, lemma, plemma, pos, ppos = word current_sentence.append([id, form, lemma, plemma, pos]) columns[id] = {} if form in poses_for_word_from_train: for (pos_for_word, pos_for_word_count) in poses_for_word_from_train[form].items(): p_word_given_pos = word_pos_probabilities_train["{0} {1}".format(form, pos_for_word)] pos_bigram = "{0} {1}".format(prev_pos, pos_for_word) if pos_bigram in pos_bigram_probabilities_train: p_pos_given_prevpos = pos_bigram_probabilities_train[pos_bigram] else: p_pos_given_prevpos = 0.00001 # Low chance that this is what we want columns[id][pos_for_word] = {} columns[id][pos_for_word][WORD_GIVEN_POS] = p_word_given_pos columns[id][pos_for_word][POS_GIVEN_PREVPOS] = p_pos_given_prevpos else: most_common_pos = max(pos_frequencies.items(), key=lambda x: x[1]) if form in word_pos_probabilities_train: p_word_given_pos = word_pos_probabilities_train["{0} {1}".format(form, most_common_pos[0])] else: p_word_given_pos = 0.00001 # Low chance that this is what we want p_pos_given_prevpos = pos_bigram_probabilities_train["{0} {1}".format(prev_pos, most_common_pos[0])] columns[id][most_common_pos[0]] = {} columns[id][most_common_pos[0]][WORD_GIVEN_POS] = p_word_given_pos columns[id][most_common_pos[0]][POS_GIVEN_PREVPOS] = p_pos_given_prevpos prev_pos = pos path = {} trellis = {} for (column_id, poses) in sorted(columns.items(), key=lambda x: int(x[0])): column_id = int(column_id) trellis[column_id] = {} for (current_pos, data) in poses.items(): current_word_given_pos = data[WORD_GIVEN_POS] current_pos_given_prevpos = data[POS_GIVEN_PREVPOS] if column_id == 0: break elif column_id == 1: trellis[column_id][current_pos] = current_word_given_pos * current_pos_given_prevpos else: max_prev_column = max( [(id, data * current_pos_given_prevpos) for id, data in trellis[column_id - 1].items()], key=lambda x: x[1], ) p = max_prev_column[1] * current_word_given_pos trellis[column_id][current_pos] = p if column_id == 0: continue else: path[column_id] = max(trellis[column_id].items(), key=lambda x: x[1])[0] for (id, predicted) in sorted(path.items(), key=lambda x: x[0]): if id == 1: print() id, form, lemma, plemma, pos = current_sentence[id] print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(id, form, lemma, plemma, pos, predicted))
def main(): parser = argparse.ArgumentParser() parser.add_argument('train', help='Path to training corpus.') parser.add_argument('corpus', help='Path to corpus file.') args = parser.parse_args() train_corpus = Corpus(args.train) corpus = Corpus(args.corpus) pos_frequencies = processing.pos_frequencies(corpus) word_pos_probabilities = processing.calculate_word_pos_probabilities(train_corpus) bigram_probabilities = processing.calculate_pos_bigram_probabilities(train_corpus) poses_for_words, total_pos_count = processing.calculate_poses_for_word(train_corpus) for sentence in corpus.get_sentences(): parent_trellis = {'<s>': {'probability': 1, 'parent': None}} for word in sentence: id, form, lemma, plemma, current_word_pos, ppos = word if word == [0, '<s>', '<s>', '<s>', '<s>', '<s>']: continue trellis = {} # P(W|T) if form not in poses_for_words: most_common_pos = max(pos_frequencies.items(), key=lambda x: x[1]) probability_pos_given_prevpos = {} for prev_pos in parent_trellis: probability = parent_trellis[prev_pos]['probability'] bigram = '{0} {1}'.format(prev_pos, pos_for_word) if not bigram in bigram_probabilities: probability_bigram = 0.0001 # Gives highest value on the development set for unknown bigrams else: # P(T_i|T_i-1) probability_bigram = bigram_probabilities[bigram] probability *= probability_bigram probability_pos_given_prevpos[prev_pos] = probability max_probability = max(probability_pos_given_prevpos.items(), key=lambda x: x[1]) trellis[most_common_pos[0]] = {} trellis[most_common_pos[0]][ 'probability'] = 0.0001 # Gives highest value on the development set for unknown poses trellis[most_common_pos[0]]['parent'] = {max_probability[0]: parent_trellis[max_probability[0]]} else: for (pos_for_word, count) in poses_for_words[form].items(): probability_word_given_pos = word_pos_probabilities['{0} {1}'.format(form, pos_for_word)] probability_pos_given_prevpos = {} for prev_pos in parent_trellis: probability = parent_trellis[prev_pos]['probability'] bigram = '{0} {1}'.format(prev_pos, pos_for_word) if not bigram in bigram_probabilities: probability_bigram = 0.0001 # Gives highest value on the development set for # unknown bigrams else: # P(T_i|T_i-1) probability_bigram = bigram_probabilities[bigram] probability *= probability_bigram probability_pos_given_prevpos[prev_pos] = probability max_probability = max(probability_pos_given_prevpos.items(), key=lambda x: x[1]) trellis[pos_for_word] = {} trellis[pos_for_word]['probability'] = probability_word_given_pos * max_probability[1] trellis[pos_for_word]['parent'] = {max_probability[0]: parent_trellis[max_probability[0]]} parent_trellis = trellis optimal_path = max(trellis.items(), key=lambda x: x[1]['probability']) prev_path = {optimal_path[0]: optimal_path[1]} current_id = int(sentence[-1][0]) while prev_path != None: predicted = prev_path.keys() if current_id == 0: break sentence[current_id][-1] = list(predicted)[0] prev_path = prev_path[list(predicted)[0]]['parent'] current_id -= 1 for word in sentence: id, form, lemma, plemma, pos, ppos = word if id == 0: print() else: print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(id, form, lemma, plemma, pos, ppos))