def map_tags(tagged_src, untagged_tgt, alignment_list): tagged_tgt =[] print alignment_list[0] for sentence in untagged_tgt: sent_token_list = Sentence() for word in sentence: sent_token_list.add_token(Token(word, untagged_tag_str, 0, 0)) tagged_tgt.append(sent_token_list) count = 0 for sent_num, pairings in enumerate(alignment_list): for pair in pairings: src_tag_idx = pair[0] tgt_tag_idx = pair[1] tagged_tgt[sent_num].get_token_at(tgt_tag_idx).pos_tag = tagged_src[sent_num].get_token_at(src_tag_idx).pos_tag return tagged_tgt
def get_test_corpus(file_name): corpus="" words=[] test_correct_tags=[] sentence_tags = Sentence() on_sentence = False for line in codecs.open(file_name,'r', encoding="utf-8"): vals = line.split('\t') if (len(vals) > 1): on_sentence=True tok = Token(vals[1],vals[3], vals[6], 0) sentence_tags.add_token(tok) elif(on_sentence): on_sentence = False test_correct_tags.append(sentence_tags) sentence_tags = Sentence() print str(len(test_correct_tags)) + " sentences in test corpus" return test_correct_tags
def load_tagged_sentences(file_name): sentences_w_tags = [] count = 0 words=[] tags=[] sentence_obj = Sentence() on_sentence = False for line in codecs.open(file_name, 'r', encoding="utf-8"): vals = line.split('\t') if (len(vals) > 1): on_sentence = True tok = Token() tok.orig = vals[1] tok.pos_tag = vals[3] tok.head = int(vals[6]) sentence_obj.add_token(tok) elif (on_sentence): on_sentence=False sentences_w_tags.append(sentence_obj) sentence_obj = Sentence() return sentences_w_tags # [ Sentence_obj, Sentence_obj]