def read_sequence_list_conll(self, train_file, train_file_parsed=None, train_files_parsed_path=None, max_sent_len=100000, max_nr_sent=100000): """ Read a conll2002 or conll2003 file into a sequence list. Optionally add a sequence list/tree with *unk* for decoding in wordrep. """ instance_list = self.read_conll_instances(train_file, train_file_parsed, train_files_parsed_path, max_sent_len, max_nr_sent) if self.wordrep_dict is not None: seq_list = SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict) # for indices for sent_x, sent_y, sent_ in instance_list: # sent_ is a normalized tree if self.use_wordrep_tree: seq_list.add_sequence(sent_x, sent_y, None, sent_) # sent is a normalized chain else: seq_list.add_sequence(sent_x, sent_y, sent_) else: seq_list = SequenceListLabel(self.word_dict, self.tag_dict) # for indices for sent_x, sent_y in instance_list: seq_list.add_sequence(sent_x, sent_y) return seq_list
def read_sequence_list_conll(infile, word_dict, tag_dict, max_sent_len=100000, max_nr_sent=100000): instance_list, word_dict, tag_dict = read_output_instances( infile, word_dict, tag_dict, max_sent_len, max_nr_sent) seq_list = SequenceListLabel(word_dict, tag_dict, tag_dict) for sent_x, sent_gold, sent_predict in instance_list: seq_list.add_sequence(sent_x, sent_gold, sent_predict) return seq_list, word_dict, tag_dict