def load_nltk_conll2000(self): import nltk document = Document(0) for idx, sentence in enumerate(nltk.corpus.conll2000.tagged_sents()): sentence_obj = Sentence() for word, tag in sentence: id = self.alloc_global_word_id(word) word_obj = Word(id, word) word_obj.tag = tag sentence_obj.add_word(word_obj) document.add_sentence(sentence_obj) self.documents.append(document)
def load(self, file_path, type=1): ''' load corpora from Conll05 data file :param file_path: :param type: 1, the data we found in Github;2, append with conll2005/synt.upc :return: ''' for sentence_info in self.__get_sentence_block(file_path): sentence = Sentence() sentence_array = np.array(sentence_info) for loc, word_info in enumerate(sentence_array): word_name, pos = word_info[:2] cur_word = word_repo.get_word(word_name) word_property = WordProperty() word_property.pos = pos if word_info[4] != "-": srl = SRLStructure(cur_word, loc) srl.verb_sense = word_info[4] srl.verb_infinitive = word_repo.get_word(word_info[5]) sentence.add_srl_struct(srl) sentence.add_word(srl.verb_infinitive, word_property) else: sentence.add_word(cur_word, word_property) # parse ne for ne_type, (start_pos, end_pos) in parse_start_end_components(sentence_array[:,3]): ner = Ner(ne_type, start_pos, end_pos) sentence.add_ne(ner) if type == 2: # parse chunk for chunk_type, (start_pos, end_pos) in parse_start_end_components(sentence_array[:,-2]): chunk = Chunk(chunk_type, start_pos, end_pos) sentence.add_chunk(chunk) if type == 1: props = sentence_array[:,6:] else: props = sentence_array[:,6:-3] for verb_idx, srl in enumerate(sentence.srl_structs()): cur_prop = props[:,verb_idx] for role_type, (start_pos, end_pos) in parse_start_end_components(cur_prop): role = Role(role_type, start_pos, end_pos) srl.add_role(role) self.__sentences.append(sentence)