Exemplo n.º 1
0
    def load_nltk_conll2000(self):

        import nltk

        document = Document(0)

        for idx, sentence in enumerate(nltk.corpus.conll2000.tagged_sents()):

            sentence_obj = Sentence()
            for word, tag in sentence:
                id = self.alloc_global_word_id(word)

                word_obj = Word(id, word)
                word_obj.tag = tag

                sentence_obj.add_word(word_obj)

            document.add_sentence(sentence_obj)

        self.documents.append(document)
Exemplo n.º 2
0
    def load(self, file_path, type=1):
        '''
        load corpora from Conll05 data file
        :param file_path:
        :param type: 1, the data we found in Github;2, append with conll2005/synt.upc
        :return:
        '''


        for sentence_info in self.__get_sentence_block(file_path):


            sentence = Sentence()

            sentence_array = np.array(sentence_info)


            for loc, word_info in enumerate(sentence_array):
                word_name, pos = word_info[:2]

                cur_word = word_repo.get_word(word_name)

                word_property = WordProperty()
                word_property.pos = pos


                if word_info[4] != "-":

                    srl = SRLStructure(cur_word, loc)
                    srl.verb_sense = word_info[4]
                    srl.verb_infinitive = word_repo.get_word(word_info[5])
                    sentence.add_srl_struct(srl)
                    sentence.add_word(srl.verb_infinitive, word_property)
                else:
                    sentence.add_word(cur_word, word_property)

            # parse ne
            for ne_type, (start_pos, end_pos) in parse_start_end_components(sentence_array[:,3]):
                ner = Ner(ne_type, start_pos, end_pos)
                sentence.add_ne(ner)
            if type == 2:
                # parse chunk
                for chunk_type, (start_pos, end_pos) in parse_start_end_components(sentence_array[:,-2]):
                    chunk = Chunk(chunk_type, start_pos, end_pos)
                    sentence.add_chunk(chunk)
            if type == 1:
                props = sentence_array[:,6:]
            else:
                props = sentence_array[:,6:-3]


            for verb_idx, srl in enumerate(sentence.srl_structs()):
                cur_prop = props[:,verb_idx]

                for role_type, (start_pos, end_pos) in parse_start_end_components(cur_prop):
                    role = Role(role_type, start_pos, end_pos)
                    srl.add_role(role)




            self.__sentences.append(sentence)