Пример #1
0
class UDPipeTokenizer:
    def __init__(self, udpipe_model_path):
        self.udpipe_model = Model(udpipe_model_path)

    def tokenize(self, sentence: str) -> List[Tuple[str, str]]:
        """
        return: list of pairs of tags (POS, DEP_REL) for each token in the sentence
        """
        s = list(self.udpipe_model.process(sentence))
        lst = [(item.upostag, item.deprel) for item in s[0].words
               if item.upostag != '<root>']
        return lst
Пример #2
0
class UdpipeTagger:
    def __init__(self, file = None, **kwarg):
        if file:
            self.model = Model(file)
        else:
            raise Exception("You should pass the model")
        
    def get_pos_tag(self, word):
        sent = list(self.model.process(word))[0]
        
        if len(sent.words) != 2:
            print(word, sent.words)
        
        return sent.words[1].xpostag
Пример #3
0
 def do_train(self) -> List[TResult]:
     """
     By pre-train modules of unpipe get the results for our corpus
     These udpipe modules can be download here:
     https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131
     :return:
     """
     model = Model(self.pre_model_name)
     # train our corpus to get POS for each word
     line_no = 1
     for sen in self.load_data():
         sen_clean = self.clean_data(sen)
         if not sen_clean:
             continue
         word_pos = list(model.process(sen_clean))
         for i, one_sentence in enumerate(word_pos):
             sentence_text = self.extract_one_sentence(one_sentence)
             results = self.extract_one_word(one_sentence, sentence_text)
             self.store_data.insert_data(self.cursor, results, self.language_name)
             print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name))
         line_no += 1
     print(' all written succeed for corpus of %s' % self.our_corpus_name)
Пример #4
0
def corpy_udpipe(text,
                 sent_level=True,
                 model='english-lines-ud-2.5-191206.udpipe'):

    m = Model('../udpipe_model/' + model)
    print(model, "loaded successfully!")

    if sent_level:

        all_pos = []
        all_head = []
        all_dep = []
        all_tok = []

        for line in text:
            #print(line)
            sent_pos = []
            sent_head = []
            sent_dep = []
            sent_tok = []

            sents = list(m.process(line, out_format="conllu"))

            conllu = "".join(sents)
            parse_con = parse(conllu)

            # iterate over each word and append the POS/HEAD/UD into a list,

            #print(parse_con[0])

            for i in range(len(parse_con)):
                for word in parse_con[i]:
                    #print(i)
                    sent_pos.append(word['upostag'])
                    sent_head.append(word['head'])
                    sent_dep.append(word['deprel'])
                    sent_tok.append(word['form'])

            # append sent pos to the the doc
            all_pos.append(sent_pos)
            all_head.append(sent_head)
            all_dep.append(sent_dep)
            all_tok.append(sent_tok)

    # for doc-level
    else:

        all_pos = []
        all_head = []
        all_dep = []
        all_tok = []

        for doc in text:

            pos_per_doc = []
            head_per_doc = []
            dep_per_doc = []
            tok_per_doc = []

            for line in doc:
                #print(line)
                sent_pos = []
                sent_head = []
                sent_dep = []
                sent_tok = []

                sents = list(m.process(line, out_format="conllu"))
                conllu = "".join(sents)
                parse_con = parse(conllu)

                # iterate over each word and append the POS/HEAD/UD into a list,

                #print(parse_con[0])

                for i in range(len(parse_con)):
                    for word in parse_con[i]:
                        #print(i)
                        sent_pos.append(word['upostag'])
                        sent_head.append(word['head'])
                        sent_dep.append(word['deprel'])
                        sent_tok.append(word['form'])

                # append sent pos to the the doc

                pos_per_doc.append(sent_pos)
                head_per_doc.append(sent_head)
                dep_per_doc.append(sent_dep)
                tok_per_doc.append(sent_tok)

            all_pos.append(pos_per_doc)
            all_head.append(head_per_doc)
            all_dep.append(dep_per_doc)
            all_tok.append(tok_per_doc)

    return all_pos, all_head, all_dep, all_tok
Пример #5
0
class UdpipeTrain(ITrain):
    def __init__(self, language_name, pre_model_name, our_corpus_name):
        """

        The language of pre_model_name and our_corpus_name should be identical!
        :param language_name:
        :param pre_model_name: it's from udpipe
        :param our_corpus_name: it's our found
        """
        self.language_name = language_name
        self.pre_model_name = pre_model_name
        self.our_corpus_name = our_corpus_name
        try:
            self.store_data = StoreData(db_config['user'],
                                        db_config['password'],
                                        db_host=db_config['db_host'],
                                        db_name=db_config['db_name'])
            self.cursor = self.store_data.db_connect().cursor()

            # second loading udpipe pre-train model
            self.model = Model(self.pre_model_name)

        except Exception as ex:
            print('logging in database error %s' % ex)

    def load_data(self) -> str:
        with open(self.our_corpus_name, 'r') as f:
            for sen in f:
                print('loading one sentence: %s' % (sen,))
                yield sen

        print('loading done for our corpus')

    def clean_data(self, data: str) -> str:
        """
        data is one or several sentence(s) we expect

        if data is \n, \t, empty str, etc, replace them

        :param data: raw data
        :return: data after cleaning
        """
        cleaned_data = re.sub('[\n\t]+', '', data)
        return cleaned_data

    def do_train(self) -> List[TResult]:
        """
        By pre-train modules of unpipe get the results for our corpus
        These udpipe modules can be download here:
        https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131
        :return:
        """
        # train our corpus to get POS for each word
        line_no = 1
        for sen in self.load_data():
            # if line_no < 1811:
            #     line_no += 1
            #     continue
            sen_clean = self.clean_data(sen)
            if not sen_clean:
                continue
            word_pos = list(self.model.process(sen_clean))
            # pprint(word_pos)
            for i, one_sentence in enumerate(word_pos):
                sentence_text = self.extract_one_sentence(one_sentence)
                results = self.extract_one_word(one_sentence, sentence_text)
                self.store_data.insert_data(self.cursor, results, self.language_name)
                print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name))
            line_no += 1
        print(' all written succeed for corpus of %s' % self.our_corpus_name)

    def extract_one_sentence(self, sentence) -> str:
        """
       This private method is mainly used to extract the sentence text.
       an instance of udpipe Sentence:
       Sentence(
           comments=[
             '# sent_id = 3',
             '# text = 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。'],
           words=[
             Word(id=0, <root>),
             Word(id=1,
                  form='黄土',
                  lemma='黄土',
                  xpostag='NNP',
                  upostag='PROPN',
                  head=3,
                  deprel='nmod',
                  misc='SpaceAfter=No'),
             Word(id=2,
                  form='高原',
                  lemma='高原',
                  xpostag='NN',
                  upostag='NOUN',
                  head=3,
                  deprel='nmod',
                  misc='SpaceAfter=No'),
             Word(id=3,
                  form='严寒',
                  lemma='严寒',
                  xpostag='NN',
                  upostag='NOUN',
                  head=22,
                  deprel='nsubj',
                  misc='SpaceAfter=No'),
             
             omited by myself ])
       
       :param sentence: udpipe Sentence
       :return: str 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。
       """
        comment = ''.join(sentence.comments)
        try:
            cs = re.findall(r'text = (.*)', comment)[0]
            return cs
        except Exception as e:
            # TODO: need to write warning log
            print('error: not find a sentence', e)
            return ''

    def extract_one_word(self, sentence, sentence_text: str) -> [TResult]:
        """
        This private method is mainly used to extract one word and it's POS

        :param sentence_text:
        :param sentence:
        :return: [TResult]
        """
        r = []
        for word in sentence.words:
            if word.lemma and word.lemma not in ITrain.FILTER_WORD:
                if word.lemma and word.upostag and sentence_text:
                    r.append(TResult(word.lemma, word.upostag, sentence_text))
        return r

    def word_segmentation(self, sentence) -> List[str]:
        """
        :param sentence:
        :return: word list
        """
        sen_clean = self.clean_data(sentence)
        if not sen_clean:
            return []
        word_pos = list(self.model.process(sen_clean))
        words = []
        for i, one_sentence in enumerate(word_pos):
            sentence_text = self.extract_one_sentence(one_sentence)
            results = self.extract_one_word(one_sentence, sentence_text)
            words.extend([res.word for res in results])
        return words
          'rb') as f:
    for line in f.readlines():
        data.append(str(line, 'utf-8'))

model = 'german-hdt-ud-2.5-191206.udpipe'

m = Model('../udpipe_model/' + model)
print(model, "loaded successfully!")

all_pos = []

for line in data:
    #print(line)
    sent_pos = []

    sents = list(m.process(line, out_format="conllu"))

    conllu = "".join(sents)
    parse_con = parse(conllu)

    # iterate over each word and append the POS into a list,

    for i in parse_con[0]:
        #print(i)
        sent_pos.append(i['upostag'])

    # append sent pos to the the doc
    all_pos.append(sent_pos)

# write pos list into a file
with open('de_pos', 'wb') as f:
Пример #7
0
from corpy.udpipe import Model
from corpy.udpipe import pprint


m = Model("/home/zglg/SLU/psd/pre-model/classical_chinese-kyoto-ud-2.5-191206.udpipe")

sents = list(m.process("我爱北京天安门. 天安门上好风景"))
pprint(sents)