Python Model.process примеры использования

Язык программирования: Python

Пространство имен/Пакет: corpy.udpipe

Класс/Тип: Model

Метод/Функция: process

Примеров на hotexamples.com: 7

Python Model.process - 7 примеров найдено. Это лучшие примеры Python кода для corpy.udpipe.Model.process, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Model(11)

process(7)

Основные методы

Model (11)

process (7)

Пример #1

Показать файл

class UDPipeTokenizer:
    def __init__(self, udpipe_model_path):
        self.udpipe_model = Model(udpipe_model_path)

    def tokenize(self, sentence: str) -> List[Tuple[str, str]]:
        """
        return: list of pairs of tags (POS, DEP_REL) for each token in the sentence
        """
        s = list(self.udpipe_model.process(sentence))
        lst = [(item.upostag, item.deprel) for item in s[0].words
               if item.upostag != '<root>']
        return lst

Пример #2

Показать файл

Файл: pos_filter.py Проект: AdilGM/poetry_generation

class UdpipeTagger:
    def __init__(self, file = None, **kwarg):
        if file:
            self.model = Model(file)
        else:
            raise Exception("You should pass the model")
        
    def get_pos_tag(self, word):
        sent = list(self.model.process(word))[0]
        
        if len(sent.words) != 2:
            print(word, sent.words)
        
        return sent.words[1].xpostag

Пример #3

Показать файл

 def do_train(self) -> List[TResult]:
     """
     By pre-train modules of unpipe get the results for our corpus
     These udpipe modules can be download here:
     https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131
     :return:
     """
     model = Model(self.pre_model_name)
     # train our corpus to get POS for each word
     line_no = 1
     for sen in self.load_data():
         sen_clean = self.clean_data(sen)
         if not sen_clean:
             continue
         word_pos = list(model.process(sen_clean))
         for i, one_sentence in enumerate(word_pos):
             sentence_text = self.extract_one_sentence(one_sentence)
             results = self.extract_one_word(one_sentence, sentence_text)
             self.store_data.insert_data(self.cursor, results, self.language_name)
             print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name))
         line_no += 1
     print(' all written succeed for corpus of %s' % self.our_corpus_name)

Пример #4

Показать файл

Файл: utils.py Проект: tianfrank/translation_detection

def corpy_udpipe(text,
                 sent_level=True,
                 model='english-lines-ud-2.5-191206.udpipe'):

    m = Model('../udpipe_model/' + model)
    print(model, "loaded successfully!")

    if sent_level:

        all_pos = []
        all_head = []
        all_dep = []
        all_tok = []

        for line in text:
            #print(line)
            sent_pos = []
            sent_head = []
            sent_dep = []
            sent_tok = []

            sents = list(m.process(line, out_format="conllu"))

            conllu = "".join(sents)
            parse_con = parse(conllu)

            # iterate over each word and append the POS/HEAD/UD into a list,

            #print(parse_con[0])

            for i in range(len(parse_con)):
                for word in parse_con[i]:
                    #print(i)
                    sent_pos.append(word['upostag'])
                    sent_head.append(word['head'])
                    sent_dep.append(word['deprel'])
                    sent_tok.append(word['form'])

            # append sent pos to the the doc
            all_pos.append(sent_pos)
            all_head.append(sent_head)
            all_dep.append(sent_dep)
            all_tok.append(sent_tok)

    # for doc-level
    else:

        all_pos = []
        all_head = []
        all_dep = []
        all_tok = []

        for doc in text:

            pos_per_doc = []
            head_per_doc = []
            dep_per_doc = []
            tok_per_doc = []

            for line in doc:
                #print(line)
                sent_pos = []
                sent_head = []
                sent_dep = []
                sent_tok = []

                sents = list(m.process(line, out_format="conllu"))
                conllu = "".join(sents)
                parse_con = parse(conllu)

                # iterate over each word and append the POS/HEAD/UD into a list,

                #print(parse_con[0])

                for i in range(len(parse_con)):
                    for word in parse_con[i]:
                        #print(i)
                        sent_pos.append(word['upostag'])
                        sent_head.append(word['head'])
                        sent_dep.append(word['deprel'])
                        sent_tok.append(word['form'])

                # append sent pos to the the doc

                pos_per_doc.append(sent_pos)
                head_per_doc.append(sent_head)
                dep_per_doc.append(sent_dep)
                tok_per_doc.append(sent_tok)

            all_pos.append(pos_per_doc)
            all_head.append(head_per_doc)
            all_dep.append(dep_per_doc)
            all_tok.append(tok_per_doc)

    return all_pos, all_head, all_dep, all_tok

Пример #5

Показать файл

Файл: train_model.py Проект: harisalam/wordfinder

class UdpipeTrain(ITrain):
    def __init__(self, language_name, pre_model_name, our_corpus_name):
        """

        The language of pre_model_name and our_corpus_name should be identical!
        :param language_name:
        :param pre_model_name: it's from udpipe
        :param our_corpus_name: it's our found
        """
        self.language_name = language_name
        self.pre_model_name = pre_model_name
        self.our_corpus_name = our_corpus_name
        try:
            self.store_data = StoreData(db_config['user'],
                                        db_config['password'],
                                        db_host=db_config['db_host'],
                                        db_name=db_config['db_name'])
            self.cursor = self.store_data.db_connect().cursor()

            # second loading udpipe pre-train model
            self.model = Model(self.pre_model_name)

        except Exception as ex:
            print('logging in database error %s' % ex)

    def load_data(self) -> str:
        with open(self.our_corpus_name, 'r') as f:
            for sen in f:
                print('loading one sentence: %s' % (sen,))
                yield sen

        print('loading done for our corpus')

    def clean_data(self, data: str) -> str:
        """
        data is one or several sentence(s) we expect

        if data is \n, \t, empty str, etc, replace them

        :param data: raw data
        :return: data after cleaning
        """
        cleaned_data = re.sub('[\n\t]+', '', data)
        return cleaned_data

    def do_train(self) -> List[TResult]:
        """
        By pre-train modules of unpipe get the results for our corpus
        These udpipe modules can be download here:
        https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131
        :return:
        """
        # train our corpus to get POS for each word
        line_no = 1
        for sen in self.load_data():
            # if line_no < 1811:
            #     line_no += 1
            #     continue
            sen_clean = self.clean_data(sen)
            if not sen_clean:
                continue
            word_pos = list(self.model.process(sen_clean))
            # pprint(word_pos)
            for i, one_sentence in enumerate(word_pos):
                sentence_text = self.extract_one_sentence(one_sentence)
                results = self.extract_one_word(one_sentence, sentence_text)
                self.store_data.insert_data(self.cursor, results, self.language_name)
                print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name))
            line_no += 1
        print(' all written succeed for corpus of %s' % self.our_corpus_name)

    def extract_one_sentence(self, sentence) -> str:
        """
       This private method is mainly used to extract the sentence text.
       an instance of udpipe Sentence:
       Sentence(
           comments=[
             '# sent_id = 3',
             '# text = 黄土高原严寒而漫长的冬天看来就要过去，但那真正温暖的春天还远远地没有到来。'],
           words=[
             Word(id=0, <root>),
             Word(id=1,
                  form='黄土',
                  lemma='黄土',
                  xpostag='NNP',
                  upostag='PROPN',
                  head=3,
                  deprel='nmod',
                  misc='SpaceAfter=No'),
             Word(id=2,
                  form='高原',
                  lemma='高原',
                  xpostag='NN',
                  upostag='NOUN',
                  head=3,
                  deprel='nmod',
                  misc='SpaceAfter=No'),
             Word(id=3,
                  form='严寒',
                  lemma='严寒',
                  xpostag='NN',
                  upostag='NOUN',
                  head=22,
                  deprel='nsubj',
                  misc='SpaceAfter=No'),
             
             omited by myself ])
       
       :param sentence: udpipe Sentence
       :return: str 黄土高原严寒而漫长的冬天看来就要过去，但那真正温暖的春天还远远地没有到来。
       """
        comment = ''.join(sentence.comments)
        try:
            cs = re.findall(r'text = (.*)', comment)[0]
            return cs
        except Exception as e:
            # TODO: need to write warning log
            print('error: not find a sentence', e)
            return ''

    def extract_one_word(self, sentence, sentence_text: str) -> [TResult]:
        """
        This private method is mainly used to extract one word and it's POS

        :param sentence_text:
        :param sentence:
        :return: [TResult]
        """
        r = []
        for word in sentence.words:
            if word.lemma and word.lemma not in ITrain.FILTER_WORD:
                if word.lemma and word.upostag and sentence_text:
                    r.append(TResult(word.lemma, word.upostag, sentence_text))
        return r

    def word_segmentation(self, sentence) -> List[str]:
        """
        :param sentence:
        :return: word list
        """
        sen_clean = self.clean_data(sentence)
        if not sen_clean:
            return []
        word_pos = list(self.model.process(sen_clean))
        words = []
        for i, one_sentence in enumerate(word_pos):
            sentence_text = self.extract_one_sentence(one_sentence)
            results = self.extract_one_word(one_sentence, sentence_text)
            words.extend([res.word for res in results])
        return words

Пример #6

Показать файл

Файл: create_pos_corpus.py Проект: tianfrank/translation_detection

          'rb') as f:
    for line in f.readlines():
        data.append(str(line, 'utf-8'))

model = 'german-hdt-ud-2.5-191206.udpipe'

m = Model('../udpipe_model/' + model)
print(model, "loaded successfully!")

all_pos = []

for line in data:
    #print(line)
    sent_pos = []

    sents = list(m.process(line, out_format="conllu"))

    conllu = "".join(sents)
    parse_con = parse(conllu)

    # iterate over each word and append the POS into a list,

    for i in parse_con[0]:
        #print(i)
        sent_pos.append(i['upostag'])

    # append sent pos to the the doc
    all_pos.append(sent_pos)

# write pos list into a file
with open('de_pos', 'wb') as f:

Пример #7

Показать файл

from corpy.udpipe import Model
from corpy.udpipe import pprint


m = Model("/home/zglg/SLU/psd/pre-model/classical_chinese-kyoto-ud-2.5-191206.udpipe")

sents = list(m.process("我爱北京天安门. 天安门上好风景"))
pprint(sents)