Python Doc примеры использования

Язык программирования: Python

Пространство имен/Пакет: src.data.doc

Класс/Тип: Doc

Примеров на hotexamples.com: 5

Python Doc - 5 примеров найдено. Это лучшие примеры Python кода для src.data.doc.Doc, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Doc(4)

to_sentences(1)

Основные методы

Doc (4)

to_sentences (1)

Пример #1

Показать файл

Файл: data_loader.py Проект: yehudit96/wd-plus-srl-extraction

    def read_data_from_corpus_folder(self, corpus):
        documents = list()
        data = pickle.load(open(corpus, 'rb'))
        for rule_data in data:
            pairs_counter = 0
            rule = rule_data['path']
            rule_name = os.path.basename(rule).replace('.pk', '')
            print('start reading {}'.format(rule))
            for topic in tqdm(rule_data['data']):
                for tweet in topic['tweets']:
                    doc_text = ''
                    doc_id = '{}_{}{}'.format(tweet['id'], pairs_counter,
                                              rule_name)
                    # text = tweet['text']
                    tokens = list()
                    for sent_id, sent in enumerate(tweet['tokens']):
                        #  TODO: maybe change the tok_id (raise only for valid tokens)
                        for token_id, token in enumerate(sent):
                            tok_text = token
                            tokens.append(
                                Token(sent_id + 1, token_id, tok_text))
                            if doc_text == '':
                                doc_text = tok_text
                            elif tok_text in [
                                    '.', ',', '?', '!', '\'re', '\'s', 'n\'t',
                                    '\'ve', '\'m', '\'ll'
                            ]:
                                doc_text += tok_text
                            else:
                                doc_text += ' ' + tok_text

                    documents.append(Doc(doc_id, doc_text, tokens))
                pairs_counter += 1
        return documents

Пример #2

Показать файл

Файл: srl_allen.py Проект: AlonEirew/wd-plus-srl-extraction

def run_srl(ecb_path: str, data_loader: IDataLoader):
    documents = data_loader.read_data_from_corpus_folder(ecb_path)
    predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz"
    )

    sentences = Doc.to_sentences(documents)
    all_sentence_verbs = list()
    for sentence in sentences:
        srl_sentence = SRLSentence(sentence.doc_id, sentence.sent_id)
        sentence_word = sentence.get_sentence_words()
        prediction = predictor.predict_tokenized(
            tokenized_sentence=sentence_word)
        verbs = prediction['verbs']
        words = prediction['words']
        for verb in verbs:
            srl_verb = SRLVerb()
            tags = verb['tags']
            srl_verb.add_var(tags, words)
            srl_sentence.add_srl_vrb(srl_verb)

        all_sentence_verbs.append(srl_sentence)
        print('Dont with sentence from doc-' + sentence.doc_id + ', withId-' +
              str(sentence.sent_id))

    return all_sentence_verbs

Пример #3

Показать файл

 def read_data_from_corpus_folder(self, corpus):
     ret_docs = list()
     with open(corpus) as json_file:
         data = json.load(json_file)
         last_doc_id = None
         tok_inx = 0
         for doc_id, doc in data.items():
             tokens = list()
             for tok in doc:
                 sent_id, _, tok_text, _ = tok
                 if last_doc_id != doc_id:
                     tok_inx = 0
                     last_doc_id = doc_id
                 tokens.append(Token(sent_id, int(tok_inx), tok_text))
                 tok_inx += 1
             ret_docs.append(Doc(doc_id, "", tokens))
     return ret_docs

Пример #4

Показать файл

Файл: data_loader.py Проект: yehudit96/wd-plus-srl-extraction

    def read_data_from_corpus_folder(self, corpus):
        documents = list()
        for (dirpath, folders, files) in walk(corpus):
            for file in files:
                is_ecb_plus = False
                if file.endswith('.xml'):
                    print('processing file-', file)

                    if 'ecbplus' in file:
                        is_ecb_plus = True

                    tree = ElementTree.parse(join(dirpath, file))
                    root = tree.getroot()
                    doc_id = root.attrib['doc_name']
                    tokens = list()
                    doc_text = ''
                    for elem in root:
                        if elem.tag == 'token':
                            sent_id = int(elem.attrib['sentence'])
                            tok_id = elem.attrib['number']
                            tok_text = elem.text
                            if is_ecb_plus and sent_id == 0:
                                continue
                            if is_ecb_plus:
                                sent_id = sent_id - 1

                            tokens.append(Token(sent_id, int(tok_id),
                                                tok_text))
                            if doc_text == '':
                                doc_text = tok_text
                            elif tok_text in [
                                    '.', ',', '?', '!', '\'re', '\'s', 'n\'t',
                                    '\'ve', '\'m', '\'ll'
                            ]:
                                doc_text += tok_text
                            else:
                                doc_text += ' ' + tok_text

                    documents.append(Doc(doc_id, doc_text, tokens))

        return documents

Пример #5

Показать файл

Файл: data_loader.py Проект: yehudit96/wd-plus-srl-extraction

    def read_data_from_corpus_folder_old(self, corpus):
        nlp = spacy.load('en_core_web_sm')
        documents = list()
        data = pickle.load(open(corpus, 'rb'))
        pairs_counter = 0
        for rule, pairs in data[:2]:
            print('start reading {}'.format(rule))
            for topic in tqdm(pairs):
                for tweet in topic:
                    doc_id = '{}_{}'.format(tweet[0], pairs_counter)
                    text = tweet[1]
                    tokens = list()
                    doc_text = ''
                    doc = nlp(text)
                    for sent_id, sent in enumerate(doc.sents):
                        #  TODO: maybe change the tok_id (raise only for valid tokens)
                        for token_id, token in enumerate(sent):
                            tok_text = str(token)

                            #  ignore URL tokens
                            if tok_text in ['#', '@'] or self.is_url(tok_text):
                                continue
                            #  remove @ from tokens
                            if len(tok_text) > 1 and tok_text.startswith('@'):
                                tok_text = tok_text.replace('@', '', 1)

                            tokens.append(Token(sent_id, token_id, tok_text))
                            if doc_text == '':
                                doc_text = tok_text
                            elif tok_text in [
                                    '.', ',', '?', '!', '\'re', '\'s', 'n\'t',
                                    '\'ve', '\'m', '\'ll'
                            ]:
                                doc_text += tok_text
                            else:
                                doc_text += ' ' + tok_text
                    documents.append(Doc(doc_id, doc_text, tokens))
                pairs_counter += 1
        return documents