def read_data_from_corpus_folder(self, corpus):
        documents = list()
        data = pickle.load(open(corpus, 'rb'))
        for rule_data in data:
            pairs_counter = 0
            rule = rule_data['path']
            rule_name = os.path.basename(rule).replace('.pk', '')
            print('start reading {}'.format(rule))
            for topic in tqdm(rule_data['data']):
                for tweet in topic['tweets']:
                    doc_text = ''
                    doc_id = '{}_{}{}'.format(tweet['id'], pairs_counter,
                                              rule_name)
                    # text = tweet['text']
                    tokens = list()
                    for sent_id, sent in enumerate(tweet['tokens']):
                        #  TODO: maybe change the tok_id (raise only for valid tokens)
                        for token_id, token in enumerate(sent):
                            tok_text = token
                            tokens.append(
                                Token(sent_id + 1, token_id, tok_text))
                            if doc_text == '':
                                doc_text = tok_text
                            elif tok_text in [
                                    '.', ',', '?', '!', '\'re', '\'s', 'n\'t',
                                    '\'ve', '\'m', '\'ll'
                            ]:
                                doc_text += tok_text
                            else:
                                doc_text += ' ' + tok_text

                    documents.append(Doc(doc_id, doc_text, tokens))
                pairs_counter += 1
        return documents
Пример #2
0
def run_srl(ecb_path: str, data_loader: IDataLoader):
    documents = data_loader.read_data_from_corpus_folder(ecb_path)
    predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz"
    )

    sentences = Doc.to_sentences(documents)
    all_sentence_verbs = list()
    for sentence in sentences:
        srl_sentence = SRLSentence(sentence.doc_id, sentence.sent_id)
        sentence_word = sentence.get_sentence_words()
        prediction = predictor.predict_tokenized(
            tokenized_sentence=sentence_word)
        verbs = prediction['verbs']
        words = prediction['words']
        for verb in verbs:
            srl_verb = SRLVerb()
            tags = verb['tags']
            srl_verb.add_var(tags, words)
            srl_sentence.add_srl_vrb(srl_verb)

        all_sentence_verbs.append(srl_sentence)
        print('Dont with sentence from doc-' + sentence.doc_id + ', withId-' +
              str(sentence.sent_id))

    return all_sentence_verbs
Пример #3
0
 def read_data_from_corpus_folder(self, corpus):
     ret_docs = list()
     with open(corpus) as json_file:
         data = json.load(json_file)
         last_doc_id = None
         tok_inx = 0
         for doc_id, doc in data.items():
             tokens = list()
             for tok in doc:
                 sent_id, _, tok_text, _ = tok
                 if last_doc_id != doc_id:
                     tok_inx = 0
                     last_doc_id = doc_id
                 tokens.append(Token(sent_id, int(tok_inx), tok_text))
                 tok_inx += 1
             ret_docs.append(Doc(doc_id, "", tokens))
     return ret_docs
    def read_data_from_corpus_folder(self, corpus):
        documents = list()
        for (dirpath, folders, files) in walk(corpus):
            for file in files:
                is_ecb_plus = False
                if file.endswith('.xml'):
                    print('processing file-', file)

                    if 'ecbplus' in file:
                        is_ecb_plus = True

                    tree = ElementTree.parse(join(dirpath, file))
                    root = tree.getroot()
                    doc_id = root.attrib['doc_name']
                    tokens = list()
                    doc_text = ''
                    for elem in root:
                        if elem.tag == 'token':
                            sent_id = int(elem.attrib['sentence'])
                            tok_id = elem.attrib['number']
                            tok_text = elem.text
                            if is_ecb_plus and sent_id == 0:
                                continue
                            if is_ecb_plus:
                                sent_id = sent_id - 1

                            tokens.append(Token(sent_id, int(tok_id),
                                                tok_text))
                            if doc_text == '':
                                doc_text = tok_text
                            elif tok_text in [
                                    '.', ',', '?', '!', '\'re', '\'s', 'n\'t',
                                    '\'ve', '\'m', '\'ll'
                            ]:
                                doc_text += tok_text
                            else:
                                doc_text += ' ' + tok_text

                    documents.append(Doc(doc_id, doc_text, tokens))

        return documents
    def read_data_from_corpus_folder_old(self, corpus):
        nlp = spacy.load('en_core_web_sm')
        documents = list()
        data = pickle.load(open(corpus, 'rb'))
        pairs_counter = 0
        for rule, pairs in data[:2]:
            print('start reading {}'.format(rule))
            for topic in tqdm(pairs):
                for tweet in topic:
                    doc_id = '{}_{}'.format(tweet[0], pairs_counter)
                    text = tweet[1]
                    tokens = list()
                    doc_text = ''
                    doc = nlp(text)
                    for sent_id, sent in enumerate(doc.sents):
                        #  TODO: maybe change the tok_id (raise only for valid tokens)
                        for token_id, token in enumerate(sent):
                            tok_text = str(token)

                            #  ignore URL tokens
                            if tok_text in ['#', '@'] or self.is_url(tok_text):
                                continue
                            #  remove @ from tokens
                            if len(tok_text) > 1 and tok_text.startswith('@'):
                                tok_text = tok_text.replace('@', '', 1)

                            tokens.append(Token(sent_id, token_id, tok_text))
                            if doc_text == '':
                                doc_text = tok_text
                            elif tok_text in [
                                    '.', ',', '?', '!', '\'re', '\'s', 'n\'t',
                                    '\'ve', '\'m', '\'ll'
                            ]:
                                doc_text += tok_text
                            else:
                                doc_text += ' ' + tok_text
                    documents.append(Doc(doc_id, doc_text, tokens))
                pairs_counter += 1
        return documents