def read_data_from_corpus_folder(self, corpus): documents = list() data = pickle.load(open(corpus, 'rb')) for rule_data in data: pairs_counter = 0 rule = rule_data['path'] rule_name = os.path.basename(rule).replace('.pk', '') print('start reading {}'.format(rule)) for topic in tqdm(rule_data['data']): for tweet in topic['tweets']: doc_text = '' doc_id = '{}_{}{}'.format(tweet['id'], pairs_counter, rule_name) # text = tweet['text'] tokens = list() for sent_id, sent in enumerate(tweet['tokens']): # TODO: maybe change the tok_id (raise only for valid tokens) for token_id, token in enumerate(sent): tok_text = token tokens.append( Token(sent_id + 1, token_id, tok_text)) if doc_text == '': doc_text = tok_text elif tok_text in [ '.', ',', '?', '!', '\'re', '\'s', 'n\'t', '\'ve', '\'m', '\'ll' ]: doc_text += tok_text else: doc_text += ' ' + tok_text documents.append(Doc(doc_id, doc_text, tokens)) pairs_counter += 1 return documents
def run_srl(ecb_path: str, data_loader: IDataLoader): documents = data_loader.read_data_from_corpus_folder(ecb_path) predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz" ) sentences = Doc.to_sentences(documents) all_sentence_verbs = list() for sentence in sentences: srl_sentence = SRLSentence(sentence.doc_id, sentence.sent_id) sentence_word = sentence.get_sentence_words() prediction = predictor.predict_tokenized( tokenized_sentence=sentence_word) verbs = prediction['verbs'] words = prediction['words'] for verb in verbs: srl_verb = SRLVerb() tags = verb['tags'] srl_verb.add_var(tags, words) srl_sentence.add_srl_vrb(srl_verb) all_sentence_verbs.append(srl_sentence) print('Dont with sentence from doc-' + sentence.doc_id + ', withId-' + str(sentence.sent_id)) return all_sentence_verbs
def read_data_from_corpus_folder(self, corpus): ret_docs = list() with open(corpus) as json_file: data = json.load(json_file) last_doc_id = None tok_inx = 0 for doc_id, doc in data.items(): tokens = list() for tok in doc: sent_id, _, tok_text, _ = tok if last_doc_id != doc_id: tok_inx = 0 last_doc_id = doc_id tokens.append(Token(sent_id, int(tok_inx), tok_text)) tok_inx += 1 ret_docs.append(Doc(doc_id, "", tokens)) return ret_docs
def read_data_from_corpus_folder(self, corpus): documents = list() for (dirpath, folders, files) in walk(corpus): for file in files: is_ecb_plus = False if file.endswith('.xml'): print('processing file-', file) if 'ecbplus' in file: is_ecb_plus = True tree = ElementTree.parse(join(dirpath, file)) root = tree.getroot() doc_id = root.attrib['doc_name'] tokens = list() doc_text = '' for elem in root: if elem.tag == 'token': sent_id = int(elem.attrib['sentence']) tok_id = elem.attrib['number'] tok_text = elem.text if is_ecb_plus and sent_id == 0: continue if is_ecb_plus: sent_id = sent_id - 1 tokens.append(Token(sent_id, int(tok_id), tok_text)) if doc_text == '': doc_text = tok_text elif tok_text in [ '.', ',', '?', '!', '\'re', '\'s', 'n\'t', '\'ve', '\'m', '\'ll' ]: doc_text += tok_text else: doc_text += ' ' + tok_text documents.append(Doc(doc_id, doc_text, tokens)) return documents
def read_data_from_corpus_folder_old(self, corpus): nlp = spacy.load('en_core_web_sm') documents = list() data = pickle.load(open(corpus, 'rb')) pairs_counter = 0 for rule, pairs in data[:2]: print('start reading {}'.format(rule)) for topic in tqdm(pairs): for tweet in topic: doc_id = '{}_{}'.format(tweet[0], pairs_counter) text = tweet[1] tokens = list() doc_text = '' doc = nlp(text) for sent_id, sent in enumerate(doc.sents): # TODO: maybe change the tok_id (raise only for valid tokens) for token_id, token in enumerate(sent): tok_text = str(token) # ignore URL tokens if tok_text in ['#', '@'] or self.is_url(tok_text): continue # remove @ from tokens if len(tok_text) > 1 and tok_text.startswith('@'): tok_text = tok_text.replace('@', '', 1) tokens.append(Token(sent_id, token_id, tok_text)) if doc_text == '': doc_text = tok_text elif tok_text in [ '.', ',', '?', '!', '\'re', '\'s', 'n\'t', '\'ve', '\'m', '\'ll' ]: doc_text += tok_text else: doc_text += ' ' + tok_text documents.append(Doc(doc_id, doc_text, tokens)) pairs_counter += 1 return documents