def get_data_df(self): doc = self.doc i_0 = self.first_i predicates = doc._.lex_matches n = len(predicates) t = Timer() t.start() data = [{ 'i': i_0 + tok.i, 'sent_i': i_0 + tok.sent.start, 't': i_0 + tok._.subsent_root.i, 'neg': tok._.negated, 'lemma': tok.lemma_, 'text': tok.text, 'R_agent': agent.root.text if agent else None, 'R_patient': patient.root.text if patient else None, **{('L_' + doc.vocab[cat].text): 1.0 for cat in tok._.lex}, } for tok in predicates for agent in (tok._.agents or [None]) for patient in (tok._.patients or [None])] table = pd.DataFrame(data).sort_values('i') predicate_cols = [c for c in list(table.columns) if c.startswith('L_')] table[predicate_cols] = table[predicate_cols].fillna(0) t.stop() logger.debug('%d predicates (%d distinct) [%s]', len(table.index), n, t) return table
def get_entities_df(self): t = Timer() t.start() ent_cls = proc_ent.entity_classifier(self.doc.vocab) df = pd.DataFrame(ent_cls(self.doc)) t.stop() logger.debug('%d entities [%s]', len(df.index), t) return df