def parse(self, doc_text, show_tok=True, show_doc=True): """Parse a raw text document. Args: doc_text (str) show_tok (bool, optional): Specifies whether to include token text in output. show_doc (bool, optional): Specifies whether to include document text in output. Returns: CoreNLPDoc: The annotated document. """ validate((doc_text, str), (show_tok, bool), (show_doc, bool)) doc_conll = self.to_conll(doc_text) parsed_doc = CoreNLPDoc() if show_doc: parsed_doc.doc_text = doc_text for sent_conll in self.bist_parser.predict_conll(doc_conll): parsed_sent = [] conj_governors = {"and": set(), "or": set()} for tok in sent_conll: gov_id = int(tok.pred_parent_id) rel = tok.pred_relation if tok.form != "*root*": if tok.form.lower() == "and": conj_governors["and"].add(gov_id) if tok.form.lower() == "or": conj_governors["or"].add(gov_id) if rel == "conj": if gov_id in conj_governors["and"]: rel += "_and" if gov_id in conj_governors["or"]: rel += "_or" parsed_tok = { "start": tok.misc, "len": len(tok.form), "pos": tok.pos, "ner": tok.feats, "lemma": tok.lemma, "gov": gov_id - 1, "rel": rel, } if show_tok: parsed_tok["text"] = tok.form parsed_sent.append(parsed_tok) if parsed_sent: parsed_doc.sentences.append(parsed_sent) return parsed_doc
def parse(self, doc_text, show_tok=True, show_doc=True): """Parse a raw text document. Args: doc_text (str) show_tok (bool, optional): Specifies whether to include token text in output. show_doc (bool, optional): Specifies whether to include document text in output. Returns: CoreNLPDoc: The annotated document. """ validate((doc_text, str), (show_tok, bool), (show_doc, bool)) doc_conll = self.to_conll(doc_text) parsed_doc = CoreNLPDoc() if show_doc: parsed_doc.doc_text = doc_text for sent_conll in self.bist_parser.predict_conll(doc_conll): parsed_sent = [] conj_governors = {'and': set(), 'or': set()} for tok in sent_conll: gov_id = int(tok.pred_parent_id) rel = tok.pred_relation if tok.form != '*root*': if tok.form.lower() == 'and': conj_governors['and'].add(gov_id) if tok.form.lower() == 'or': conj_governors['or'].add(gov_id) if rel == 'conj': if gov_id in conj_governors['and']: rel += '_and' if gov_id in conj_governors['or']: rel += '_or' parsed_tok = { 'start': tok.misc, 'len': len(tok.form), 'pos': tok.pos, 'ner': tok.feats, 'lemma': tok.lemma, 'gov': gov_id - 1, 'rel': rel } if show_tok: parsed_tok['text'] = tok.form parsed_sent.append(parsed_tok) if parsed_sent: parsed_doc.sentences.append(parsed_sent) return parsed_doc
def parse(self, doc_text, show_tok=True, show_doc=True): """Parse a raw text document. Args: doc_text (str) show_tok (bool, optional): Specifies whether to include token text in output. show_doc (bool, optional): Specifies whether to include document text in output. Returns: CoreNLPDoc: The annotated document. """ validate((doc_text, str), (show_tok, bool), (show_doc, bool)) doc_conll = self.to_conll(doc_text) parsed_doc = CoreNLPDoc() if show_doc: parsed_doc.doc_text = doc_text for sent_conll in self.bist_parser.predict_conll(doc_conll): parsed_sent = [] conj_governors = {'and': set(), 'or': set()} for tok in sent_conll: gov_id = int(tok.pred_parent_id) rel = tok.pred_relation if tok.form != '*root*': if tok.form.lower() == 'and': conj_governors['and'].add(gov_id) if tok.form.lower() == 'or': conj_governors['or'].add(gov_id) if rel == 'conj': if gov_id in conj_governors['and']: rel += '_and' if gov_id in conj_governors['or']: rel += '_or' parsed_tok = {'start': tok.misc, 'len': len(tok.form), 'pos': tok.pos, 'ner': tok.feats, 'lemma': tok.lemma, 'gov': gov_id - 1, 'rel': rel} if show_tok: parsed_tok['text'] = tok.form parsed_sent.append(parsed_tok) if parsed_sent: parsed_doc.sentences.append(parsed_sent) return parsed_doc
def process_batch(self, texts, output_dir=None, batch_id=0): parsed_docs = [] for i, doc in enumerate(self.parser.pipe(texts)): parsed_doc = ( doc if self.spacy_doc else CoreNLPDoc.from_spacy(doc, self.show_tok, self.show_doc, self.ptb_pos) ) parsed_docs.append(parsed_doc) if output_dir: out_path = Path(output_dir) / ("{}.{}.json".format(batch_id, i)) with open(out_path, "w", encoding="utf8") as f: f.write(parsed_doc.pretty_json()) return parsed_docs