def predict_doc(self, text, raw_entities, need_entities, need_relations): """ :param raw_entities: list of {"id","start","end","type"} dicts :return: (raw_entities, raw_relations) where: raw_entities is list of {"id","start","end","type"} dicts or None raw_relations is list of {"first","second","type"} dicts or None """ if self.ent_clf is None and raw_entities is None and (need_entities or need_relations): raise BadRequest("Server doesn't support entities recognition") if self.rel_clf is None and need_relations: raise BadRequest("Server doesn't support relation extraction") tokens, sentences, raw_tokens = self.segmenter.segment(text) doc = Document("_", tokens, sentences, [Paragraph(0, len(sentences))]) doc = self.transformer.transform(doc) entities = None if raw_entities is not None: if need_relations: entities = align_raw_entities(raw_entities, raw_tokens) if not need_entities: raw_entities = None else: if need_entities or need_relations: entities = self.ent_clf.predict_doc(doc) if need_entities: raw_entities = self._to_raw_entities(entities, raw_tokens) raw_relations = None if need_relations: doc = doc.with_entities(entities) relations = self.rel_clf.predict_doc(doc) raw_relations = self._to_raw_relations(relations) return raw_entities, raw_relations
def make_document_from_json_file(file_path): d = load_json_file_as_dict(file_path) tokens = d.get('tokens', []) entities = d.get('entities', []) sentences = d.get('sentences', []) paragraphs = d.get('paragraphs', []) token_features = {} for feature in [ 'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels', 'dt_head_distances', 'dt_depths', 'dt_deltas_forward', 'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward' ]: if feature in d: token_features[feature] = d[feature] relations = d.get('relations', []) doc_entities = [] for ent in entities: id_, start_token, end_token, ent_type = tuple(ent) doc_entities.append(Entity(id_, start_token, end_token, ent_type)) doc_sentences = [] for sent in sentences: start_token, end_token = tuple(sent) doc_sentences.append(Sentence(start_token, end_token)) doc_paragraphs = [] for par in paragraphs: start_sentence, end_sentence = tuple(par) doc_paragraphs.append(Paragraph(start_sentence, end_sentence)) doc_relations = [] for rel in relations: e1 = None e2 = None e1_id, e2_id, rel_type = tuple(rel) for entity in doc_entities: if entity.id == e1_id: e1 = entity if entity.id == e2_id: e2 = entity if e1 is not None and e2 is not None: break doc_relations.append(Relation(e1, e2, rel_type)) doc = Document("", tokens, doc_sentences, doc_paragraphs, token_features=token_features) if 'entities' in d: doc = doc.with_entities(doc_entities) if 'relations' in d: doc = doc.with_relations(doc_relations) return doc