예제 #1
0
    def predict_doc(self, text, raw_entities, need_entities, need_relations):
        """
        :param raw_entities: list of {"id","start","end","type"} dicts
        :return: (raw_entities, raw_relations) where:
          raw_entities is list of {"id","start","end","type"} dicts or None
          raw_relations is list of {"first","second","type"} dicts or None
        """

        if self.ent_clf is None and raw_entities is None and (need_entities or
                                                              need_relations):
            raise BadRequest("Server doesn't support entities recognition")

        if self.rel_clf is None and need_relations:
            raise BadRequest("Server doesn't support relation extraction")

        tokens, sentences, raw_tokens = self.segmenter.segment(text)
        doc = Document("_", tokens, sentences, [Paragraph(0, len(sentences))])
        doc = self.transformer.transform(doc)

        entities = None
        if raw_entities is not None:
            if need_relations:
                entities = align_raw_entities(raw_entities, raw_tokens)
            if not need_entities:
                raw_entities = None
        else:
            if need_entities or need_relations:
                entities = self.ent_clf.predict_doc(doc)
            if need_entities:
                raw_entities = self._to_raw_entities(entities, raw_tokens)

        raw_relations = None
        if need_relations:
            doc = doc.with_entities(entities)
            relations = self.rel_clf.predict_doc(doc)
            raw_relations = self._to_raw_relations(relations)

        return raw_entities, raw_relations
예제 #2
0
def make_document_from_json_file(file_path):
    d = load_json_file_as_dict(file_path)

    tokens = d.get('tokens', [])
    entities = d.get('entities', [])
    sentences = d.get('sentences', [])
    paragraphs = d.get('paragraphs', [])
    token_features = {}

    for feature in [
            'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels',
            'dt_head_distances', 'dt_depths', 'dt_deltas_forward',
            'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward'
    ]:
        if feature in d:
            token_features[feature] = d[feature]

    relations = d.get('relations', [])

    doc_entities = []
    for ent in entities:
        id_, start_token, end_token, ent_type = tuple(ent)
        doc_entities.append(Entity(id_, start_token, end_token, ent_type))

    doc_sentences = []

    for sent in sentences:
        start_token, end_token = tuple(sent)
        doc_sentences.append(Sentence(start_token, end_token))

    doc_paragraphs = []

    for par in paragraphs:
        start_sentence, end_sentence = tuple(par)
        doc_paragraphs.append(Paragraph(start_sentence, end_sentence))

    doc_relations = []

    for rel in relations:
        e1 = None
        e2 = None
        e1_id, e2_id, rel_type = tuple(rel)

        for entity in doc_entities:
            if entity.id == e1_id:
                e1 = entity
            if entity.id == e2_id:
                e2 = entity

            if e1 is not None and e2 is not None:
                break

        doc_relations.append(Relation(e1, e2, rel_type))

    doc = Document("",
                   tokens,
                   doc_sentences,
                   doc_paragraphs,
                   token_features=token_features)
    if 'entities' in d:
        doc = doc.with_entities(doc_entities)
    if 'relations' in d:
        doc = doc.with_relations(doc_relations)
    return doc