예제 #1
0
 def _flatten_document(self, document):
     ec = ElasticCore()
     source = document.get("_source")
     annotator_meta = source.pop(TEXTA_ANNOTATOR_KEY)
     flattened_source = ec.flatten(source)
     # Skip the annotator meta when flattening and then attach it back.
     flattened_source[TEXTA_ANNOTATOR_KEY] = annotator_meta
     document["_source"] = flattened_source
     return document
예제 #2
0
def update_generator(generator: ElasticSearcher,
                     ec: ElasticCore,
                     fields: List[str],
                     fact_name: str,
                     fact_value: str,
                     tagger_object: BertTaggerObject,
                     tagger: BertTagger = None):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Appyling BERT Tagger with ID {tagger_object.id} to batch {i + 1}..."
        )
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):

                    result = tagger_object.apply_loaded_tagger(
                        tagger, text, input_type="text", feedback=False)

                    # If tagger is binary and fact value is not specified by the user, use tagger description as fact value
                    if result["result"] in ["true", "false"]:
                        if not fact_value:
                            fact_value = tagger_object.description

                    # For multitag, use the prediction as fact value
                    else:
                        fact_value = result["result"]

                    new_facts = to_texta_facts(result, field, fact_name,
                                               fact_value)
                    existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }
예제 #3
0
def update_search_query_generator(generator: ElasticSearcher, ec: ElasticCore,
                                  fields: List[str], fact_name: str,
                                  fact_value: str,
                                  tagger_object: SearchQueryTagger):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Appyling Search Query Tagger with ID {tagger_object.id}...")
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):

                    result = {
                        'tagger_id': tagger_object.id,
                        'result': tagger_object.fact_name
                    }

                    if result["result"]:
                        if not fact_value:
                            fact_value = tagger_object.description

                    else:
                        fact_value = result["result"]

                    new_facts = to_texta_facts(field, fact_name, fact_value)
                    existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                },
                "retry_on_conflict": 3
            }
예제 #4
0
def update_search_fields_generator(
        generator: ElasticSearcher, ec: ElasticCore, fields: List[str],
        fact_name: str, search_field_tagger_object: SearchFieldsTagger,
        use_breakup: bool, breakup_character: str):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Applying Search Fields Tagger with ID {search_field_tagger_object.id}..."
        )
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                field_content = flat_hit.get(field, None)
                processed_content = handle_field_content(
                    field_content, breakup_character, use_breakup)

                for content in processed_content:
                    new_facts = to_texta_facts(field,
                                               fact_name,
                                               fact_value=content)
                    existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }
예제 #5
0
def update_generator(keyword_detector: RakunDetectorWrapper,
                     generator: ElasticSearcher, ec: ElasticCore,
                     fields: List[str], rakun_extractor_object: RakunExtractor,
                     fact_name: str, fact_value: str, add_spans: bool):
    for scroll_batch in generator:
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):
                    results = rakun_extractor_object.get_rakun_keywords(
                        keyword_detector=keyword_detector,
                        texts=[text],
                        field_path=field,
                        fact_name=fact_name,
                        fact_value=fact_value,
                        add_spans=add_spans)
                    existing_facts.extend(results)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }
예제 #6
0
def update_generator(generator: ElasticSearcher,
                     ec: ElasticCore,
                     fields: List[str],
                     fact_name: str,
                     fact_value: str,
                     max_tags: int,
                     object_id: int,
                     object_type: str,
                     tagger_object: Union[Tagger, TaggerGroup],
                     object_args: Dict,
                     tagger: TextTagger = None):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Appyling {object_type} with ID {object_id} to batch {i + 1}...")
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):
                    if object_type == "tagger":
                        result = tagger_object.apply_loaded_tagger(
                            tagger, text, input_type="text", feedback=None)
                        if result:
                            tags = [result]
                        else:
                            tags = []
                    else:
                        # update text and tags with MLP
                        combined_texts, ner_tags = get_mlp(
                            object_id, [text],
                            lemmatize=object_args["lemmatize"],
                            use_ner=object_args["use_ner"])
                        # retrieve tag candidates
                        tag_candidates = get_tag_candidates(
                            object_id, [text],
                            ignore_tags=ner_tags,
                            n_similar_docs=object_args["n_similar_docs"],
                            max_candidates=object_args["n_candidate_tags"])
                        # get tags (sorted by probability in descending order)
                        tagger_group_tags = apply_tagger_group(
                            object_id,
                            text,
                            tag_candidates,
                            request=None,
                            input_type='text',
                            lemmatize=object_args["lemmatize"],
                            feedback=False,
                            use_async=False)
                        # take only `max_tags` first tags
                        tags = ner_tags + tagger_group_tags[:max_tags]

                    new_facts = to_texta_fact(tags, field, fact_name,
                                              fact_value)
                    if new_facts:
                        existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }