def _flatten_document(self, document): ec = ElasticCore() source = document.get("_source") annotator_meta = source.pop(TEXTA_ANNOTATOR_KEY) flattened_source = ec.flatten(source) # Skip the annotator meta when flattening and then attach it back. flattened_source[TEXTA_ANNOTATOR_KEY] = annotator_meta document["_source"] = flattened_source return document
def update_generator(generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, fact_value: str, tagger_object: BertTaggerObject, tagger: BertTagger = None): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Appyling BERT Tagger with ID {tagger_object.id} to batch {i + 1}..." ) for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): result = tagger_object.apply_loaded_tagger( tagger, text, input_type="text", feedback=False) # If tagger is binary and fact value is not specified by the user, use tagger description as fact value if result["result"] in ["true", "false"]: if not fact_value: fact_value = tagger_object.description # For multitag, use the prediction as fact value else: fact_value = result["result"] new_facts = to_texta_facts(result, field, fact_name, fact_value) existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }
def update_search_query_generator(generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, fact_value: str, tagger_object: SearchQueryTagger): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Appyling Search Query Tagger with ID {tagger_object.id}...") for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): result = { 'tagger_id': tagger_object.id, 'result': tagger_object.fact_name } if result["result"]: if not fact_value: fact_value = tagger_object.description else: fact_value = result["result"] new_facts = to_texta_facts(field, fact_name, fact_value) existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } }, "retry_on_conflict": 3 }
def update_search_fields_generator( generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, search_field_tagger_object: SearchFieldsTagger, use_breakup: bool, breakup_character: str): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Applying Search Fields Tagger with ID {search_field_tagger_object.id}..." ) for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: field_content = flat_hit.get(field, None) processed_content = handle_field_content( field_content, breakup_character, use_breakup) for content in processed_content: new_facts = to_texta_facts(field, fact_name, fact_value=content) existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }
def update_generator(keyword_detector: RakunDetectorWrapper, generator: ElasticSearcher, ec: ElasticCore, fields: List[str], rakun_extractor_object: RakunExtractor, fact_name: str, fact_value: str, add_spans: bool): for scroll_batch in generator: for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): results = rakun_extractor_object.get_rakun_keywords( keyword_detector=keyword_detector, texts=[text], field_path=field, fact_name=fact_name, fact_value=fact_value, add_spans=add_spans) existing_facts.extend(results) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }
def update_generator(generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, fact_value: str, max_tags: int, object_id: int, object_type: str, tagger_object: Union[Tagger, TaggerGroup], object_args: Dict, tagger: TextTagger = None): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Appyling {object_type} with ID {object_id} to batch {i + 1}...") for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): if object_type == "tagger": result = tagger_object.apply_loaded_tagger( tagger, text, input_type="text", feedback=None) if result: tags = [result] else: tags = [] else: # update text and tags with MLP combined_texts, ner_tags = get_mlp( object_id, [text], lemmatize=object_args["lemmatize"], use_ner=object_args["use_ner"]) # retrieve tag candidates tag_candidates = get_tag_candidates( object_id, [text], ignore_tags=ner_tags, n_similar_docs=object_args["n_similar_docs"], max_candidates=object_args["n_candidate_tags"]) # get tags (sorted by probability in descending order) tagger_group_tags = apply_tagger_group( object_id, text, tag_candidates, request=None, input_type='text', lemmatize=object_args["lemmatize"], feedback=False, use_async=False) # take only `max_tags` first tags tags = ner_tags + tagger_group_tags[:max_tags] new_facts = to_texta_fact(tags, field, fact_name, fact_value) if new_facts: existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }