def fact_delete_query_task(self, worker_id: int): worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id) try: show_progress = ShowProgress(worker_object.task, multiplier=1) show_progress.update_step( 'Scrolling through the indices to delete the facts.') # Get the necessary fields. indices: List[str] = worker_object.get_indices() target_facts = json.loads(worker_object.facts) scroll_size = worker_object.scroll_size searcher = ElasticSearcher( query=json.loads(worker_object.query), indices=indices, field_data=[TEXTA_TAGS_KEY], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=f"{worker_object.es_timeout}m") ed = ElasticDocument(index=None) actions = query_delete_actions_generator(searcher, target_facts) ed.bulk_update(actions) worker_object.task.complete() worker_object.save() return worker_id except Exception as e: worker_object.task.handle_failed_task(e) raise e
def update_documents_in_es(documents: List[dict]): """ Updates the documents inside Elasticsearch, either with the MLP results or the error messages. :param documents: Full Elasticsearch documents.. """ ed = ElasticDocument(index=None) ed.bulk_update(actions=documents)
def apply_analyzers_on_indices(self, worker_id: int): worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id) task_object = worker_object.task try: show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() fields = json.loads(worker_object.fields) detect_lang = worker_object.detect_lang snowball_language = worker_object.stemmer_lang scroll_timeout = f"{worker_object.es_timeout}m" scroll_size = worker_object.bulk_size analyzers = json.loads(worker_object.analyzers) tokenizer = worker_object.tokenizer strip_html = worker_object.strip_html searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=fields, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=scroll_timeout) task_object.set_total(searcher.count()) actions = process_analyzer_actions(generator=searcher, worker=worker_object, detect_lang=detect_lang, snowball_language=snowball_language, fields_to_parse=fields, analyzers=analyzers, tokenizer=tokenizer, strip_html=strip_html) # Send the data towards Elasticsearch ed = ElasticDocument("_all") ed.bulk_update(actions=actions, chunk_size=scroll_size) worker_object.task.complete() return worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def apply_search_fields_tagger_on_index(object_id: int): search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id) task_object = search_fields_tagger.task """Apply Search Fields Tagger to index.""" try: progress = ShowProgress(task_object) progress.update_step('scrolling search fields') # Get the necessary fields. indices: List[str] = search_fields_tagger.get_indices() fields: List[str] = json.loads(search_fields_tagger.fields) fact_name: str = search_fields_tagger.fact_name scroll_timeout = search_fields_tagger.es_timeout scroll_size = search_fields_tagger.bulk_size use_breakup = search_fields_tagger.use_breakup breakup_character = search_fields_tagger.breakup_character ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=fields + ["texta_facts"], # Get facts to add upon existing ones. query=json.loads(search_fields_tagger.query), output=ElasticSearcher.OUT_RAW, scroll_timeout=f"{scroll_timeout}m", callback_progress=progress, scroll_size=scroll_size) actions = update_search_fields_generator( generator=searcher, ec=ec, fields=fields, fact_name=fact_name, search_field_tagger_object=search_fields_tagger, use_breakup=use_breakup, breakup_character=breakup_character) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return object_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str], fields: List[str], query: dict, es_timeout: int, bulk_size: int, fact_name: str, add_spans: bool): """Apply Rakun Keyword Extractor to index.""" logging.getLogger(INFO_LOGGER).info( f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!" ) rakun_extractor_object = RakunExtractor.objects.get(id=object_id) try: progress = ShowProgress(rakun_extractor_object.task) # retrieve fields field_data = fields ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=field_data + ["texta_facts"], # Get facts to add upon existing ones. query=query, timeout=f"{es_timeout}m", output=ElasticSearcher.OUT_RAW, callback_progress=progress, scroll_size=bulk_size) keyword_detector = rakun_extractor_object.load_rakun_keyword_detector() actions = update_generator( keyword_detector=keyword_detector, generator=searcher, ec=ec, fields=field_data, rakun_extractor_object=rakun_extractor_object, fact_name=fact_name, fact_value="", add_spans=add_spans) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) rakun_extractor_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. rakun_extractor_object.task.add_error(error_message) rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
def apply_lang_on_indices(self, apply_worker_id: int): worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id) task_object = worker_object.task try: load_mlp() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() field = worker_object.field scroll_size = 100 searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=[field], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="15m") for index in indices: searcher.core.add_texta_facts_mapping(index=index) actions = process_lang_actions(generator=searcher, field=field, worker_id=apply_worker_id, mlp_class=mlp) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) worker_object.task.complete() return apply_worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def apply_summarizer_on_index(self, summarizer_id: int): summarizer_object = Summarizer.objects.get(pk=summarizer_id) task_object = summarizer_object.task try: load_sumy() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('scrolling summarizer') # Get the necessary fields. indices: List[str] = summarizer_object.get_indices() field_data: List[str] = json.loads(summarizer_object.fields) ratio_data: float[str] = summarizer_object.ratio algorithm_data: List[str] = summarizer_object.algorithm scroll_size = 100 searcher = ElasticSearcher(query=json.loads(summarizer_object.query), indices=indices, field_data=field_data, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="30m") actions = process_actions(searcher, field_data, ratio_data, algorithm=algorithm_data, summarizer_class=sumy, summarizer_id=summarizer_id) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return summarizer_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e