Exemplo n.º 1
0
def fact_delete_query_task(self, worker_id: int):
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step(
            'Scrolling through the indices to delete the facts.')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        target_facts = json.loads(worker_object.facts)
        scroll_size = worker_object.scroll_size

        searcher = ElasticSearcher(
            query=json.loads(worker_object.query),
            indices=indices,
            field_data=[TEXTA_TAGS_KEY],
            output=ElasticSearcher.OUT_RAW,
            callback_progress=show_progress,
            scroll_size=scroll_size,
            scroll_timeout=f"{worker_object.es_timeout}m")

        ed = ElasticDocument(index=None)
        actions = query_delete_actions_generator(searcher, target_facts)
        ed.bulk_update(actions)

        worker_object.task.complete()
        worker_object.save()

        return worker_id

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Exemplo n.º 2
0
def update_documents_in_es(documents: List[dict]):
    """
    Updates the documents inside Elasticsearch, either with the MLP results or the
    error messages.

    :param documents: Full Elasticsearch documents..
    """
    ed = ElasticDocument(index=None)
    ed.bulk_update(actions=documents)
Exemplo n.º 3
0
def apply_analyzers_on_indices(self, worker_id: int):
    worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id)
    task_object = worker_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        fields = json.loads(worker_object.fields)
        detect_lang = worker_object.detect_lang
        snowball_language = worker_object.stemmer_lang
        scroll_timeout = f"{worker_object.es_timeout}m"
        scroll_size = worker_object.bulk_size
        analyzers = json.loads(worker_object.analyzers)
        tokenizer = worker_object.tokenizer
        strip_html = worker_object.strip_html

        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=fields,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout=scroll_timeout)

        task_object.set_total(searcher.count())

        actions = process_analyzer_actions(generator=searcher,
                                           worker=worker_object,
                                           detect_lang=detect_lang,
                                           snowball_language=snowball_language,
                                           fields_to_parse=fields,
                                           analyzers=analyzers,
                                           tokenizer=tokenizer,
                                           strip_html=strip_html)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        ed.bulk_update(actions=actions, chunk_size=scroll_size)

        worker_object.task.complete()

        return worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemplo n.º 4
0
def apply_search_fields_tagger_on_index(object_id: int):
    search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id)
    task_object = search_fields_tagger.task
    """Apply Search Fields Tagger to index."""
    try:
        progress = ShowProgress(task_object)
        progress.update_step('scrolling search fields')

        # Get the necessary fields.
        indices: List[str] = search_fields_tagger.get_indices()
        fields: List[str] = json.loads(search_fields_tagger.fields)
        fact_name: str = search_fields_tagger.fact_name
        scroll_timeout = search_fields_tagger.es_timeout
        scroll_size = search_fields_tagger.bulk_size

        use_breakup = search_fields_tagger.use_breakup
        breakup_character = search_fields_tagger.breakup_character

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=json.loads(search_fields_tagger.query),
            output=ElasticSearcher.OUT_RAW,
            scroll_timeout=f"{scroll_timeout}m",
            callback_progress=progress,
            scroll_size=scroll_size)

        actions = update_search_fields_generator(
            generator=searcher,
            ec=ec,
            fields=fields,
            fact_name=fact_name,
            search_field_tagger_object=search_fields_tagger,
            use_breakup=use_breakup,
            breakup_character=breakup_character)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return object_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemplo n.º 5
0
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str],
                                   fields: List[str], query: dict,
                                   es_timeout: int, bulk_size: int,
                                   fact_name: str, add_spans: bool):
    """Apply Rakun Keyword Extractor to index."""
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!"
    )
    rakun_extractor_object = RakunExtractor.objects.get(id=object_id)
    try:
        progress = ShowProgress(rakun_extractor_object.task)

        # retrieve fields
        field_data = fields

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=field_data +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            timeout=f"{es_timeout}m",
            output=ElasticSearcher.OUT_RAW,
            callback_progress=progress,
            scroll_size=bulk_size)
        keyword_detector = rakun_extractor_object.load_rakun_keyword_detector()
        actions = update_generator(
            keyword_detector=keyword_detector,
            generator=searcher,
            ec=ec,
            fields=field_data,
            rakun_extractor_object=rakun_extractor_object,
            fact_name=fact_name,
            fact_value="",
            add_spans=add_spans)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        rakun_extractor_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        rakun_extractor_object.task.add_error(error_message)
        rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
Exemplo n.º 6
0
def apply_lang_on_indices(self, apply_worker_id: int):
    worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id)
    task_object = worker_object.task
    try:
        load_mlp()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        field = worker_object.field

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=[field],
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="15m")

        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        actions = process_lang_actions(generator=searcher,
                                       field=field,
                                       worker_id=apply_worker_id,
                                       mlp_class=mlp)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        worker_object.task.complete()

        return apply_worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemplo n.º 7
0
def apply_summarizer_on_index(self, summarizer_id: int):
    summarizer_object = Summarizer.objects.get(pk=summarizer_id)
    task_object = summarizer_object.task
    try:
        load_sumy()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling summarizer')

        # Get the necessary fields.
        indices: List[str] = summarizer_object.get_indices()
        field_data: List[str] = json.loads(summarizer_object.fields)
        ratio_data: float[str] = summarizer_object.ratio
        algorithm_data: List[str] = summarizer_object.algorithm

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(summarizer_object.query),
                                   indices=indices,
                                   field_data=field_data,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="30m")

        actions = process_actions(searcher,
                                  field_data,
                                  ratio_data,
                                  algorithm=algorithm_data,
                                  summarizer_class=sumy,
                                  summarizer_id=summarizer_id)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return summarizer_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e