Пример #1
0
 def add_facts_mapping(self, request, pk=None, project_pk=None):
     es_core = ElasticCore()
     index = Index.objects.get(pk=pk)
     if index.is_open:
         es_core.add_texta_facts_mapping(index.name)
         return Response({
             "message":
             f"Added the Texta Facts mapping for: {index.name}"
         })
     else:
         return Response(
             {
                 "message":
                 f"Index {index.name} is closed, could not add the mapping!"
             },
             status=status.HTTP_400_BAD_REQUEST)
Пример #2
0
def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int):
    """Apply Torch Tagger to index."""
    try:
        tagger_object = TorchTaggerObject.objects.get(pk=object_id)
        tagger = tagger_object.load_tagger()

        progress = ShowProgress(tagger_object.task)

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices = indices,
            field_data = fields + ["texta_facts"],  # Get facts to add upon existing ones.
            query = query,
            output = ElasticSearcher.OUT_RAW,
            timeout = f"{es_timeout}m",
            callback_progress=progress,
            scroll_size = bulk_size
        )

        actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger)
        for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3):
            if not success:
                logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))

        tagger_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        tagger_object.task.add_error(error_message)
        tagger_object.task.update_status(Task.STATUS_FAILED)
Пример #3
0
def apply_search_fields_tagger_on_index(object_id: int):
    search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id)
    task_object = search_fields_tagger.task
    """Apply Search Fields Tagger to index."""
    try:
        progress = ShowProgress(task_object)
        progress.update_step('scrolling search fields')

        # Get the necessary fields.
        indices: List[str] = search_fields_tagger.get_indices()
        fields: List[str] = json.loads(search_fields_tagger.fields)
        fact_name: str = search_fields_tagger.fact_name
        scroll_timeout = search_fields_tagger.es_timeout
        scroll_size = search_fields_tagger.bulk_size

        use_breakup = search_fields_tagger.use_breakup
        breakup_character = search_fields_tagger.breakup_character

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=json.loads(search_fields_tagger.query),
            output=ElasticSearcher.OUT_RAW,
            scroll_timeout=f"{scroll_timeout}m",
            callback_progress=progress,
            scroll_size=scroll_size)

        actions = update_search_fields_generator(
            generator=searcher,
            ec=ec,
            fields=fields,
            fact_name=fact_name,
            search_field_tagger_object=search_fields_tagger,
            use_breakup=use_breakup,
            breakup_character=breakup_character)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return object_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Пример #4
0
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str],
                                   fields: List[str], query: dict,
                                   es_timeout: int, bulk_size: int,
                                   fact_name: str, add_spans: bool):
    """Apply Rakun Keyword Extractor to index."""
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!"
    )
    rakun_extractor_object = RakunExtractor.objects.get(id=object_id)
    try:
        progress = ShowProgress(rakun_extractor_object.task)

        # retrieve fields
        field_data = fields

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=field_data +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            timeout=f"{es_timeout}m",
            output=ElasticSearcher.OUT_RAW,
            callback_progress=progress,
            scroll_size=bulk_size)
        keyword_detector = rakun_extractor_object.load_rakun_keyword_detector()
        actions = update_generator(
            keyword_detector=keyword_detector,
            generator=searcher,
            ec=ec,
            fields=field_data,
            rakun_extractor_object=rakun_extractor_object,
            fact_name=fact_name,
            fact_value="",
            add_spans=add_spans)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        rakun_extractor_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        rakun_extractor_object.task.add_error(error_message)
        rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
Пример #5
0
def reindex_test_dataset(query: dict = None, from_index: Optional[str] = None, hex_size=20) -> str:
    """
    Reindexes the master test dataset into isolated pieces.
    :param from_index: Index from which to reindex.
    :param query: Query you want to limit the reindex to.
    :param hex_size: How many random characters should there be in the new indexes name.
    :return: Name of the newly generated index.
    """
    from texta_elastic.core import ElasticCore
    from toolkit.test_settings import TEST_INDEX

    from_index = from_index if from_index else TEST_INDEX

    ec = ElasticCore()
    new_test_index_name = f"ttk_test_{uuid.uuid4().hex[:hex_size]}"
    ec.create_index(index=new_test_index_name)
    ec.add_texta_facts_mapping(new_test_index_name)

    from_scan = elasticsearch_dsl.Search() if query is None else elasticsearch_dsl.Search.from_dict(query)
    from_scan = from_scan.index(from_index).using(ec.es)
    from_scan = from_scan.scan()


    def doc_actions(generator):
        for document in generator:
            yield {
                "_index": new_test_index_name,
                "_type": "_doc",
                "_source": document.to_dict(),
                "retry_on_conflict": 3
            }


    actions = doc_actions(from_scan)
    from elasticsearch.helpers import bulk
    bulk(actions=actions, client=ec.es, refresh="wait_for")
    return new_test_index_name
Пример #6
0
def apply_crf_extractor_to_index(object_id: int, indices: List[str],
                                 mlp_fields: List[str], label_suffix: str,
                                 query: dict, bulk_size: int,
                                 max_chunk_bytes: int, es_timeout: int):
    """
    Applies Extractor to ES index.
    """
    try:
        # load model
        crf_object = CRFExtractorObject.objects.get(pk=object_id)
        extractor = crf_object.load_extractor()
        # progress
        progress = ShowProgress(crf_object.task)
        # add fact field if missing
        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]
        # search
        searcher = ElasticSearcher(
            indices=indices,
            field_data=mlp_fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            output=ElasticSearcher.OUT_RAW,
            timeout=f"{es_timeout}m",
            callback_progress=progress,
            scroll_size=bulk_size)
        # create update actions
        actions = update_generator(generator=searcher,
                                   ec=ec,
                                   mlp_fields=mlp_fields,
                                   label_suffix=label_suffix,
                                   object_id=object_id,
                                   extractor=extractor)
        # perform updates
        try:
            # as we have defined indices in actions there is no need to do it again (None)
            ElasticDocument(None).bulk_update(actions)
        except Exception as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
        # all done
        crf_object.task.complete()
        return True

    except Exception as e:
        crf_object.task.handle_failed_task(e)
        raise e