示例#1
0
def start_rakun_task(self, object_id: int):
    rakun = RakunExtractor.objects.get(pk=object_id)
    task_object = rakun.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('starting rakun')
    show_progress.update_view(0)
    return object_id
示例#2
0
def start_fact_delete_query_task(self, worker_id: int):
    """
    Scrolls the document ID-s and passes them to MLP worker.
    """
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Celery: Starting task for deleting facts by query for project with ID: {worker_object.pk}"
        )

        # init progress
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step('Scrolling document IDs')
        show_progress.update_view(0)

        # create searcher object for scrolling ids
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=worker_object.get_indices(),
                                   output=ElasticSearcher.OUT_DOC,
                                   callback_progress=show_progress,
                                   scroll_size=worker_object.scroll_size,
                                   field_data=["texta_facts"])

        count = searcher.count()

        show_progress.update_step(f'Deleting facts from {count} documents')
        show_progress.update_view(0)
        worker_object.task.set_total(count)
        return True

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
示例#3
0
def start_tagger_task(tagger_id: int):
    tagger = Tagger.objects.get(pk=tagger_id)
    task_object = tagger.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('starting tagging')
    show_progress.update_view(0)
    return tagger_id
示例#4
0
def start_clustering_task(clustering_id: int):
    clustering_obj = ClusteringResult.objects.get(pk=clustering_id)
    task_object = clustering_obj.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('starting clustering')
    show_progress.update_view(0)

    return clustering_id
示例#5
0
def start_search_fields_tagger_worker(self, object_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting applying search fields tagger on the index for model ID: {object_id}"
    )
    searchfieldstagger_object = SearchFieldsTagger.objects.get(pk=object_id)
    show_progress = ShowProgress(searchfieldstagger_object.task, multiplier=1)
    show_progress.update_step('running search fields tagger')
    show_progress.update_view(0)
    return object_id
示例#6
0
def start_summarizer_worker(self, summarizer_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting applying summarizer on the index for model ID: {summarizer_id}"
    )
    summarizer_object = Summarizer.objects.get(pk=summarizer_id)
    show_progress = ShowProgress(summarizer_object.task, multiplier=1)
    show_progress.update_step('running summarizer')
    show_progress.update_view(0)
    return summarizer_id
示例#7
0
def start_crf_task(crf_id: int):
    """
    Starts the training process for Extractor.
    """
    extractor = CRFExtractorObject.objects.get(pk=crf_id)
    task_object = extractor.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('starting tagging')
    show_progress.update_view(0)
    return crf_id
示例#8
0
def start_mlp_worker(self, mlp_id: int):
    """
    Scrolls the document ID-s and passes them to MLP worker.
    """
    mlp_object = MLPWorker.objects.get(pk=mlp_id)

    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Applying mlp on the index for MLP Task ID: {mlp_id}")
        # init progress
        show_progress = ShowProgress(mlp_object.task, multiplier=1)
        show_progress.update_step('Scrolling document IDs')
        show_progress.update_view(0)
        # Get the necessary fields.
        indices: List[str] = mlp_object.get_indices()
        es_scroll_size = mlp_object.es_scroll_size
        es_timeout = mlp_object.es_timeout

        # create searcher object for scrolling ids
        searcher = ElasticSearcher(query=json.loads(mlp_object.query),
                                   indices=indices,
                                   output=ElasticSearcher.OUT_META,
                                   callback_progress=show_progress,
                                   scroll_size=es_scroll_size,
                                   scroll_timeout=f"{es_timeout}m")
        # add texta facts mappings to the indices if needed
        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        doc_chunks = list(chunks_iter(searcher, MLP_BATCH_SIZE))

        # update progress
        show_progress.update_step(
            f'Applying MLP to {len(doc_chunks)} documents')
        show_progress.update_view(0)

        mlp_object.task.set_total(searcher.count())
        mlp_object.task.update_status(Task.STATUS_RUNNING)

        # pass document id-s to the next task
        chain = group(
            apply_mlp_on_es_docs.s([doc["_id"] for doc in meta_chunk], mlp_id)
            for meta_chunk in doc_chunks) | end_mlp_task.si(mlp_id)
        chain.delay()
        return True

    except Exception as e:
        mlp_object.task.handle_failed_task(e)
        raise
示例#9
0
def save_tagger_results(result_data: dict):
    try:
        tagger_id = result_data['id']
        logging.getLogger(INFO_LOGGER).info(
            f"Starting task results for tagger with ID: {tagger_id}!")
        tagger_object = Tagger.objects.get(pk=tagger_id)

        # Handle previous tagger models that exist in case of retrains.
        model_path = pathlib.Path(
            tagger_object.model.path) if tagger_object.model else None

        task_object = tagger_object.task
        show_progress = ShowProgress(task_object, multiplier=1)
        # update status to saving
        show_progress.update_step('saving')
        show_progress.update_view(0)
        tagger_object.model.name = result_data["tagger_path"]
        tagger_object.precision = result_data["precision"]
        tagger_object.recall = result_data["recall"]
        tagger_object.f1_score = result_data["f1_score"]
        tagger_object.num_features = result_data["num_features"]
        tagger_object.num_examples = json.dumps(result_data["num_examples"])
        tagger_object.model_size = result_data["model_size"]
        tagger_object.plot.name = result_data["plot"]
        tagger_object.confusion_matrix = result_data["confusion_matrix"]
        tagger_object.classes = json.dumps(result_data["classes"],
                                           ensure_ascii=False)
        tagger_object.save()
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
示例#10
0
def save_crf_results(result_data: dict):
    """
    Saves task results to database.
    """
    try:
        crf_id = result_data['id']
        logging.getLogger(INFO_LOGGER).info(
            f"Starting task results for CRFExtractor with ID: {crf_id}!")
        crf_object = CRFExtractorObject.objects.get(pk=crf_id)

        model_path = pathlib.Path(
            crf_object.model.path) if crf_object.model else None

        task_object = crf_object.task
        show_progress = ShowProgress(task_object, multiplier=1)
        # update status to saving
        show_progress.update_step('saving')
        show_progress.update_view(0)
        crf_object.best_c1 = result_data["best_c_values"][0]
        crf_object.best_c2 = result_data["best_c_values"][1]
        crf_object.model.name = result_data["extractor_path"]
        crf_object.precision = result_data["precision"]
        crf_object.recall = result_data["recall"]
        crf_object.f1_score = result_data["f1_score"]
        crf_object.model_size = result_data["model_size"]
        crf_object.confusion_matrix = result_data["confusion_matrix"]
        crf_object.plot.name = result_data["plot"]
        crf_object.save()
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

        return True
    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
示例#11
0
def reindex_task(reindexer_task_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'reindex' with ID {reindexer_task_id}.")
    try:
        reindexer_obj = Reindexer.objects.get(pk=reindexer_task_id)
        task_object = reindexer_obj.task
        indices = json.loads(reindexer_obj.indices)
        fields = json.loads(reindexer_obj.fields)
        random_size = reindexer_obj.random_size
        field_type = json.loads(reindexer_obj.field_type)
        scroll_size = reindexer_obj.scroll_size
        new_index = reindexer_obj.new_index
        query = json.loads(reindexer_obj.query)

        # if no fields, let's use all fields from all selected indices
        if not fields:
            fields = ElasticCore().get_fields(indices)
            fields = [field["path"] for field in fields]

        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step("scrolling data")
        show_progress.update_view(0)

        elastic_search = ElasticSearcher(indices=indices,
                                         field_data=fields,
                                         callback_progress=show_progress,
                                         query=query,
                                         scroll_size=scroll_size)
        task_object.set_total(elastic_search.count())
        elastic_doc = ElasticDocument(new_index)

        if random_size > 0:
            elastic_search = elastic_search.random_documents(size=random_size)

        logging.getLogger(INFO_LOGGER).info("Updating index schema.")
        ''' the operations that don't require a mapping update have been completed '''
        schema_input = update_field_types(indices,
                                          fields,
                                          field_type,
                                          flatten_doc=FLATTEN_DOC)
        updated_schema = update_mapping(schema_input,
                                        new_index,
                                        reindexer_obj.add_facts_mapping,
                                        add_texta_meta_mapping=False)

        logging.getLogger(INFO_LOGGER).info("Creating new index.")
        # create new_index
        create_index_res = ElasticCore().create_index(new_index,
                                                      updated_schema)
        Index.objects.get_or_create(name=new_index)

        logging.getLogger(INFO_LOGGER).info("Indexing documents.")
        # set new_index name as mapping name, perhaps make it customizable in the future
        bulk_add_documents(elastic_search,
                           elastic_doc,
                           index=new_index,
                           chunk_size=scroll_size,
                           flatten_doc=FLATTEN_DOC,
                           field_data=field_type)

        # declare the job done
        task_object.complete()

        logging.getLogger(INFO_LOGGER).info(
            "Reindexing succesfully completed.")
        return True

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
示例#12
0
def train_crf_task(crf_id: int):
    """
    Trains CRF model.
    """
    try:
        # get task object
        logging.getLogger(INFO_LOGGER).info(
            f"Starting task 'train_crf' for CRFExtractor with ID: {crf_id}!")
        crf_object = CRFExtractorObject.objects.get(id=crf_id)
        task_object = crf_object.task
        # create progress object
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling documents')
        show_progress.update_view(0)
        # retrieve indices & field data
        indices = get_indices_from_object(crf_object)
        mlp_field = crf_object.mlp_field

        # load embedding if any
        if crf_object.embedding:
            embedding = crf_object.embedding.get_embedding()
            embedding.load_django(crf_object.embedding)
        else:
            embedding = None

        # scroll docs
        logging.getLogger(INFO_LOGGER).info(
            f"Scrolling data for CRFExtractor with ID: {crf_id}!")
        documents = ElasticSearcher(query=crf_object.get_query(),
                                    indices=indices,
                                    callback_progress=show_progress,
                                    text_processor=None,
                                    field_data=[mlp_field, "texta_facts"],
                                    output=ElasticSearcher.OUT_DOC,
                                    flatten=False)

        # create config
        config = crf_object.get_crf_config()
        # start training
        logging.getLogger(INFO_LOGGER).info(
            f"Training the model for CRFExtractor with ID: {crf_id}!")
        # create extractor
        extractor = CRFExtractor(config=config, embedding=embedding)
        # train the CRF model
        model_full_path, relative_model_path = crf_object.generate_name("crf")
        report, _ = extractor.train(documents,
                                    save_path=model_full_path,
                                    mlp_field=mlp_field)
        # Save the image before its path.
        image_name = f'{secrets.token_hex(15)}.png'
        crf_object.plot.save(image_name,
                             create_tagger_plot(report.to_dict()),
                             save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        # pass results to next task
        return {
            "id":
            crf_id,
            "best_c_values":
            extractor.best_c_values,
            "extractor_path":
            relative_model_path,
            "precision":
            float(report.precision),
            "recall":
            float(report.recall),
            "f1_score":
            float(report.f1_score),
            "confusion_matrix":
            report.confusion.tolist(),
            "model_size":
            round(float(os.path.getsize(model_full_path)) / 1000000,
                  1),  # bytes to mb
            "plot":
            str(image_path),
        }
    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
示例#13
0
def perform_data_clustering(clustering_id):
    clustering_model = ClusteringResult.objects.get(id=clustering_id)

    try:

        num_clusters = clustering_model.num_cluster
        clustering_algorithm = clustering_model.clustering_algorithm
        stop_words = json.loads(clustering_model.stop_words)
        indices = clustering_model.get_indices()
        query = json.loads(clustering_model.query)
        ignored_ids = json.loads(clustering_model.ignored_ids)
        fields = json.loads(clustering_model.fields)
        display_fields = json.loads(clustering_model.display_fields)
        document_limit = clustering_model.document_limit
        vectorizer = clustering_model.vectorizer
        num_dims = clustering_model.num_dims
        use_lsi = clustering_model.use_lsi
        num_topics = clustering_model.num_topics
        significant_words_filter = clustering_model.significant_words_filter

        # Removing stopwords, ignored ids while fetching the documents.
        show_progress = ShowProgress(clustering_model.task, multiplier=1)
        show_progress.update_step("scrolling data")
        show_progress.update_view(0)

        # load phraser from embedding
        if clustering_model.embedding:
            embedding = clustering_model.embedding.get_embedding()
            embedding.load_django(clustering_model.embedding)
            phraser = embedding.phraser
        else:
            phraser = None

        # Can't give parser to TextProcessor as some processing is also done in Clustering class
        text_processor = TextProcessor(remove_stop_words=True,
                                       custom_stop_words=stop_words)

        elastic_search = ElasticSearcher(
            indices=indices,
            query=query,
            callback_progress=show_progress,
            text_processor=text_processor,
            ignore_ids=set(ignored_ids),
            output=ElasticSearcher.OUT_TEXT_WITH_ID,
            field_data=fields,
            scroll_limit=document_limit)

        docs = [{
            "id": doc_id,
            "document": document
        } for doc_id, document in elastic_search]

        # Group em up!
        clusters = Clustering(docs=docs,
                              num_clusters=num_clusters,
                              stop_words=stop_words,
                              clustering_algorithm=clustering_algorithm,
                              vectorizer=vectorizer,
                              num_dims=num_dims,
                              use_lsi=use_lsi,
                              num_topics=num_topics,
                              phraser=phraser)
        clusters.cluster()

        # Save the vector path.
        full_vector_path, relative_vector_path = clustering_model.generate_name(
        )
        clusters.save_transformation(full_vector_path)

        clustering_info = {
            "pk": clustering_model.pk,
            "results": list(clusters.clustering_result.items()),
            "fields": fields,
            "indices": indices,
            "display_fields": display_fields,
            "vectors_filepath": relative_vector_path,
            "stop_words": stop_words,
            "significant_words_filter": significant_words_filter
        }

        return clustering_info

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        clustering_model.task.add_error(str(e))
        clustering_model.task.update_status(status=Task.STATUS_FAILED)
        clustering_model.save()
        raise e
示例#14
0
def train_torchtagger(tagger_id, testing=False):
    try:
        # retrieve neurotagger & task objects
        tagger_object = TorchTaggerObject.objects.get(pk=tagger_id)

        # Handle previous tagger models that exist in case of retrains.
        model_path = pathlib.Path(tagger_object.model.path) if tagger_object.model else None

        task_object = tagger_object.task
        model_type = TorchTaggerObject.MODEL_TYPE
        show_progress = ShowProgress(task_object, multiplier=1)
        # get fields & indices
        fields = json.loads(tagger_object.fields)
        indices = get_indices_from_object(tagger_object)
        # load embedding
        embedding = W2VEmbedding()
        embedding.load_django(tagger_object.embedding)
        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices,
            fields,
            show_progress=show_progress,
            join_fields=True,
            balance=tagger_object.balance,
            use_sentence_shuffle=tagger_object.use_sentence_shuffle,
            balance_to_max_limit=tagger_object.balance_to_max_limit
        )
        show_progress.update_step('training')
        show_progress.update_view(0.0)

        # get num examples and save to model
        num_examples = {k: len(v) for k, v in data_sample.data.items()}
        tagger_object.num_examples = json.dumps(num_examples)

        tagger_object.save()

        # create TorchTagger
        tagger = TorchTagger(
            embedding,
            model_arch=tagger_object.model_architecture
        )
        # train tagger and get result statistics
        report = tagger.train(data_sample.data, num_epochs=int(tagger_object.num_epochs), pos_label=tagger_object.pos_label)
        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()
        # save tagger to disk
        tagger_path = os.path.join(RELATIVE_MODELS_PATH, model_type, f'{model_type}_{tagger_id}_{secrets.token_hex(10)}')
        tagger.save(tagger_path)


        # set tagger location
        tagger_object.model.name = tagger_path
        # save tagger plot
        report_dict = report.to_dict()
        tagger_object.plot.save(f'{secrets.token_hex(15)}.png', create_tagger_plot(report_dict), save=False)
        # save label index
        tagger_object.label_index = json.dumps(tagger.label_reverse_index)
        # stats to model object
        tagger_object.f1_score = report.f1_score
        tagger_object.precision = report.precision
        tagger_object.recall = report.recall
        tagger_object.accuracy = report.accuracy
        tagger_object.training_loss = report.training_loss
        tagger_object.epoch_reports = json.dumps([a.to_dict() for a in tagger.epoch_reports])
        tagger_object.confusion_matrix = json.dumps(report.confusion.tolist())
        tagger_object.classes = json.dumps(report.classes, ensure_ascii=False)

        # save tagger object
        tagger_object.save()
        # declare the job done
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

        return True


    except Exception as e:
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise
示例#15
0
def train_embedding(embedding_id):
    # retrieve embedding & task objects
    embedding_object = Embedding.objects.get(pk=embedding_id)
    task_object = embedding_object.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('training')
    show_progress.update_view(0)
    try:
        # retrieve indices from project
        indices = get_indices_from_object(embedding_object)
        field_data = json.loads(embedding_object.fields)
        max_documents = embedding_object.max_documents
        use_phraser = embedding_object.use_phraser
        snowball_language = embedding_object.snowball_language
        # add stemmer if asked
        if snowball_language:
            snowball_lemmatizer = ElasticAnalyzer(language=snowball_language)
        else:
            snowball_lemmatizer = None
        # iterator for texts
        sentences = ElasticSearcher(query=json.loads(embedding_object.query),
                                    indices=indices,
                                    field_data=field_data,
                                    callback_progress=show_progress,
                                    scroll_limit=max_documents,
                                    text_processor=TextProcessor(
                                        sentences=True,
                                        remove_stop_words=True,
                                        words_as_list=True,
                                        lemmatizer=snowball_lemmatizer),
                                    output=ElasticSearcher.OUT_TEXT)
        # create embedding object & train
        embedding = embedding_object.get_embedding()
        embedding.train(sentences, use_phraser=use_phraser)

        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # save model
        show_progress.update_step('saving')
        full_model_path, relative_model_path = embedding_object.generate_name(
            "embedding")
        embedding.save(full_model_path)

        # save gensim model
        if embedding_object.embedding_type == "FastTextEmbedding":
            fast_text_embedding_model = joblib.load(
                full_model_path)["embedding"]
            gensim_full_model_path = full_model_path + "_" + FACEBOOK_MODEL_SUFFIX
            gensim.models.fasttext.save_facebook_model(
                fast_text_embedding_model,
                gensim_full_model_path,
                encoding='utf-8')

        # save model path
        embedding_object.embedding_model.name = relative_model_path
        embedding_object.vocab_size = embedding.model.wv.vectors.shape[0]
        embedding_object.save()
        # declare the job done
        task_object.complete()
        return True
    except Exception as e:
        # declare the job failed
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise
示例#16
0
def train_bert_tagger(tagger_id, testing=False):
    # retrieve neurotagger & task objects
    tagger_object = BertTaggerObject.objects.get(pk=tagger_id)

    # Handle previous tagger models that exist in case of retrains.
    model_path = pathlib.Path(
        tagger_object.model.path) if tagger_object.model else None

    task_object = tagger_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        # get fields & indices
        fields = json.loads(tagger_object.fields)
        indices = get_indices_from_object(tagger_object)

        # set loading model from a checkpoint False by default
        from_checkpoint = False
        checkpoint_model = tagger_object.checkpoint_model

        pos_label = tagger_object.pos_label

        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices,
            fields,
            show_progress=show_progress,
            join_fields=True,
            balance=tagger_object.balance,
            use_sentence_shuffle=tagger_object.use_sentence_shuffle,
            balance_to_max_limit=tagger_object.balance_to_max_limit)
        show_progress.update_step('training')
        show_progress.update_view(0.0)

        # select sklearn average function based on the number of classes
        if data_sample.is_binary:
            sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_BINARY
        else:
            sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_MULTICLASS

        # if checkpoint model is detected, load it and use it for further training
        if checkpoint_model:
            logging.getLogger(INFO_LOGGER).info(
                f"Loading model from a checkpoint stored in '{tagger_object}'..."
            )

            # use the same pre-trained bert model as the checkpoint model
            tagger_object.bert_model = checkpoint_model.bert_model
            tagger = checkpoint_model.load_tagger()

            # set sklearn avg function in case the number of classes has changed
            tagger.sklearn_avg_function = sklearn_avg_function

            # set loading model from a checkpoint True
            from_checkpoint = True

        # if no checkpoint model is given, train a new model
        else:
            logging.getLogger(INFO_LOGGER).info(
                "No checkpoint model detected, training a new model...")
            # NB! saving pretrained models must be disabled!
            tagger = BertTagger(
                allow_standard_output=choices.DEFAULT_ALLOW_STANDARD_OUTPUT,
                autoadjust_batch_size=choices.DEFAULT_AUTOADJUST_BATCH_SIZE,
                sklearn_avg_function=sklearn_avg_function,
                use_gpu=tagger_object.use_gpu,
                save_pretrained=False,
                pretrained_models_dir=BERT_PRETRAINED_MODEL_DIRECTORY,
                logger=logging.getLogger(INFO_LOGGER),
                cache_dir=BERT_CACHE_DIR)

        # use state dict for binary taggers
        if data_sample.is_binary:
            tagger.config.use_state_dict = True
        else:
            tagger.config.use_state_dict = False
            pos_label = ""

        # train tagger and get result statistics
        report = tagger.train(data_sample.data,
                              from_checkpoint=from_checkpoint,
                              pos_label=pos_label,
                              n_epochs=tagger_object.num_epochs,
                              max_length=tagger_object.max_length,
                              batch_size=tagger_object.batch_size,
                              lr=tagger_object.learning_rate,
                              eps=tagger_object.eps,
                              split_ratio=tagger_object.split_ratio,
                              bert_model=tagger_object.bert_model)
        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # save tagger to disc
        tagger_path = os.path.join(
            BERT_FINETUNED_MODEL_DIRECTORY,
            f'{tagger_object.MODEL_TYPE}_{tagger_id}_{secrets.token_hex(10)}')
        tagger.save(tagger_path)

        # set tagger location
        tagger_object.model.name = tagger_path

        report_dict = report.to_dict()

        # save tagger plot
        tagger_object.plot.save(f'{secrets.token_hex(15)}.png',
                                create_tagger_plot(report_dict),
                                save=False)
        # save label index
        tagger_object.label_index = json.dumps(
            tagger.config.label_reverse_index)
        # stats to model object
        tagger_object.f1_score = report.f1_score
        tagger_object.precision = report.precision
        tagger_object.recall = report.recall
        tagger_object.accuracy = report.accuracy
        tagger_object.training_loss = report.training_loss
        tagger_object.validation_loss = report.validation_loss
        tagger_object.epoch_reports = json.dumps(
            [a.to_dict() for a in tagger.epoch_reports])
        tagger_object.num_examples = json.dumps(
            {k: len(v)
             for k, v in list(data_sample.data.items())})
        tagger_object.adjusted_batch_size = tagger.config.batch_size
        tagger_object.confusion_matrix = json.dumps(report.confusion.tolist())
        tagger_object.classes = json.dumps(report.classes, ensure_ascii=False)
        # save tagger object
        tagger_object.save()
        # declare the job done
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        tagger_object.task.add_error(error_message)
        tagger_object.task.update_status(Task.STATUS_FAILED)
        raise
示例#17
0
def train_tagger_task(tagger_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'train_tagger' for tagger with ID: {tagger_id}!")
    tagger_object = Tagger.objects.get(id=tagger_id)
    task_object = tagger_object.task
    try:
        # create progress object
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling positives')
        show_progress.update_view(0)

        # retrieve indices & field data
        indices = get_indices_from_object(tagger_object)
        field_data = json.loads(tagger_object.fields)
        # split stop words by space or newline and remove empties

        stop_words = load_stop_words(tagger_object.stop_words)
        ignore_numbers = tagger_object.ignore_numbers

        # get scoring function
        if tagger_object.scoring_function != "default":
            scoring_function = tagger_object.scoring_function
        else:
            scoring_function = None

        logging.getLogger(INFO_LOGGER).info(
            f"Using scoring function: {scoring_function}.")

        # load embedding if any
        if tagger_object.embedding:
            embedding = W2VEmbedding()
            embedding.load_django(tagger_object.embedding)
        else:
            embedding = None
        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices=indices,
            field_data=field_data,
            show_progress=show_progress,
            snowball_language=tagger_object.snowball_language,
            detect_lang=tagger_object.detect_lang,
            balance=tagger_object.balance,
            balance_to_max_limit=tagger_object.balance_to_max_limit)
        # update status to training
        show_progress.update_step("training")
        show_progress.update_view(0)
        # train model
        tagger = TextTagger(embedding=embedding,
                            custom_stop_words=stop_words,
                            ignore_numbers=ignore_numbers,
                            classifier=tagger_object.classifier,
                            vectorizer=tagger_object.vectorizer,
                            analyzer=tagger_object.analyzer)
        tagger.train(data_sample.data,
                     pos_label=tagger_object.pos_label,
                     field_list=field_data,
                     scoring=scoring_function)

        # save tagger to disk
        tagger_full_path, relative_tagger_path = tagger_object.generate_name(
            "tagger")
        tagger.save(tagger_full_path)

        # Save the image before its path.
        image_name = f'{secrets.token_hex(15)}.png'
        tagger_object.plot.save(image_name,
                                create_tagger_plot(tagger.report.to_dict()),
                                save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name

        # get num examples
        num_examples = {k: len(v) for k, v in data_sample.data.items()}

        return {
            "id":
            tagger_id,
            "tagger_path":
            relative_tagger_path,
            "precision":
            float(tagger.report.precision),
            "recall":
            float(tagger.report.recall),
            "f1_score":
            float(tagger.report.f1_score),
            "num_features":
            tagger.report.num_features,
            "num_examples":
            num_examples,
            "confusion_matrix":
            tagger.report.confusion.tolist(),
            "model_size":
            round(float(os.path.getsize(tagger_full_path)) / 1000000,
                  1),  # bytes to mb
            "plot":
            str(image_path),
            "classes":
            tagger.report.classes
        }

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
示例#18
0
def annotator_task(self, annotator_task_id):
    annotator_obj = Annotator.objects.get(pk=annotator_task_id)
    annotator_group_children = []

    indices = annotator_obj.get_indices()
    users = [user.pk for user in annotator_obj.annotator_users.all()]

    task_object = annotator_obj.task
    annotator_fields = json.loads(annotator_obj.fields)
    all_fields = annotator_fields
    all_fields.append("texta_meta.document_uuid")

    if annotator_obj.annotation_type == 'entity':
        all_fields.append("texta_facts")
        all_fields.append(texta_mlp.settings.META_KEY)  # Include MLP Meta key here so it would be pulled from Elasticsearch.

    project_obj = Project.objects.get(id=annotator_obj.project_id)
    new_field_type = get_selected_fields(indices, annotator_fields)
    field_type = add_field_type(new_field_type)
    add_facts_mapping = annotator_obj.add_facts_mapping
    scroll_size = 100

    new_indices = []
    new_annotators = []

    for user in users:
        annotating_user = User.objects.get(pk=user)
        new_annotators.append(annotating_user.pk)
        for index in indices:
            new_indices.append(f"{index}_{user}_{annotator_obj.task_id}")

    query = annotator_obj.query

    logging.getLogger(INFO_LOGGER).info(f"Starting task annotator with Task ID {annotator_obj.task_id}.")

    try:
        ec = ElasticCore()
        index_fields = ec.get_fields(indices)
        index_fields = [index_field["path"] for index_field in index_fields]

        # ElasticSearcher seems to be broken when handling scrolls with only the main field in its field_data instead of all of them in dot notation.
        # Hence this ugly hack is needed if I want to include the MLP meta field inside the output.
        for annotator_field in json.loads(annotator_obj.fields):
            for index_field in index_fields:
                stripped_mlp_field = annotator_field.split("_mlp.")[0] if "_mlp." in annotator_field else annotator_field
                if texta_mlp.settings.META_KEY in index_field and stripped_mlp_field in index_field:
                    all_fields.append(index_field)

        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step("scrolling data")
        show_progress.update_view(0)

        __add_meta_to_original_index(indices, index_fields, show_progress, query, scroll_size, ec)

        for new_annotator in new_annotators:
            new_annotator_obj = Annotator.objects.create(
                annotator_uid=f"{annotator_obj.description}_{new_annotator}_{annotator_obj.task_id}",
                description=f"{annotator_obj.description}",
                author=annotator_obj.author,
                project=annotator_obj.project,
                total=annotator_obj.total,
                fields=annotator_obj.fields,
                add_facts_mapping=add_facts_mapping,
                annotation_type=annotator_obj.annotation_type,
                binary_configuration=annotator_obj.binary_configuration,
                multilabel_configuration=annotator_obj.multilabel_configuration,
                entity_configuration=annotator_obj.entity_configuration,
            )
            new_annotator_obj.annotator_users.add(new_annotator)
            for new_index in new_indices:
                logging.getLogger(INFO_LOGGER).info(f"New Index check {new_index} for user {new_annotator}")
                logging.getLogger(INFO_LOGGER).info(f"Index object {indices}")

                for index in indices:
                    if new_index == f"{index}_{new_annotator}_{annotator_obj.task_id}":

                        elastic_search = ElasticSearcher(indices=indices, field_data=all_fields, callback_progress=show_progress, query=query, scroll_size=scroll_size)
                        elastic_doc = ElasticDocument(new_index)

                        logging.getLogger(INFO_LOGGER).info(f"Updating index schema for index {new_index}")
                        ''' the operations that don't require a mapping update have been completed '''
                        schema_input = update_field_types(indices, all_fields, field_type, flatten_doc=False)
                        updated_schema = update_mapping(schema_input, new_index, add_facts_mapping, add_texta_meta_mapping=True)

                        logging.getLogger(INFO_LOGGER).info(f"Creating new index {new_index} for user {new_annotator}")
                        # create new_index
                        create_index_res = ElasticCore().create_index(new_index, updated_schema)

                        index_model, is_created = Index.objects.get_or_create(name=new_index)
                        project_obj.indices.add(index_model)
                        index_user = index_model.name.rsplit('_', 2)[1]
                        if str(index_user) == str(new_annotator):
                            new_annotator_obj.indices.add(index_model)

                        logging.getLogger(INFO_LOGGER).info("Indexing documents.")
                        # set new_index name as mapping name
                        bulk_add_documents(elastic_search, elastic_doc, index=new_index, chunk_size=scroll_size, flatten_doc=False)

            new_annotator_obj.save()
            annotator_group_children.append(new_annotator_obj.id)
            logging.getLogger(INFO_LOGGER).info(f"Saving new annotator object ID {new_annotator_obj.id}")

        new_annotator_obj.add_annotation_mapping(new_indices)
        new_annotator_obj.add_texta_meta_mapping(new_indices)

        annotator_obj.annotator_users.clear()
        annotator_obj.save()

        annotator_group, is_created = AnnotatorGroup.objects.get_or_create(project=annotator_obj.project, parent=annotator_obj)
        annotator_group.children.add(*annotator_group_children)

        # declare the job done
        task_object.complete()

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e

    logging.getLogger(INFO_LOGGER).info(f"Annotator with Task ID {annotator_obj.task_id} successfully completed.")
    return True