def start_fact_delete_query_task(self, worker_id: int): """ Scrolls the document ID-s and passes them to MLP worker. """ worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id) try: logging.getLogger(INFO_LOGGER).info( f"Celery: Starting task for deleting facts by query for project with ID: {worker_object.pk}" ) # init progress show_progress = ShowProgress(worker_object.task, multiplier=1) show_progress.update_step('Scrolling document IDs') show_progress.update_view(0) # create searcher object for scrolling ids searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=worker_object.get_indices(), output=ElasticSearcher.OUT_DOC, callback_progress=show_progress, scroll_size=worker_object.scroll_size, field_data=["texta_facts"]) count = searcher.count() show_progress.update_step(f'Deleting facts from {count} documents') show_progress.update_view(0) worker_object.task.set_total(count) return True except Exception as e: worker_object.task.handle_failed_task(e) raise e
def start_tagger_task(tagger_id: int): tagger = Tagger.objects.get(pk=tagger_id) task_object = tagger.task show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('starting tagging') show_progress.update_view(0) return tagger_id
def fact_delete_query_task(self, worker_id: int): worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id) try: show_progress = ShowProgress(worker_object.task, multiplier=1) show_progress.update_step( 'Scrolling through the indices to delete the facts.') # Get the necessary fields. indices: List[str] = worker_object.get_indices() target_facts = json.loads(worker_object.facts) scroll_size = worker_object.scroll_size searcher = ElasticSearcher( query=json.loads(worker_object.query), indices=indices, field_data=[TEXTA_TAGS_KEY], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=f"{worker_object.es_timeout}m") ed = ElasticDocument(index=None) actions = query_delete_actions_generator(searcher, target_facts) ed.bulk_update(actions) worker_object.task.complete() worker_object.save() return worker_id except Exception as e: worker_object.task.handle_failed_task(e) raise e
def start_rakun_task(self, object_id: int): rakun = RakunExtractor.objects.get(pk=object_id) task_object = rakun.task show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('starting rakun') show_progress.update_view(0) return object_id
def start_clustering_task(clustering_id: int): clustering_obj = ClusteringResult.objects.get(pk=clustering_id) task_object = clustering_obj.task show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('starting clustering') show_progress.update_view(0) return clustering_id
def start_search_fields_tagger_worker(self, object_id: int): logging.getLogger(INFO_LOGGER).info( f"Starting applying search fields tagger on the index for model ID: {object_id}" ) searchfieldstagger_object = SearchFieldsTagger.objects.get(pk=object_id) show_progress = ShowProgress(searchfieldstagger_object.task, multiplier=1) show_progress.update_step('running search fields tagger') show_progress.update_view(0) return object_id
def start_summarizer_worker(self, summarizer_id: int): logging.getLogger(INFO_LOGGER).info( f"Starting applying summarizer on the index for model ID: {summarizer_id}" ) summarizer_object = Summarizer.objects.get(pk=summarizer_id) show_progress = ShowProgress(summarizer_object.task, multiplier=1) show_progress.update_step('running summarizer') show_progress.update_view(0) return summarizer_id
def start_crf_task(crf_id: int): """ Starts the training process for Extractor. """ extractor = CRFExtractorObject.objects.get(pk=crf_id) task_object = extractor.task show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('starting tagging') show_progress.update_view(0) return crf_id
def apply_search_fields_tagger_on_index(object_id: int): search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id) task_object = search_fields_tagger.task """Apply Search Fields Tagger to index.""" try: progress = ShowProgress(task_object) progress.update_step('scrolling search fields') # Get the necessary fields. indices: List[str] = search_fields_tagger.get_indices() fields: List[str] = json.loads(search_fields_tagger.fields) fact_name: str = search_fields_tagger.fact_name scroll_timeout = search_fields_tagger.es_timeout scroll_size = search_fields_tagger.bulk_size use_breakup = search_fields_tagger.use_breakup breakup_character = search_fields_tagger.breakup_character ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=fields + ["texta_facts"], # Get facts to add upon existing ones. query=json.loads(search_fields_tagger.query), output=ElasticSearcher.OUT_RAW, scroll_timeout=f"{scroll_timeout}m", callback_progress=progress, scroll_size=scroll_size) actions = update_search_fields_generator( generator=searcher, ec=ec, fields=fields, fact_name=fact_name, search_field_tagger_object=search_fields_tagger, use_breakup=use_breakup, breakup_character=breakup_character) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return object_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def start_mlp_worker(self, mlp_id: int): """ Scrolls the document ID-s and passes them to MLP worker. """ mlp_object = MLPWorker.objects.get(pk=mlp_id) try: logging.getLogger(INFO_LOGGER).info( f"Applying mlp on the index for MLP Task ID: {mlp_id}") # init progress show_progress = ShowProgress(mlp_object.task, multiplier=1) show_progress.update_step('Scrolling document IDs') show_progress.update_view(0) # Get the necessary fields. indices: List[str] = mlp_object.get_indices() es_scroll_size = mlp_object.es_scroll_size es_timeout = mlp_object.es_timeout # create searcher object for scrolling ids searcher = ElasticSearcher(query=json.loads(mlp_object.query), indices=indices, output=ElasticSearcher.OUT_META, callback_progress=show_progress, scroll_size=es_scroll_size, scroll_timeout=f"{es_timeout}m") # add texta facts mappings to the indices if needed for index in indices: searcher.core.add_texta_facts_mapping(index=index) doc_chunks = list(chunks_iter(searcher, MLP_BATCH_SIZE)) # update progress show_progress.update_step( f'Applying MLP to {len(doc_chunks)} documents') show_progress.update_view(0) mlp_object.task.set_total(searcher.count()) mlp_object.task.update_status(Task.STATUS_RUNNING) # pass document id-s to the next task chain = group( apply_mlp_on_es_docs.s([doc["_id"] for doc in meta_chunk], mlp_id) for meta_chunk in doc_chunks) | end_mlp_task.si(mlp_id) chain.delay() return True except Exception as e: mlp_object.task.handle_failed_task(e) raise
def apply_analyzers_on_indices(self, worker_id: int): worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id) task_object = worker_object.task try: show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() fields = json.loads(worker_object.fields) detect_lang = worker_object.detect_lang snowball_language = worker_object.stemmer_lang scroll_timeout = f"{worker_object.es_timeout}m" scroll_size = worker_object.bulk_size analyzers = json.loads(worker_object.analyzers) tokenizer = worker_object.tokenizer strip_html = worker_object.strip_html searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=fields, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=scroll_timeout) task_object.set_total(searcher.count()) actions = process_analyzer_actions(generator=searcher, worker=worker_object, detect_lang=detect_lang, snowball_language=snowball_language, fields_to_parse=fields, analyzers=analyzers, tokenizer=tokenizer, strip_html=strip_html) # Send the data towards Elasticsearch ed = ElasticDocument("_all") ed.bulk_update(actions=actions, chunk_size=scroll_size) worker_object.task.complete() return worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def apply_lang_on_indices(self, apply_worker_id: int): worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id) task_object = worker_object.task try: load_mlp() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() field = worker_object.field scroll_size = 100 searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=[field], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="15m") for index in indices: searcher.core.add_texta_facts_mapping(index=index) actions = process_lang_actions(generator=searcher, field=field, worker_id=apply_worker_id, mlp_class=mlp) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) worker_object.task.complete() return apply_worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def apply_summarizer_on_index(self, summarizer_id: int): summarizer_object = Summarizer.objects.get(pk=summarizer_id) task_object = summarizer_object.task try: load_sumy() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('scrolling summarizer') # Get the necessary fields. indices: List[str] = summarizer_object.get_indices() field_data: List[str] = json.loads(summarizer_object.fields) ratio_data: float[str] = summarizer_object.ratio algorithm_data: List[str] = summarizer_object.algorithm scroll_size = 100 searcher = ElasticSearcher(query=json.loads(summarizer_object.query), indices=indices, field_data=field_data, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="30m") actions = process_actions(searcher, field_data, ratio_data, algorithm=algorithm_data, summarizer_class=sumy, summarizer_id=summarizer_id) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return summarizer_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def save_tagger_results(result_data: dict): try: tagger_id = result_data['id'] logging.getLogger(INFO_LOGGER).info( f"Starting task results for tagger with ID: {tagger_id}!") tagger_object = Tagger.objects.get(pk=tagger_id) # Handle previous tagger models that exist in case of retrains. model_path = pathlib.Path( tagger_object.model.path) if tagger_object.model else None task_object = tagger_object.task show_progress = ShowProgress(task_object, multiplier=1) # update status to saving show_progress.update_step('saving') show_progress.update_view(0) tagger_object.model.name = result_data["tagger_path"] tagger_object.precision = result_data["precision"] tagger_object.recall = result_data["recall"] tagger_object.f1_score = result_data["f1_score"] tagger_object.num_features = result_data["num_features"] tagger_object.num_examples = json.dumps(result_data["num_examples"]) tagger_object.model_size = result_data["model_size"] tagger_object.plot.name = result_data["plot"] tagger_object.confusion_matrix = result_data["confusion_matrix"] tagger_object.classes = json.dumps(result_data["classes"], ensure_ascii=False) tagger_object.save() task_object.complete() # Cleanup after the transaction to ensure integrity database records. if model_path and model_path.exists(): model_path.unlink(missing_ok=True) except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def save_crf_results(result_data: dict): """ Saves task results to database. """ try: crf_id = result_data['id'] logging.getLogger(INFO_LOGGER).info( f"Starting task results for CRFExtractor with ID: {crf_id}!") crf_object = CRFExtractorObject.objects.get(pk=crf_id) model_path = pathlib.Path( crf_object.model.path) if crf_object.model else None task_object = crf_object.task show_progress = ShowProgress(task_object, multiplier=1) # update status to saving show_progress.update_step('saving') show_progress.update_view(0) crf_object.best_c1 = result_data["best_c_values"][0] crf_object.best_c2 = result_data["best_c_values"][1] crf_object.model.name = result_data["extractor_path"] crf_object.precision = result_data["precision"] crf_object.recall = result_data["recall"] crf_object.f1_score = result_data["f1_score"] crf_object.model_size = result_data["model_size"] crf_object.confusion_matrix = result_data["confusion_matrix"] crf_object.plot.name = result_data["plot"] crf_object.save() task_object.complete() # Cleanup after the transaction to ensure integrity database records. if model_path and model_path.exists(): model_path.unlink(missing_ok=True) return True except Exception as e: task_object.handle_failed_task(e) raise e
def reindex_task(reindexer_task_id: int): logging.getLogger(INFO_LOGGER).info( f"Starting task 'reindex' with ID {reindexer_task_id}.") try: reindexer_obj = Reindexer.objects.get(pk=reindexer_task_id) task_object = reindexer_obj.task indices = json.loads(reindexer_obj.indices) fields = json.loads(reindexer_obj.fields) random_size = reindexer_obj.random_size field_type = json.loads(reindexer_obj.field_type) scroll_size = reindexer_obj.scroll_size new_index = reindexer_obj.new_index query = json.loads(reindexer_obj.query) # if no fields, let's use all fields from all selected indices if not fields: fields = ElasticCore().get_fields(indices) fields = [field["path"] for field in fields] show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step("scrolling data") show_progress.update_view(0) elastic_search = ElasticSearcher(indices=indices, field_data=fields, callback_progress=show_progress, query=query, scroll_size=scroll_size) task_object.set_total(elastic_search.count()) elastic_doc = ElasticDocument(new_index) if random_size > 0: elastic_search = elastic_search.random_documents(size=random_size) logging.getLogger(INFO_LOGGER).info("Updating index schema.") ''' the operations that don't require a mapping update have been completed ''' schema_input = update_field_types(indices, fields, field_type, flatten_doc=FLATTEN_DOC) updated_schema = update_mapping(schema_input, new_index, reindexer_obj.add_facts_mapping, add_texta_meta_mapping=False) logging.getLogger(INFO_LOGGER).info("Creating new index.") # create new_index create_index_res = ElasticCore().create_index(new_index, updated_schema) Index.objects.get_or_create(name=new_index) logging.getLogger(INFO_LOGGER).info("Indexing documents.") # set new_index name as mapping name, perhaps make it customizable in the future bulk_add_documents(elastic_search, elastic_doc, index=new_index, chunk_size=scroll_size, flatten_doc=FLATTEN_DOC, field_data=field_type) # declare the job done task_object.complete() logging.getLogger(INFO_LOGGER).info( "Reindexing succesfully completed.") return True except Exception as e: task_object.handle_failed_task(e) raise e
def train_crf_task(crf_id: int): """ Trains CRF model. """ try: # get task object logging.getLogger(INFO_LOGGER).info( f"Starting task 'train_crf' for CRFExtractor with ID: {crf_id}!") crf_object = CRFExtractorObject.objects.get(id=crf_id) task_object = crf_object.task # create progress object show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('scrolling documents') show_progress.update_view(0) # retrieve indices & field data indices = get_indices_from_object(crf_object) mlp_field = crf_object.mlp_field # load embedding if any if crf_object.embedding: embedding = crf_object.embedding.get_embedding() embedding.load_django(crf_object.embedding) else: embedding = None # scroll docs logging.getLogger(INFO_LOGGER).info( f"Scrolling data for CRFExtractor with ID: {crf_id}!") documents = ElasticSearcher(query=crf_object.get_query(), indices=indices, callback_progress=show_progress, text_processor=None, field_data=[mlp_field, "texta_facts"], output=ElasticSearcher.OUT_DOC, flatten=False) # create config config = crf_object.get_crf_config() # start training logging.getLogger(INFO_LOGGER).info( f"Training the model for CRFExtractor with ID: {crf_id}!") # create extractor extractor = CRFExtractor(config=config, embedding=embedding) # train the CRF model model_full_path, relative_model_path = crf_object.generate_name("crf") report, _ = extractor.train(documents, save_path=model_full_path, mlp_field=mlp_field) # Save the image before its path. image_name = f'{secrets.token_hex(15)}.png' crf_object.plot.save(image_name, create_tagger_plot(report.to_dict()), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name # pass results to next task return { "id": crf_id, "best_c_values": extractor.best_c_values, "extractor_path": relative_model_path, "precision": float(report.precision), "recall": float(report.recall), "f1_score": float(report.f1_score), "confusion_matrix": report.confusion.tolist(), "model_size": round(float(os.path.getsize(model_full_path)) / 1000000, 1), # bytes to mb "plot": str(image_path), } except Exception as e: task_object.handle_failed_task(e) raise e
def perform_data_clustering(clustering_id): clustering_model = ClusteringResult.objects.get(id=clustering_id) try: num_clusters = clustering_model.num_cluster clustering_algorithm = clustering_model.clustering_algorithm stop_words = json.loads(clustering_model.stop_words) indices = clustering_model.get_indices() query = json.loads(clustering_model.query) ignored_ids = json.loads(clustering_model.ignored_ids) fields = json.loads(clustering_model.fields) display_fields = json.loads(clustering_model.display_fields) document_limit = clustering_model.document_limit vectorizer = clustering_model.vectorizer num_dims = clustering_model.num_dims use_lsi = clustering_model.use_lsi num_topics = clustering_model.num_topics significant_words_filter = clustering_model.significant_words_filter # Removing stopwords, ignored ids while fetching the documents. show_progress = ShowProgress(clustering_model.task, multiplier=1) show_progress.update_step("scrolling data") show_progress.update_view(0) # load phraser from embedding if clustering_model.embedding: embedding = clustering_model.embedding.get_embedding() embedding.load_django(clustering_model.embedding) phraser = embedding.phraser else: phraser = None # Can't give parser to TextProcessor as some processing is also done in Clustering class text_processor = TextProcessor(remove_stop_words=True, custom_stop_words=stop_words) elastic_search = ElasticSearcher( indices=indices, query=query, callback_progress=show_progress, text_processor=text_processor, ignore_ids=set(ignored_ids), output=ElasticSearcher.OUT_TEXT_WITH_ID, field_data=fields, scroll_limit=document_limit) docs = [{ "id": doc_id, "document": document } for doc_id, document in elastic_search] # Group em up! clusters = Clustering(docs=docs, num_clusters=num_clusters, stop_words=stop_words, clustering_algorithm=clustering_algorithm, vectorizer=vectorizer, num_dims=num_dims, use_lsi=use_lsi, num_topics=num_topics, phraser=phraser) clusters.cluster() # Save the vector path. full_vector_path, relative_vector_path = clustering_model.generate_name( ) clusters.save_transformation(full_vector_path) clustering_info = { "pk": clustering_model.pk, "results": list(clusters.clustering_result.items()), "fields": fields, "indices": indices, "display_fields": display_fields, "vectors_filepath": relative_vector_path, "stop_words": stop_words, "significant_words_filter": significant_words_filter } return clustering_info except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) clustering_model.task.add_error(str(e)) clustering_model.task.update_status(status=Task.STATUS_FAILED) clustering_model.save() raise e
def train_torchtagger(tagger_id, testing=False): try: # retrieve neurotagger & task objects tagger_object = TorchTaggerObject.objects.get(pk=tagger_id) # Handle previous tagger models that exist in case of retrains. model_path = pathlib.Path(tagger_object.model.path) if tagger_object.model else None task_object = tagger_object.task model_type = TorchTaggerObject.MODEL_TYPE show_progress = ShowProgress(task_object, multiplier=1) # get fields & indices fields = json.loads(tagger_object.fields) indices = get_indices_from_object(tagger_object) # load embedding embedding = W2VEmbedding() embedding.load_django(tagger_object.embedding) # create Datasample object for retrieving positive and negative sample data_sample = DataSample( tagger_object, indices, fields, show_progress=show_progress, join_fields=True, balance=tagger_object.balance, use_sentence_shuffle=tagger_object.use_sentence_shuffle, balance_to_max_limit=tagger_object.balance_to_max_limit ) show_progress.update_step('training') show_progress.update_view(0.0) # get num examples and save to model num_examples = {k: len(v) for k, v in data_sample.data.items()} tagger_object.num_examples = json.dumps(num_examples) tagger_object.save() # create TorchTagger tagger = TorchTagger( embedding, model_arch=tagger_object.model_architecture ) # train tagger and get result statistics report = tagger.train(data_sample.data, num_epochs=int(tagger_object.num_epochs), pos_label=tagger_object.pos_label) # close all db connections for conn in connections.all(): conn.close_if_unusable_or_obsolete() # save tagger to disk tagger_path = os.path.join(RELATIVE_MODELS_PATH, model_type, f'{model_type}_{tagger_id}_{secrets.token_hex(10)}') tagger.save(tagger_path) # set tagger location tagger_object.model.name = tagger_path # save tagger plot report_dict = report.to_dict() tagger_object.plot.save(f'{secrets.token_hex(15)}.png', create_tagger_plot(report_dict), save=False) # save label index tagger_object.label_index = json.dumps(tagger.label_reverse_index) # stats to model object tagger_object.f1_score = report.f1_score tagger_object.precision = report.precision tagger_object.recall = report.recall tagger_object.accuracy = report.accuracy tagger_object.training_loss = report.training_loss tagger_object.epoch_reports = json.dumps([a.to_dict() for a in tagger.epoch_reports]) tagger_object.confusion_matrix = json.dumps(report.confusion.tolist()) tagger_object.classes = json.dumps(report.classes, ensure_ascii=False) # save tagger object tagger_object.save() # declare the job done task_object.complete() # Cleanup after the transaction to ensure integrity database records. if model_path and model_path.exists(): model_path.unlink(missing_ok=True) return True except Exception as e: task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise
def train_bert_tagger(tagger_id, testing=False): # retrieve neurotagger & task objects tagger_object = BertTaggerObject.objects.get(pk=tagger_id) # Handle previous tagger models that exist in case of retrains. model_path = pathlib.Path( tagger_object.model.path) if tagger_object.model else None task_object = tagger_object.task try: show_progress = ShowProgress(task_object, multiplier=1) # get fields & indices fields = json.loads(tagger_object.fields) indices = get_indices_from_object(tagger_object) # set loading model from a checkpoint False by default from_checkpoint = False checkpoint_model = tagger_object.checkpoint_model pos_label = tagger_object.pos_label # create Datasample object for retrieving positive and negative sample data_sample = DataSample( tagger_object, indices, fields, show_progress=show_progress, join_fields=True, balance=tagger_object.balance, use_sentence_shuffle=tagger_object.use_sentence_shuffle, balance_to_max_limit=tagger_object.balance_to_max_limit) show_progress.update_step('training') show_progress.update_view(0.0) # select sklearn average function based on the number of classes if data_sample.is_binary: sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_BINARY else: sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_MULTICLASS # if checkpoint model is detected, load it and use it for further training if checkpoint_model: logging.getLogger(INFO_LOGGER).info( f"Loading model from a checkpoint stored in '{tagger_object}'..." ) # use the same pre-trained bert model as the checkpoint model tagger_object.bert_model = checkpoint_model.bert_model tagger = checkpoint_model.load_tagger() # set sklearn avg function in case the number of classes has changed tagger.sklearn_avg_function = sklearn_avg_function # set loading model from a checkpoint True from_checkpoint = True # if no checkpoint model is given, train a new model else: logging.getLogger(INFO_LOGGER).info( "No checkpoint model detected, training a new model...") # NB! saving pretrained models must be disabled! tagger = BertTagger( allow_standard_output=choices.DEFAULT_ALLOW_STANDARD_OUTPUT, autoadjust_batch_size=choices.DEFAULT_AUTOADJUST_BATCH_SIZE, sklearn_avg_function=sklearn_avg_function, use_gpu=tagger_object.use_gpu, save_pretrained=False, pretrained_models_dir=BERT_PRETRAINED_MODEL_DIRECTORY, logger=logging.getLogger(INFO_LOGGER), cache_dir=BERT_CACHE_DIR) # use state dict for binary taggers if data_sample.is_binary: tagger.config.use_state_dict = True else: tagger.config.use_state_dict = False pos_label = "" # train tagger and get result statistics report = tagger.train(data_sample.data, from_checkpoint=from_checkpoint, pos_label=pos_label, n_epochs=tagger_object.num_epochs, max_length=tagger_object.max_length, batch_size=tagger_object.batch_size, lr=tagger_object.learning_rate, eps=tagger_object.eps, split_ratio=tagger_object.split_ratio, bert_model=tagger_object.bert_model) # close all db connections for conn in connections.all(): conn.close_if_unusable_or_obsolete() # save tagger to disc tagger_path = os.path.join( BERT_FINETUNED_MODEL_DIRECTORY, f'{tagger_object.MODEL_TYPE}_{tagger_id}_{secrets.token_hex(10)}') tagger.save(tagger_path) # set tagger location tagger_object.model.name = tagger_path report_dict = report.to_dict() # save tagger plot tagger_object.plot.save(f'{secrets.token_hex(15)}.png', create_tagger_plot(report_dict), save=False) # save label index tagger_object.label_index = json.dumps( tagger.config.label_reverse_index) # stats to model object tagger_object.f1_score = report.f1_score tagger_object.precision = report.precision tagger_object.recall = report.recall tagger_object.accuracy = report.accuracy tagger_object.training_loss = report.training_loss tagger_object.validation_loss = report.validation_loss tagger_object.epoch_reports = json.dumps( [a.to_dict() for a in tagger.epoch_reports]) tagger_object.num_examples = json.dumps( {k: len(v) for k, v in list(data_sample.data.items())}) tagger_object.adjusted_batch_size = tagger.config.batch_size tagger_object.confusion_matrix = json.dumps(report.confusion.tolist()) tagger_object.classes = json.dumps(report.classes, ensure_ascii=False) # save tagger object tagger_object.save() # declare the job done task_object.complete() # Cleanup after the transaction to ensure integrity database records. if model_path and model_path.exists(): model_path.unlink(missing_ok=True) return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. tagger_object.task.add_error(error_message) tagger_object.task.update_status(Task.STATUS_FAILED) raise
def train_embedding(embedding_id): # retrieve embedding & task objects embedding_object = Embedding.objects.get(pk=embedding_id) task_object = embedding_object.task show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('training') show_progress.update_view(0) try: # retrieve indices from project indices = get_indices_from_object(embedding_object) field_data = json.loads(embedding_object.fields) max_documents = embedding_object.max_documents use_phraser = embedding_object.use_phraser snowball_language = embedding_object.snowball_language # add stemmer if asked if snowball_language: snowball_lemmatizer = ElasticAnalyzer(language=snowball_language) else: snowball_lemmatizer = None # iterator for texts sentences = ElasticSearcher(query=json.loads(embedding_object.query), indices=indices, field_data=field_data, callback_progress=show_progress, scroll_limit=max_documents, text_processor=TextProcessor( sentences=True, remove_stop_words=True, words_as_list=True, lemmatizer=snowball_lemmatizer), output=ElasticSearcher.OUT_TEXT) # create embedding object & train embedding = embedding_object.get_embedding() embedding.train(sentences, use_phraser=use_phraser) # close all db connections for conn in connections.all(): conn.close_if_unusable_or_obsolete() # save model show_progress.update_step('saving') full_model_path, relative_model_path = embedding_object.generate_name( "embedding") embedding.save(full_model_path) # save gensim model if embedding_object.embedding_type == "FastTextEmbedding": fast_text_embedding_model = joblib.load( full_model_path)["embedding"] gensim_full_model_path = full_model_path + "_" + FACEBOOK_MODEL_SUFFIX gensim.models.fasttext.save_facebook_model( fast_text_embedding_model, gensim_full_model_path, encoding='utf-8') # save model path embedding_object.embedding_model.name = relative_model_path embedding_object.vocab_size = embedding.model.wv.vectors.shape[0] embedding_object.save() # declare the job done task_object.complete() return True except Exception as e: # declare the job failed task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise
def train_tagger_task(tagger_id: int): logging.getLogger(INFO_LOGGER).info( f"Starting task 'train_tagger' for tagger with ID: {tagger_id}!") tagger_object = Tagger.objects.get(id=tagger_id) task_object = tagger_object.task try: # create progress object show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('scrolling positives') show_progress.update_view(0) # retrieve indices & field data indices = get_indices_from_object(tagger_object) field_data = json.loads(tagger_object.fields) # split stop words by space or newline and remove empties stop_words = load_stop_words(tagger_object.stop_words) ignore_numbers = tagger_object.ignore_numbers # get scoring function if tagger_object.scoring_function != "default": scoring_function = tagger_object.scoring_function else: scoring_function = None logging.getLogger(INFO_LOGGER).info( f"Using scoring function: {scoring_function}.") # load embedding if any if tagger_object.embedding: embedding = W2VEmbedding() embedding.load_django(tagger_object.embedding) else: embedding = None # create Datasample object for retrieving positive and negative sample data_sample = DataSample( tagger_object, indices=indices, field_data=field_data, show_progress=show_progress, snowball_language=tagger_object.snowball_language, detect_lang=tagger_object.detect_lang, balance=tagger_object.balance, balance_to_max_limit=tagger_object.balance_to_max_limit) # update status to training show_progress.update_step("training") show_progress.update_view(0) # train model tagger = TextTagger(embedding=embedding, custom_stop_words=stop_words, ignore_numbers=ignore_numbers, classifier=tagger_object.classifier, vectorizer=tagger_object.vectorizer, analyzer=tagger_object.analyzer) tagger.train(data_sample.data, pos_label=tagger_object.pos_label, field_list=field_data, scoring=scoring_function) # save tagger to disk tagger_full_path, relative_tagger_path = tagger_object.generate_name( "tagger") tagger.save(tagger_full_path) # Save the image before its path. image_name = f'{secrets.token_hex(15)}.png' tagger_object.plot.save(image_name, create_tagger_plot(tagger.report.to_dict()), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name # get num examples num_examples = {k: len(v) for k, v in data_sample.data.items()} return { "id": tagger_id, "tagger_path": relative_tagger_path, "precision": float(tagger.report.precision), "recall": float(tagger.report.recall), "f1_score": float(tagger.report.f1_score), "num_features": tagger.report.num_features, "num_examples": num_examples, "confusion_matrix": tagger.report.confusion.tolist(), "model_size": round(float(os.path.getsize(tagger_full_path)) / 1000000, 1), # bytes to mb "plot": str(image_path), "classes": tagger.report.classes } except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def annotator_task(self, annotator_task_id): annotator_obj = Annotator.objects.get(pk=annotator_task_id) annotator_group_children = [] indices = annotator_obj.get_indices() users = [user.pk for user in annotator_obj.annotator_users.all()] task_object = annotator_obj.task annotator_fields = json.loads(annotator_obj.fields) all_fields = annotator_fields all_fields.append("texta_meta.document_uuid") if annotator_obj.annotation_type == 'entity': all_fields.append("texta_facts") all_fields.append(texta_mlp.settings.META_KEY) # Include MLP Meta key here so it would be pulled from Elasticsearch. project_obj = Project.objects.get(id=annotator_obj.project_id) new_field_type = get_selected_fields(indices, annotator_fields) field_type = add_field_type(new_field_type) add_facts_mapping = annotator_obj.add_facts_mapping scroll_size = 100 new_indices = [] new_annotators = [] for user in users: annotating_user = User.objects.get(pk=user) new_annotators.append(annotating_user.pk) for index in indices: new_indices.append(f"{index}_{user}_{annotator_obj.task_id}") query = annotator_obj.query logging.getLogger(INFO_LOGGER).info(f"Starting task annotator with Task ID {annotator_obj.task_id}.") try: ec = ElasticCore() index_fields = ec.get_fields(indices) index_fields = [index_field["path"] for index_field in index_fields] # ElasticSearcher seems to be broken when handling scrolls with only the main field in its field_data instead of all of them in dot notation. # Hence this ugly hack is needed if I want to include the MLP meta field inside the output. for annotator_field in json.loads(annotator_obj.fields): for index_field in index_fields: stripped_mlp_field = annotator_field.split("_mlp.")[0] if "_mlp." in annotator_field else annotator_field if texta_mlp.settings.META_KEY in index_field and stripped_mlp_field in index_field: all_fields.append(index_field) show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step("scrolling data") show_progress.update_view(0) __add_meta_to_original_index(indices, index_fields, show_progress, query, scroll_size, ec) for new_annotator in new_annotators: new_annotator_obj = Annotator.objects.create( annotator_uid=f"{annotator_obj.description}_{new_annotator}_{annotator_obj.task_id}", description=f"{annotator_obj.description}", author=annotator_obj.author, project=annotator_obj.project, total=annotator_obj.total, fields=annotator_obj.fields, add_facts_mapping=add_facts_mapping, annotation_type=annotator_obj.annotation_type, binary_configuration=annotator_obj.binary_configuration, multilabel_configuration=annotator_obj.multilabel_configuration, entity_configuration=annotator_obj.entity_configuration, ) new_annotator_obj.annotator_users.add(new_annotator) for new_index in new_indices: logging.getLogger(INFO_LOGGER).info(f"New Index check {new_index} for user {new_annotator}") logging.getLogger(INFO_LOGGER).info(f"Index object {indices}") for index in indices: if new_index == f"{index}_{new_annotator}_{annotator_obj.task_id}": elastic_search = ElasticSearcher(indices=indices, field_data=all_fields, callback_progress=show_progress, query=query, scroll_size=scroll_size) elastic_doc = ElasticDocument(new_index) logging.getLogger(INFO_LOGGER).info(f"Updating index schema for index {new_index}") ''' the operations that don't require a mapping update have been completed ''' schema_input = update_field_types(indices, all_fields, field_type, flatten_doc=False) updated_schema = update_mapping(schema_input, new_index, add_facts_mapping, add_texta_meta_mapping=True) logging.getLogger(INFO_LOGGER).info(f"Creating new index {new_index} for user {new_annotator}") # create new_index create_index_res = ElasticCore().create_index(new_index, updated_schema) index_model, is_created = Index.objects.get_or_create(name=new_index) project_obj.indices.add(index_model) index_user = index_model.name.rsplit('_', 2)[1] if str(index_user) == str(new_annotator): new_annotator_obj.indices.add(index_model) logging.getLogger(INFO_LOGGER).info("Indexing documents.") # set new_index name as mapping name bulk_add_documents(elastic_search, elastic_doc, index=new_index, chunk_size=scroll_size, flatten_doc=False) new_annotator_obj.save() annotator_group_children.append(new_annotator_obj.id) logging.getLogger(INFO_LOGGER).info(f"Saving new annotator object ID {new_annotator_obj.id}") new_annotator_obj.add_annotation_mapping(new_indices) new_annotator_obj.add_texta_meta_mapping(new_indices) annotator_obj.annotator_users.clear() annotator_obj.save() annotator_group, is_created = AnnotatorGroup.objects.get_or_create(project=annotator_obj.project, parent=annotator_obj) annotator_group.children.add(*annotator_group_children) # declare the job done task_object.complete() except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e logging.getLogger(INFO_LOGGER).info(f"Annotator with Task ID {annotator_obj.task_id} successfully completed.") return True