def reevaluate(self, request, pk=None, project_pk=None): """Starts re-evaluation task for the Evaluator model.""" evaluator = self.get_object() es_timeout = evaluator.es_timeout scroll_size = evaluator.scroll_size query = json.loads(evaluator.query) evaluation_type = evaluator.evaluation_type indices = get_indices_from_object(evaluator) if evaluation_type == choices.ENTITY_EVALUATION: evaluate_entity_tags_task.apply_async(args=(evaluator.pk, indices, query, es_timeout, scroll_size), queue=CELERY_LONG_TERM_TASK_QUEUE) else: evaluate_tags_task.apply_async(args=(evaluator.pk, indices, query, es_timeout, scroll_size), queue=CELERY_LONG_TERM_TASK_QUEUE) return Response({"success": "Re-evaluation task created"}, status=status.HTTP_200_OK)
def train_embedding(embedding_id): # retrieve embedding & task objects embedding_object = Embedding.objects.get(pk=embedding_id) task_object = embedding_object.task show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('training') show_progress.update_view(0) try: # retrieve indices from project indices = get_indices_from_object(embedding_object) field_data = json.loads(embedding_object.fields) max_documents = embedding_object.max_documents use_phraser = embedding_object.use_phraser snowball_language = embedding_object.snowball_language # add stemmer if asked if snowball_language: snowball_lemmatizer = ElasticAnalyzer(language=snowball_language) else: snowball_lemmatizer = None # iterator for texts sentences = ElasticSearcher(query=json.loads(embedding_object.query), indices=indices, field_data=field_data, callback_progress=show_progress, scroll_limit=max_documents, text_processor=TextProcessor( sentences=True, remove_stop_words=True, words_as_list=True, lemmatizer=snowball_lemmatizer), output=ElasticSearcher.OUT_TEXT) # create embedding object & train embedding = embedding_object.get_embedding() embedding.train(sentences, use_phraser=use_phraser) # close all db connections for conn in connections.all(): conn.close_if_unusable_or_obsolete() # save model show_progress.update_step('saving') full_model_path, relative_model_path = embedding_object.generate_name( "embedding") embedding.save(full_model_path) # save gensim model if embedding_object.embedding_type == "FastTextEmbedding": fast_text_embedding_model = joblib.load( full_model_path)["embedding"] gensim_full_model_path = full_model_path + "_" + FACEBOOK_MODEL_SUFFIX gensim.models.fasttext.save_facebook_model( fast_text_embedding_model, gensim_full_model_path, encoding='utf-8') # save model path embedding_object.embedding_model.name = relative_model_path embedding_object.vocab_size = embedding.model.wv.vectors.shape[0] embedding_object.save() # declare the job done task_object.complete() return True except Exception as e: # declare the job failed task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise
def train_tagger_task(tagger_id: int): logging.getLogger(INFO_LOGGER).info( f"Starting task 'train_tagger' for tagger with ID: {tagger_id}!") tagger_object = Tagger.objects.get(id=tagger_id) task_object = tagger_object.task try: # create progress object show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('scrolling positives') show_progress.update_view(0) # retrieve indices & field data indices = get_indices_from_object(tagger_object) field_data = json.loads(tagger_object.fields) # split stop words by space or newline and remove empties stop_words = load_stop_words(tagger_object.stop_words) ignore_numbers = tagger_object.ignore_numbers # get scoring function if tagger_object.scoring_function != "default": scoring_function = tagger_object.scoring_function else: scoring_function = None logging.getLogger(INFO_LOGGER).info( f"Using scoring function: {scoring_function}.") # load embedding if any if tagger_object.embedding: embedding = W2VEmbedding() embedding.load_django(tagger_object.embedding) else: embedding = None # create Datasample object for retrieving positive and negative sample data_sample = DataSample( tagger_object, indices=indices, field_data=field_data, show_progress=show_progress, snowball_language=tagger_object.snowball_language, detect_lang=tagger_object.detect_lang, balance=tagger_object.balance, balance_to_max_limit=tagger_object.balance_to_max_limit) # update status to training show_progress.update_step("training") show_progress.update_view(0) # train model tagger = TextTagger(embedding=embedding, custom_stop_words=stop_words, ignore_numbers=ignore_numbers, classifier=tagger_object.classifier, vectorizer=tagger_object.vectorizer, analyzer=tagger_object.analyzer) tagger.train(data_sample.data, pos_label=tagger_object.pos_label, field_list=field_data, scoring=scoring_function) # save tagger to disk tagger_full_path, relative_tagger_path = tagger_object.generate_name( "tagger") tagger.save(tagger_full_path) # Save the image before its path. image_name = f'{secrets.token_hex(15)}.png' tagger_object.plot.save(image_name, create_tagger_plot(tagger.report.to_dict()), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name # get num examples num_examples = {k: len(v) for k, v in data_sample.data.items()} return { "id": tagger_id, "tagger_path": relative_tagger_path, "precision": float(tagger.report.precision), "recall": float(tagger.report.recall), "f1_score": float(tagger.report.f1_score), "num_features": tagger.report.num_features, "num_examples": num_examples, "confusion_matrix": tagger.report.confusion.tolist(), "model_size": round(float(os.path.getsize(tagger_full_path)) / 1000000, 1), # bytes to mb "plot": str(image_path), "classes": tagger.report.classes } except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def train_bert_tagger(tagger_id, testing=False): # retrieve neurotagger & task objects tagger_object = BertTaggerObject.objects.get(pk=tagger_id) # Handle previous tagger models that exist in case of retrains. model_path = pathlib.Path( tagger_object.model.path) if tagger_object.model else None task_object = tagger_object.task try: show_progress = ShowProgress(task_object, multiplier=1) # get fields & indices fields = json.loads(tagger_object.fields) indices = get_indices_from_object(tagger_object) # set loading model from a checkpoint False by default from_checkpoint = False checkpoint_model = tagger_object.checkpoint_model pos_label = tagger_object.pos_label # create Datasample object for retrieving positive and negative sample data_sample = DataSample( tagger_object, indices, fields, show_progress=show_progress, join_fields=True, balance=tagger_object.balance, use_sentence_shuffle=tagger_object.use_sentence_shuffle, balance_to_max_limit=tagger_object.balance_to_max_limit) show_progress.update_step('training') show_progress.update_view(0.0) # select sklearn average function based on the number of classes if data_sample.is_binary: sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_BINARY else: sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_MULTICLASS # if checkpoint model is detected, load it and use it for further training if checkpoint_model: logging.getLogger(INFO_LOGGER).info( f"Loading model from a checkpoint stored in '{tagger_object}'..." ) # use the same pre-trained bert model as the checkpoint model tagger_object.bert_model = checkpoint_model.bert_model tagger = checkpoint_model.load_tagger() # set sklearn avg function in case the number of classes has changed tagger.sklearn_avg_function = sklearn_avg_function # set loading model from a checkpoint True from_checkpoint = True # if no checkpoint model is given, train a new model else: logging.getLogger(INFO_LOGGER).info( "No checkpoint model detected, training a new model...") # NB! saving pretrained models must be disabled! tagger = BertTagger( allow_standard_output=choices.DEFAULT_ALLOW_STANDARD_OUTPUT, autoadjust_batch_size=choices.DEFAULT_AUTOADJUST_BATCH_SIZE, sklearn_avg_function=sklearn_avg_function, use_gpu=tagger_object.use_gpu, save_pretrained=False, pretrained_models_dir=BERT_PRETRAINED_MODEL_DIRECTORY, logger=logging.getLogger(INFO_LOGGER), cache_dir=BERT_CACHE_DIR) # use state dict for binary taggers if data_sample.is_binary: tagger.config.use_state_dict = True else: tagger.config.use_state_dict = False pos_label = "" # train tagger and get result statistics report = tagger.train(data_sample.data, from_checkpoint=from_checkpoint, pos_label=pos_label, n_epochs=tagger_object.num_epochs, max_length=tagger_object.max_length, batch_size=tagger_object.batch_size, lr=tagger_object.learning_rate, eps=tagger_object.eps, split_ratio=tagger_object.split_ratio, bert_model=tagger_object.bert_model) # close all db connections for conn in connections.all(): conn.close_if_unusable_or_obsolete() # save tagger to disc tagger_path = os.path.join( BERT_FINETUNED_MODEL_DIRECTORY, f'{tagger_object.MODEL_TYPE}_{tagger_id}_{secrets.token_hex(10)}') tagger.save(tagger_path) # set tagger location tagger_object.model.name = tagger_path report_dict = report.to_dict() # save tagger plot tagger_object.plot.save(f'{secrets.token_hex(15)}.png', create_tagger_plot(report_dict), save=False) # save label index tagger_object.label_index = json.dumps( tagger.config.label_reverse_index) # stats to model object tagger_object.f1_score = report.f1_score tagger_object.precision = report.precision tagger_object.recall = report.recall tagger_object.accuracy = report.accuracy tagger_object.training_loss = report.training_loss tagger_object.validation_loss = report.validation_loss tagger_object.epoch_reports = json.dumps( [a.to_dict() for a in tagger.epoch_reports]) tagger_object.num_examples = json.dumps( {k: len(v) for k, v in list(data_sample.data.items())}) tagger_object.adjusted_batch_size = tagger.config.batch_size tagger_object.confusion_matrix = json.dumps(report.confusion.tolist()) tagger_object.classes = json.dumps(report.classes, ensure_ascii=False) # save tagger object tagger_object.save() # declare the job done task_object.complete() # Cleanup after the transaction to ensure integrity database records. if model_path and model_path.exists(): model_path.unlink(missing_ok=True) return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. tagger_object.task.add_error(error_message) tagger_object.task.update_status(Task.STATUS_FAILED) raise
def train_torchtagger(tagger_id, testing=False): try: # retrieve neurotagger & task objects tagger_object = TorchTaggerObject.objects.get(pk=tagger_id) # Handle previous tagger models that exist in case of retrains. model_path = pathlib.Path(tagger_object.model.path) if tagger_object.model else None task_object = tagger_object.task model_type = TorchTaggerObject.MODEL_TYPE show_progress = ShowProgress(task_object, multiplier=1) # get fields & indices fields = json.loads(tagger_object.fields) indices = get_indices_from_object(tagger_object) # load embedding embedding = W2VEmbedding() embedding.load_django(tagger_object.embedding) # create Datasample object for retrieving positive and negative sample data_sample = DataSample( tagger_object, indices, fields, show_progress=show_progress, join_fields=True, balance=tagger_object.balance, use_sentence_shuffle=tagger_object.use_sentence_shuffle, balance_to_max_limit=tagger_object.balance_to_max_limit ) show_progress.update_step('training') show_progress.update_view(0.0) # get num examples and save to model num_examples = {k: len(v) for k, v in data_sample.data.items()} tagger_object.num_examples = json.dumps(num_examples) tagger_object.save() # create TorchTagger tagger = TorchTagger( embedding, model_arch=tagger_object.model_architecture ) # train tagger and get result statistics report = tagger.train(data_sample.data, num_epochs=int(tagger_object.num_epochs), pos_label=tagger_object.pos_label) # close all db connections for conn in connections.all(): conn.close_if_unusable_or_obsolete() # save tagger to disk tagger_path = os.path.join(RELATIVE_MODELS_PATH, model_type, f'{model_type}_{tagger_id}_{secrets.token_hex(10)}') tagger.save(tagger_path) # set tagger location tagger_object.model.name = tagger_path # save tagger plot report_dict = report.to_dict() tagger_object.plot.save(f'{secrets.token_hex(15)}.png', create_tagger_plot(report_dict), save=False) # save label index tagger_object.label_index = json.dumps(tagger.label_reverse_index) # stats to model object tagger_object.f1_score = report.f1_score tagger_object.precision = report.precision tagger_object.recall = report.recall tagger_object.accuracy = report.accuracy tagger_object.training_loss = report.training_loss tagger_object.epoch_reports = json.dumps([a.to_dict() for a in tagger.epoch_reports]) tagger_object.confusion_matrix = json.dumps(report.confusion.tolist()) tagger_object.classes = json.dumps(report.classes, ensure_ascii=False) # save tagger object tagger_object.save() # declare the job done task_object.complete() # Cleanup after the transaction to ensure integrity database records. if model_path and model_path.exists(): model_path.unlink(missing_ok=True) return True except Exception as e: task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise
def train_crf_task(crf_id: int): """ Trains CRF model. """ try: # get task object logging.getLogger(INFO_LOGGER).info( f"Starting task 'train_crf' for CRFExtractor with ID: {crf_id}!") crf_object = CRFExtractorObject.objects.get(id=crf_id) task_object = crf_object.task # create progress object show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('scrolling documents') show_progress.update_view(0) # retrieve indices & field data indices = get_indices_from_object(crf_object) mlp_field = crf_object.mlp_field # load embedding if any if crf_object.embedding: embedding = crf_object.embedding.get_embedding() embedding.load_django(crf_object.embedding) else: embedding = None # scroll docs logging.getLogger(INFO_LOGGER).info( f"Scrolling data for CRFExtractor with ID: {crf_id}!") documents = ElasticSearcher(query=crf_object.get_query(), indices=indices, callback_progress=show_progress, text_processor=None, field_data=[mlp_field, "texta_facts"], output=ElasticSearcher.OUT_DOC, flatten=False) # create config config = crf_object.get_crf_config() # start training logging.getLogger(INFO_LOGGER).info( f"Training the model for CRFExtractor with ID: {crf_id}!") # create extractor extractor = CRFExtractor(config=config, embedding=embedding) # train the CRF model model_full_path, relative_model_path = crf_object.generate_name("crf") report, _ = extractor.train(documents, save_path=model_full_path, mlp_field=mlp_field) # Save the image before its path. image_name = f'{secrets.token_hex(15)}.png' crf_object.plot.save(image_name, create_tagger_plot(report.to_dict()), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name # pass results to next task return { "id": crf_id, "best_c_values": extractor.best_c_values, "extractor_path": relative_model_path, "precision": float(report.precision), "recall": float(report.recall), "f1_score": float(report.f1_score), "confusion_matrix": report.confusion.tolist(), "model_size": round(float(os.path.getsize(model_full_path)) / 1000000, 1), # bytes to mb "plot": str(image_path), } except Exception as e: task_object.handle_failed_task(e) raise e