def __init__(self, tasks: list, result_filename: str, detok=True, model_filename=None, just_last=False): self.tasks = tasks self.DETOK = 'detok_' if detok else '' self._DETOK = '_detok' if detok else '' for task in tasks: model_service = ModelService(task) self.tokenizer = model_service.get_tokenizer() task = self.getAllMetrics(task) fm.to_pickle(task, loc.abs_path([loc.TMP, task.id + ".pickle"])) if just_last: best_partial = self.find_last_configuration( tasks, COMPARATOR_MODE.PARTIAL) best_strict = self.find_last_configuration(tasks, COMPARATOR_MODE.STRICT) else: best_partial = self.find_best_configuration( tasks, COMPARATOR_MODE.PARTIAL) best_strict = self.find_best_configuration(tasks, COMPARATOR_MODE.STRICT) print(f"\n\nBEST PARTIAL:\n\n{self.best_to_str(best_partial)}") print(f"\n\nBEST STRICT:\n\n{self.best_to_str(best_strict)}") best = Best(best_strict, best_partial) fm.to_pickle(best, loc.abs_path([loc.TMP, result_filename]))
def __init__(self, task: Task): super(DirksonTaskLoader, self).__init__(task) LOG.info('dataset export in progress...') split_svc = SplitService() self.task.split = split_svc.load_split(self.task) train = self.load(task.split.train) validation = self.load(task.split.validation) train_full = train + validation test = self.load(task.split.test) all_doc_text = [[y.text for y in x.doc.tokens] for x in train_full] all_labels = [x.tags for x in train_full] prep_text, prep_labels = self.preprocess_text(all_doc_text, all_labels) self.save( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, task.corpus.name + loc.DIRKSON_VALIDATION_TXT ]), prep_text, prep_labels) all_doc_text = [[y.text for y in x.doc.tokens] for x in test] all_labels = [x.tags for x in test] prep_text, prep_labels = self.preprocess_text(all_doc_text, all_labels) self.save( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, task.corpus.name + loc.DIRKSON_TEST_TXT ]), prep_text, prep_labels) LOG.info('dataset exported successfully!...')
def __init__(self, corpus_name: str): f = open(loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name + loc.DIRKSON_TEST_TXT ]), "r", encoding="utf8") all_doc_text = [] all_labels = [] temp_text = "" temp_labels = [] for line in f: if line == "\n": all_doc_text.append(temp_text[1:]) all_labels.append(temp_labels) temp_text = "" temp_labels = [] else: temp_text = temp_text + " " + line[0:-3] temp_labels.append(line[-2:-1]) model = SequenceTagger.load_from_file( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, "best-model.pt" ])) doc_text = [] truth_labels = [] pred_labels = [] for text, labels in tqdm(zip(all_doc_text, all_labels), total=len(all_doc_text)): sentence = Sentence(text) model.predict(sentence) pred = self.fromTaggedToIOB(sentence.to_tagged_string()) doc_text.append(text.split(" ")) truth_labels.append(labels) pred_labels.append(pred) if len(pred) != len(labels): print(text) print(sentence) df = pd.DataFrame({ 'doc_text': doc_text, 'labels': truth_labels, 'pred_labels': pred_labels }) df.to_pickle( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name + loc.DIRKSON_TEST_RESULTS_PICKLE ]))
def __init__(self, corpus_name: str): corpus = NLPTaskDataFetcher.load_column_corpus( loc.abs_path([loc.ASSETS, loc.MODELS, loc.DIRKSON]), { 0: 'text', 1: 'ner' }, train_file=corpus_name + loc.DIRKSON_VALIDATION_TXT, test_file=corpus_name + loc.DIRKSON_TEST_TXT) embedding_types = [ BertEmbeddings('bert-base-uncased'), FlairEmbeddings('mix-forward'), FlairEmbeddings('mix-backward') ] tag_type = 'ner' embeddings = StackedEmbeddings(embeddings=embedding_types) tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) if not path.exists: os.mkdir( loc.abs_path( [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name])) trainer.train(loc.abs_path( [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]), learning_rate=0.1, mini_batch_size=32, max_epochs=150) plotter = Plotter() plotter.plot_training_curves( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.LOSS_TSV ])) plotter.plot_weights( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.WEIGHTS_TXT ]))
def run_handler(self, run_path): json = fm.from_json(loc.abs_path([loc.ASSETS, loc.RUNS, run_path])) for task in json: # try: random_seed = int(task['train_config']['random_seed']) self.set_all_seed(random_seed) train_config = TrainConfig( int(task['train_config']['max_patience']), float(task['train_config']['learning_rate']), float(task['train_config']['dropout']), int(task['train_config']['epochs']), random_seed, float(task['train_config']['epsilon'])) task = Task(task['id'], task['split_folder'], enums_by_list(TIDY_MODE, task['tidy_modes']), enum_by_name(CORPUS, task['corpus']), enum_by_name(NOTATION, task['notation']), enum_by_name(MODEL, task['model']), enum_by_name(ARCHITECTURE, task['architecture']), enums_by_list(ANNOTATION_TYPE, task['goal']), enum_by_name(TRAIN_MODE, task['train_mode']), train_config) model = task.model if model == MODEL.DIRKSON: self.dirkson_task_handler(task) else: self.bert_task_handler(task)
def load_obj(self, name): with open( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, loc.OBJ_LEX, f'{name}.pkl' ]), 'rb') as f: return pickle.load(f, encoding='latin1')
def loadEnglishToAmericanDict(self): etoa = {} english = open( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, loc.OBJ_LEX, 'englishspellings.txt' ])) american = open( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, loc.OBJ_LEX, 'americanspellings.txt' ])) for line in english: etoa[line.strip()] = american.readline().strip() return etoa
class MODEL(str, enum.Enum): SPAN_BERT_CASED = 'SpanBERT/spanbert-base-cased' BIO_BERT = 'dmis-lab/biobert-v1.1' BIO_BERT_GIT = loc.abs_path([loc.TMP, loc.BIO_BERT_GIT]) SCI_BERT = 'allenai/scibert_scivocab_cased' BIO_CLINICAL_BERT = 'emilyalsentzer/Bio_ClinicalBERT' BERT_TWEET = 'vinai/bertweet-base' PUB_MED_BERT = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' BASE_UNCASED = 'bert-base-uncased' DIRKSON = 'dirkson'
def __init__(self, args): try: if not path.exists(loc.abs_path([loc.TMP, loc.BIO_BERT_GIT])): ModelService.get_bio_git_model() except: pass if args.import_ds: self.import_handler() elif args.run is not None: self.run_handler(args.run[0]) elif args.clean: self.clean_handler() else: self.default_handler()
def loadDictionaryData(self): ''' this function loads the various dictionaries which can be used for mapping from oov to iv ''' n_dict = {} infile = open( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, loc.OBJ_LEX, 'noslang_mod.txt' ])) for line in infile: items = line.split(' - ') if len(items[0]) > 0 and len(items) > 1: n_dict[items[0].strip()] = items[1].strip() return n_dict
def compare_dirkson(corpus_name: str): df = pd.read_pickle( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name + loc.DIRKSON_TEST_RESULTS_PICKLE ])) all_true_array = df.labels.values all_pred_array = df.pred_labels.values all_true = [] for t in all_true_array: all_true.append(t) all_pred = [] for t in all_pred_array: all_pred.append(t) evaluator = Evaluator(all_true, all_pred, [""]) results, results_agg = evaluator.evaluate() fm.to_json( results, loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name + "-results.json" ]))
def __init__(self, task: Task): self.task = task self.TEST_ONLY = task.train_mode == TRAIN_MODE.JUST_TESTING if task.train_mode == TRAIN_MODE.VALIDATION: self.train_dataset = task.split.to_tensor_dataset(task.split.train) self.validation_dataset = task.split.to_tensor_dataset(task.split.validation) elif task.train_mode == TRAIN_MODE.TESTING: self.train_dataset = task.split.to_tensor_dataset(task.split.train + task.split.validation) self.validation_dataset = task.split.to_tensor_dataset(task.split.test) elif task.train_mode == TRAIN_MODE.JUST_TESTING: self.train_dataset = None self.validation_dataset = task.split.to_tensor_dataset(task.split.test) LOG.info('cuda selection...') if torch.cuda.is_available(): self.device = torch.device('cuda') print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') self.device = torch.device('cpu') pd.set_option('precision', 2) LOG.info('init model in progress...') model_svc = ModelService(task) self.tokenizer = model_svc.get_tokenizer() self.config = model_svc.get_config() model_classes = { ARCHITECTURE.BERT_WRAPPER: Bert_wrapper, ARCHITECTURE.BERT_CRF: Bert_CRF, ARCHITECTURE.BERT_LSTM: Bert_LSTM, } MODEL_CLASS = model_classes[task.architecture] if self.TEST_ONLY: self.model = MODEL_CLASS.from_pretrained(task.model.value, config=self.config) else: self.model = MODEL_CLASS(self.config) self.model.to(self.device) # Runs the model on the GPU (if available) LOG.info('init completed successfully!') train_dataloader, validation_dataloader = self.make_dataloaders(self.train_dataset, self.validation_dataset) LOG.info('Dataloader fitted') val_df = self.init_val_df(self.validation_dataset) (val_df, df) = self.do_train_val(train_dataloader, validation_dataloader, val_df) LOG.info('Train/Val/Test completed successfully!') if not self.TEST_ONLY and task.train_mode != TRAIN_MODE.VALIDATION: LOG.info('model weights caching in progress...') if not path.exists(loc.abs_path([loc.TMP, loc.MODELS])): os.mkdir(loc.abs_path([loc.TMP, loc.MODELS])) model_to_save = self.model.module if hasattr(self.model, 'module') else self.model model_to_save.save_pretrained(loc.abs_path([loc.TMP, loc.MODELS, self.task.id])) LOG.info('model weights saved successfully!') LOG.info('results export in progress...') try: val_df = self.add_detok_preds(val_df) LOG.info('detokenized successfully!') except: LOG.warning('detokenization issue!') self.task.val_df = val_df self.task.df = df fm.to_pickle(self.task, loc.abs_path([loc.TMP, self.task.id + '.pickle'])) LOG.info('export completed successfully!')
def make_graphics(metrics_strict_df, metrics_partial_df, TYPE, NOTATION, LR, DROPOUT, prefix, asset_folder): fig = go.Figure() # Create and style traces fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_correct'], name='Correct', line=dict(color='#06D6A0', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_incorrect'], name='Incorrect', line=dict(color='#ef476f', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_partial'], name='Partial', line=dict(color='#118AB2', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_missed'], name='Missed', line=dict(color='#f78c6b', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_spurious'], name='Spurious', line=dict(color='#FFD166', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_possible'], name='Possible', line=dict(color='#073B4C', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_actual'], name='Actual', line=dict(color='mediumorchid', width=2))) # Edit the layout fig.update_layout(title='Strict metrics - ' + TYPE + ', ' + NOTATION + ', lr: ' + LR + ', dropout: ' + DROPOUT, xaxis_title='Epochs', yaxis_title='Value') #fig.show() fig.write_html(loc.abs_path([loc.ASSETS, asset_folder, f'{prefix}___strict___{TYPE}.html'])) fig = go.Figure() # Create and style traces fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_precision'], name='Precision', line=dict(color='#f4a261', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_recall'], name='Recall', line=dict(color='#e76f51', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['str_f1score'], name='F1-Score', line=dict(color='#2a9d8f', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['training_loss'], name='Training Loss', line=dict(color='#ef476f', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['validation_loss'], name='Validation Loss', line=dict(color='mediumorchid', width=2))) # Edit the layout fig.update_layout(title='Strict metrics - ' + TYPE + ', ' + NOTATION + ', lr: ' + LR + ', dropout: ' + DROPOUT, xaxis_title='Epochs', yaxis_title='Value') #fig.show() fig.write_html(loc.abs_path([loc.ASSETS, asset_folder, f'{prefix}___strict_prec_rec_f1___{TYPE}.html'])) fig = go.Figure() # Create and style traces fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_correct'], name='Correct', line=dict(color='#06D6A0', width=2))) fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_incorrect'], name='Incorrect', line=dict(color='#ef476f', width=2))) fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_partial'], name='Partial', line=dict(color='#118AB2', width=2))) fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_missed'], name='Missed', line=dict(color='#f78c6b', width=2))) fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_spurious'], name='Spurious', line=dict(color='#FFD166', width=2))) fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_possible'], name='Possible', line=dict(color='#073B4C', width=2))) fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_actual'], name='Actual', line=dict(color='mediumorchid', width=2))) # Edit the layout fig.update_layout(title='Partial metrics - ' + TYPE + ', ' + NOTATION + ', lr: ' + LR + ', dropout: ' + DROPOUT, xaxis_title='Epochs', yaxis_title='Value') #fig.show() fig.write_html(loc.abs_path([loc.ASSETS, asset_folder, f'{prefix}___partial___{TYPE}.html'])) fig = go.Figure() # Create and style traces fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_precision'], name='Precision', line=dict(color='#f4a261', width=2))) fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_recall'], name='Recall', line=dict(color='#e76f51', width=2))) fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], y=metrics_partial_df['prt_f1score'], name='F1-Score', line=dict(color='#2a9d8f', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['training_loss'], name='Training Loss', line=dict(color='#ef476f', width=2))) fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], y=metrics_strict_df['validation_loss'], name='Validation Loss', line=dict(color='mediumorchid', width=2))) # Edit the layout fig.update_layout(title='Partial metrics - ' + TYPE + ', ' + NOTATION + ', lr: ' + LR + ', dropout: ' + DROPOUT, xaxis_title='Epochs', yaxis_title='Value') #fig.show() fig.write_html(loc.abs_path([loc.ASSETS, asset_folder, f'{prefix}___partial_prec_rec_f1___{TYPE}.html']))