Exemplo n.º 1
0
    def __init__(self,
                 tasks: list,
                 result_filename: str,
                 detok=True,
                 model_filename=None,
                 just_last=False):
        self.tasks = tasks
        self.DETOK = 'detok_' if detok else ''
        self._DETOK = '_detok' if detok else ''

        for task in tasks:
            model_service = ModelService(task)
            self.tokenizer = model_service.get_tokenizer()
            task = self.getAllMetrics(task)
            fm.to_pickle(task, loc.abs_path([loc.TMP, task.id + ".pickle"]))

        if just_last:
            best_partial = self.find_last_configuration(
                tasks, COMPARATOR_MODE.PARTIAL)
            best_strict = self.find_last_configuration(tasks,
                                                       COMPARATOR_MODE.STRICT)
        else:
            best_partial = self.find_best_configuration(
                tasks, COMPARATOR_MODE.PARTIAL)
            best_strict = self.find_best_configuration(tasks,
                                                       COMPARATOR_MODE.STRICT)

        print(f"\n\nBEST PARTIAL:\n\n{self.best_to_str(best_partial)}")
        print(f"\n\nBEST STRICT:\n\n{self.best_to_str(best_strict)}")
        best = Best(best_strict, best_partial)
        fm.to_pickle(best, loc.abs_path([loc.TMP, result_filename]))
Exemplo n.º 2
0
    def __init__(self, task: Task):
        super(DirksonTaskLoader, self).__init__(task)
        LOG.info('dataset export in progress...')

        split_svc = SplitService()
        self.task.split = split_svc.load_split(self.task)

        train = self.load(task.split.train)
        validation = self.load(task.split.validation)
        train_full = train + validation
        test = self.load(task.split.test)

        all_doc_text = [[y.text for y in x.doc.tokens] for x in train_full]
        all_labels = [x.tags for x in train_full]
        prep_text, prep_labels = self.preprocess_text(all_doc_text, all_labels)
        self.save(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON,
                task.corpus.name + loc.DIRKSON_VALIDATION_TXT
            ]), prep_text, prep_labels)

        all_doc_text = [[y.text for y in x.doc.tokens] for x in test]
        all_labels = [x.tags for x in test]
        prep_text, prep_labels = self.preprocess_text(all_doc_text, all_labels)
        self.save(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON,
                task.corpus.name + loc.DIRKSON_TEST_TXT
            ]), prep_text, prep_labels)

        LOG.info('dataset exported successfully!...')
Exemplo n.º 3
0
    def __init__(self, corpus_name: str):
        f = open(loc.abs_path([
            loc.ASSETS, loc.MODELS, loc.DIRKSON,
            corpus_name + loc.DIRKSON_TEST_TXT
        ]),
                 "r",
                 encoding="utf8")

        all_doc_text = []
        all_labels = []

        temp_text = ""
        temp_labels = []
        for line in f:
            if line == "\n":
                all_doc_text.append(temp_text[1:])
                all_labels.append(temp_labels)
                temp_text = ""
                temp_labels = []
            else:
                temp_text = temp_text + " " + line[0:-3]
                temp_labels.append(line[-2:-1])

        model = SequenceTagger.load_from_file(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name,
                "best-model.pt"
            ]))

        doc_text = []
        truth_labels = []
        pred_labels = []

        for text, labels in tqdm(zip(all_doc_text, all_labels),
                                 total=len(all_doc_text)):
            sentence = Sentence(text)
            model.predict(sentence)
            pred = self.fromTaggedToIOB(sentence.to_tagged_string())
            doc_text.append(text.split(" "))
            truth_labels.append(labels)
            pred_labels.append(pred)

            if len(pred) != len(labels):
                print(text)
                print(sentence)

        df = pd.DataFrame({
            'doc_text': doc_text,
            'labels': truth_labels,
            'pred_labels': pred_labels
        })

        df.to_pickle(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON,
                corpus_name + loc.DIRKSON_TEST_RESULTS_PICKLE
            ]))
Exemplo n.º 4
0
    def __init__(self, corpus_name: str):

        corpus = NLPTaskDataFetcher.load_column_corpus(
            loc.abs_path([loc.ASSETS, loc.MODELS, loc.DIRKSON]), {
                0: 'text',
                1: 'ner'
            },
            train_file=corpus_name + loc.DIRKSON_VALIDATION_TXT,
            test_file=corpus_name + loc.DIRKSON_TEST_TXT)

        embedding_types = [
            BertEmbeddings('bert-base-uncased'),
            FlairEmbeddings('mix-forward'),
            FlairEmbeddings('mix-backward')
        ]

        tag_type = 'ner'
        embeddings = StackedEmbeddings(embeddings=embedding_types)
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type,
                                                use_crf=True)

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        if not path.exists:
            os.mkdir(
                loc.abs_path(
                    [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]))
        trainer.train(loc.abs_path(
            [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]),
                      learning_rate=0.1,
                      mini_batch_size=32,
                      max_epochs=150)

        plotter = Plotter()
        plotter.plot_training_curves(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.LOSS_TSV
            ]))
        plotter.plot_weights(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name,
                loc.WEIGHTS_TXT
            ]))
Exemplo n.º 5
0
    def run_handler(self, run_path):
        json = fm.from_json(loc.abs_path([loc.ASSETS, loc.RUNS, run_path]))
        for task in json:
            #            try:
            random_seed = int(task['train_config']['random_seed'])
            self.set_all_seed(random_seed)

            train_config = TrainConfig(
                int(task['train_config']['max_patience']),
                float(task['train_config']['learning_rate']),
                float(task['train_config']['dropout']),
                int(task['train_config']['epochs']), random_seed,
                float(task['train_config']['epsilon']))

            task = Task(task['id'], task['split_folder'],
                        enums_by_list(TIDY_MODE, task['tidy_modes']),
                        enum_by_name(CORPUS, task['corpus']),
                        enum_by_name(NOTATION, task['notation']),
                        enum_by_name(MODEL, task['model']),
                        enum_by_name(ARCHITECTURE, task['architecture']),
                        enums_by_list(ANNOTATION_TYPE, task['goal']),
                        enum_by_name(TRAIN_MODE, task['train_mode']),
                        train_config)

            model = task.model
            if model == MODEL.DIRKSON:
                self.dirkson_task_handler(task)
            else:
                self.bert_task_handler(task)
Exemplo n.º 6
0
 def load_obj(self, name):
     with open(
             loc.abs_path([
                 loc.ASSETS, loc.MODELS, loc.DIRKSON, loc.OBJ_LEX,
                 f'{name}.pkl'
             ]), 'rb') as f:
         return pickle.load(f, encoding='latin1')
Exemplo n.º 7
0
    def loadEnglishToAmericanDict(self):
        etoa = {}

        english = open(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, loc.OBJ_LEX,
                'englishspellings.txt'
            ]))
        american = open(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, loc.OBJ_LEX,
                'americanspellings.txt'
            ]))
        for line in english:
            etoa[line.strip()] = american.readline().strip()
        return etoa
Exemplo n.º 8
0
class MODEL(str, enum.Enum):
    SPAN_BERT_CASED = 'SpanBERT/spanbert-base-cased'
    BIO_BERT = 'dmis-lab/biobert-v1.1'
    BIO_BERT_GIT = loc.abs_path([loc.TMP, loc.BIO_BERT_GIT])
    SCI_BERT = 'allenai/scibert_scivocab_cased'
    BIO_CLINICAL_BERT = 'emilyalsentzer/Bio_ClinicalBERT'
    BERT_TWEET = 'vinai/bertweet-base'
    PUB_MED_BERT = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
    BASE_UNCASED = 'bert-base-uncased'
    DIRKSON = 'dirkson'
Exemplo n.º 9
0
 def __init__(self, args):
     try:
         if not path.exists(loc.abs_path([loc.TMP, loc.BIO_BERT_GIT])):
             ModelService.get_bio_git_model()
     except:
         pass
     if args.import_ds:
         self.import_handler()
     elif args.run is not None:
         self.run_handler(args.run[0])
     elif args.clean:
         self.clean_handler()
     else:
         self.default_handler()
Exemplo n.º 10
0
 def loadDictionaryData(self):
     '''
     this function loads the various dictionaries which can be used for mapping from oov to iv
     '''
     n_dict = {}
     infile = open(
         loc.abs_path([
             loc.ASSETS, loc.MODELS, loc.DIRKSON, loc.OBJ_LEX,
             'noslang_mod.txt'
         ]))
     for line in infile:
         items = line.split(' - ')
         if len(items[0]) > 0 and len(items) > 1:
             n_dict[items[0].strip()] = items[1].strip()
     return n_dict
Exemplo n.º 11
0
    def compare_dirkson(corpus_name: str):
        df = pd.read_pickle(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON,
                corpus_name + loc.DIRKSON_TEST_RESULTS_PICKLE
            ]))
        all_true_array = df.labels.values
        all_pred_array = df.pred_labels.values

        all_true = []
        for t in all_true_array:
            all_true.append(t)
        all_pred = []
        for t in all_pred_array:
            all_pred.append(t)

        evaluator = Evaluator(all_true, all_pred, [""])
        results, results_agg = evaluator.evaluate()
        fm.to_json(
            results,
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON,
                corpus_name + "-results.json"
            ]))
Exemplo n.º 12
0
    def __init__(self, task: Task):
        self.task = task
        self.TEST_ONLY = task.train_mode == TRAIN_MODE.JUST_TESTING
        if task.train_mode == TRAIN_MODE.VALIDATION:
            self.train_dataset = task.split.to_tensor_dataset(task.split.train)
            self.validation_dataset = task.split.to_tensor_dataset(task.split.validation)
        elif task.train_mode == TRAIN_MODE.TESTING:
            self.train_dataset = task.split.to_tensor_dataset(task.split.train + task.split.validation)
            self.validation_dataset = task.split.to_tensor_dataset(task.split.test)
        elif task.train_mode == TRAIN_MODE.JUST_TESTING:
            self.train_dataset = None
            self.validation_dataset = task.split.to_tensor_dataset(task.split.test)

        LOG.info('cuda selection...')
        if torch.cuda.is_available():    
            self.device = torch.device('cuda')
            print('There are %d GPU(s) available.' % torch.cuda.device_count())
            print('We will use the GPU:', torch.cuda.get_device_name(0))
        else:
            print('No GPU available, using the CPU instead.')
            self.device = torch.device('cpu')
        pd.set_option('precision', 2)

        LOG.info('init model in progress...')
        model_svc = ModelService(task)
        self.tokenizer = model_svc.get_tokenizer()
        self.config = model_svc.get_config()
        
        model_classes = {
            ARCHITECTURE.BERT_WRAPPER: Bert_wrapper,
            ARCHITECTURE.BERT_CRF: Bert_CRF,
            ARCHITECTURE.BERT_LSTM: Bert_LSTM,
        }
        
        MODEL_CLASS = model_classes[task.architecture]
        
        if self.TEST_ONLY:
            self.model = MODEL_CLASS.from_pretrained(task.model.value, config=self.config)
        else:
            self.model = MODEL_CLASS(self.config)
        
        self.model.to(self.device) # Runs the model on the GPU (if available)

        LOG.info('init completed successfully!')
        
        train_dataloader, validation_dataloader = self.make_dataloaders(self.train_dataset,
                                                                        self.validation_dataset)
        LOG.info('Dataloader fitted')
        
        val_df = self.init_val_df(self.validation_dataset)

        (val_df, df) = self.do_train_val(train_dataloader, validation_dataloader, val_df)
        LOG.info('Train/Val/Test completed successfully!')
        
        if not self.TEST_ONLY and task.train_mode != TRAIN_MODE.VALIDATION:
            LOG.info('model weights caching in progress...')
            if not path.exists(loc.abs_path([loc.TMP, loc.MODELS])):
                os.mkdir(loc.abs_path([loc.TMP, loc.MODELS]))
            model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
            model_to_save.save_pretrained(loc.abs_path([loc.TMP, loc.MODELS, self.task.id]))
            LOG.info('model weights saved successfully!')
            
        LOG.info('results export in progress...')

        try:
            val_df = self.add_detok_preds(val_df)
            LOG.info('detokenized successfully!')
        except:
            LOG.warning('detokenization issue!')
        
        self.task.val_df = val_df
        self.task.df = df
        fm.to_pickle(self.task, loc.abs_path([loc.TMP, self.task.id + '.pickle']))
        LOG.info('export completed successfully!')
Exemplo n.º 13
0
def make_graphics(metrics_strict_df, metrics_partial_df, TYPE, NOTATION, LR, DROPOUT, prefix, asset_folder):
    fig = go.Figure()

    # Create and style traces
    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_correct'], 
                            name='Correct',
                            line=dict(color='#06D6A0', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_incorrect'], 
                            name='Incorrect',
                            line=dict(color='#ef476f', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_partial'], 
                            name='Partial',
                            line=dict(color='#118AB2', width=2)))
                        
    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_missed'], 
                            name='Missed',
                            line=dict(color='#f78c6b', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_spurious'], 
                            name='Spurious',
                            line=dict(color='#FFD166', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_possible'], 
                            name='Possible',
                            line=dict(color='#073B4C', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_actual'], 
                            name='Actual',
                            line=dict(color='mediumorchid', width=2)))

    # Edit the layout
    fig.update_layout(title='Strict metrics - ' + TYPE + ', ' + NOTATION + ', lr: ' + LR + ', dropout: ' + DROPOUT,
                    xaxis_title='Epochs',
                    yaxis_title='Value')

    #fig.show()
    fig.write_html(loc.abs_path([loc.ASSETS, asset_folder, f'{prefix}___strict___{TYPE}.html']))

    fig = go.Figure()

    # Create and style traces
    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_precision'], 
                            name='Precision',
                            line=dict(color='#f4a261', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_recall'], 
                            name='Recall',
                            line=dict(color='#e76f51', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['str_f1score'], 
                            name='F1-Score',
                            line=dict(color='#2a9d8f', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['training_loss'], 
                            name='Training Loss',
                            line=dict(color='#ef476f', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['validation_loss'], 
                            name='Validation Loss',
                            line=dict(color='mediumorchid', width=2)))
                        
    # Edit the layout
    fig.update_layout(title='Strict metrics - ' + TYPE + ', ' + NOTATION + ', lr: ' + LR + ', dropout: ' + DROPOUT,
                    xaxis_title='Epochs',
                    yaxis_title='Value')

    #fig.show()
    fig.write_html(loc.abs_path([loc.ASSETS, asset_folder, f'{prefix}___strict_prec_rec_f1___{TYPE}.html']))

    fig = go.Figure()

    # Create and style traces
    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_correct'], 
                            name='Correct',
                            line=dict(color='#06D6A0', width=2)))

    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_incorrect'], 
                            name='Incorrect',
                            line=dict(color='#ef476f', width=2)))

    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_partial'], 
                            name='Partial',
                            line=dict(color='#118AB2', width=2)))
                        
    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_missed'], 
                            name='Missed',
                            line=dict(color='#f78c6b', width=2)))

    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_spurious'], 
                            name='Spurious',
                            line=dict(color='#FFD166', width=2)))

    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_possible'], 
                            name='Possible',
                            line=dict(color='#073B4C', width=2)))

    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_actual'], 
                            name='Actual',
                            line=dict(color='mediumorchid', width=2)))

    # Edit the layout
    fig.update_layout(title='Partial metrics - ' + TYPE + ', ' + NOTATION + ', lr: ' + LR + ', dropout: ' + DROPOUT,
                    xaxis_title='Epochs',
                    yaxis_title='Value')

    #fig.show()
    fig.write_html(loc.abs_path([loc.ASSETS, asset_folder, f'{prefix}___partial___{TYPE}.html']))

    fig = go.Figure()

    # Create and style traces
    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_precision'], 
                            name='Precision',
                            line=dict(color='#f4a261', width=2)))

    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_recall'], 
                            name='Recall',
                            line=dict(color='#e76f51', width=2)))

    fig.add_trace(go.Scatter(x=metrics_partial_df['epoch'], 
                            y=metrics_partial_df['prt_f1score'], 
                            name='F1-Score',
                            line=dict(color='#2a9d8f', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['training_loss'], 
                            name='Training Loss',
                            line=dict(color='#ef476f', width=2)))

    fig.add_trace(go.Scatter(x=metrics_strict_df['epoch'], 
                            y=metrics_strict_df['validation_loss'], 
                            name='Validation Loss',
                            line=dict(color='mediumorchid', width=2)))
                        
    # Edit the layout
    fig.update_layout(title='Partial metrics - ' + TYPE + ', ' + NOTATION + ', lr: ' + LR + ', dropout: ' + DROPOUT,
                    xaxis_title='Epochs',
                    yaxis_title='Value')
    #fig.show()
    fig.write_html(loc.abs_path([loc.ASSETS, asset_folder, f'{prefix}___partial_prec_rec_f1___{TYPE}.html']))