def test_transformers3(self): ''' Prepare test data (no labels) for two sentence BERT classification problem COVERAGE: bert_prepare_data() in bert_utils.py class BertDMH() in bert_utils.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isdir( self.data_dir_local)): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment " "variables or it does not exist.") if not self.necessary_packages_installed: unittest.TestCase.skipTest(self, "missing transformers package") if not os.path.isfile( os.path.join(self.data_dir_local, 'qnli_train.tsv')): unittest.TestCase.skipTest( self, "cannot locate qnli_train.csv in DLPY_DATA_DIR_LOCAL") from transformers import BertTokenizer model_name = 'bert-base-uncased' # instantiate BERT tokenizer tokenizer = BertTokenizer.from_pretrained( model_name, cache_dir=self.data_dir_local) # read QNLI dataset train_data = pd.read_csv( '/dept/cas/DeepLearn/docair/glue/qnli/train.tsv', header=0, sep='\t', error_bad_lines=False, warn_bad_lines=False, names=['index', 'question', 'sentence', 'label']) input_a_label = 'question' input_b_label = 'sentence' input_a = train_data[input_a_label].to_list() input_b = train_data[input_b_label].to_list() # limit the number of observations to 1000 if len(input_a) > 1000: input_a = input_a[:1000] input_b = input_b[:1000] # prepare data num_tgt_var, test = bert_prepare_data(self.s, tokenizer, 128, input_a=input_a, input_b=input_b, classification_problem=True) # check for the existence of the training table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=test) self.assertTrue(res['exists'] != 0, "Test table not created.") # ensure table has the proper number of columns res = self.s.retrieve('table.columninfo', _messagelevel='error', table=test) self.assertTrue( len(res['ColumnInfo']['Column'].to_list()) == 3, "Test table has extra/missing columns.") # clean up data table if it exists try: model_tbl_opts = input_table_check(test) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "BERT data preparation failed") # clean up tokenizer del tokenizer
def test_transformers1(self): ''' Prepare labeled data for single sentence BERT classification problem COVERAGE: bert_prepare_data() in bert_utils.py class BertDMH() in bert_utils.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isdir( self.data_dir_local)): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment " "variables or it does not exist.") if not self.necessary_packages_installed: unittest.TestCase.skipTest(self, "missing transformers package") if not os.path.isfile( os.path.join(self.data_dir_local, 'imdb_master.csv')): unittest.TestCase.skipTest( self, "cannot locate imdb_master.csv in DLPY_DATA_DIR_LOCAL") from transformers import BertTokenizer model_name = 'bert-base-uncased' # instantiate BERT tokenizer tokenizer = BertTokenizer.from_pretrained( model_name, cache_dir=self.data_dir_local) # read dataset for IMDB movie review sentiment classification reviews = pd.read_csv(os.path.join(self.data_dir_local, 'imdb_master.csv'), header=0, names=['type', 'review', 'label', 'file'], encoding='latin_1') input_label = 'review' # input data is review text target_label = 'label' # target data is sentiment label # extract "train" data t_idx1 = reviews['type'] == 'train' t_idx2 = reviews['label'] != 'unsup' inputs = reviews[t_idx1 & t_idx2][input_label].to_list() targets = reviews[t_idx1 & t_idx2][target_label].to_list() # limit the number of observations to 1000 if len(inputs) > 1000: inputs = inputs[:1000] targets = targets[:1000] # create numeric target labels for ii, val in enumerate(targets): inputs[ii] = inputs[ii].replace("<br />", "") if val == 'neg': targets[ii] = 1 elif val == 'pos': targets[ii] = 2 # prepare data num_tgt_var, train = bert_prepare_data(self.s, tokenizer, 128, input_a=list(inputs), target=list(targets), classification_problem=True) # check for the existence of the training table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=train) self.assertTrue(res['exists'] != 0, "Training table not created.") # ensure table has the proper number of columns res = self.s.retrieve('table.columninfo', _messagelevel='error', table=train) self.assertTrue( len(res['ColumnInfo']['Column'].to_list()) == 5, "Training table has extra/missing columns.") # clean up data table if it exists try: model_tbl_opts = input_table_check(train) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "BERT data preparation failed") # clean up tokenizer del tokenizer
def test_transformers2(self): ''' Prepare labeled data for single sentence BERT regression problem COVERAGE: bert_prepare_data() in bert_utils.py class BertDMH() in bert_utils.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isdir( self.data_dir_local)): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment " "variables or it does not exist.") if not self.necessary_packages_installed: unittest.TestCase.skipTest(self, "missing transformers package") if not os.path.isfile( os.path.join(self.data_dir_local, 'task1_training_edited.csv')): unittest.TestCase.skipTest( self, "cannot locate task1_training_edited.csv in DLPY_DATA_DIR_LOCAL" ) from transformers import BertTokenizer model_name = 'bert-base-uncased' # instantiate BERT tokenizer tokenizer = BertTokenizer.from_pretrained( model_name, cache_dir=self.data_dir_local) # read regression data set reviews = pd.read_csv( os.path.join(self.data_dir_local, 'task1_training_edited.csv'), header=None, names=['id', 'original', 'edit', 'grades', 'meanGrade']) inputs = reviews['original'].tolist()[1:] reviews['meanGrade'] = pd.to_numeric(reviews['meanGrade'], errors='coerce').fillna(0) targets = reviews['meanGrade'].tolist()[1:] for ii, val in enumerate(targets): targets[ii] = round(val) # limit the number of observations to 1000 if len(inputs) > 1000: inputs = inputs[:1000] targets = targets[:1000] # prepare data num_tgt_var, train, valid = bert_prepare_data( self.s, tokenizer, 128, input_a=list(inputs), target=list(targets), train_fraction=0.8, classification_problem=False) # check for the existence of the training table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=train) self.assertTrue(res['exists'] != 0, "Training table not created.") # ensure table has the proper number of columns res = self.s.retrieve('table.columninfo', _messagelevel='error', table=train) self.assertTrue( len(res['ColumnInfo']['Column'].to_list()) == 5, "Training table has extra/missing columns.") # check for the existence of the validation table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=valid) self.assertTrue(res['exists'] != 0, "Validation table not created.") # ensure table has the proper number of columns res = self.s.retrieve('table.columninfo', _messagelevel='error', table=valid) self.assertTrue( len(res['ColumnInfo']['Column'].to_list()) == 5, "Validation table has extra/missing columns.") # clean up training table if it exists try: model_tbl_opts = input_table_check(train) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "BERT data preparation failed") # clean up validation table if it exists try: model_tbl_opts = input_table_check(valid) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "BERT data preparation failed") # clean up models del tokenizer