예제 #1
0
    def test_transformers3(self):
        '''
        Prepare test data (no labels) for two sentence BERT classification problem
        COVERAGE: bert_prepare_data() in bert_utils.py
                  class BertDMH() in bert_utils.py
        '''

        if self.data_dir is None:
            unittest.TestCase.skipTest(
                self, "DLPY_DATA_DIR is not set in the environment variables")

        if (self.data_dir_local is None) or (not os.path.isdir(
                self.data_dir_local)):
            unittest.TestCase.skipTest(
                self, "DLPY_DATA_DIR_LOCAL is not set in the environment "
                "variables or it does not exist.")

        if not self.necessary_packages_installed:
            unittest.TestCase.skipTest(self, "missing transformers package")

        if not os.path.isfile(
                os.path.join(self.data_dir_local, 'qnli_train.tsv')):
            unittest.TestCase.skipTest(
                self, "cannot locate qnli_train.csv in DLPY_DATA_DIR_LOCAL")

        from transformers import BertTokenizer
        model_name = 'bert-base-uncased'

        # instantiate BERT tokenizer
        tokenizer = BertTokenizer.from_pretrained(
            model_name, cache_dir=self.data_dir_local)

        # read QNLI dataset
        train_data = pd.read_csv(
            '/dept/cas/DeepLearn/docair/glue/qnli/train.tsv',
            header=0,
            sep='\t',
            error_bad_lines=False,
            warn_bad_lines=False,
            names=['index', 'question', 'sentence', 'label'])

        input_a_label = 'question'
        input_b_label = 'sentence'

        input_a = train_data[input_a_label].to_list()
        input_b = train_data[input_b_label].to_list()

        # limit the number of observations to 1000
        if len(input_a) > 1000:
            input_a = input_a[:1000]
            input_b = input_b[:1000]

        # prepare data
        num_tgt_var, test = bert_prepare_data(self.s,
                                              tokenizer,
                                              128,
                                              input_a=input_a,
                                              input_b=input_b,
                                              classification_problem=True)

        # check for the existence of the training table
        res = self.s.retrieve('table.tableexists',
                              _messagelevel='error',
                              name=test)
        self.assertTrue(res['exists'] != 0, "Test table not created.")

        # ensure table has the proper number of columns
        res = self.s.retrieve('table.columninfo',
                              _messagelevel='error',
                              table=test)
        self.assertTrue(
            len(res['ColumnInfo']['Column'].to_list()) == 3,
            "Test table has extra/missing columns.")

        # clean up data table if it exists
        try:
            model_tbl_opts = input_table_check(test)
            self.s.table.droptable(quiet=True, **model_tbl_opts)
        except TypeError:
            self.assertTrue(False, "BERT data preparation failed")

        # clean up tokenizer
        del tokenizer
예제 #2
0
    def test_transformers1(self):
        '''
        Prepare labeled data for single sentence BERT classification problem
        COVERAGE: bert_prepare_data() in bert_utils.py
                  class BertDMH() in bert_utils.py
        '''

        if self.data_dir is None:
            unittest.TestCase.skipTest(
                self, "DLPY_DATA_DIR is not set in the environment variables")

        if (self.data_dir_local is None) or (not os.path.isdir(
                self.data_dir_local)):
            unittest.TestCase.skipTest(
                self, "DLPY_DATA_DIR_LOCAL is not set in the environment "
                "variables or it does not exist.")

        if not self.necessary_packages_installed:
            unittest.TestCase.skipTest(self, "missing transformers package")

        if not os.path.isfile(
                os.path.join(self.data_dir_local, 'imdb_master.csv')):
            unittest.TestCase.skipTest(
                self, "cannot locate imdb_master.csv in DLPY_DATA_DIR_LOCAL")

        from transformers import BertTokenizer
        model_name = 'bert-base-uncased'

        # instantiate BERT tokenizer
        tokenizer = BertTokenizer.from_pretrained(
            model_name, cache_dir=self.data_dir_local)

        # read dataset for IMDB movie review sentiment classification
        reviews = pd.read_csv(os.path.join(self.data_dir_local,
                                           'imdb_master.csv'),
                              header=0,
                              names=['type', 'review', 'label', 'file'],
                              encoding='latin_1')

        input_label = 'review'  # input data is review text
        target_label = 'label'  # target data is sentiment label

        # extract "train" data
        t_idx1 = reviews['type'] == 'train'
        t_idx2 = reviews['label'] != 'unsup'
        inputs = reviews[t_idx1 & t_idx2][input_label].to_list()
        targets = reviews[t_idx1 & t_idx2][target_label].to_list()

        # limit the number of observations to 1000
        if len(inputs) > 1000:
            inputs = inputs[:1000]
            targets = targets[:1000]

        # create numeric target labels
        for ii, val in enumerate(targets):
            inputs[ii] = inputs[ii].replace("<br />", "")
            if val == 'neg':
                targets[ii] = 1
            elif val == 'pos':
                targets[ii] = 2

        # prepare data
        num_tgt_var, train = bert_prepare_data(self.s,
                                               tokenizer,
                                               128,
                                               input_a=list(inputs),
                                               target=list(targets),
                                               classification_problem=True)

        # check for the existence of the training table
        res = self.s.retrieve('table.tableexists',
                              _messagelevel='error',
                              name=train)
        self.assertTrue(res['exists'] != 0, "Training table not created.")

        # ensure table has the proper number of columns
        res = self.s.retrieve('table.columninfo',
                              _messagelevel='error',
                              table=train)
        self.assertTrue(
            len(res['ColumnInfo']['Column'].to_list()) == 5,
            "Training table has extra/missing columns.")

        # clean up data table if it exists
        try:
            model_tbl_opts = input_table_check(train)
            self.s.table.droptable(quiet=True, **model_tbl_opts)
        except TypeError:
            self.assertTrue(False, "BERT data preparation failed")

        # clean up tokenizer
        del tokenizer
예제 #3
0
    def test_transformers2(self):
        '''
        Prepare labeled data for single sentence BERT regression problem
        COVERAGE: bert_prepare_data() in bert_utils.py
                  class BertDMH() in bert_utils.py
        '''

        if self.data_dir is None:
            unittest.TestCase.skipTest(
                self, "DLPY_DATA_DIR is not set in the environment variables")

        if (self.data_dir_local is None) or (not os.path.isdir(
                self.data_dir_local)):
            unittest.TestCase.skipTest(
                self, "DLPY_DATA_DIR_LOCAL is not set in the environment "
                "variables or it does not exist.")

        if not self.necessary_packages_installed:
            unittest.TestCase.skipTest(self, "missing transformers package")

        if not os.path.isfile(
                os.path.join(self.data_dir_local,
                             'task1_training_edited.csv')):
            unittest.TestCase.skipTest(
                self,
                "cannot locate task1_training_edited.csv in DLPY_DATA_DIR_LOCAL"
            )

        from transformers import BertTokenizer
        model_name = 'bert-base-uncased'

        # instantiate BERT tokenizer
        tokenizer = BertTokenizer.from_pretrained(
            model_name, cache_dir=self.data_dir_local)

        # read regression data set
        reviews = pd.read_csv(
            os.path.join(self.data_dir_local, 'task1_training_edited.csv'),
            header=None,
            names=['id', 'original', 'edit', 'grades', 'meanGrade'])

        inputs = reviews['original'].tolist()[1:]
        reviews['meanGrade'] = pd.to_numeric(reviews['meanGrade'],
                                             errors='coerce').fillna(0)
        targets = reviews['meanGrade'].tolist()[1:]
        for ii, val in enumerate(targets):
            targets[ii] = round(val)

        # limit the number of observations to 1000
        if len(inputs) > 1000:
            inputs = inputs[:1000]
            targets = targets[:1000]

        # prepare data
        num_tgt_var, train, valid = bert_prepare_data(
            self.s,
            tokenizer,
            128,
            input_a=list(inputs),
            target=list(targets),
            train_fraction=0.8,
            classification_problem=False)

        # check for the existence of the training table
        res = self.s.retrieve('table.tableexists',
                              _messagelevel='error',
                              name=train)
        self.assertTrue(res['exists'] != 0, "Training table not created.")

        # ensure table has the proper number of columns
        res = self.s.retrieve('table.columninfo',
                              _messagelevel='error',
                              table=train)
        self.assertTrue(
            len(res['ColumnInfo']['Column'].to_list()) == 5,
            "Training table has extra/missing columns.")

        # check for the existence of the validation table
        res = self.s.retrieve('table.tableexists',
                              _messagelevel='error',
                              name=valid)
        self.assertTrue(res['exists'] != 0, "Validation table not created.")

        # ensure table has the proper number of columns
        res = self.s.retrieve('table.columninfo',
                              _messagelevel='error',
                              table=valid)
        self.assertTrue(
            len(res['ColumnInfo']['Column'].to_list()) == 5,
            "Validation table has extra/missing columns.")

        # clean up training table if it exists
        try:
            model_tbl_opts = input_table_check(train)
            self.s.table.droptable(quiet=True, **model_tbl_opts)
        except TypeError:
            self.assertTrue(False, "BERT data preparation failed")

        # clean up validation table if it exists
        try:
            model_tbl_opts = input_table_check(valid)
            self.s.table.droptable(quiet=True, **model_tbl_opts)
        except TypeError:
            self.assertTrue(False, "BERT data preparation failed")

        # clean up models
        del tokenizer