Exemplo n.º 1
0
 def test_read_csv_positive03(self):
     file_name = os.path.join(os.path.dirname(__file__),
                              'csv_for_testing2.csv')
     true_classes = [
         'Криминал', 'Культура', 'Общество', 'Политика', 'Экономика'
     ]
     true_texts = [
         'Германия разрешила третий пол.',
         'Лукашенко объяснил поставки оружия Азербайджану.',
         'Шакиру уличили в неуплате налогов.',
         'Люди в масках ворвались на собрание врачей в Киеве и отказались их выпускать.',
         'Подсчитаны расходы на российскую сверхтяжелую ракету для освоения Луны.',
         'Популярные актеры дебютируют в короткометражных хоррорах.',
         'Россия ответила на высылку дипломата из Словакии.',
         'Российские нефтяники заработали триллионы на фоне бензинового кризиса.',
         'Google выбрал своего «Человека года».'
     ]
     true_labels = [2, {3, 4}, {0, 1}, 0, 4, 1, 3, 4, 2]
     loaded_texts, loaded_labels, loaded_classes = read_csv(file_name, 1)
     self.assertIsInstance(loaded_classes, list)
     self.assertEqual(true_classes, loaded_classes)
     self.assertIsInstance(loaded_texts, np.ndarray)
     self.assertIsInstance(loaded_labels, np.ndarray)
     self.assertEqual(loaded_texts.shape, (len(true_texts), ))
     self.assertEqual(loaded_labels.shape, (len(true_labels), ))
     self.assertEqual(true_texts, loaded_texts.tolist())
     self.assertEqual(true_labels, loaded_labels.tolist())
Exemplo n.º 2
0
 def test_read_csv_positive04(self):
     file_name = os.path.join(os.path.dirname(__file__),
                              'csv_for_testing1.csv')
     true_classes = ['Криминал', 'Политика', 'Спорт']
     true_texts = [
         'Испанские клубы открестились от Неймара.',
         'Семилетняя беженка погибла после задержания на границе США.',
         'Главная реформа Обамы признана неконституционной.',
         'Бывший чемпион UFC не выдержал кровопролития и сдался.',
         'Охранника магазина зарезали из-за трех бутылок водки.',
         'Лукашенко пожаловался Путину на украинских «отмороженных нацменов».'
     ]
     true_labels = [2, 0, 1, 2, 0, 1]
     loaded_texts, loaded_labels, loaded_classes = read_csv(file_name, 1)
     self.assertIsInstance(loaded_classes, list)
     self.assertEqual(true_classes, loaded_classes)
     self.assertIsInstance(loaded_texts, np.ndarray)
     self.assertIsInstance(loaded_labels, np.ndarray)
     self.assertEqual(loaded_texts.shape, (len(true_texts), ))
     self.assertEqual(loaded_labels.shape, (len(true_labels), ))
     self.assertEqual(true_texts, loaded_texts.tolist())
     self.assertEqual(true_labels, loaded_labels.tolist())
Exemplo n.º 3
0
def main():
    def func(args):
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        quality = 0.0
        print(
            'Filters number for different convolution kernels: ({0}, {1}, {2}, {3}, {4})'
            .format(conv1_, conv2_, conv3_, conv4_, conv5_))
        for fold_idx, (train_index, test_index) in enumerate(indices_for_cv):
            cls = ImpatialTextClassifier(
                bert_hub_module_handle=(None if os.path.exists(
                    os.path.normpath(bert_handle)) else bert_handle),
                filters_for_conv1=conv1_,
                filters_for_conv2=conv2_,
                filters_for_conv3=conv3_,
                filters_for_conv4=conv4_,
                filters_for_conv5=conv5_,
                multioutput=multioutput,
                gpu_memory_frac=gpu_memory_frac,
                num_monte_carlo=num_monte_carlo,
                verbose=False,
                random_seed=42,
                max_epochs=100,
                patience=5,
                batch_size=16,
                bayesian=(nn_type == 'bayesian'))
            if os.path.exists(os.path.normpath(bert_handle)):
                cls.PATH_TO_BERT = os.path.normpath(bert_handle)
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
            train_index_, val_index = cls.train_test_split(train_labels, 0.1)
            val_texts = train_texts[val_index]
            val_labels = train_labels[val_index]
            if unlabeled_texts_for_training is None:
                train_texts = train_texts[train_index_]
                train_labels = train_labels[train_index_]
            else:
                train_texts = np.concatenate(
                    (train_texts[train_index_], unlabeled_texts_for_training))
                train_labels = np.concatenate(
                    (train_labels[train_index_],
                     np.full(shape=(len(unlabeled_texts_for_training), ),
                             fill_value=-1,
                             dtype=np.int32)))
            cls.fit(train_texts,
                    train_labels,
                    validation_data=(val_texts, val_labels))
            del train_texts, train_labels, val_texts, val_labels, train_index_, val_index
            if unlabeled_texts_for_testing is None:
                texts_for_final_testing = labeled_texts[test_index]
                labels_for_final_testing = labels[test_index]
            else:
                texts_for_final_testing = np.concatenate(
                    (labeled_texts[test_index], unlabeled_texts_for_testing))
                labels_for_final_testing = np.concatenate(
                    (labels[test_index],
                     np.full(shape=(len(unlabeled_texts_for_testing), ),
                             fill_value=-1,
                             dtype=np.int32)))
            instant_quality = cls.score(texts_for_final_testing,
                                        labels_for_final_testing)
            quality += instant_quality
            print('Fold {0}: {1:.6f}.'.format(fold_idx + 1, instant_quality))
            del cls, texts_for_final_testing, labels_for_final_testing
        quality /= float(len(indices_for_cv))
        print('Total quality = {0:.6f}.'.format(quality))
        print('')
        return -quality

    def score(args):
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        print(
            'Optimal filters number for different convolution kernels: ({0}, {1}, {2}, {3}, {4})'
            .format(conv1_, conv2_, conv3_, conv4_, conv5_))
        print('')
        y_pred = []
        y_true = []
        unlabeled_is_added = False
        for train_index, test_index in indices_for_cv:
            cls = ImpatialTextClassifier(
                bert_hub_module_handle=(None if os.path.exists(
                    os.path.normpath(bert_handle)) else bert_handle),
                filters_for_conv1=conv1_,
                filters_for_conv2=conv2_,
                filters_for_conv3=conv3_,
                filters_for_conv4=conv4_,
                filters_for_conv5=conv5_,
                batch_size=16,
                gpu_memory_frac=gpu_memory_frac,
                num_monte_carlo=num_monte_carlo,
                verbose=True,
                random_seed=42,
                max_epochs=100,
                patience=5,
                multioutput=multioutput,
                bayesian=(nn_type == 'bayesian'))
            if os.path.exists(os.path.normpath(bert_handle)):
                cls.PATH_TO_BERT = os.path.normpath(bert_handle)
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
            train_index_, val_index = cls.train_test_split(train_labels, 0.1)
            val_texts = train_texts[val_index]
            val_labels = train_labels[val_index]
            if unlabeled_texts_for_training is None:
                train_texts = train_texts[train_index_]
                train_labels = train_labels[train_index_]
            else:
                train_texts = np.concatenate(
                    (train_texts[train_index_], unlabeled_texts_for_training))
                train_labels = np.concatenate(
                    (train_labels[train_index_],
                     np.full(shape=(len(unlabeled_texts_for_training), ),
                             fill_value=-1,
                             dtype=np.int32)))
            cls.fit(train_texts,
                    train_labels,
                    validation_data=(val_texts, val_labels))
            print('')
            del train_texts, train_labels, val_texts, val_labels, train_index_, val_index
            if (not unlabeled_is_added) and (unlabeled_texts_for_testing
                                             is not None):
                y_pred.append(cls.predict(unlabeled_texts_for_testing))
                unlabeled_is_added = True
                y_true.append(
                    np.full(shape=(len(unlabeled_texts_for_testing), ),
                            fill_value=-1,
                            dtype=np.int32))
            y_pred.append(cls.predict(labeled_texts[test_index]))
            y_true.append(labels[test_index])
            del cls
        y_pred = np.concatenate(y_pred)
        y_true = np.concatenate(y_true)
        print('')
        if multioutput:
            for class_idx in range(len(classes_list)):
                y_true_ = np.zeros((len(y_true), ), dtype=np.int32)
                y_pred_ = np.zeros((len(y_pred), ), dtype=np.int32)
                for sample_idx in range(len(y_true)):
                    if isinstance(y_true[sample_idx], set):
                        if class_idx in y_true[sample_idx]:
                            y_true_[sample_idx] = 1
                    elif class_idx == y_true[sample_idx]:
                        y_true_[sample_idx] = 1
                    if isinstance(y_pred[sample_idx], set):
                        if class_idx in y_pred[sample_idx]:
                            y_pred_[sample_idx] = 1
                    elif class_idx == y_pred[sample_idx]:
                        y_pred_[sample_idx] = 1
                print(
                    classification_report(
                        y_true,
                        y_pred,
                        target_names=['OTHER', classes_list[class_idx]]))
        else:
            for sample_idx in range(len(y_true)):
                if y_true[sample_idx] < 0:
                    y_true[sample_idx] = len(classes_list)
                if y_pred[sample_idx] < 0:
                    y_pred[sample_idx] = len(classes_list)
            print(
                classification_report(y_true,
                                      y_pred,
                                      target_names=classes_list + ['UNKNOWN']))
            print('')

    def train(args) -> ImpatialTextClassifier:
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        train_index, val_index = ImpatialTextClassifier.train_test_split(
            labels, 0.1)
        if unlabeled_texts_for_training is None:
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
        else:
            train_texts = np.concatenate(
                (labeled_texts[train_index], unlabeled_texts_for_training))
            train_labels = np.concatenate(
                (labels[train_index],
                 np.full(shape=(len(unlabeled_texts_for_training), ),
                         fill_value=-1,
                         dtype=np.int32)))
        val_texts = labeled_texts[val_index]
        val_labels = labels[val_index]
        cls = ImpatialTextClassifier(
            bert_hub_module_handle=(None if os.path.exists(
                os.path.normpath(bert_handle)) else bert_handle),
            filters_for_conv1=conv1_,
            filters_for_conv2=conv2_,
            filters_for_conv3=conv3_,
            filters_for_conv4=conv4_,
            filters_for_conv5=conv5_,
            batch_size=16,
            gpu_memory_frac=gpu_memory_frac,
            num_monte_carlo=num_monte_carlo,
            verbose=True,
            random_seed=42,
            max_epochs=100,
            patience=5,
            multioutput=multioutput,
            bayesian=(nn_type == 'bayesian'))
        if os.path.exists(os.path.normpath(bert_handle)):
            cls.PATH_TO_BERT = os.path.normpath(bert_handle)
        cls.fit(train_texts,
                train_labels,
                validation_data=(val_texts, val_labels))
        del train_texts, train_labels, val_texts, val_labels
        return cls

    parser = ArgumentParser()
    parser.add_argument('-m',
                        '--model',
                        dest='model_name',
                        type=str,
                        required=True,
                        help='The binary file with the text classifier.')
    parser.add_argument(
        '-b',
        '--bert',
        dest='bert',
        type=str,
        required=False,
        default='https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1',
        help=
        'URL of used TF-Hub BERT model (or path to the BERT model in local drive).'
    )
    parser.add_argument('-c',
                        '--csv',
                        dest='csv_data_file',
                        type=str,
                        required=True,
                        help='Path to the CSV file with labeled data.')
    parser.add_argument(
        '-t',
        '--train',
        dest='train_file_name',
        type=str,
        required=False,
        default='',
        help='Path to the text file with unlabeled data for training.')
    parser.add_argument(
        '-e',
        '--test',
        dest='test_file_name',
        type=str,
        required=False,
        default='',
        help='Path to the text file with unlabeled data for evaluation.')
    parser.add_argument(
        '--gpu_frac',
        dest='gpu_memory_frac',
        type=float,
        required=False,
        default=0.9,
        help='Allocable part of the GPU memory for the classifier.')
    parser.add_argument(
        '--nn_type',
        dest='nn_type',
        type=str,
        choices=['bayesian', 'usual'],
        required=False,
        default='bayesian',
        help=
        'Neural network type: `bayesian`, `usual` or `additional_class` (it '
        'is same as `usual` but unlabeled samples are modeled as additional '
        'class).')
    parser.add_argument(
        '--num_monte_carlo',
        dest='num_monte_carlo',
        type=int,
        required=False,
        default=100,
        help='Number of generated Monte Carlo samples for each data sample.')
    parser.add_argument(
        '--conv1',
        dest='size_of_conv1',
        type=int,
        required=False,
        default=20,
        help='Size of the Bayesian convolution layer with kernel size 1.')
    parser.add_argument(
        '--conv2',
        dest='size_of_conv2',
        type=int,
        required=False,
        default=20,
        help='Size of the Bayesian convolution layer with kernel size 2.')
    parser.add_argument(
        '--conv3',
        dest='size_of_conv3',
        type=int,
        required=False,
        default=20,
        help='Size of the Bayesian convolution layer with kernel size 3.')
    parser.add_argument(
        '--conv4',
        dest='size_of_conv4',
        type=int,
        required=False,
        default=20,
        help='Size of the Bayesian convolution layer with kernel size 4.')
    parser.add_argument(
        '--conv5',
        dest='size_of_conv5',
        type=int,
        required=False,
        default=20,
        help='Size of the Bayesian convolution layer with kernel size 5.')
    parser.add_argument(
        '--search',
        dest='search_hyperparameters',
        required=False,
        action='store_true',
        default=False,
        help='Will be hyperparameters found by the Bayesian optimization?')
    cmd_args = parser.parse_args()

    num_monte_carlo = cmd_args.num_monte_carlo
    gpu_memory_frac = cmd_args.gpu_memory_frac
    bert_handle = cmd_args.bert
    nn_type = cmd_args.nn_type
    model_name = os.path.normpath(cmd_args.model_name)
    labeled_data_name = os.path.normpath(cmd_args.csv_data_file)
    unlabeled_train_data_name = cmd_args.train_file_name.strip()
    if len(unlabeled_train_data_name) > 0:
        unlabeled_train_data_name = os.path.normpath(unlabeled_train_data_name)
        unlabeled_texts_for_training = load_unlabeled_texts(
            unlabeled_train_data_name)
        assert len(
            unlabeled_texts_for_training) > 0, 'File `{0}` is empty!'.format(
                unlabeled_train_data_name)
    else:
        unlabeled_texts_for_training = None
    unlabeled_test_data_name = cmd_args.test_file_name.strip()
    if len(unlabeled_test_data_name) > 0:
        unlabeled_test_data_name = os.path.normpath(unlabeled_test_data_name)
        unlabeled_texts_for_testing = load_unlabeled_texts(
            unlabeled_test_data_name)
        assert len(
            unlabeled_texts_for_testing) > 0, 'File `{0}` is empty!'.format(
                unlabeled_test_data_name)
    else:
        unlabeled_texts_for_testing = None
    labeled_texts, labels, classes_list = read_csv(labeled_data_name, 7)
    print('Number of labeled texts is {0}.'.format(len(labeled_texts)))
    print('Number of classes is {0}.'.format(len(classes_list)))
    if any(map(lambda it: isinstance(it, set), labels)):
        print(
            'Some data samples can be corresponded to several labels at once.')
        multioutput = True
    else:
        multioutput = False
    print('')
    print_classes_distribution(labels, classes_list)
    np.random.seed(42)
    indices_for_cv = ImpatialTextClassifier.cv_split(labels, 5)
    if cmd_args.search_hyperparameters:
        optimal_res = gp_minimize(func,
                                  dimensions=[
                                      Integer(0, 200),
                                      Integer(0, 200),
                                      Integer(0, 200),
                                      Integer(0, 200),
                                      Integer(0, 200)
                                  ],
                                  n_calls=20,
                                  n_random_starts=5,
                                  random_state=42,
                                  verbose=False,
                                  n_jobs=1)
        print('')
        hyperparameters = optimal_res.x
    else:
        hyperparameters = [
            cmd_args.size_of_conv1, cmd_args.size_of_conv2,
            cmd_args.size_of_conv3, cmd_args.size_of_conv4,
            cmd_args.size_of_conv5
        ]
    score(hyperparameters)
    with open(model_name, 'wb') as fp:
        pickle.dump(train(hyperparameters), fp)