Exemplo n.º 1
0
 def test_read_csv_positive03(self):
     file_name = os.path.join(os.path.dirname(__file__),
                              'csv_for_testing2.csv')
     true_classes = [
         'Криминал', 'Культура', 'Общество', 'Политика', 'Экономика'
     ]
     true_texts = [
         'Германия разрешила третий пол.',
         'Лукашенко объяснил поставки оружия Азербайджану.',
         'Шакиру уличили в неуплате налогов.',
         'Люди в масках ворвались на собрание врачей в Киеве и отказались их выпускать.',
         'Подсчитаны расходы на российскую сверхтяжелую ракету для освоения Луны.',
         'Популярные актеры дебютируют в короткометражных хоррорах.',
         'Россия ответила на высылку дипломата из Словакии.',
         'Российские нефтяники заработали триллионы на фоне бензинового кризиса.',
         'Google выбрал своего «Человека года».'
     ]
     true_labels = [2, {3, 4}, {0, 1}, 0, 4, 1, 3, 4, 2]
     loaded_texts, loaded_labels, loaded_classes = read_csv(file_name, 1)
     self.assertIsInstance(loaded_classes, list)
     self.assertEqual(true_classes, loaded_classes)
     self.assertIsInstance(loaded_texts, np.ndarray)
     self.assertIsInstance(loaded_labels, np.ndarray)
     self.assertEqual(loaded_texts.shape, (len(true_texts), ))
     self.assertEqual(loaded_labels.shape, (len(true_labels), ))
     self.assertEqual(true_texts, loaded_texts.tolist())
     self.assertEqual(true_labels, loaded_labels.tolist())
Exemplo n.º 2
0
 def test_read_csv_positive04(self):
     file_name = os.path.join(os.path.dirname(__file__),
                              'csv_for_testing1.csv')
     true_classes = ['Криминал', 'Политика', 'Спорт']
     true_texts = [
         'Испанские клубы открестились от Неймара.',
         'Семилетняя беженка погибла после задержания на границе США.',
         'Главная реформа Обамы признана неконституционной.',
         'Бывший чемпион UFC не выдержал кровопролития и сдался.',
         'Охранника магазина зарезали из-за трех бутылок водки.',
         'Лукашенко пожаловался Путину на украинских «отмороженных нацменов».'
     ]
     true_labels = [2, 0, 1, 2, 0, 1]
     loaded_texts, loaded_labels, loaded_classes = read_csv(file_name, 1)
     self.assertIsInstance(loaded_classes, list)
     self.assertEqual(true_classes, loaded_classes)
     self.assertIsInstance(loaded_texts, np.ndarray)
     self.assertIsInstance(loaded_labels, np.ndarray)
     self.assertEqual(loaded_texts.shape, (len(true_texts), ))
     self.assertEqual(loaded_labels.shape, (len(true_labels), ))
     self.assertEqual(true_texts, loaded_texts.tolist())
     self.assertEqual(true_labels, loaded_labels.tolist())
Exemplo n.º 3
0
def main():

    def func(args):
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        hidden_layer_size_ = int(args[5])
        n_hidden_layers_ = int(args[6])
        if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
            hidden_layer_size_ = 0
            n_hidden_layers_ = 0
        quality = 0.0
        print('Filters number for different convolution kernels: ({0}, {1}, {2}, {3}, {4})'.format(
            conv1_, conv2_, conv3_, conv4_, conv5_))
        if n_hidden_layers_ > 0:
            print('Hidden layer size is {0}.'.format(hidden_layer_size_))
            print('Number of hidden layers is {0}.'.format(n_hidden_layers_))
        if nn_type == 'bayesian':
            init_kl_weight = float(args[7])
            fin_kl_weight = float(args[8])
            print('Optimal value of initial KL weight is {0:.6f}.'.format(init_kl_weight))
            print('Optimal value of final KL weight is {0:.6f}.'.format(fin_kl_weight))
        else:
            init_kl_weight = 1.0
            fin_kl_weight = 1.0
        if sum(args) == 0:
            return 1.0
        for fold_idx, (train_index, test_index) in enumerate(indices_for_cv):
            cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                                 else bert_handle),
                                         filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                         filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                         hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                         multioutput=multioutput, gpu_memory_frac=gpu_memory_frac,
                                         num_monte_carlo=num_monte_carlo, verbose=False, random_seed=42, max_epochs=100,
                                         patience=5, batch_size=16, bayesian=(nn_type == 'bayesian'),
                                         kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
            if os.path.exists(os.path.normpath(bert_handle)):
                cls.PATH_TO_BERT = os.path.normpath(bert_handle)
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
            train_index_, val_index = cls.train_test_split(train_labels, 0.1)
            val_texts = train_texts[val_index]
            val_labels = train_labels[val_index]
            if unlabeled_texts_for_training is None:
                train_texts = train_texts[train_index_]
                train_labels = train_labels[train_index_]
            else:
                train_texts = np.concatenate(
                    (
                        train_texts[train_index_],
                        unlabeled_texts_for_training
                    )
                )
                train_labels = np.concatenate(
                    (
                        train_labels[train_index_],
                        np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
                    )
                )
            cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
            del train_texts, train_labels, val_texts, val_labels, train_index_, val_index
            if unlabeled_texts_for_testing is None:
                texts_for_final_testing = labeled_texts[test_index]
                labels_for_final_testing = labels[test_index]
            else:
                texts_for_final_testing = np.concatenate(
                    (
                        labeled_texts[test_index],
                        unlabeled_texts_for_testing
                    )
                )
                labels_for_final_testing = np.concatenate(
                    (
                        labels[test_index],
                        np.full(shape=(len(unlabeled_texts_for_testing),), fill_value=-1, dtype=np.int32)
                    )
                )
            instant_quality = cls.score(texts_for_final_testing, labels_for_final_testing)
            quality += instant_quality
            print('Fold {0}: {1:.6f}.'.format(fold_idx + 1, instant_quality))
            del cls, texts_for_final_testing, labels_for_final_testing
        quality /= float(len(indices_for_cv))
        print('Total quality = {0:.6f}.'.format(quality))
        print('')
        return -quality

    def score(args):
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        hidden_layer_size_ = int(args[5])
        n_hidden_layers_ = int(args[6])
        if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
            hidden_layer_size_ = 0
            n_hidden_layers_ = 0
        print('Optimal filters number for different convolution kernels: ({0}, {1}, {2}, {3}, {4})'.format(
            conv1_, conv2_, conv3_, conv4_, conv5_))
        if n_hidden_layers_ > 0:
            print('Optimal size of the hidden layer is {0}.'.format(hidden_layer_size_))
            print('Optimal number of hidden layers is {0}.'.format(n_hidden_layers_))
        if nn_type == 'bayesian':
            init_kl_weight = float(args[7])
            fin_kl_weight = float(args[8])
            print('Optimal value of initial KL weight is {0:.6f}.'.format(init_kl_weight))
            print('Optimal value of final KL weight is {0:.6f}.'.format(fin_kl_weight))
        else:
            init_kl_weight = 1.0
            fin_kl_weight = 1.0
        print('')
        y_pred = []
        y_true = []
        unlabeled_is_added = False
        for train_index, test_index in indices_for_cv:
            cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                                 else bert_handle),
                                         filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                         filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                         hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                         batch_size=16, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42,
                                         num_monte_carlo=num_monte_carlo, max_epochs=100, patience=5,
                                         multioutput=multioutput, bayesian=(nn_type == 'bayesian'),
                                         kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
            if os.path.exists(os.path.normpath(bert_handle)):
                cls.PATH_TO_BERT = os.path.normpath(bert_handle)
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
            train_index_, val_index = cls.train_test_split(train_labels, 0.1)
            val_texts = train_texts[val_index]
            val_labels = train_labels[val_index]
            if unlabeled_texts_for_training is None:
                train_texts = train_texts[train_index_]
                train_labels = train_labels[train_index_]
            else:
                train_texts = np.concatenate(
                    (
                        train_texts[train_index_],
                        unlabeled_texts_for_training
                    )
                )
                train_labels = np.concatenate(
                    (
                        train_labels[train_index_],
                        np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
                    )
                )
            cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
            print('')
            del train_texts, train_labels, val_texts, val_labels, train_index_, val_index
            if (not unlabeled_is_added) and (unlabeled_texts_for_testing is not None):
                y_pred.append(cls.predict(unlabeled_texts_for_testing))
                unlabeled_is_added = True
                y_true.append(np.full(shape=(len(unlabeled_texts_for_testing),), fill_value=-1, dtype=np.int32))
            y_pred.append(cls.predict(labeled_texts[test_index]))
            y_true.append(labels[test_index])
            del cls
        y_pred = np.concatenate(y_pred)
        y_true = np.concatenate(y_true)
        print('')
        if multioutput:
            for class_idx in range(len(classes_list)):
                y_true_ = np.zeros((len(y_true),), dtype=np.int32)
                y_pred_ = np.zeros((len(y_pred),), dtype=np.int32)
                for sample_idx in range(len(y_true)):
                    if isinstance(y_true[sample_idx], set):
                        if class_idx in y_true[sample_idx]:
                            y_true_[sample_idx] = 1
                    elif class_idx == y_true[sample_idx]:
                        y_true_[sample_idx] = 1
                    if isinstance(y_pred[sample_idx], set):
                        if class_idx in y_pred[sample_idx]:
                            y_pred_[sample_idx] = 1
                    elif class_idx == y_pred[sample_idx]:
                        y_pred_[sample_idx] = 1
                print(classification_report(y_true, y_pred, target_names=['OTHER', classes_list[class_idx]], digits=4))
        else:
            for sample_idx in range(len(y_true)):
                if y_true[sample_idx] < 0:
                    y_true[sample_idx] = len(classes_list)
                if y_pred[sample_idx] < 0:
                    y_pred[sample_idx] = len(classes_list)
            print(classification_report(y_true, y_pred, target_names=classes_list + ['UNKNOWN'], digits=4))
            print('')

    def train(args) -> ImpartialTextClassifier:
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        hidden_layer_size_ = int(args[5])
        n_hidden_layers_ = int(args[6])
        if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
            hidden_layer_size_ = 0
            n_hidden_layers_ = 0
        if nn_type == 'bayesian':
            init_kl_weight = float(args[7])
            fin_kl_weight = float(args[8])
        else:
            init_kl_weight = 1.0
            fin_kl_weight = 1.0
        train_index, val_index = ImpartialTextClassifier.train_test_split(labels, 0.1)
        if unlabeled_texts_for_training is None:
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
        else:
            train_texts = np.concatenate(
                (
                    labeled_texts[train_index],
                    unlabeled_texts_for_training
                )
            )
            train_labels = np.concatenate(
                (
                    labels[train_index],
                    np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
                )
            )
        val_texts = labeled_texts[val_index]
        val_labels = labels[val_index]
        cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                             else bert_handle),
                                     filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                     filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                     hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                     batch_size=16, gpu_memory_frac=gpu_memory_frac, num_monte_carlo=num_monte_carlo,
                                     verbose=True, random_seed=42, max_epochs=100, patience=5, multioutput=multioutput,
                                     bayesian=(nn_type == 'bayesian'),
                                     kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
        if os.path.exists(os.path.normpath(bert_handle)):
            cls.PATH_TO_BERT = os.path.normpath(bert_handle)
        cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
        del train_texts, train_labels, val_texts, val_labels
        return cls

    parser = ArgumentParser()
    parser.add_argument('-m', '--model', dest='model_name', type=str, required=True,
                        help='The binary file with the text classifier.')
    parser.add_argument('-b', '--bert', dest='bert', type=str, required=False,
                        default='https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1',
                        help='URL of used TF-Hub BERT model (or path to the BERT model in local drive).')
    parser.add_argument('-c', '--csv', dest='csv_data_file', type=str, required=True,
                        help='Path to the CSV file with labeled data.')
    parser.add_argument('-t', '--train', dest='train_file_name', type=str, required=False, default='',
                        help='Path to the text file with unlabeled data for training.')
    parser.add_argument('-e', '--test', dest='test_file_name', type=str, required=False, default='',
                        help='Path to the text file with unlabeled data for evaluation.')
    parser.add_argument('--gpu_frac', dest='gpu_memory_frac', type=float, required=False, default=0.9,
                        help='Allocable part of the GPU memory for the classifier.')
    parser.add_argument('--nn_type', dest='nn_type', type=str, choices=['bayesian', 'usual'], required=False,
                        default='bayesian', help='Neural network type: `bayesian` or `usual`.')
    parser.add_argument('--num_monte_carlo', dest='num_monte_carlo', type=int, required=False, default=100,
                        help='Number of generated Monte Carlo samples for each data sample.')
    parser.add_argument('--conv1', dest='size_of_conv1', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 1.')
    parser.add_argument('--conv2', dest='size_of_conv2', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 2.')
    parser.add_argument('--conv3', dest='size_of_conv3', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 3.')
    parser.add_argument('--conv4', dest='size_of_conv4', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 4.')
    parser.add_argument('--conv5', dest='size_of_conv5', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 5.')
    parser.add_argument('--hidden', dest='hidden_layer_size', type=str, required=False, default='500',
                        help='Size of each hidden layer and total number of hidden layers (separate them with colons).')
    parser.add_argument('--init_kl_weight', dest='init_kl_weight', type=float, required=False, default=1e-1,
                        help='Initial value of KL weight.')
    parser.add_argument('--fin_kl_weight', dest='fin_kl_weight', type=float, required=False, default=1e-2,
                        help='Final value of KL weight.')
    parser.add_argument('--search', dest='search_hyperparameters', required=False, action='store_true',
                        default=False, help='Will be hyperparameters found by the Bayesian optimization?')
    cmd_args = parser.parse_args()

    num_monte_carlo = cmd_args.num_monte_carlo
    gpu_memory_frac = cmd_args.gpu_memory_frac
    bert_handle = cmd_args.bert
    nn_type = cmd_args.nn_type
    model_name = os.path.normpath(cmd_args.model_name)
    labeled_data_name = os.path.normpath(cmd_args.csv_data_file)
    unlabeled_train_data_name = cmd_args.train_file_name.strip()
    hidden_layer_size, n_hidden_layers = parse_hidden_layers_description(cmd_args.hidden_layer_size)
    if len(unlabeled_train_data_name) > 0:
        unlabeled_train_data_name = os.path.normpath(unlabeled_train_data_name)
        unlabeled_texts_for_training = load_unlabeled_texts(unlabeled_train_data_name)
        assert len(unlabeled_texts_for_training) > 0, 'File `{0}` is empty!'.format(unlabeled_train_data_name)
    else:
        unlabeled_texts_for_training = None
    unlabeled_test_data_name = cmd_args.test_file_name.strip()
    if len(unlabeled_test_data_name) > 0:
        unlabeled_test_data_name = os.path.normpath(unlabeled_test_data_name)
        unlabeled_texts_for_testing = load_unlabeled_texts(unlabeled_test_data_name)
        assert len(unlabeled_texts_for_testing) > 0, 'File `{0}` is empty!'.format(unlabeled_test_data_name)
    else:
        unlabeled_texts_for_testing = None
    labeled_texts, labels, classes_list = read_csv(labeled_data_name, 7)
    print('Number of labeled texts is {0}.'.format(len(labeled_texts)))
    print('Number of classes is {0}.'.format(len(classes_list)))
    if any(map(lambda it: isinstance(it, set), labels)):
        print('Some data samples can be corresponded to several labels at once.')
        multioutput = True
    else:
        multioutput = False
    print('')
    print_classes_distribution(labels, classes_list)
    np.random.seed(42)
    indices_for_cv = ImpartialTextClassifier.cv_split(labels, 5)
    if cmd_args.search_hyperparameters:
        dimensions = [Integer(0, 300), Integer(0, 300), Integer(0, 300), Integer(0, 300), Integer(0, 300),
                      Integer(100, 2000), Integer(0, 3)]
        if nn_type == 'bayesian':
            dimensions += [Real(1e-5, 1.0, prior='log-uniform'), Real(1e-5, 1.0, prior='log-uniform')]
        optimal_res = gp_minimize(
            func, dimensions=dimensions,
            n_calls=100, n_random_starts=5, random_state=42, verbose=False, n_jobs=1
        )
        print('')
        hyperparameters = optimal_res.x
    else:
        hyperparameters = [cmd_args.size_of_conv1, cmd_args.size_of_conv2, cmd_args.size_of_conv3,
                           cmd_args.size_of_conv4, cmd_args.size_of_conv5, hidden_layer_size, n_hidden_layers,
                           cmd_args.init_kl_weight, cmd_args.fin_kl_weight]
    score(hyperparameters)
    with open(model_name, 'wb') as fp:
        pickle.dump(train(hyperparameters), fp)