Пример #1
0
def test_scoring_logreg_tune_correct(data_fixture, request):
    train_data, test_data = request.getfixturevalue(data_fixture)

    train_data.features = Scaling().fit(train_data.features).apply(
        train_data.features)
    test_data.features = Scaling().fit(test_data.features).apply(
        test_data.features)

    logreg = Model(model_type='logit')

    model, _ = logreg.fit(train_data)
    test_predicted = logreg.predict(fitted_model=model, data=test_data)

    test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted)

    logreg_for_tune = Model(model_type='logit')

    model_tuned, _ = logreg_for_tune.fine_tune(
        train_data, iterations=50, max_lead_time=timedelta(minutes=0.1))
    test_predicted_tuned = logreg_for_tune.predict(fitted_model=model_tuned,
                                                   data=test_data)

    test_roc_auc_tuned = roc_auc(y_true=test_data.target,
                                 y_score=test_predicted_tuned)

    roc_threshold = 0.6

    assert round(test_roc_auc_tuned, 2) >= round(test_roc_auc,
                                                 2) > roc_threshold
Пример #2
0
def test_classification_manual_tuning_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    knn_for_tune = Model(model_type='knn')
    knn_for_tune.params = {'n_neighbors': 1}
    model, _ = knn_for_tune.fit(data=train_data)

    test_predicted_tuned = knn_for_tune.predict(fitted_model=model,
                                                data=test_data)

    assert not np.array_equal(test_predicted, test_predicted_tuned)
Пример #3
0
def main(args):

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)

    default_path = create_default_path()
    print('\n*** Set default saving/loading path to:', default_path)

    if args.dataset == AIFB or args.dataset == MUTAG:
        module = importlib.import_module(MODULE.format('dglrgcn'))
        data = module.load_dglrgcn(args.data_path)
        data = to_cuda(data) if cuda else data
        mode = NODE_CLASSIFICATION
    elif args.dataset == MUTAGENICITY or args.dataset == PTC_MR or args.dataset == PTC_MM or args.dataset == PTC_FR or args.dataset == PTC_FM:
        module = importlib.import_module(MODULE.format('dortmund'))
        data = module.load_dortmund(args.data_path)
        data = to_cuda(data) if cuda else data
        mode = GRAPH_CLASSIFICATION
    else:
        raise ValueError('Unable to load dataset', args.dataset)

    print_graph_stats(data[GRAPH])

    config_params = read_params(args.config_fpath, verbose=True)

    # create GNN model
    model = Model(g=data[GRAPH],
                  config_params=config_params[0],
                  n_classes=data[N_CLASSES],
                  n_rels=data[N_RELS] if N_RELS in data else None,
                  n_entities=data[N_ENTITIES] if N_ENTITIES in data else None,
                  is_cuda=cuda,
                  mode=mode)

    if cuda:
        model.cuda()

    # 1. Training
    app = App()
    learning_config = {
        'lr': args.lr,
        'n_epochs': args.n_epochs,
        'weight_decay': args.weight_decay,
        'batch_size': args.batch_size,
        'cuda': cuda
    }
    print('\n*** Start training ***\n')
    app.train(data, config_params[0], learning_config, default_path, mode=mode)

    # 2. Testing
    print('\n*** Start testing ***\n')
    app.test(data, default_path, mode=mode)

    # 3. Delete model
    remove_model(default_path)
Пример #4
0
def test_node_factory_log_reg_correct(data_setup):
    model_type = 'logit'
    node = PrimaryNode(model_type=model_type)

    expected_model = Model(model_type=model_type).__class__
    actual_model = node.model.__class__

    assert node.__class__ == PrimaryNode
    assert expected_model == actual_model
Пример #5
0
def test_node_factory_log_reg_correct(data_setup):
    model_type = ModelTypesIdsEnum.logit
    node = NodeGenerator().primary_node(model_type=model_type)

    expected_model = Model(model_type=model_type).__class__
    actual_model = node.model.__class__

    assert node.__class__ == PrimaryNode
    assert expected_model == actual_model
Пример #6
0
def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(
        n_samples=1000, n_features=100, n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    pca = Model(model_type='pca_data_model')
    _, train_predicted = pca.fit(data=train_data)

    assert train_predicted.shape[1] < data.features.shape[1]
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    kmeans = Model(model_type=ModelTypesIdsEnum.kmeans)

    _, train_predicted = kmeans.fit(data=train_data)

    assert all(np.unique(train_predicted) == [0, 1])
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    log_reg = Model(model_type=ModelTypesIdsEnum.logit)

    _, train_predicted = log_reg.fit(data=train_data)
    roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
def test_qda_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    qda = Model(model_type=ModelTypesIdsEnum.qda)

    _, train_predicted = qda.fit(data=train_data)
    roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Пример #10
0
def test_lda_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    lda = Model(model_type='lda')

    _, train_predicted = lda.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Пример #11
0
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    log_reg = Model(model_type='logit')

    _, train_predicted = log_reg.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Пример #12
0
def test_knn_classification_tune_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted)

    knn_for_tune = Model(model_type='knn')
    model, _ = knn_for_tune.fine_tune(data=train_data,
                                      iterations=10,
                                      max_lead_time=timedelta(minutes=1))

    test_predicted_tuned = knn.predict(fitted_model=model, data=test_data)

    roc_on_test_tuned = roc_auc(y_true=test_data.target,
                                y_score=test_predicted_tuned)
    roc_threshold = 0.6
    assert roc_on_test_tuned > roc_on_test > roc_threshold
Пример #13
0
def test_pca_manual_tuning_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    pca = Model(model_type='pca_data_model')
    model, _ = pca.fit(data=train_data)
    test_predicted = pca.predict(fitted_model=model, data=test_data)

    pca_for_tune = Model(model_type='pca_data_model')

    pca_for_tune.params = {
        'svd_solver': 'randomized',
        'iterated_power': 'auto',
        'dim_reduction_expl_thr': 0.7,
        'dim_reduction_min_expl': 0.001
    }

    model, _ = pca_for_tune.fit(data=train_data)
    test_predicted_tuned = pca_for_tune.predict(fitted_model=model,
                                                data=test_data)

    assert not np.array_equal(test_predicted, test_predicted_tuned)
Пример #14
0
def fit_template(chain_template, classes, with_gaussian=False, skip_fit=False):
    templates_by_models = []
    for model_template in itertools.chain.from_iterable(chain_template):
        model_instance = Model(model_type=model_template.model_type)
        model_template.model_instance = model_instance
        templates_by_models.append((model_template, model_instance))
    if skip_fit:
        return

    for template, instance in templates_by_models:
        samples, features_amount = template.input_shape

        if with_gaussian:
            features, target = gauss_quantiles(samples_amount=samples,
                                               features_amount=features_amount,
                                               classes_amount=classes)
        else:
            options = {
                'informative': features_amount,
                'redundant': 0,
                'repeated': 0,
                'clusters_per_class': 1
            }
            features, target = synthetic_dataset(
                samples_amount=samples,
                features_amount=features_amount,
                classes_amount=classes,
                features_options=options)
        target = np.expand_dims(target, axis=1)
        data_train = InputData(idx=np.arange(0, samples),
                               features=features,
                               target=target,
                               data_type=DataTypesEnum.table,
                               task=Task(TaskTypesEnum.classification))

        preproc_data = copy(data_train)
        preprocessor = Normalization().fit(preproc_data.features)
        preproc_data.features = preprocessor.apply(preproc_data.features)
        print(f'Fit {instance}')
        fitted_model, predictions = instance.fit(data=preproc_data)

        template.fitted_model = fitted_model
        template.data_fit = preproc_data
        template.preprocessor = preprocessor
Пример #15
0
def test_arima_tune_correct():
    data = get_synthetic_ts_data()
    train_data, test_data = train_test_data_setup(data=data)

    arima_for_tune = Model(model_type='arima')
    model, _ = arima_for_tune.fine_tune(data=train_data,
                                        iterations=5,
                                        max_lead_time=timedelta(minutes=0.1))

    test_predicted_tuned = arima_for_tune.predict(fitted_model=model,
                                                  data=test_data)

    rmse_on_test_tuned = mse(y_true=test_data.target,
                             y_pred=test_predicted_tuned,
                             squared=False)

    rmse_threshold = np.std(test_data.target)

    assert rmse_on_test_tuned < rmse_threshold
Пример #16
0
def test_max_lead_time_in_tune_process(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    start = datetime.now()

    knn_for_tune = Model(model_type='knn')
    model, _ = knn_for_tune.fine_tune(data=train_data,
                                      max_lead_time=timedelta(minutes=0.05),
                                      iterations=100)
    test_predicted_tuned = knn_for_tune.predict(fitted_model=model,
                                                data=test_data)

    roc_on_test_tuned = roc_auc(y_true=test_data.target,
                                y_score=test_predicted_tuned)
    roc_threshold = 0.6

    spent_time = (datetime.now() - start).seconds

    assert roc_on_test_tuned > roc_threshold
    assert spent_time == 3
Пример #17
0
def test_rf_class_tune_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    rf = Model(model_type='rf')

    model, _ = rf.fit(train_data)
    test_predicted = rf.predict(fitted_model=model, data=test_data)

    test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted)

    model_tuned, _ = rf.fine_tune(data=train_data,
                                  iterations=12,
                                  max_lead_time=timedelta(minutes=0.1))
    test_predicted_tuned = rf.predict(fitted_model=model_tuned, data=test_data)

    test_roc_auc_tuned = roc_auc(y_true=test_data.target,
                                 y_score=test_predicted_tuned)
    roc_threshold = 0.7

    assert test_roc_auc_tuned != test_roc_auc
    assert test_roc_auc_tuned > roc_threshold
Пример #18
0
 def __init__(self, model_type: str):
     model = Model(model_type=model_type)
     super().__init__(nodes_from=None, model=model)
Пример #19
0
    def train(self, data, model_config, learning_config, save_path='', mode=NODE_CLASSIFICATION):

        loss_fcn = torch.nn.CrossEntropyLoss()

        labels = data[LABELS]
        # initialize graph
        if mode == NODE_CLASSIFICATION:
            train_mask = data[TRAIN_MASK]
            val_mask = data[VAL_MASK]
            dur = []

            # create GNN model
            self.model = Model(g=data[GRAPH],
                               config_params=model_config,
                               n_classes=data[N_CLASSES],
                               n_rels=data[N_RELS] if N_RELS in data else None,
                               n_entities=data[N_ENTITIES] if N_ENTITIES in data else None,
                               is_cuda=learning_config['cuda'],
                               mode=mode)

            optimizer = torch.optim.Adam(self.model.parameters(),
                                         lr=learning_config['lr'],
                                         weight_decay=learning_config['weight_decay'])

            for epoch in range(learning_config['n_epochs']):
                self.model.train()
                if epoch >= 3:
                    t0 = time.time()
                # forward
                logits = self.model(None)
                loss = loss_fcn(logits[train_mask], labels[train_mask])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if epoch >= 3:
                    dur.append(time.time() - t0)

                val_acc, val_loss = self.model.eval_node_classification(labels, val_mask)
                print("Epoch {:05d} | Time(s) {:.4f} | Train loss {:.4f} | Val accuracy {:.4f} | "
                      "Val loss {:.4f}".format(epoch,
                                               np.mean(dur),
                                               loss.item(),
                                               val_acc,
                                               val_loss))

                self.early_stopping(val_loss, self.model, save_path)

                if self.early_stopping.early_stop:
                    print("Early stopping")
                    break

        elif mode == GRAPH_CLASSIFICATION:
            self.accuracies = np.zeros(10)
            graphs = data[GRAPH]                 # load all the graphs

            # debug purposes: reshuffle all the data before the splitting
            random_indices = list(range(len(graphs)))
            random.shuffle(random_indices)
            graphs = [graphs[i] for i in random_indices]
            labels = labels[random_indices]

            K = 10
            for k in range(K):                  # K-fold cross validation

                # create GNN model
                self.model = Model(g=data[GRAPH],
                                   config_params=model_config,
                                   n_classes=data[N_CLASSES],
                                   n_rels=data[N_RELS] if N_RELS in data else None,
                                   n_entities=data[N_ENTITIES] if N_ENTITIES in data else None,
                                   is_cuda=learning_config['cuda'],
                                   mode=mode)

                optimizer = torch.optim.Adam(self.model.parameters(),
                                             lr=learning_config['lr'],
                                             weight_decay=learning_config['weight_decay'])

                if learning_config['cuda']:
                    self.model.cuda()

                print('\n\n\nProcess new k')
                start = int(len(graphs)/K) * k
                end = int(len(graphs)/K) * (k+1)

                # testing batch
                testing_graphs = graphs[start:end]
                self.testing_labels = labels[start:end]
                self.testing_batch = dgl.batch(testing_graphs)

                # training batch
                training_graphs = graphs[:start] + graphs[end:]
                training_labels = labels[list(range(0, start)) + list(range(end+1, len(graphs)))]
                training_samples = list(map(list, zip(training_graphs, training_labels)))
                training_batches = DataLoader(training_samples,
                                              batch_size=learning_config['batch_size'],
                                              shuffle=True,
                                              collate_fn=collate)

                dur = []
                for epoch in range(learning_config['n_epochs']):
                    self.model.train()
                    if epoch >= 3:
                        t0 = time.time()
                    losses = []
                    training_accuracies = []
                    for iter, (bg, label) in enumerate(training_batches):
                        logits = self.model(bg)
                        loss = loss_fcn(logits, label)
                        losses.append(loss.item())
                        _, indices = torch.max(logits, dim=1)
                        correct = torch.sum(indices == label)
                        training_accuracies.append(correct.item() * 1.0 / len(label))

                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

                    if epoch >= 3:
                        dur.append(time.time() - t0)
                    val_acc, val_loss = self.model.eval_graph_classification(self.testing_labels, self.testing_batch)
                    print("Epoch {:05d} | Time(s) {:.4f} | Train acc {:.4f} | Train loss {:.4f} "
                          "| Val accuracy {:.4f} | Val loss {:.4f}".format(epoch,
                                                                           np.mean(dur) if dur else 0,
                                                                           np.mean(training_accuracies),
                                                                           np.mean(losses),
                                                                           val_acc,
                                                                           val_loss))

                    is_better = self.early_stopping(val_loss, self.model, save_path)
                    if is_better:
                        self.accuracies[k] = val_acc

                    if self.early_stopping.early_stop:
                        print("Early stopping")
                        break
                self.early_stopping.reset()
        else:
            raise RuntimeError
Пример #20
0
 def __init__(self, model_type: ModelTypesIdsEnum):
     model = Model(model_type=model_type)
     super().__init__(nodes_from=None, model=model)
Пример #21
0
 def __init__(self, model_type: str, nodes_from: Optional[List['Node']] = None):
     model = Model(model_type=model_type)
     nodes_from = [] if nodes_from is None else nodes_from
     super().__init__(nodes_from=nodes_from, model=model)
Пример #22
0
 def __init__(self, nodes_from: Optional[List['Node']], model_type: str,
              manual_preprocessing_func: Optional[Callable] = None):
     self.nodes_from = nodes_from
     self.model = Model(model_type=model_type)
     self.cache = FittedModelCache(self)
     self.manual_preprocessing_func = manual_preprocessing_func
Пример #23
0
 def __init__(self, nodes_from: Optional[List['Node']],
              model_type: ModelTypesIdsEnum):
     model = Model(model_type=model_type)
     nodes_from = [] if nodes_from is None else nodes_from
     super().__init__(nodes_from=nodes_from, model=model)