def test_scoring_logreg_tune_correct(data_fixture, request): train_data, test_data = request.getfixturevalue(data_fixture) train_data.features = Scaling().fit(train_data.features).apply( train_data.features) test_data.features = Scaling().fit(test_data.features).apply( test_data.features) logreg = Model(model_type='logit') model, _ = logreg.fit(train_data) test_predicted = logreg.predict(fitted_model=model, data=test_data) test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted) logreg_for_tune = Model(model_type='logit') model_tuned, _ = logreg_for_tune.fine_tune( train_data, iterations=50, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = logreg_for_tune.predict(fitted_model=model_tuned, data=test_data) test_roc_auc_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.6 assert round(test_roc_auc_tuned, 2) >= round(test_roc_auc, 2) > roc_threshold
def test_classification_manual_tuning_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) knn = Model(model_type='knn') model, _ = knn.fit(data=train_data) test_predicted = knn.predict(fitted_model=model, data=test_data) knn_for_tune = Model(model_type='knn') knn_for_tune.params = {'n_neighbors': 1} model, _ = knn_for_tune.fit(data=train_data) test_predicted_tuned = knn_for_tune.predict(fitted_model=model, data=test_data) assert not np.array_equal(test_predicted, test_predicted_tuned)
def main(args): if args.gpu < 0: cuda = False else: cuda = True torch.cuda.set_device(args.gpu) default_path = create_default_path() print('\n*** Set default saving/loading path to:', default_path) if args.dataset == AIFB or args.dataset == MUTAG: module = importlib.import_module(MODULE.format('dglrgcn')) data = module.load_dglrgcn(args.data_path) data = to_cuda(data) if cuda else data mode = NODE_CLASSIFICATION elif args.dataset == MUTAGENICITY or args.dataset == PTC_MR or args.dataset == PTC_MM or args.dataset == PTC_FR or args.dataset == PTC_FM: module = importlib.import_module(MODULE.format('dortmund')) data = module.load_dortmund(args.data_path) data = to_cuda(data) if cuda else data mode = GRAPH_CLASSIFICATION else: raise ValueError('Unable to load dataset', args.dataset) print_graph_stats(data[GRAPH]) config_params = read_params(args.config_fpath, verbose=True) # create GNN model model = Model(g=data[GRAPH], config_params=config_params[0], n_classes=data[N_CLASSES], n_rels=data[N_RELS] if N_RELS in data else None, n_entities=data[N_ENTITIES] if N_ENTITIES in data else None, is_cuda=cuda, mode=mode) if cuda: model.cuda() # 1. Training app = App() learning_config = { 'lr': args.lr, 'n_epochs': args.n_epochs, 'weight_decay': args.weight_decay, 'batch_size': args.batch_size, 'cuda': cuda } print('\n*** Start training ***\n') app.train(data, config_params[0], learning_config, default_path, mode=mode) # 2. Testing print('\n*** Start testing ***\n') app.test(data, default_path, mode=mode) # 3. Delete model remove_model(default_path)
def test_node_factory_log_reg_correct(data_setup): model_type = 'logit' node = PrimaryNode(model_type=model_type) expected_model = Model(model_type=model_type).__class__ actual_model = node.model.__class__ assert node.__class__ == PrimaryNode assert expected_model == actual_model
def test_node_factory_log_reg_correct(data_setup): model_type = ModelTypesIdsEnum.logit node = NodeGenerator().primary_node(model_type=model_type) expected_model = Model(model_type=model_type).__class__ actual_model = node.model.__class__ assert node.__class__ == PrimaryNode assert expected_model == actual_model
def test_pca_model_removes_redunant_features_correct(): n_informative = 5 data = classification_dataset_with_redunant_features( n_samples=1000, n_features=100, n_informative=n_informative) train_data, test_data = train_test_data_setup(data=data) pca = Model(model_type='pca_data_model') _, train_predicted = pca.fit(data=train_data) assert train_predicted.shape[1] < data.features.shape[1]
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) kmeans = Model(model_type=ModelTypesIdsEnum.kmeans) _, train_predicted = kmeans.fit(data=train_data) assert all(np.unique(train_predicted) == [0, 1])
def test_log_regression_fit_correct(classification_dataset): data = classification_dataset data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) log_reg = Model(model_type=ModelTypesIdsEnum.logit) _, train_predicted = log_reg.fit(data=train_data) roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_qda_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) qda = Model(model_type=ModelTypesIdsEnum.qda) _, train_predicted = qda.fit(data=train_data) roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_lda_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) lda = Model(model_type='lda') _, train_predicted = lda.fit(data=train_data) roc_on_train = get_roc_auc(train_data, train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_log_regression_fit_correct(classification_dataset): data = classification_dataset data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) log_reg = Model(model_type='logit') _, train_predicted = log_reg.fit(data=train_data) roc_on_train = get_roc_auc(train_data, train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_knn_classification_tune_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) knn = Model(model_type='knn') model, _ = knn.fit(data=train_data) test_predicted = knn.predict(fitted_model=model, data=test_data) roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted) knn_for_tune = Model(model_type='knn') model, _ = knn_for_tune.fine_tune(data=train_data, iterations=10, max_lead_time=timedelta(minutes=1)) test_predicted_tuned = knn.predict(fitted_model=model, data=test_data) roc_on_test_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.6 assert roc_on_test_tuned > roc_on_test > roc_threshold
def test_pca_manual_tuning_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) pca = Model(model_type='pca_data_model') model, _ = pca.fit(data=train_data) test_predicted = pca.predict(fitted_model=model, data=test_data) pca_for_tune = Model(model_type='pca_data_model') pca_for_tune.params = { 'svd_solver': 'randomized', 'iterated_power': 'auto', 'dim_reduction_expl_thr': 0.7, 'dim_reduction_min_expl': 0.001 } model, _ = pca_for_tune.fit(data=train_data) test_predicted_tuned = pca_for_tune.predict(fitted_model=model, data=test_data) assert not np.array_equal(test_predicted, test_predicted_tuned)
def fit_template(chain_template, classes, with_gaussian=False, skip_fit=False): templates_by_models = [] for model_template in itertools.chain.from_iterable(chain_template): model_instance = Model(model_type=model_template.model_type) model_template.model_instance = model_instance templates_by_models.append((model_template, model_instance)) if skip_fit: return for template, instance in templates_by_models: samples, features_amount = template.input_shape if with_gaussian: features, target = gauss_quantiles(samples_amount=samples, features_amount=features_amount, classes_amount=classes) else: options = { 'informative': features_amount, 'redundant': 0, 'repeated': 0, 'clusters_per_class': 1 } features, target = synthetic_dataset( samples_amount=samples, features_amount=features_amount, classes_amount=classes, features_options=options) target = np.expand_dims(target, axis=1) data_train = InputData(idx=np.arange(0, samples), features=features, target=target, data_type=DataTypesEnum.table, task=Task(TaskTypesEnum.classification)) preproc_data = copy(data_train) preprocessor = Normalization().fit(preproc_data.features) preproc_data.features = preprocessor.apply(preproc_data.features) print(f'Fit {instance}') fitted_model, predictions = instance.fit(data=preproc_data) template.fitted_model = fitted_model template.data_fit = preproc_data template.preprocessor = preprocessor
def test_arima_tune_correct(): data = get_synthetic_ts_data() train_data, test_data = train_test_data_setup(data=data) arima_for_tune = Model(model_type='arima') model, _ = arima_for_tune.fine_tune(data=train_data, iterations=5, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = arima_for_tune.predict(fitted_model=model, data=test_data) rmse_on_test_tuned = mse(y_true=test_data.target, y_pred=test_predicted_tuned, squared=False) rmse_threshold = np.std(test_data.target) assert rmse_on_test_tuned < rmse_threshold
def test_max_lead_time_in_tune_process(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) start = datetime.now() knn_for_tune = Model(model_type='knn') model, _ = knn_for_tune.fine_tune(data=train_data, max_lead_time=timedelta(minutes=0.05), iterations=100) test_predicted_tuned = knn_for_tune.predict(fitted_model=model, data=test_data) roc_on_test_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.6 spent_time = (datetime.now() - start).seconds assert roc_on_test_tuned > roc_threshold assert spent_time == 3
def test_rf_class_tune_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) rf = Model(model_type='rf') model, _ = rf.fit(train_data) test_predicted = rf.predict(fitted_model=model, data=test_data) test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted) model_tuned, _ = rf.fine_tune(data=train_data, iterations=12, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = rf.predict(fitted_model=model_tuned, data=test_data) test_roc_auc_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.7 assert test_roc_auc_tuned != test_roc_auc assert test_roc_auc_tuned > roc_threshold
def __init__(self, model_type: str): model = Model(model_type=model_type) super().__init__(nodes_from=None, model=model)
def train(self, data, model_config, learning_config, save_path='', mode=NODE_CLASSIFICATION): loss_fcn = torch.nn.CrossEntropyLoss() labels = data[LABELS] # initialize graph if mode == NODE_CLASSIFICATION: train_mask = data[TRAIN_MASK] val_mask = data[VAL_MASK] dur = [] # create GNN model self.model = Model(g=data[GRAPH], config_params=model_config, n_classes=data[N_CLASSES], n_rels=data[N_RELS] if N_RELS in data else None, n_entities=data[N_ENTITIES] if N_ENTITIES in data else None, is_cuda=learning_config['cuda'], mode=mode) optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_config['lr'], weight_decay=learning_config['weight_decay']) for epoch in range(learning_config['n_epochs']): self.model.train() if epoch >= 3: t0 = time.time() # forward logits = self.model(None) loss = loss_fcn(logits[train_mask], labels[train_mask]) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) val_acc, val_loss = self.model.eval_node_classification(labels, val_mask) print("Epoch {:05d} | Time(s) {:.4f} | Train loss {:.4f} | Val accuracy {:.4f} | " "Val loss {:.4f}".format(epoch, np.mean(dur), loss.item(), val_acc, val_loss)) self.early_stopping(val_loss, self.model, save_path) if self.early_stopping.early_stop: print("Early stopping") break elif mode == GRAPH_CLASSIFICATION: self.accuracies = np.zeros(10) graphs = data[GRAPH] # load all the graphs # debug purposes: reshuffle all the data before the splitting random_indices = list(range(len(graphs))) random.shuffle(random_indices) graphs = [graphs[i] for i in random_indices] labels = labels[random_indices] K = 10 for k in range(K): # K-fold cross validation # create GNN model self.model = Model(g=data[GRAPH], config_params=model_config, n_classes=data[N_CLASSES], n_rels=data[N_RELS] if N_RELS in data else None, n_entities=data[N_ENTITIES] if N_ENTITIES in data else None, is_cuda=learning_config['cuda'], mode=mode) optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_config['lr'], weight_decay=learning_config['weight_decay']) if learning_config['cuda']: self.model.cuda() print('\n\n\nProcess new k') start = int(len(graphs)/K) * k end = int(len(graphs)/K) * (k+1) # testing batch testing_graphs = graphs[start:end] self.testing_labels = labels[start:end] self.testing_batch = dgl.batch(testing_graphs) # training batch training_graphs = graphs[:start] + graphs[end:] training_labels = labels[list(range(0, start)) + list(range(end+1, len(graphs)))] training_samples = list(map(list, zip(training_graphs, training_labels))) training_batches = DataLoader(training_samples, batch_size=learning_config['batch_size'], shuffle=True, collate_fn=collate) dur = [] for epoch in range(learning_config['n_epochs']): self.model.train() if epoch >= 3: t0 = time.time() losses = [] training_accuracies = [] for iter, (bg, label) in enumerate(training_batches): logits = self.model(bg) loss = loss_fcn(logits, label) losses.append(loss.item()) _, indices = torch.max(logits, dim=1) correct = torch.sum(indices == label) training_accuracies.append(correct.item() * 1.0 / len(label)) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) val_acc, val_loss = self.model.eval_graph_classification(self.testing_labels, self.testing_batch) print("Epoch {:05d} | Time(s) {:.4f} | Train acc {:.4f} | Train loss {:.4f} " "| Val accuracy {:.4f} | Val loss {:.4f}".format(epoch, np.mean(dur) if dur else 0, np.mean(training_accuracies), np.mean(losses), val_acc, val_loss)) is_better = self.early_stopping(val_loss, self.model, save_path) if is_better: self.accuracies[k] = val_acc if self.early_stopping.early_stop: print("Early stopping") break self.early_stopping.reset() else: raise RuntimeError
def __init__(self, model_type: ModelTypesIdsEnum): model = Model(model_type=model_type) super().__init__(nodes_from=None, model=model)
def __init__(self, model_type: str, nodes_from: Optional[List['Node']] = None): model = Model(model_type=model_type) nodes_from = [] if nodes_from is None else nodes_from super().__init__(nodes_from=nodes_from, model=model)
def __init__(self, nodes_from: Optional[List['Node']], model_type: str, manual_preprocessing_func: Optional[Callable] = None): self.nodes_from = nodes_from self.model = Model(model_type=model_type) self.cache = FittedModelCache(self) self.manual_preprocessing_func = manual_preprocessing_func
def __init__(self, nodes_from: Optional[List['Node']], model_type: ModelTypesIdsEnum): model = Model(model_type=model_type) nodes_from = [] if nodes_from is None else nodes_from super().__init__(nodes_from=nodes_from, model=model)