示例#1
0
def run(i_cv):
    N_SIG = 15000
    N_BKG = N_SIG
    N_SAMPLES = N_SIG + N_BKG
    mix = N_SIG / N_SAMPLES
    model_name = 'GradientBoosting'
    directory = os.path.join(DIRECTORY, f'cv_{i_cv}')
    os.makedirs(directory, exist_ok=True)
    print(f'running iter {i_cv}...')

    results = {'i_cv': i_cv}

    seed = SEED + 5 * i_cv
    train_seed = seed
    test_seed = seed + 1

    # Generate training data
    generator = Generator(train_seed)
    z_train = generator.sample_nuisance(N_SIG)
    X_train, y_train = generator.sample_event(z_train, mix, N_SAMPLES)

    # Train classifier
    model = GradientBoostingClassifier(n_estimators=400, learning_rate=5e-2)
    model.fit(X_train, y_train)

    # Generate testing data
    generator = Generator(test_seed)
    z_test = generator.sample_nuisance(N_SIG)
    X_test, y_test = generator.sample_event(z_test, mix, N_SAMPLES)

    # Evaluation
    r = evaluate_classifier(model,
                            X_train,
                            y_train,
                            prefix='train',
                            model_name=model_name,
                            directory=directory)
    results.update(r)
    r = evaluate_classifier(model,
                            X_test,
                            y_test,
                            prefix='test',
                            model_name=model_name,
                            directory=directory)
    results.update(r)

    evaluate_pivotal(model,
                     generator,
                     prefix='test',
                     model_name=model_name,
                     directory=directory)

    return results
def evaluate_parameters():
    X,y = get_train_data(limit=25)

    scores = []
    scores_std = []

    print('Start learning...')
    forests = [70]
    rbm_components = [1100]
    rbm_learning_rate = [0.06]
    rbm_n_iter = [20]

    it = itertools.product(forests,rbm_components,rbm_learning_rate,rbm_n_iter)

    for (trees,components,learning_rate,n_iter) in it:
        classifier = get_classifier(trees,components,learning_rate,n_iter)
        name = "plots_pipeline/pipeline_{}.png".format(trees)
        e.evaluate_classifier(classifier,X,y, name=name)
示例#3
0
def evaluate_parameters():
    X, y = get_train_data(limit=25)

    scores = []
    scores_std = []

    print('Start learning...')
    forests = [70]
    rbm_components = [1100]
    rbm_learning_rate = [0.06]
    rbm_n_iter = [20]

    it = itertools.product(forests, rbm_components, rbm_learning_rate,
                           rbm_n_iter)

    for (trees, components, learning_rate, n_iter) in it:
        classifier = get_classifier(trees, components, learning_rate, n_iter)
        name = "plots_pipeline/pipeline_{}.png".format(trees)
        e.evaluate_classifier(classifier, X, y, name=name)
示例#4
0
def run_arc_test(X_train, y_train, X_test, y_test, params, algo_name):
    X_train_nn = np.array(X_train).T
    y_train_nn = np.array(y_train).reshape(-1,1).T
    X_test_nn = np.array(X_test).T
    y_test_nn = np.array(y_test).reshape(-1,1).T
    
    for layer_dims in params['arcs']:

        layer_dims = list(layer_dims)
        
        y_score, param_dict, costs = simple_ann.train_network(X_train_nn, y_train_nn, X_test_nn, y_test_nn, layer_dims, num_iterations=params['num_iter'], num_checkpoints=params['num_iter']/params['checkpoints'], c_plot=False, learning_rate=params['learning_rate'], learn_adjust = params['learn_adjust'], weights = params['weights'])

        _, y_score_p = simple_ann.forward_propagation(X_test_nn, param_dict[params['num_iter']], layer_dims)
        y_score = (y_score_p > 0.5).astype(float)
        
        print('\tPredictions == 1: ', y_score.sum())
        text = '%s_test_%s' %(algo_name, str(layer_dims))

        print('-------------------------------')
        print('\t\t%s\t\t'%text)
        print('-------------------------------')
        evaluate_classifier(np.array(y_test_nn).reshape(1,-1), y_score.reshape(1,-1))
        print('\n\n')
def evaluate_with_comparison(networks, dataloader, comparison_dataloader=None, **options):
    # comparison_dataloader = get_comparison_dataloader(**options)
    # if comparison_dataloader:
        # options['fold'] = 'openset_{}'.format(comparison_dataloader.dsf.name)
    options['fold'] = 'openset_{}'.format(options['data_dir'].split('/')[-1])
    if options.get('mode'):
        options['fold'] += '_{}'.format(options['mode'])
    if options.get('aux_dataset'):
        aux_dataset = CustomDataloader(options['aux_dataset'])
        options['fold'] = '{}_{}'.format(options.get('fold'), aux_dataset.dsf.count())

    new_results = evaluation.evaluate_classifier(networks, dataloader, comparison_dataloader, **options)

    if comparison_dataloader is not None:
        openset_results = evaluation.evaluate_openset(networks, dataloader, comparison_dataloader, **options)
        new_results[options['fold']].update(openset_results)
    return new_results[options['fold']]
示例#6
0
comparison_dataloader = None
if options['comparison_dataset']:
    comparison_options = options.copy()
    comparison_options['dataset'] = options['comparison_dataset']
    comparison_dataloader = CustomDataloader(last_batch=True,
                                             shuffle=False,
                                             **comparison_options)
    comparison_name = options['comparison_dataset'].split('/')[-1].split(
        '.')[0]
    labels_dir = os.path.join(options['result_dir'], 'labels')
    if os.path.exists(labels_dir):
        label_count = len(os.listdir(labels_dir))
    else:
        label_count = 0
    # Hack: ignore the label count
    """
    options['fold'] = 'openset_{}_{:04d}'.format(comparison_name, label_count)
    """
    options['fold'] = 'openset_{}'.format(comparison_name)

new_results = evaluate_classifier(networks, dataloader, comparison_dataloader,
                                  **options)
if options['comparison_dataset']:
    openset_results = evaluate_openset(networks, dataloader,
                                       comparison_dataloader, **options)
    pprint(openset_results)
    new_results[options['fold'] + '_openset'] = openset_results
    new_results[options['fold']]['active_learning_label_count'] = label_count

save_evaluation(new_results, options['result_dir'], options['epoch'])
def run(i_cv):
    N_SIG = 15000
    N_BKG = N_SIG
    N_SAMPLES = N_SIG + N_BKG
    mix = N_SIG / N_SAMPLES
    model_name = 'PivotClassifier'
    directory = os.path.join(DIRECTORY, f'cv_{i_cv}')
    os.makedirs(directory, exist_ok=True)
    print(f'running iter {i_cv}...')

    results = {'i_cv': i_cv}

    seed = SEED + 5 * i_cv
    train_seed = seed
    test_seed = seed + 1

    # Generate training data
    generator = Generator(train_seed)
    z_train = generator.sample_nuisance(N_SIG)
    X_train, y_train = generator.sample_event(z_train, mix, N_SAMPLES)
    z_train = np.concatenate((np.zeros(N_BKG), z_train), axis=0)

    # Define Pivot
    net = F3Classifier(n_in=2, n_out=1)
    adv_net = F3GausianMixtureDensity(n_in=1, n_components=5)
    # net_criterion = nn.CrossEntropyLoss()
    net_criterion = nn.BCEWithLogitsLoss()
    adv_criterion = ADVLoss()

    # ADAM
    # Reducing optimizer inertia with lower beta1 and beta2 help with density network
    net_optimizer = optim.Adam(net.parameters(), lr=1e-3, betas=(0.5, 0.9))
    adv_optimizer = optim.Adam(adv_net.parameters(), lr=1e-3, betas=(0.5, 0.9))
    # SGD
    # net_optimizer = optim.SGD(net.parameters(), lr=1e-3)
    # adv_optimizer = optim.SGD(adv_net.parameters(), lr=1e-3)

    # model = PivotClassifier(net, adv_net, net_criterion, adv_criterion, TRADE_OFF, net_optimizer, adv_optimizer,
    model = PivotBinaryClassifier(net,
                                  adv_net,
                                  net_criterion,
                                  adv_criterion,
                                  TRADE_OFF,
                                  net_optimizer,
                                  adv_optimizer,
                                  n_net_pre_training_steps=500,
                                  n_adv_pre_training_steps=3000,
                                  n_steps=2000,
                                  n_recovery_steps=20,
                                  batch_size=128,
                                  rescale=True,
                                  cuda=False,
                                  verbose=0)

    # Train Pivot
    model.fit(X_train, y_train, z_train)

    # Generate testing data
    generator = Generator(test_seed)
    z_test = generator.sample_nuisance(N_SIG)
    X_test, y_test = generator.sample_event(z_test, mix, N_SAMPLES)

    # Evaluation
    r = evaluate_neural_net(model,
                            prefix='train',
                            model_name=model_name,
                            directory=directory)
    results.update(r)

    evaluate_pivotal(model,
                     generator,
                     prefix='test',
                     model_name=model_name,
                     directory=directory)

    r = evaluate_classifier(model,
                            X_train,
                            y_train,
                            prefix='train',
                            model_name=model_name,
                            directory=directory)
    results.update(r)
    r = evaluate_classifier(model,
                            X_test,
                            y_test,
                            prefix='test',
                            model_name=model_name,
                            directory=directory)
    results.update(r)

    return results
import numpy as np
import read_dataset as rd
import evaluation as e
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

# loading training data
print('Loading training data')
X,y = rd.read_train()
X,y = rd.nudge_dataset(X,y)

scores = []
scores_std = []

#just so we know it didn't blow up or something
print('Start learning...')
#The last few might be excessive.
forests = [10, 15, 20, 25, 30, 40, 50, 70, 100, 125, 150, 175, 200, 250]

for tree in forests:
    print("This forest has {} trees!".format(tree))
    classifier = RandomForestClassifier(tree)
    #score = cross_validation.cross_val_score(classifier, X, y)
    #scores.append(np.mean(score))
    #scores_std.append(np.std(score))
    name = "plots_extended/RandomForest_{}_trees.png".format(tree)
    e.evaluate_classifier(classifier,X,y, name=name)

#print('Score: ', np.array(scores))
#print('Std  : ', np.array(scores_std))
示例#9
0
    options['result_dir']))
options = load_options(options)

print("Switching to the most recent version of the network saved in {}".format(
    options['result_dir']))
options['epoch'] = get_current_epoch(options['result_dir'])

print("Loading dataset from file {}".format(options['dataset']))
dataloader = CustomDataloader(last_batch=True, shuffle=False, **options)

print("Loading neural network weights...")
nets = build_networks(dataloader.num_classes, **options)

examples.run_example_code(nets, dataloader, **options)

print("Evaluating the accuracy of the classifier on the {} fold".format(
    options['fold']))
new_results = evaluate_classifier(nets, dataloader, verbose=False, **options)

print("Results from evaluate_classifier:")
pprint(new_results)

acquire_lock(options['result_dir'])
try:
    print("Saving results in {}".format(options['result_dir']))
    filename = os.path.join(options['result_dir'], 'example_results.json')
    with open(filename, 'w') as fp:
        fp.write(json.dumps(new_results, indent=2))
finally:
    release_lock(options['result_dir'])
示例#10
0
lr_adasyn = train_classifier(X_train, y_train, log_reg, lr_adasyn_params ,"ADASYN")


# In[110]:


y_test = np.array(y_test).reshape(1, -1)


# #####  Evaluation Logistic Regression + SMOTE

# In[111]:


y_lr_smote_pred = lr_smote.predict(X_test).reshape(1, -1)
evaluate_classifier(y_test, y_lr_smote_pred)


# #####  Evaluation Logistic Regression + Borderline

# In[112]:


y_lr_borderline_pred = lr_borderline.predict(X_test).reshape(1, -1)
evaluate_classifier(y_test, y_lr_borderline_pred)


# #####  Evaluation Logistic Regression + ADASYN

# In[113]:
示例#11
0
import numpy as np
import read_dataset as rd
import evaluation as e
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

# loading training data
print('Loading training data')
X, y = rd.read_train()
X, y = rd.nudge_dataset(X, y)

scores = []
scores_std = []

#just so we know it didn't blow up or something
print('Start learning...')
#The last few might be excessive.
forests = [10, 15, 20, 25, 30, 40, 50, 70, 100, 125, 150, 175, 200, 250]

for tree in forests:
    print("This forest has {} trees!".format(tree))
    classifier = RandomForestClassifier(tree)
    #score = cross_validation.cross_val_score(classifier, X, y)
    #scores.append(np.mean(score))
    #scores_std.append(np.std(score))
    name = "plots_extended/RandomForest_{}_trees.png".format(tree)
    e.evaluate_classifier(classifier, X, y, name=name)

#print('Score: ', np.array(scores))
#print('Std  : ', np.array(scores_std))