示例#1
0
def q20():
	def cross_validate(X, y, lamda):
		Xs = X[:40], X[40:80], X[80:120], X[120:160], X[160:]
		ys = y[:40], y[40:80], y[80:120], y[120:160], y[160:]
		reg = ridge.RidgeRegression(lamda)
		e_cv = 0.0
		for i in range(5):
			X_val, y_val = Xs[i], ys[i]
			X_train = np.concatenate([Xs[j] for j in range(5) if i != j])
			y_train = np.concatenate([ys[j] for j in range(5) if i != j])
			reg.fit(X_train, y_train)
			e_cv += reg.evaluate(X_val, y_val, sign)
		return e_cv / 5
	X_train, y_train = load_train()
	X_test, y_test = load_test()
	lamdas = np.logspace(-10, 2, 13)
	best = (1.0, None)
	for lamda in lamdas:
		e_cv = cross_validate(X_train, y_train, lamda)
		if e_cv <= best[0]:
			best = (e_cv, lamda)
	best_reg = ridge.RidgeRegression(best[1])
	best_reg.fit(X_train, y_train)
	print "lamda: %e, E_in: %.3f, E_out: %.3f" % (best[1],
		best_reg.evaluate(X_train, y_train, sign),
		best_reg.evaluate(X_test, y_test, sign))
示例#2
0
 def predict_test(self, path: str, batch_size: int, fout: TextIO):
     """
     predict from test dataset
     Args:
         path:  test dataset path
         fout:  output file
     """
     data = dataset.load_test(path)
     device = torch.device(
         'cuda') if torch.cuda.is_available() else torch.device('cpu')  # pylint: disable=no-member
     tst_itr = BucketIterator(data,
                              device=device,
                              batch_size=batch_size,
                              shuffle=False,
                              train=False,
                              sort_within_batch=True,
                              sort_key=lambda exam: -len(exam.comment_text))
     print('id,prediction', file=fout)
     for step, batch in enumerate(tqdm(tst_itr, mininterval=1, ncols=100),
                                  start=1):
         if step % 1000 == 0:
             logging.info('%dk-th step..')
         with torch.no_grad():
             outputs = self.model(batch.comment_text)
             for id_, output in zip(batch.id, torch.softmax(outputs,
                                                            dim=1)):  # pylint: disable=no-member
                 print(f'{id_},{output[1].item()}', file=fout)
def compare_manual_vs_model():

    with open(DATA_FOLDER + "labels_int.p", "r") as f:
        y_dict = pickle.load(f)

    print "Loading test data"
    X_test, y_test, filenames_test = dataset.load_test()
    y_pred = joblib.load("../models/pred_ml_improved.pkl")

    relevant = []
    for pred, correct, filename in zip(y_pred, y_test, filenames_test):
        if filename in FILES:
            relevant.append((pred, correct, filename, CLASSIFICATIONS[filename]))

    model_predictions, correct, filename, manual_predictions = zip(*relevant)
    manual_predictions = learn.multilabel_binary_y(manual_predictions)
    model_predictions = np.array(model_predictions)
    correct = learn.multilabel_binary_y(correct)

    rules = infer_topology.infer_topology_rules()
    improved_manual = infer_topology.apply_topology_rules(rules, manual_predictions)

    prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"]
    predictions = [model_predictions, manual_predictions, improved_manual]

    for name, pred in zip(prediction_names, predictions):

        print "\n{}\n--".format(name)
        print "Zero-one classification loss", zero_one_loss(correct, pred)
        print "Hamming loss", hamming_loss(correct, pred)
        print "Precision:", precision_score(correct, pred, average="weighted", labels=label_list)
        print "Recall   :", recall_score(correct, pred, average="weighted", labels=label_list)
        print "F1 score :", f1_score(correct, pred, average="weighted", labels=label_list)
示例#4
0
def lr_with_scale3():
    """
    Check the performance of normalizing TEST SET.

    Submission: lr_with_scale3_0707_04.csv
    E_val:
    E_in: 0.879233
    E_out: 0.8770121701777971

    Submission: lr_with_scale3_0712_01.csv
    E_val:
    E_in:
    E_out:
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.pipeline import Pipeline
    import numpy as np

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(np.r_[X, dataset.load_test()])
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegression(C=0.03, class_weight='auto')
    clf.fit(X_scaled, y)

    logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y))
    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('lr', clf)]), 'lr_with_scale3_0712_01')

    scores = cross_val_score(clf, X_scaled, y, scoring='roc_auc', n_jobs=-1)
    logger.debug('E_val: %f <- %s', np.average(scores), scores)
示例#5
0
def test():
	X_train, y_train = load_train()
	X_test, y_test = load_test()
	lamda = 0
	reg = ridge.RidgeRegression(lamda)
	reg.fit(X_train, y_train)
	e_in = reg.evaluate(X_train, y_train, sign)
	e_out = reg.evaluate(X_test, y_test, sign)
	print "E_in: %.3f, E_out: %.3f" % (e_in, e_out)
示例#6
0
def rf():
    """
    Submission: rf_0708_01.csv
    3000 trees
    E_val: 0.871837
    E_in: 0.999998
    E_out: 0.882316801296279
    15000 trees
    E_val: 0.872011
    E_in: 0.999998
    E_out: 0.8824869811781106
    30000 trees
    E_val: 0.871928
    E_in:
    E_out:

    depth=4; 12000 trees
    E_val: 0.969158
    E_in:
    E_out:
    """
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier
    import numpy as np

    X, y = dataset.load_train(depth=1)

    raw_scaler = StandardScaler()
    raw_scaler.fit(np.r_[X, dataset.load_test()])
    X_scaled = raw_scaler.transform(X)
    del X
    import gc
    gc.collect()

    rf = RandomForestClassifier(n_estimators=12000, oob_score=True, n_jobs=-1,
                                class_weight='auto')
    rf.fit(X_scaled, y)

    logger.debug('RandomForestClassifier fitted')

    logger.debug('E_val(oob): %f', rf.oob_score_)
    logger.debug('E_in(full): %f', Util.auc_score(rf, X_scaled, y))

    X, y = dataset.load_train()
    X_scaled = raw_scaler.transform(X)
    logger.debug('E_in (depth=0): %f', Util.auc_score(rf, X_scaled, y))
    del X
    gc.collect()

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('rf', rf)]), 'rf_0708_01')

    logger.debug('caching fitted RandomForestClassifier')
    IO.cache(rf, Path.of_cache('rf.RandomForestClassifier.12000.pkl'))
    logger.debug('cached fitted RandomForestClassifier')
示例#7
0
def to_submission(clf, filename):
    path = filename
    if not path.startswith('submission/'):
        path = 'submission/' + path
    if not path.endswith('.csv'):
        path += '.not-submitted.csv'
    Enroll_test = util.load_enrollment_test()['enrollment_id']
    X_test = dataset.load_test()
    y_test = clf.predict_proba(X_test)[:, 1]
    lines = ['%d,%f\n' % l for l in zip(Enroll_test, y_test)]
    with open(path, 'w') as f:
        f.writelines(lines)
示例#8
0
def to_submission(clf, filename):
    path = filename
    if not path.startswith('submission/'):
        path = 'submission/' + path
    if not path.endswith('.csv'):
        path += '.not-submitted.csv'
    Enroll_test = util.load_enrollment_test()['enrollment_id']
    X_test = dataset.load_test()
    y_test = clf.predict_proba(X_test)[:, 1]
    lines = ['%d,%f\n' % l for l in zip(Enroll_test, y_test)]
    with open(path, 'w') as f:
        f.writelines(lines)
示例#9
0
def gbdt_search():
    """
    Grid search for best n_estimators.
    Best params: {'loss': 'deviance', 'n_estimators': 100}
    Submission: gbdt_search_0707_01.csv
    E_val: 0.883786743214
    E_in: 0.887785
    E_out: 0.8848760405053878
    """
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.pipeline import Pipeline
    import numpy as np

    X, y = dataset.load_train()
    raw_scaler = StandardScaler()
    X_scaled = raw_scaler.fit_transform(X)

    param_grid = {
        'loss': ['deviance', 'exponential'],
        'n_estimators': np.arange(100, 1001, 100)
    }

    params = {'learning_rate': 0.1, 'subsample': 0.5}

    gb = GradientBoostingClassifier(**params)
    grid = GridSearchCV(gb, param_grid, scoring='roc_auc', n_jobs=-1,
                        cv=StratifiedKFold(y, 5), refit=True, verbose=1)
    grid.fit(X_scaled, y)

    logger.debug('Got best GBDT.')
    logger.debug('Grid scores: ')
    for i, grid_score in enumerate(grid.grid_scores_):
        print('\t%d00: %s' % (i + 1, grid_score))
    logger.debug('Best score (E_val): %s', grid.best_score_)
    logger.debug('Best params: %s', grid.best_params_)

    IO.cache(grid, Path.of_cache('gbdt_search.GridSearchCV.pkl'))

    X_test = dataset.load_test()
    raw_scaler.fit(np.r_[X, X_test])
    X_scaled = raw_scaler.transform(X)

    params.update(grid.best_params_)
    clf = GradientBoostingClassifier(**params)
    clf.fit(X_scaled, y)

    logger.debug('E_in: %f', Util.auc_score(grid, X_scaled, y))
    IO.dump_submission(Pipeline([('scaler', raw_scaler),
                                 ('gbdt', grid)]), 'gbdt_search_0707_01')
示例#10
0
def q15():
	X_train, y_train = load_train()
	X_test, y_test = load_test()
	lamdas = np.logspace(-10, 2, 13)
	best = (1.0, None)
	for lamda in lamdas:
		reg = ridge.RidgeRegression(lamda)
		reg.fit(X_train, y_train)
		e_out = reg.evaluate(X_test, y_test, sign)
		if e_out <= best[0]:
			best = (e_out, reg)
	best_reg = best[1]
	print "lamda: %e, E_in: %.3f, E_out: %.3f" % (
		best_reg.lamda,
		best_reg.evaluate(X_train, y_train, sign),
		best_reg.evaluate(X_test, y_test, sign))
示例#11
0
def gbdt_grid():
    """
    Grid search for best params.
    Best params: {'learning_rate': 0.05, 'subsample': 0.3}
    Submission: gbdt_grid_0706_03.csv
    E_val: 0.860118290628
    E_in: 0.882949
    E_out: 0.8809314555068068
    """
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.pipeline import Pipeline
    import numpy as np

    X, y = dataset.load_train()
    raw_scaler = StandardScaler()
    X_scaled = raw_scaler.fit_transform(X)

    param_grid = {
        'learning_rate': [0.05, 0.1],
        'subsample': [0.3, 0.5, 0.7]
    }

    grid = GridSearchCV(GradientBoostingClassifier(n_estimators=3000),
                        param_grid, scoring='roc_auc', n_jobs=-1,
                        cv=StratifiedKFold(y, 5), refit=False, verbose=1)
    grid.fit(X_scaled, y)

    logger.debug('Got best GBDT.')
    logger.debug('Grid scores: %s', grid.grid_scores_)
    logger.debug('Best score (E_val): %s', grid.best_score_)
    logger.debug('Best params: %s', grid.best_params_)

    X_test = dataset.load_test()
    raw_scaler.fit_transform(np.r_[X, X_test])
    X_scaled = raw_scaler.transform(X)

    clf = GradientBoostingClassifier(**grid.best_params_)
    clf.fit(X_scaled, y)

    IO.cache(grid, Path.of_cache('gbdt_grid.GridSearchCV.pkl'))

    logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y))
    IO.dump_submission(Pipeline([('scaler', raw_scaler),
                                 ('gbdt', clf)]), 'gbdt_grid_0706_03')
示例#12
0
def gbdt():
    """
    Submission: gbdt_0708_02.csv
    n_estimators: 1000, learning_rate: 0.1, subsample: 0.5
    E_val: 0.858235
    E_in: 0.908622
    E_out: 0.8873906795559863
    n_estimators: 500, learning_rate: 0.1, subsample: 0.5
    E_val: 0.870976
    E_in: 0.899593
    E_out: 0.88711101837711
    n_estimators: 3000, learning_rate: 0.1, subsample: 0.5
    E_val: 0.836049
    E_in: 0.936056
    E_out: 0.8833930861722906

    depth=4; n_estimators: 1000, learning_rate: 0.1, subsample: 0.5
    E_val: 0.947301
    E_in: 0.983812 (on depth=4) // 0.85089646325496504 (on depth=0)
    E_out: 0.8855316272153549
    """
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    import numpy as np

    gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1,
                                    subsample=0.5)

    d = 0
    X, y = dataset.load_train(depth=d)
    raw_scaler = StandardScaler()
    raw_scaler.fit(np.r_[X, dataset.load_test()])
    X_scaled = raw_scaler.transform(X)
    gb.fit(X_scaled, y)

    IO.cache(gb, Path.of_cache('gbdt.GradientBoostingClassifier.d%d.pkl' % d))
    IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', gb)]),
                       'gbdt_0708_02.1000.d%d' % d)

    logger.debug('E_in(full): %f', Util.auc_score(gb, X_scaled, y))

    X, y = dataset.load_train()
    X_scaled = raw_scaler.transform(X)
    logger.debug('E_in(depth=0): %f', Util.auc_score(gb, X_scaled, y))
def compare_manual_vs_model():

    with open(DATA_FOLDER + 'labels_int.p', 'r') as f:
        y_dict = pickle.load(f)

    print "Loading test data"
    X_test, y_test, filenames_test = dataset.load_test()
    y_pred = joblib.load('../models/pred_ml_improved.pkl')

    relevant = []
    for pred, correct, filename in zip(y_pred, y_test, filenames_test):
        if filename in FILES:
            relevant.append(
                (pred, correct, filename, CLASSIFICATIONS[filename]))

    model_predictions, correct, filename, manual_predictions = zip(*relevant)
    manual_predictions = learn.multilabel_binary_y(manual_predictions)
    model_predictions = np.array(model_predictions)
    correct = learn.multilabel_binary_y(correct)

    rules = infer_topology.infer_topology_rules()
    improved_manual = infer_topology.apply_topology_rules(
        rules, manual_predictions)

    prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"]
    predictions = [model_predictions, manual_predictions, improved_manual]

    for name, pred in zip(prediction_names, predictions):

        print "\n{}\n--".format(name)
        print "Zero-one classification loss", zero_one_loss(correct, pred)
        print "Hamming loss", hamming_loss(correct, pred)
        print "Precision:", precision_score(correct,
                                            pred,
                                            average='weighted',
                                            labels=label_list)
        print "Recall   :", recall_score(correct,
                                         pred,
                                         average='weighted',
                                         labels=label_list)
        print "F1 score :", f1_score(correct,
                                     pred,
                                     average='weighted',
                                     labels=label_list)
示例#14
0
def q17():
	X, y = load_train()
	X_train, X_val = X[:120], X[120:]
	y_train, y_val = y[:120], y[120:]
	X_test, y_test = load_test()
	lamdas = np.logspace(-10, 2, 13)
	best = (1.0, None)
	for lamda in lamdas:
		reg = ridge.RidgeRegression(lamda)
		reg.fit(X_train, y_train)
		e_val = reg.evaluate(X_val, y_val, sign)
		if e_val <= best[0]:
			best = (e_val, reg)
	best_reg = best[1]
	print "lamda: %e, E_train: %.3f, E_val: %.3f, E_out: %.3f" % (
		best_reg.lamda,
		best_reg.evaluate(X_train, y_train, sign),
		best_reg.evaluate(X_val, y_val, sign),
		best_reg.evaluate(X_test, y_test, sign))
示例#15
0
    print "Zero-one classification loss", zero_one_loss(y_test_mlb, y_pred)
    print "Hamming loss", hamming_loss(y_test_mlb, y_pred)

    im = y_test_mlb + y_pred * 2
    scipy.misc.imsave('predictions.png', im)


if __name__ == '__main__':

    #Load data
    print "Loading labels"
    label_list = dataset.load_labels()

    print "Loading train set"
    X_train, y_train, filenames_train = dataset.load_train()
    print "Size of train set", len(X_train)
    multilabel_classifier(X_train, y_train)

    #Unload train set from memory
    del X_train, y_train, filenames_train

    print "Loading test set"
    X_test, y_test, filenames_test = dataset.load_test()
    print "Size of test set", len(X_test)

    predict(X_test, y_test)
    improve_predictions(use_infer_topology=True)
    #evaluate_multilabel(y_test, label_list, '../models/pred_ml_improved.pkl')
    evaluate_multilabel(y_test, label_list, '../models/pred_ml.pkl')
示例#16
0
    print "Zero-one classification loss", zero_one_loss(y_test_mlb, y_pred)
    print "Hamming loss", hamming_loss(y_test_mlb, y_pred)

    im = y_test_mlb + y_pred * 2
    scipy.misc.imsave("predictions.png", im)


if __name__ == "__main__":

    # Load data
    print "Loading labels"
    label_list = dataset.load_labels()

    print "Loading train set"
    X_train, y_train, filenames_train = dataset.load_train()
    print "Size of train set", len(X_train)
    multilabel_classifier(X_train, y_train)

    # Unload train set from memory
    del X_train, y_train, filenames_train

    print "Loading test set"
    X_test, y_test, filenames_test = dataset.load_test()
    print "Size of test set", len(X_test)

    predict(X_test, y_test)
    improve_predictions(use_infer_topology=True)
    # evaluate_multilabel(y_test, label_list, '../models/pred_ml_improved.pkl')
    evaluate_multilabel(y_test, label_list, "../models/pred_ml.pkl")
示例#17
0
def gbdt2():
    """
    Submission: gbdt2_0708_03.csv
    n_estimators: 1000, learning_rate: 0.1, subsample: 0.5
    E_val: 0.852035
    E_in: 0.910251
    E_out: 0.8874428893001793
    n_estimators: 3000, learning_rate: 0.1, subsample: 0.5
    E_val: 0.827988
    E_in: 0.938593
    E_out: 0.8844206314551558

    depth=4; n_estimators: 1000, learning_rate: 0.1, subsample: 0.5
    E_val: 0.941602
    E_in: 0.983938 (on depth=4) // 0.87209181108731892 (on depth=0)
    E_out: 0.8872206627768779

    depth=0:
    E_val:
    E_in: 0.909368 // 0.909368
    E_out: 0.8864839071529611

    depth=1:
    E_val:
    E_in: 0.956676 // 0.903537
    E_out: 0.8851856544683128

    depth=2:
    E_val:
    E_in: 0.971240 // 0.899843
    E_out:

    depth=3:
    E_val:
    E_in: 0.978190 // 0.896956
    E_out:
    """
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    import numpy as np

    gb = GradientBoostingClassifier(loss='exponential', n_estimators=1000,
                                    learning_rate=0.1, subsample=0.5)

    d = 3
    X, y = dataset.load_train(depth=d)
    raw_scaler = StandardScaler()
    raw_scaler.fit(np.r_[X, dataset.load_test()])
    X_scaled = raw_scaler.transform(X)
    gb.fit(X_scaled, y)

    IO.cache(gb, Path.of_cache('gbdt2.GradientBoostingClassifier.d%d.pkl' % d))
    IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', gb)]),
                       'gbdt2_0708_03.1000.d%d' % d)

    logger.debug('E_in(full): %f', Util.auc_score(gb, X_scaled, y))

    X, y = dataset.load_train()
    X_scaled = raw_scaler.transform(X)
    logger.debug('E_in(depth=0): %f', Util.auc_score(gb, X_scaled, y))
示例#18
0
    # Train path has the class structure
    classes = os.listdir(train_path)
    num_classes = len(classes)
    print("num of classes:", num_classes)

    #Keeping image size as 128
    img_size = 128
    num_channels = 3

    # We shall load all the training and validation images and labels into memory using openCV and use that during training
    train_data = dataset.read_train_sets(train_path, img_size, classes, validation_size=0.2)
    print("Complete reading input data. Will Now print a snippet of it")
    print("Number of files in Training-set:\t{}".format(len(train_data.train.labels)))
    print("Number of files in Validation-set:\t{}".format(len(train_data.valid.labels)))

    test_data = dataset.load_test(test_path, img_size)
    print("Number of files in test-set:\t{}".format(len(test_data)))

    batch_size = 8
    num_iterations = 3

    #session, y_pred_cls = tensorflow_model.load_tensorflow(train_data, batch_size, num_iterations, img_size)

    #x, y_true = tensorflow_model.input_placeholders(img_size, num_channels, num_classes)
    #y_true_cls = tf.argmax(y_true, dimension=1)

    #prediction = session.run(y_pred_cls, feed_dict={'x': test_data})
    #print(prediction)

    # save the model
    #saver = tf.train.Saver()