Пример #1
0
def grid_search():
    comments, labels = load_data()
    param_grid = dict(logr__C=np.arange(1, 20, 5))
    clf = build_nltk_model()

    cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2)
    grid = GridSearchCV(clf,
                        cv=cv,
                        param_grid=param_grid,
                        verbose=4,
                        n_jobs=12,
                        score_func=auc_score)
    grid.fit(comments, labels)
    print(grid.best_score_)
    print(grid.best_params_)

    tracer()
    cv_scores = grid.scores_
    for param in cv_scores.params:
        means, errors = cv_scores.accumulated(param, 'max')
        plt.errorbar(cv_scores.values[param], means, yerr=errors)
        plt.xlabel(param)
        plt.ylim((0.85, 0.93))
        plt.savefig("grid_plot_%s.png" % param)
        plt.close()
    comments_test, dates_test = load_test()
    prob_pred = grid.best_estimator_.predict_proba(comments_test)
    write_test(prob_pred[:, 1])
Пример #2
0
def loadData():
    global dataChoice
    if dataChoice == 'sim':
        train_filename = 'r-train100.csv'
        test_filename  = 'r-test100.csv'
        user_filename  = 'u100.csv'
    else:
        # dataChoice == 'validate' or dataChoice == 'full'
        train_filename = 'ratings-train.csv'
        test_filename  = 'ratings-test.csv'
        user_filename  = 'users.csv'

    training_data  = util.load_train(train_filename)
    test_queries   = util.load_test(test_filename)
    user_list      = util.load_users(user_filename)

    validation_set = {}

    if dataChoice != 'full':
        # split training_data into 80% training and 20% validation
        split = int(len(training_data) * 0.8)
        validation_set = training_data[split:]
        training_data   = training_data[:split]

    return training_data, test_queries, user_list, validation_set
Пример #3
0
def loadData():
    global dataChoice
    if dataChoice == 'sim':
        train_filename = 'r-train100.csv'
        test_filename = 'r-test100.csv'
        user_filename = 'u100.csv'
    else:
        # dataChoice == 'validate' or dataChoice == 'full'
        train_filename = 'ratings-train.csv'
        test_filename = 'ratings-test.csv'
        user_filename = 'users.csv'

    training_data = util.load_train(train_filename)
    test_queries = util.load_test(test_filename)
    user_list = util.load_users(user_filename)

    validation_set = {}

    if dataChoice != 'full':
        # split training_data into 80% training and 20% validation
        split = int(len(training_data) * 0.8)
        validation_set = training_data[split:]
        training_data = training_data[:split]

    return training_data, test_queries, user_list, validation_set
Пример #4
0
def make_predictions(ratings_data, mfact_data):
	"""
	Makes a set of predictions, suitable for passing to util.write_predictions

	Arguments:
		ratings_data	: the data object returned by build_ratings
		mfact_data 		: the data object returned by mfact

	Returns:
		predictions 	: a list of dicts, suitable for passing to util.write_predictions

	"""

	# load data from the original training set
	center = ratings_data["center"]
	scale = ratings_data["scale"]
	book_isbn_to_index = ratings_data["book_isbn_to_index"]

	# load data calculated by the matrix factorization
	P = mfact_data["P"]
	Q = mfact_data["Q"]
	Bn = mfact_data["Bn"]
	Bd = mfact_data["Bd"]
	mean = mfact_data["mean"]

	# load the set of requested predictions
	queries = util.load_test("../data/books/ratings-test.csv")
	L = len(queries)
	debug("Making %d predictions",L)

	# for each query
	for (i,query) in enumerate(queries):

		# print progress
		# if DEBUG: print ("%d / %d : " % (i+1,L)),

		# lookup user and book index
		user_index = query["user"] - 1
		book_index = book_isbn_to_index[query["isbn"]]

		# calculate predicted rating
		#rating_float = (np.dot(P[user_index,:],Q[book_index,:]) + mean + Bn[user_index] + Bd[book_index]) \
		#	* scale + center
		rating_float = np.dot(P[user_index,:],Q[book_index,:]) * scale + center

		# coerce to range (1,5); round
		rating = max(1,min(5,rating_float))

		# store both values so we can do visualization of distributions later
		query["rating"] = rating
		query["rating_f"] = rating_float

		# print value
		# if DEBUG: print "%f -> %d" % (rating_float, rating)

	return queries
Пример #5
0
def test_data():
    path = 'my_model.h5'
    model = keras.models.load_model(path)

    _, _, _, _, tokenizer = util.load_train_val()

    X, y = util.load_test(path_data, tokenizer)

    res = model.eval(x=X, y=y, batch_size=200)
    print(res)
Пример #6
0
def apply_models():
    comments, labels = load_extended_data()
    comments_test = load_test("impermium_verification_set_.csv")

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    probs_common = np.zeros((len(comments_test), 2))
    for i, clf in enumerate(models):
        clf.fit(comments, labels)
        probs = clf.predict_proba(comments_test)
        #print("score: %f" % auc_score(labels_test, probs[:, 1]))
        probs_common += probs
        write_test(probs[:, 1], "test_prediction_model_%d.csv" % i,
                ds="impermium_verification_set_.csv")
    probs_common /= 4.
    #score = auc_score(labels_test, probs_common[:, 1])
    #print("combined score: %f" % score)
    write_test(probs_common[:, 1], "test_prediction_combined.csv",
            ds="impermium_verification_set_.csv")
Пример #7
0
def apply_models():
    comments, labels = load_extended_data()
    comments_test = load_test("impermium_verification_set_.csv")

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    probs_common = np.zeros((len(comments_test), 2))
    for i, clf in enumerate(models):
        clf.fit(comments, labels)
        probs = clf.predict_proba(comments_test)
        #print("score: %f" % auc_score(labels_test, probs[:, 1]))
        probs_common += probs
        write_test(probs[:, 1],
                   "test_prediction_model_%d.csv" % i,
                   ds="impermium_verification_set_.csv")
    probs_common /= 4.
    #score = auc_score(labels_test, probs_common[:, 1])
    #print("combined score: %f" % score)
    write_test(probs_common[:, 1],
               "test_prediction_combined.csv",
               ds="impermium_verification_set_.csv")
Пример #8
0
def grid_search():
    comments, labels = load_data()
    param_grid = dict(logr__C=np.arange(1, 20, 5))
    clf = build_nltk_model()

    cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2)
    grid = GridSearchCV(clf, cv=cv, param_grid=param_grid, verbose=4,
            n_jobs=12, score_func=auc_score)
    grid.fit(comments, labels)
    print(grid.best_score_)
    print(grid.best_params_)

    tracer()
    cv_scores = grid.scores_
    for param in cv_scores.params:
        means, errors = cv_scores.accumulated(param, 'max')
        plt.errorbar(cv_scores.values[param], means, yerr=errors)
        plt.xlabel(param)
        plt.ylim((0.85, 0.93))
        plt.savefig("grid_plot_%s.png" % param)
        plt.close()
    comments_test, dates_test = load_test()
    prob_pred = grid.best_estimator_.predict_proba(comments_test)
    write_test(prob_pred[:, 1])
Пример #9
0
        # acc = util.model_roc_score(learner, test_dat)
        acc = util.model_score(learner, test_dat)

        test_acc.append(acc)
        learned_size.append(total_pool_size - prof.get_pool_size() + init_size)

        if ITER_ENABLE:
            if count < 0: break
            count -= 1

    return test_acc, learned_size


pool_dat = load_pool()
init_dat = load_init()
test_dat = load_test()

train_pool = np.array(gen_land_pool(pool_dat))
shuffle(train_pool)

print "[info]Start passive learning..."
test_acc_ps, learned_size_ps = run_stl_landm(pool_dat,
                                             init_dat,
                                             test_dat,
                                             do_active=False)
util.curve_to_csv("res/ps_stl_non.csv", test_acc_ps, learned_size_ps)

print "[info]Start active learning..."
test_acc_ac, learned_size_ac = run_stl_landm(pool_dat,
                                             init_dat,
                                             test_dat,
Пример #10
0
# coding=utf8
import numpy as np
import pandas as pd
from scipy import interp
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc

# load data
# X1 = pd.read_csv(r'Data/Train/PPD_Training_Master_GBK_3_1_Training_Set.csv', encoding='gbk')
X2 = pd.read_csv(r'Data/Test/PPD_Master_GBK_2_Test_Set.csv')# , encoding='gbk')
from util import load_train, load_test
X_train, y_train, w_train = load_train()
X_test = load_test()
print 'data loaded, transforming...'

''' 3.19 commented
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# train and predict
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(n_jobs=-1, class_weight='balanced', penalty='l1')
clf.fit(X_train, y_train)
probas_ = clf.predict_proba(X_test)

# visualization on training set
fpr, tpr, thresholds = roc_curve(y_train, p2[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
Пример #11
0
# # do prediction based on matrix factorization
# K = 30
# run = 0
# step = 280
# mfact = su.unpickle("output/mfact_%d_run_%d/mfact_%d_%d" % (K, run, K, step))
# P = mfact["P"]
# Q = mfact["Q"]
# Bn = mfact["Bn"]
# Bd = mfact["Bd"]
# # this is the mean of the standardized training data, used for the learning/
# # prediction
# standard_mean = mfact["mean"] 

# load the set of requested predictions
queries = util.load_test("../data/books/ratings-test.csv")
L = len(queries)

# for each query
for (i,query) in enumerate(queries):
	print ("%d / %d : " % (i+1,L)),
	user_index = query["user"] - 1
	book_index = book_isbn_to_index[query["isbn"]]

	# calculate predicted rating
	rating_float = (np.dot(P[user_index,:],Q[book_index,:]) + standard_mean + Bn[user_index] + Bd[book_index]) * std + mean
	
	# coerce to range (1,5); round, convert to int
	rating = int(round(max(1,min(5,rating_float))))

	# store both values so we can do visualization of distributions later
Пример #12
0
def generateFullTestMatrix(detectorSettings=(1, 1, 1), partitions=(9, 9, 9)):
    testMatrix = []
    for i in range(0, util.TEST_COUNT - 1):
        testMatrix.append(generateEdgeFeaturesVector(util.load_test(i)))
    return testMatrix
Пример #13
0

def predict(train, test, pred_file):
    y_hat, train_rss = run_model(train, test, 'prediction', 0)
    for i, yi in enumerate(y_hat):
        if yi < 0:
            y_hat[i] = 0
        if yi > 5:
            y_hat[i] = 5
    for i, entry in enumerate(test):
        entry['rating'] = float(y_hat[i])    
    util.write_predictions(test, pred_file)

# prediction mode  
test_filename  = 'data/ratings-test.csv'
test           =  util.load_test(test_filename)
pred_filename  = 'predictions/sgd_converged.csv'
predict(train_valid, test, pred_filename)



"""
x = np.zeros((n, r, 2)) # 2 layers, 1 for train predictions and 1 for valid predictions
y = np.zeros((n, 1))
for i, entry in enumerate(train_valid):
        y[i] = float(entry['rating'])

       
def build_matrix(m, v, fold_idx, param_idx):
    span = np.shape(v)[0] 
    # train predictions
Пример #14
0
import numpy as np
import util

# This makes predictions based on the mean rating for each user in the
# training data.  When there are no training data for a user, it
# defaults to the global mean.

pred_filename = 'pred-user-mean.csv'
train_filename = 'ratings-train.csv'
test_filename = 'ratings-test.csv'
user_filename = 'users.csv'

training_data = util.load_train(train_filename)
test_queries = util.load_test(test_filename)
user_list = util.load_users(user_filename)

# Compute the global mean rating for a fallback.
num_train = len(training_data)
mean_rating = float(sum(map(lambda x: x['rating'], training_data))) / num_train
print "The global mean rating is %0.3f." % (mean_rating)

# Turn the list of users into a dictionary.
# Store data for each user to keep track of the per-user average.
users = {}
for user in user_list:
    users[user['user']] = {
        'total': 0,  # For storing the total of ratings.
        'count': 0,  # For storing the number of ratings.
    }

# Iterate over the training data to compute means.
Пример #15
0
import time
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from util import load_training, load_test, evaluate, standard
import warnings
warnings.filterwarnings('ignore')

### 此处定义参数
n_components = 100  # PCA降至的维数
C = 1  # 软间隔系数
decision_function = 'ovr'  # 'ovo' for OneVsOne and 'ovr' for OneVsRest'
kernel = 'rbf'  # 核函数类型 'rbf', 'linear', 'poly' or 'sigmoid'
gamma = 1e-5  # 针对rbf, gamma越大,支持向量越少
#####
training_data, training_label = load_training()
test_data, test_label = load_test()

print('training size: {}'.format(len(training_label)))
print('test size: {}'.format(len(test_label)))

# 展成一维
training_data = np.array([x.flatten() for x in training_data])
training_label = np.array(training_label)
test_data = np.array([x.flatten() for x in test_data])
test_label = np.array(test_label)

pca = PCA(n_components=n_components)
model = SVC(C=C,
            random_state=0,
            max_iter=1000,
            kernel=kernel,
Пример #16
0
import numpy as np
import ensemble
from train_model_library import train_model_library
import util
import makePred

ensemble_library_pred, validation_labels, scaler, model_grid = train_model_library(
    n_folds_to_compute=1)

ensemble, acc, n, c1acc = ensemble.generate_ensemble(ensemble_library_pred,
                                                     validation_labels,
                                                     n_init=3,
                                                     tolerance=.00001)

ids, features = util.load_test("kaggle_test_tf_idf_l1_norm.csv")
labels = makePred.makePrediction(ensemble, model_grid, features, scaler)

util.write_predictions(labels, "idflabels_lean_2.csv")

print("done")
Пример #17
0
def reload_dat():
    gc.collect()
    pool_dat = load_pool()
    init_dat = load_init()
    test_dat = load_test()
    return pool_dat, init_dat, test_dat
Пример #18
0
ELLA_DIR = "/home/stpanda/Dropbox/STDreamSoft/Academics/SeniorThesis/Projects/al_ella/lib/ELLAv1.0"
eng.addpath("/home/stpanda/Dropbox/STDreamSoft/Academics/SeniorThesis/Projects/al_ella/ml")
eng.addpath(eng.genpath(ELLA_DIR))
# res = eng.runExperimentActiveTask()
# print res


######## Panda ######
# Comparing Multiple Active Learner vs ELLA + ATS vs ELLA + ATS + AL vs
# ELLA + AL
# This file runs ELLA + Active Task Selection
#####################

## Load all files
test_dat = util.add_bias(load_test())
pool_dat = util.add_bias(load_pool())
init_dat = util.add_bias(load_init())
init_size = util.dat_size(init_dat)

## Init ELLA Model with init set ##
ella_model = ELLA(eng, init_dat)
init_acc = ella_score(ella_model, test_dat)
test_acc = [init_acc]
learned_size = [init_size]


prof = Professor(init_dat, pool_dat, multi_t=True, random=True)
total_pool_size = prof.get_pool_size()
print "train pool size", total_pool_size
Пример #19
0
import math

NUM_CLUSTERS = 8

# This makes predictions based on the mean rating for each book in the
# training data.  When there are no training data for a book, it
# defaults to the global mean.

pred_filename  = 'predictor_age-kmeans1.csv'
train_filename = 'ratings-train.csv'
test_filename  = 'ratings-test.csv'
book_filename  = 'books.csv'
user_filename  = 'users.csv'

training_data  = util.load_train(train_filename)
test_queries   = util.load_test(test_filename)
book_list      = util.load_books(book_filename)
user_list      = util.load_users(user_filename)

train_data = []
test_data = []

for datum in training_data:
    if (random.randrange(2) == 0):
        train_data.append(datum)
    else:
        test_data.append(datum)

# Compute the global mean rating for a fallback.
num_train = len(train_data)
mean_rating = float(sum(map(lambda x: x['rating'], train_data)))/num_train
Пример #20
0
ImageFile.LOAD_TRUNCATED_IMAGES = True
import shutil
from torchvision.models import vgg16

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from dataset import DepthEigenDataset
from network import GlobalCoarseNet, LocalFineNet
from loss import ScaleInvariantLoss
import util

data_dir_test = Path('nyu/test')
bs = 32
dataloader_test, datalen_test = util.load_test(data_dir_test, bs)
print(datalen_test)

global_model = torch.load('models/global_model.pt')
global_model.eval()

local_model = torch.load('models/local_model.pt')
local_model.eval()

for i, samples in enumerate(dataloader_test):
    rgbs = samples['rgb'].float().to(device)
    depths = samples['depth'].float().to(device)

    # results from global coarse network
    with torch.no_grad():
        global_output = global_model(rgbs).unsqueeze(1)