예제 #1
0
def loadData():
    global dataChoice
    if dataChoice == 'sim':
        train_filename = 'r-train100.csv'
        test_filename = 'r-test100.csv'
        user_filename = 'u100.csv'
    else:
        # dataChoice == 'validate' or dataChoice == 'full'
        train_filename = 'ratings-train.csv'
        test_filename = 'ratings-test.csv'
        user_filename = 'users.csv'

    training_data = util.load_train(train_filename)
    test_queries = util.load_test(test_filename)
    user_list = util.load_users(user_filename)

    validation_set = {}

    if dataChoice != 'full':
        # split training_data into 80% training and 20% validation
        split = int(len(training_data) * 0.8)
        validation_set = training_data[split:]
        training_data = training_data[:split]

    return training_data, test_queries, user_list, validation_set
예제 #2
0
def loadData():
    global dataChoice
    if dataChoice == 'sim':
        train_filename = 'r-train100.csv'
        test_filename  = 'r-test100.csv'
        user_filename  = 'u100.csv'
    else:
        # dataChoice == 'validate' or dataChoice == 'full'
        train_filename = 'ratings-train.csv'
        test_filename  = 'ratings-test.csv'
        user_filename  = 'users.csv'

    training_data  = util.load_train(train_filename)
    test_queries   = util.load_test(test_filename)
    user_list      = util.load_users(user_filename)

    validation_set = {}

    if dataChoice != 'full':
        # split training_data into 80% training and 20% validation
        split = int(len(training_data) * 0.8)
        validation_set = training_data[split:]
        training_data   = training_data[:split]

    return training_data, test_queries, user_list, validation_set
예제 #3
0
def _view(arg):
    try:
        num = int(arg)
        mri = load_train(num)
    except:
        mri = load_nifti1(arg)

    view(mri)
예제 #4
0
def build_ratings_tuple():
	"""
	Loads the training data for N users and D books, and builds a list of tuples

	Returns: a dict with the following fields:

		-	`ratings` : a list of tuples (i,j,r) where i is the user, j is the
			book, and r is the rating that the user gave the book
		-	`book_isbn_to_index` : dict that maps the ISBN for each book to a
			numerical index j.
		-	`N` : the number of users
		-	`D` : the number of books
		-	`T` : the number of ratings in the training set
	"""

	print "Loading Users..."
	users = util.load_users("../../data/books/users.csv")
	user_ids = sorted([ user["user"] for user in users ])
	N = len(user_ids)
	del users
	print "Loaded %d users." % N


	print "Loading Books..."
	books = util.load_books("../../data/books/books.csv")
	book_isbns = sorted([ book["isbn"] for book in books ])
	book_isbn_to_index = dict( zip(book_isbns,range(len(book_isbns))) )
	D = len(book_isbns)
	print "Loaded %d books." % D


	print "Loading Trainings..."
	train = util.load_train("../../data/books/ratings-train.csv")
	T = len(train)
	print "Loaded %d ratings." % T
	ratings = [(rating["user"]-1, book_isbn_to_index[rating["isbn"]], (rating["rating"])) for rating in train]

	return { "ratings": ratings, "book_isbn_to_index": book_isbn_to_index , \
		"N": N, "D": D, "T": T}
예제 #5
0
def book_biases():
    train_filename = 'ratings-train.csv'
    book_filename  = 'books.csv'

    training_data  = util.load_train(train_filename)
    book_list      = util.load_books(book_filename)

    books = {}
    for book in book_list:
        books[book['isbn']] = { 'total': 0, # For storing the total of ratings.
                            'count': 0, # For storing the number of ratings.
                            }

    # Iterate over the training data to compute means.
    for rating in training_data:
        books[rating['isbn']]['total'] += rating['rating']
        books[rating['isbn']]['count'] += 1

    bBooks = np.zeros(len(book_list))

    for book in book_list:
        isbn = book['isbn']
        bBooks[isbnIndex[isbn]] = float(book['total']) / book['count']
    float(book['total']) / book['count']
예제 #6
0
# ignore stupid warning
import warnings
warnings.filterwarnings('ignore')

# training device
device = torch.device('cuda')

# hyperparameter
epochs_num = 4
rate = 1 # rate = 5 in paper; but result is bad so it is tuned by hand

# specifed in paper
batch_size = 32

# load dataset
dataloader_train, dataloader_valid, train_num, val_num = util.load_train(Path('nyu/train'), Path('nyu/val'), batch_size)

print("training number:{}, validation number:{}".format(train_num, val_num))

#########################################################
#              initializing the model                   #
#########################################################

# initialize and parallize the models
global_net = GlobalCoarseNet().to(device)
local_net = LocalFineNet().to(device)

# loss
global_loss = ScaleInvariantLoss()
local_loss = ScaleInvariantLoss()
예제 #7
0
Created on Sun Feb  9 16:20:34 2014

Type: Driver

cross_validation to tune parameters

@author: vincentli2010
"""
import numpy as np
import util
from matplotlib import pyplot as plt

user_list      = util.user_list
book_list      = util.book_list
train_filename = 'data/ratings-train.csv'
train_valid    = util.load_train(train_filename)

######### Tuning Parameters #########

PARAM = [0.05, 0.1, 0.3, 0.5]
#PARAM = np.arange(0.05, 5, 0.05) 

num_folds = 1 # always 5-fold cross-validate, this decides how many folds to run




##import data_processing as dp
#dphelper = dp.data_processing()
#dense, sparse = dphelper.split(train_valid)
#train_valid = dense
예제 #8
0
파일: for_submit.py 프로젝트: cyxtj/ppd
# coding=utf8
import numpy as np
import pandas as pd
from scipy import interp
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc

# load data
# X1 = pd.read_csv(r'Data/Train/PPD_Training_Master_GBK_3_1_Training_Set.csv', encoding='gbk')
X2 = pd.read_csv(r'Data/Test/PPD_Master_GBK_2_Test_Set.csv')# , encoding='gbk')
from util import load_train, load_test
X_train, y_train, w_train = load_train()
X_test = load_test()
print 'data loaded, transforming...'

''' 3.19 commented
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# train and predict
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(n_jobs=-1, class_weight='balanced', penalty='l1')
clf.fit(X_train, y_train)
probas_ = clf.predict_proba(X_test)

# visualization on training set
fpr, tpr, thresholds = roc_curve(y_train, p2[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
예제 #9
0
import os
import util
import artifacts

# This script must run in Python, not Pypy.

# This creates a dict like {filename: label} for the whole training set.
train = util.load_train(True)
artifacts.put_artifact(train, 'train_dict')

# This makes a similar dict, holding a sample of 20k positive
# and 20k negative instances.
# This is used for determining frequent tags, tokens, etc. for features.
# The dict is saved as artifacts/sample_20_20.pkl.
sample = util.create_sample('sample_20_20', 20000, 20000)
예제 #10
0
import random
import math

NUM_CLUSTERS = 8

# This makes predictions based on the mean rating for each book in the
# training data.  When there are no training data for a book, it
# defaults to the global mean.

pred_filename  = 'predictor_age-kmeans1.csv'
train_filename = 'ratings-train.csv'
test_filename  = 'ratings-test.csv'
book_filename  = 'books.csv'
user_filename  = 'users.csv'

training_data  = util.load_train(train_filename)
test_queries   = util.load_test(test_filename)
book_list      = util.load_books(book_filename)
user_list      = util.load_users(user_filename)

train_data = []
test_data = []

for datum in training_data:
    if (random.randrange(2) == 0):
        train_data.append(datum)
    else:
        test_data.append(datum)

# Compute the global mean rating for a fallback.
num_train = len(train_data)
def train_model_library(n_folds=5, n_folds_to_compute=5):
    # ensemble_library_pred is an array of predictions made by the individual models.
    # each row is an obesrvation in the validation set, and each column is the
    # prediction of a cross validation model.
    # validation_labels is a column vector corresponding to the labels of the
    # observations in the validation set.
    # model_grid is a list of lists. each element corresponds to a single model.
    # m = model_grid[i]. m[0] is the model index (corresponding to a column in
    # ensemble_library_pred. m[1] is a list of n_folds_to_compute model objects.
    # m[2] is empty and holds the predictions of each model until the end
    # m[3] is the constructor for that model i.
    # m[4] is a dictionary specifying the model parameters for model i.

    ids, features, labels = util.load_train(TRAIN_PATH)
    kf_cv = cross_validation.KFold(features.shape[0],
                                   n_folds=n_folds,
                                   shuffle=True)
    scaler = preprocessing.StandardScaler()

    model_grid = _generate_model_grid()

    tot_v_size = 0
    i = 1
    validation_labels = []
    for train_idx, validate_idx in kf_cv:
        print 'cross validation step # ', i
        training_features = scaler.fit_transform(features[train_idx, :])
        training_labels = labels[train_idx]
        validation_features = scaler.transform(features[validate_idx, :])
        validation_labels.append(labels[validate_idx])

        # loop over all model type and model parameter pairs, train them,
        # and predict the current validation points
        for model in model_grid:
            print model
            m = model[3](**model[4])
            if model[5] == 'unstandardized':
                model[1].append(m.fit(features[train_idx, :], training_labels))
                model[2].append(
                    m.predict_proba(features[validate_idx, :])[:, 1])
            elif model[5] == 'standardized':
                model[1].append(m.fit(training_features, training_labels))
                model[2].append(m.predict_proba(validation_features)[:, 1])
            else:
                raise ValueError('dataset type not recognized')

        tot_v_size += validate_idx.size
        if i >= n_folds_to_compute:
            break
        i += 1

    # calibrate scaler to entire training set for subsequent testing
    scaler.fit(features)
    # stack individual validation folds
    validation_labels = np.concatenate(validation_labels)
    # populate the ensemble library pred and empty the model store to reduce memory
    ensemble_library_pred = np.zeros((tot_v_size, len(model_grid)))
    for model in model_grid:
        ensemble_library_pred[:, model[0]] = np.concatenate(model[2])
        model[2] = []

    return ensemble_library_pred, validation_labels, scaler, model_grid
예제 #12
0
파일: amazon.py 프로젝트: ayoung01/cs181
import numpy as np
import util
import operator
import math
# import matplotlib.pyplot as plt

user_list      = util.user_list
book_list      = util.book_list

pred_filename  = 'pred-amazon-baseline.csv'
train_filename = 'data/ratings-train.csv'
test_filename  = 'data/ratings-test.csv'
user_filename  = 'data/users.csv'
book_filename  = 'data/books.csv'

train  = util.load_train(train_filename)
test_queries   = util.load_test(test_filename)
user_list      = util.load_users(user_filename)
book_list      = util.load_books(book_filename)

# Compute the mean rating (4.070495)
train_mean = float(sum(map(lambda x: x['rating'], train)))/len(train)

# Turn the list of users into a dictionary.
# Store data for each user to keep track of the per-user average.
users = {} # {user1: {isbn1: 4, isbn2: 5, ...}, user2: {...}, ...}
for user in user_list:
    users[user['user']] = {}   

items = {} # {isbn1: {user1: 4, user2: 5, ...}, isbn2: {...}, ...} 
for item in book_list:
예제 #13
0
import numpy as np
import util

# This makes predictions based on the mean rating for each user in the
# training data.  When there are no training data for a user, it
# defaults to the global mean.

pred_filename = 'pred-user-mean.csv'
train_filename = 'ratings-train.csv'
test_filename = 'ratings-test.csv'
user_filename = 'users.csv'

training_data = util.load_train(train_filename)
test_queries = util.load_test(test_filename)
user_list = util.load_users(user_filename)

# Compute the global mean rating for a fallback.
num_train = len(training_data)
mean_rating = float(sum(map(lambda x: x['rating'], training_data))) / num_train
print "The global mean rating is %0.3f." % (mean_rating)

# Turn the list of users into a dictionary.
# Store data for each user to keep track of the per-user average.
users = {}
for user in user_list:
    users[user['user']] = {
        'total': 0,  # For storing the total of ratings.
        'count': 0,  # For storing the number of ratings.
    }

# Iterate over the training data to compute means.
예제 #14
0
파일: viewer.py 프로젝트: BenWeber42/ml
from util import load_train, load_nifti1


def usage():
    print('%s <num> | <path>' % argv[0])
    print('  num: number of training mri to display')
    print('  path: path to nni file')
    exit()


if len(argv) != 2:
    usage()

try:
    num = int(argv[1])
    mri = load_train(num)
except:
    mri = load_nifti1(argv[1])

max_z = mri.shape[2] - 1
initial_z = int(max_z / 2)

# set up figure
fig = plt.figure()
ax = fig.add_subplot(111)
ax.autoscale(True)
plt.subplots_adjust(left=0.25, bottom=0.25)

# plot first data set
frame = 0
mri_plot = ax.imshow(mri[:, :, initial_z], cmap=plt.cm.gray)
예제 #15
0
파일: main.py 프로젝트: cyxtj/ppd
# coding=utf8
import numpy as np
import pandas as pd

from util import load_train
X, y, w = load_train()

if __name__ == '__main__':
    import sys
    clf_name = sys.argv[1]
    print clf_name + '======================='
    # from sklearn import svm
    # wclf = svm.SVC(kernel='linear', class_weight={1: 10}) # svm don't provide proba

    clf = None
    sample_weighted = True
    if clf_name == 'XGB':
        from train_predict import test_xgb
        test_xgb(X, y, w)

    elif clf_name =='Ada':
        from sklearn.ensemble import AdaBoostClassifier
        clf = AdaBoostClassifier(n_estimators=100)

    elif clf_name == 'DT':
        from sklearn.tree import DecisionTreeClassifier
        clf = DecisionTreeClassifier(random_state=0)

    elif clf_name == 'GB':
        from sklearn.ensemble import GradientBoostingClassifier
        clf = GradientBoostingClassifier(n_estimators=500)
예제 #16
0
def generateFullTrainMatrix(detectorSettings=(1, 1, 1), partitions=(9, 9, 9)):
    trainMatrix = []
    for i in range(0, util.TRAIN_COUNT - 1):
        trainMatrix.append(generateEdgeFeaturesVector(util.load_train(i)))
    return trainMatrix
예제 #17
0
# coding=utf8
import numpy as np
import pandas as pd

from util import load_train
X, y = load_train()

if __name__ == '__main__':
    import sys
    clf_name = sys.argv[1]
    print clf_name + '======================='

    if clf_name == 'XGB':
        from train_predict import test_xgb
        test_xgb(X, y)
예제 #18
0
import os
import util
import artifacts

# This script must run in Python, not Pypy.

# This creates a dict like {filename: label} for the whole training set.
train = util.load_train(True)
artifacts.put_artifact(train, 'train_dict')

# This makes a similar dict, holding a sample of 20k positive 
# and 20k negative instances. 
# This is used for determining frequent tags, tokens, etc. for features.
# The dict is saved as artifacts/sample_20_20.pkl.
sample = util.create_sample('sample_20_20', 20000, 20000)
예제 #19
0
from dataset import DepthEigenDataset
from network import GlobalCoarseNet, LocalFineNet
from loss import ScaleInvariantLoss
import util

#cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#hyperparameters
num_epochs = 100  # not specified in paper

data_dir_train = Path('nyu/train')
data_dir_valid = Path('nyu/test')
bs = 32
dataloader_train, dataloader_valid, datalen_train, datalen_valid = util.load_train(
    data_dir_train, data_dir_valid, bs)
print(datalen_train, datalen_valid)

#now the net
# initialize
global_model = GlobalCoarseNet(init=False).to(device)
local_model = LocalFineNet(init=False).to(device)

# loss
global_criterion = ScaleInvariantLoss()
local_criterion = ScaleInvariantLoss()

# optimizer
r = 0.1
global_optimizer = torch.optim.SGD([{
    'params': global_model.coarse6.parameters(),