def main(): #read in data, parse into training and target sets dataset = np.genfromtxt(open('Data/train.csv', 'r'), delimiter=',', dtype='f8')[1:] target = np.array([x[0] for x in dataset]) train = np.array([x[1:] for x in dataset]) #In this case we'll use a random forest, but this could be any classifier cfr = RandomForestClassifier(n_estimators=100) #Simple K-Fold cross validation. 5 folds. #(Note: in older scikit-learn versions the "n_folds" argument is named "k".) cv = cross_validation.KFold(len(train), n_folds=5, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append(logloss.llfun(target[testcv], [x[1] for x in probas])) #print out the mean of the cross-validated results print "Results: " + str(np.array(results).mean())
def main(): #read data from csv; use nparray to create the training + target sets try: train = pd.read_csv('Data/train.csv') except IOError: print("io ERROR-->Could not locate file.") target = np.array([x[0] for x in train]) train = np.array([x[1:] for x in train]) # in this case we'll use a random forest, but this could be any classifier model = RandomForestClassifier(n_estimators = 100, n_jobs = -1) # simple K-Fold cross validation. 10 folds. cv = KFold(n = len(train), n_folds = 10, indices = False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: prob = model.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append(logloss.llfun(target[testcv], [x[1] for x in prob])) #print out the mean of the cross-validated results print('Results: ', str(np.array(results).mean()))
def main(): #read data from csv; use nparray to create the training + target sets try: train = pd.read_csv('Data/train.csv') except IOError: print("io ERROR-->Could not locate file.") target = np.array([x[0] for x in train]) train = np.array([x[1:] for x in train]) # in this case we'll use a random forest, but this could be any classifier model = RandomForestClassifier(n_estimators=100, n_jobs=-1) # simple K-Fold cross validation. 10 folds. cv = KFold(n=len(train), n_folds=10, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: prob = model.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append(logloss.llfun(target[testcv], [x[1] for x in prob])) #print out the mean of the cross-validated results print('Results: ', str(np.array(results).mean()))
def main(): dataset = numpy.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f8')[1:] target=numpy.array([x[0] for x in dataset]) train=numpy.array([x[1:] for x in dataset]) rf = RandomForestClassifier(n_estimators=1000, n_jobs=4) cv = sklearn.cross_validation.KFold(len(train), k=5, indices=False) results=[] for traincv, testcv in cv: probas = rf.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append(logloss.llfun(target[testcv], [x[1] for x in probas])) print results numpy.savetxt('submission.csv',predicted_probs, delimiter=',',fmt='%f')
def main(): #read in data, parse into training and target sets dataset = np.genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:] target = np.array([x[0] for x in dataset]) train = np.array([x[1:] for x in dataset]) #In this case we'll use a random forest, but this could be any classifier cfr = RandomForestClassifier(n_estimators=100) #Simple K-Fold cross validation. 5 folds. cv = cross_validation.KFold(len(train), k=5, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) ) #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() )
def main(): #read in data, parse into training and target sets dataset = np.genfromtxt('Data/train.csv', delimiter=',', dtype='f8')[1:] target = np.array([x[0] for x in dataset]) train = np.array([x[1:] for x in dataset]) #In this case we'll use a random forest, but this could be any classifier cfr = RandomForestClassifier(n_estimators=100) #Simple K-Fold cross validation. 5 folds. kf = KFold(n_splits=5) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in kf.split(train): probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) ) #print out the mean of the cross-validated results print("Results: ", str( np.array(results).mean() ))
def main(): #Read in Data - Parse into Training - Target sets dataset = np.genfromtxt(open('Data/train.csv', 'r'), delimiter=',', dtype='f8')[1:] target = np.array(x[0] for x in dataset) train = np.array(x[1:] for x in dataset) #Use RandomForestClassifier cfr = RandomForestClassifier(n_estimators=100) #Simple K-Fold cross validation: 5 folds #Note: In older scikit-learn versions the n_folds argument is named k cv = cross_validation.KFold(len(train), n_folds=5, indices=False) #Iterate through Training - Test Cross Validation segments #Run Classifier on each one - Aggregate the results into a list results = [] for traincv, testcv in cv: probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append(logloss.llfun(target[testcv], [x[1] for x in probas])) #Print out the mean of the cross-validated results print "Results: " + str(np.array(results).mean())
def main(): #read in data, parse into training and target sets dataset = pd.read_csv('train.csv') target = dataset.Activity.values train = dataset.drop('Activity', axis=1).values imp = Imputer(missing_values = 'NaN',strategy='mean',axis=0) new_train_data = imp.fit_transform(train) #Other classifiers could be used as well cfr = RandomForestClassifier(n_estimators=100, n_jobs=-1) #Simple K-Fold cross validation. 5 folds. cv = cross_validation.KFold(len(new_train_data), n_folds=5, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: probas = cfr.fit(new_train_data[traincv], target[traincv]).predict_proba(new_train_data[testcv]) results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) ) #print out the mean of the cross-validated results print("Results: " + str( np.array(results).mean() ))
def main(): #read in data, parse into training and target sets dataset = pd.read_csv('Data/train.csv') target = dataset.Activity.values train = dataset.drop('Activity', axis=1).values imp = Imputer(missing_values = 'NaN',strategy='mean',axis=0) new_train_data = imp.fit_transform(train) #In this case we'll use a random forest, but this could be any classifier cfr = RandomForestClassifier(n_estimators=100, n_jobs=-1) #Simple K-Fold cross validation. 5 folds. cv = cross_validation.KFold(len(new_train_data), n_folds=5, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: probas = cfr.fit(new_train_data[traincv], target[traincv]).predict_proba(new_train_data[testcv]) results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) ) #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() )
def main(): # pull in data from input stream infile = sys.stdin #read in data, parse into training and target sets # dataset = np.genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:] dataset = np.genfromtxt(infile, delimiter=',', dtype='f8')[1:] target = np.array([x[0] for x in dataset]) train = np.array([x[1:] for x in dataset]) #In this case we'll use a random forest, but this could be any classifier #Simple K-Fold cross validation. 5 folds. cv = cross_validation.KFold(len(train), k=5, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: cfr = RandomForestClassifier(n_estimators=100) ########net = buildNetwork(5, 8, 3, 1, bias=True) ########netds = SupervisedDataSet(5, 1) ########for x, y in zip(train[traincv], target[traincv]): ######## netds.addSample(x, [y]) ########trainer = BackpropTrainer(net, netds) ########for i in range(30): ######## trainer.train() probasRFC = [x[1] for x in cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])] # probasNET = [net.activate(x)[0] for x in train[testcv]] # probas = map(np.mean, zip(probasNET, probasRFC)) results.append( logloss.llfun(target[testcv], probasRFC) ) #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() )
def main(): #read in data, parse into training and target sets dataset = np.genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:] target = np.array([x[0] for x in dataset]) train = np.array([x[1:] for x in dataset]) #In this case we'll use a random forest, but this could be any classifier cfr = RandomForestClassifier(n_estimators=100) #Simple K-Fold cross validation. 5 folds. #(Note: in older scikit-learn versions the "n_folds" argument is named "k".) cv = cross_validation.KFold(len(train), n_folds=5) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) ) # train and target must be numpy array, if it is a list, error: TypeError: only integer arrays with one element can be converted to an index #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() )
__author__ = 'azmi' from sklearn.ensemble import RandomForestClassifier from sklearn import cross_validation import logloss import numpy as np # read in data, parse into training and target sets dataset = np.genfromtxt(open('/home/azmi/data/bioresponse/train.csv', 'r'), delimiter=',', dtype='f8')[1:] target = np.array([x[0] for x in dataset]) train = np.array([x[1:] for x in dataset]) # In this case we'll use a random forest, but this could be any classifier cfr = RandomForestClassifier(n_estimators=100) # Simple K-Fold cross validation. 5 folds. cv = cross_validation.KFold(len(train), n_folds=5) # iterate through the training and test cross validation segments and # run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) results.append(logloss.llfun(target[testcv], [x[1] for x in probas])) # print out the mean of the cross-validated results print "Results: " + str(np.array(results).mean())
print("Time to test training data = ", get_dt(tick)) print("Test Data MSE: %.4f" % mse) # ############################################################################## tick = get_tick() clf.fit(X, y) print("Time to fit data = ", get_tick() - tick) tick = get_tick() y_test = clf.predict(test) print("Time to predict = ", get_dt(tick)) predicted_probs = [[index + 1, x] for index, x in enumerate(y_test)] savetxt('GBR_full_submission.csv', predicted_probs, delimiter=',', fmt='%d,%f', header='MoleculeId,PredictedProbability', comments='') ############################################################################### print("Fit regression model using KFold datasets") tick = get_tick() cv = cross_validation.KFold(len(y), n_folds=5) #, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: tick1 = get_tick() probas = clf.fit(X[traincv], y[traincv]).predict(X[testcv]) results.append(logloss.llfun(y[testcv], probas)) print 'Time to Train/Test set = ', get_dt(tick1) #print out the mean of the cross-validated results print("Time for KFold regression = ", get_dt(tick)) print "GBR KFold Train/Test MSE: " + str(np.array(results).mean())