예제 #1
0
def main():
    #read in  data, parse into training and target sets
    dataset = np.genfromtxt(open('Data/train.csv', 'r'),
                            delimiter=',',
                            dtype='f8')[1:]
    target = np.array([x[0] for x in dataset])
    train = np.array([x[1:] for x in dataset])

    #In this case we'll use a random forest, but this could be any classifier
    cfr = RandomForestClassifier(n_estimators=100)

    #Simple K-Fold cross validation. 5 folds.
    #(Note: in older scikit-learn versions the "n_folds" argument is named "k".)
    cv = cross_validation.KFold(len(train), n_folds=5, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        probas = cfr.fit(train[traincv],
                         target[traincv]).predict_proba(train[testcv])
        results.append(logloss.llfun(target[testcv], [x[1] for x in probas]))

    #print out the mean of the cross-validated results
    print "Results: " + str(np.array(results).mean())
예제 #2
0
def main():
    #read data from csv; use nparray to create the training + target sets
    try:
        train = pd.read_csv('Data/train.csv')
    except IOError:
        print("io ERROR-->Could not locate file.")

    target = np.array([x[0] for x in train])
    train = np.array([x[1:] for x in train])

    # in this case we'll use a random forest, but this could be any classifier
    model = RandomForestClassifier(n_estimators = 100, n_jobs = -1)

    # simple K-Fold cross validation. 10 folds.
    cv = KFold(n = len(train), n_folds = 10, indices = False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list    
    results = []
    for traincv, testcv in cv:
        prob = model.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
        results.append(logloss.llfun(target[testcv], [x[1] for x in prob]))

    #print out the mean of the cross-validated results
    print('Results: ', str(np.array(results).mean()))
예제 #3
0
def main():
    #read data from csv; use nparray to create the training + target sets
    try:
        train = pd.read_csv('Data/train.csv')
    except IOError:
        print("io ERROR-->Could not locate file.")

    target = np.array([x[0] for x in train])
    train = np.array([x[1:] for x in train])

    # in this case we'll use a random forest, but this could be any classifier
    model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    # simple K-Fold cross validation. 10 folds.
    cv = KFold(n=len(train), n_folds=10, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        prob = model.fit(train[traincv],
                         target[traincv]).predict_proba(train[testcv])
        results.append(logloss.llfun(target[testcv], [x[1] for x in prob]))

    #print out the mean of the cross-validated results
    print('Results: ', str(np.array(results).mean()))
def main():
    dataset = numpy.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f8')[1:]
    target=numpy.array([x[0] for x in dataset])
    train=numpy.array([x[1:] for x in dataset])

    rf = RandomForestClassifier(n_estimators=1000, n_jobs=4)

    cv = sklearn.cross_validation.KFold(len(train), k=5, indices=False)

    results=[]
    for traincv, testcv in cv:
        probas = rf.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
        results.append(logloss.llfun(target[testcv], [x[1] for x in probas]))

    print results

    numpy.savetxt('submission.csv',predicted_probs, delimiter=',',fmt='%f')
def main():
    #read in  data, parse into training and target sets
    dataset = np.genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:]    
    target = np.array([x[0] for x in dataset])
    train = np.array([x[1:] for x in dataset])

    #In this case we'll use a random forest, but this could be any classifier
    cfr = RandomForestClassifier(n_estimators=100)

    #Simple K-Fold cross validation. 5 folds.
    cv = cross_validation.KFold(len(train), k=5, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
        results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) )

    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() )
예제 #6
0
def main():
    #read in data, parse into training and target sets
    dataset = np.genfromtxt('Data/train.csv', delimiter=',', dtype='f8')[1:]
    target = np.array([x[0] for x in dataset])
    train = np.array([x[1:] for x in dataset])

    #In this case we'll use a random forest, but this could be any classifier
    cfr = RandomForestClassifier(n_estimators=100)

    #Simple K-Fold cross validation. 5 folds.
    kf = KFold(n_splits=5)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in kf.split(train):
        probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
        results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) )

    #print out the mean of the cross-validated results
    print("Results: ", str( np.array(results).mean() ))
def main():
    #Read in Data - Parse into Training - Target sets
    dataset = np.genfromtxt(open('Data/train.csv', 'r'), delimiter=',', dtype='f8')[1:]
    target = np.array(x[0] for x in dataset)
    train = np.array(x[1:] for x in dataset)

    #Use RandomForestClassifier
    cfr = RandomForestClassifier(n_estimators=100)

    #Simple K-Fold cross validation: 5 folds
    #Note: In older scikit-learn versions the n_folds argument is named k
    cv = cross_validation.KFold(len(train), n_folds=5, indices=False)

    #Iterate through Training - Test Cross Validation segments
    #Run Classifier on each one - Aggregate the results into a list
    results = []
    for traincv, testcv in cv:
        probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
        results.append(logloss.llfun(target[testcv], [x[1] for x in probas]))

    #Print out the mean of the cross-validated results
    print "Results: " + str(np.array(results).mean())
def main():
    #read in  data, parse into training and target sets
    dataset = pd.read_csv('train.csv')
    target = dataset.Activity.values
    train = dataset.drop('Activity', axis=1).values
    imp = Imputer(missing_values = 'NaN',strategy='mean',axis=0)
    new_train_data = imp.fit_transform(train)

    #Other classifiers could be used as well
    cfr = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    #Simple K-Fold cross validation. 5 folds.
    cv = cross_validation.KFold(len(new_train_data), n_folds=5, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        probas = cfr.fit(new_train_data[traincv], target[traincv]).predict_proba(new_train_data[testcv])
        results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) )

    #print out the mean of the cross-validated results
    print("Results: " + str( np.array(results).mean() ))
def main():
    #read in  data, parse into training and target sets
    dataset = pd.read_csv('Data/train.csv')
    target = dataset.Activity.values
    train = dataset.drop('Activity', axis=1).values
    imp = Imputer(missing_values = 'NaN',strategy='mean',axis=0)
    new_train_data = imp.fit_transform(train)

    #In this case we'll use a random forest, but this could be any classifier
    cfr = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    #Simple K-Fold cross validation. 5 folds.
    cv = cross_validation.KFold(len(new_train_data), n_folds=5, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        probas = cfr.fit(new_train_data[traincv], target[traincv]).predict_proba(new_train_data[testcv])
        results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) )

    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() )
예제 #10
0
def main():
    # pull in data from input stream
    infile = sys.stdin

    #read in  data, parse into training and target sets
    # dataset = np.genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:]
    dataset = np.genfromtxt(infile, delimiter=',', dtype='f8')[1:]
    target = np.array([x[0] for x in dataset])
    train = np.array([x[1:] for x in dataset])

    #In this case we'll use a random forest, but this could be any classifier

    #Simple K-Fold cross validation. 5 folds.
    cv = cross_validation.KFold(len(train), k=5, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        cfr = RandomForestClassifier(n_estimators=100)

########net = buildNetwork(5, 8, 3, 1, bias=True)
########netds = SupervisedDataSet(5, 1) 
########for x, y in zip(train[traincv], target[traincv]):
########	netds.addSample(x, [y])
########trainer = BackpropTrainer(net, netds)
########for i in range(30):
########	trainer.train()

        probasRFC = [x[1] for x in cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])]
#       probasNET = [net.activate(x)[0] for x in train[testcv]]
#       probas = map(np.mean, zip(probasNET, probasRFC))

        results.append( logloss.llfun(target[testcv], probasRFC) )

    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() )
예제 #11
0
파일: cv.py 프로젝트: flian2/kaggle
def main():
    #read in  data, parse into training and target sets
    dataset = np.genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:]    
    target = np.array([x[0] for x in dataset])
    train = np.array([x[1:] for x in dataset])

    #In this case we'll use a random forest, but this could be any classifier
    cfr = RandomForestClassifier(n_estimators=100)

    #Simple K-Fold cross validation. 5 folds.
    #(Note: in older scikit-learn versions the "n_folds" argument is named "k".)
    cv = cross_validation.KFold(len(train), n_folds=5)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
        results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) )

    # train and target must be numpy array, if it is a list, error: TypeError: only integer arrays with one element can be converted to an index

    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() )
__author__ = 'azmi'

from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
import logloss
import numpy as np


# read in  data, parse into training and target sets
dataset = np.genfromtxt(open('/home/azmi/data/bioresponse/train.csv', 'r'), delimiter=',', dtype='f8')[1:]
target = np.array([x[0] for x in dataset])
train = np.array([x[1:] for x in dataset])

# In this case we'll use a random forest, but this could be any classifier
cfr = RandomForestClassifier(n_estimators=100)

# Simple K-Fold cross validation. 5 folds.
cv = cross_validation.KFold(len(train), n_folds=5)

# iterate through the training and test cross validation segments and
# run the classifier on each one, aggregating the results into a list
results = []
for traincv, testcv in cv:
    probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
    results.append(logloss.llfun(target[testcv], [x[1] for x in probas]))

# print out the mean of the cross-validated results
print "Results: " + str(np.array(results).mean())

예제 #13
0
print("Time to test training data = ", get_dt(tick))
print("Test Data MSE: %.4f" % mse)
#
##############################################################################
tick = get_tick()
clf.fit(X, y)
print("Time to fit data = ", get_tick() - tick)
tick = get_tick()
y_test = clf.predict(test)
print("Time to predict = ", get_dt(tick))
predicted_probs = [[index + 1, x] for index, x in enumerate(y_test)]
savetxt('GBR_full_submission.csv', predicted_probs, delimiter=',', fmt='%d,%f',
        header='MoleculeId,PredictedProbability', comments='')
###############################################################################
print("Fit regression model using KFold datasets")
tick = get_tick()
cv = cross_validation.KFold(len(y), n_folds=5)  #, indices=False)

#iterate through the training and test cross validation segments and
#run the classifier on each one, aggregating the results into a list
results = []
for traincv, testcv in cv:
    tick1 = get_tick()
    probas = clf.fit(X[traincv], y[traincv]).predict(X[testcv])
    results.append(logloss.llfun(y[testcv], probas))
    print 'Time to Train/Test set = ', get_dt(tick1)

#print out the mean of the cross-validated results
print("Time for KFold regression = ", get_dt(tick))
print "GBR KFold Train/Test MSE: " + str(np.array(results).mean())
예제 #14
0
__author__ = 'azmi'

from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
import logloss
import numpy as np

# read in  data, parse into training and target sets
dataset = np.genfromtxt(open('/home/azmi/data/bioresponse/train.csv', 'r'),
                        delimiter=',',
                        dtype='f8')[1:]
target = np.array([x[0] for x in dataset])
train = np.array([x[1:] for x in dataset])

# In this case we'll use a random forest, but this could be any classifier
cfr = RandomForestClassifier(n_estimators=100)

# Simple K-Fold cross validation. 5 folds.
cv = cross_validation.KFold(len(train), n_folds=5)

# iterate through the training and test cross validation segments and
# run the classifier on each one, aggregating the results into a list
results = []
for traincv, testcv in cv:
    probas = cfr.fit(train[traincv],
                     target[traincv]).predict_proba(train[testcv])
    results.append(logloss.llfun(target[testcv], [x[1] for x in probas]))

# print out the mean of the cross-validated results
print "Results: " + str(np.array(results).mean())