예제 #1
0
def main():
    #read in  data, parse into training and target sets
    train = csv.read_data("../Data/train.csv")
    target = np.array( [x[0] for x in train] )
    train = np.array( [x[1:280] for x in train] )

    #In this case we'll use a random forest, but this could be any classifier
    cfr = RandomForestClassifier(n_estimators=120, min_samples_split=2, n_jobs=-1, max_depth=None) #.46
    #cfr = GradientBoostingClassifier(n_estimators=120, learn_rate=0.57, max_depth=1) #.50
    #cfr = ExtraTreesClassifier(n_estimators=120, max_depth=None, min_samples_split=1) #.489

    #Simple K-Fold cross validation. 5 folds.
    cv = cross_validation.KFold(len(train), k=5, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    count = 0
    for traincv, testcv in cv:
        probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
        result = logloss.llfun(target[testcv], [x[1] for x in probas])
        count += 1
        print('fold: %d, result: %f' % (count, result))
        results.append( result )

    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() )

    test = csv.read_data("../Data/test.csv")
    predicted_probs = cfr.predict_proba( [x[0:279] for x in test])
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv.write_delimited_file("../Submissions/rf_cv.csv",
                                predicted_probs)
예제 #2
0
파일: Analyze1.py 프로젝트: mb16/Kaggle
def Analyze1():

	Threshold = 4.0  
	targetFile = "Target_Stack_20121017110223_3.06649134025_GradientBoos.csv"

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine = False, split = "\t")
	shutil.copy2("PreProcessData/test_PreProcess3.csv", "PreProcessData/test_PreProcess8.csv")	
	shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList8.csv")	
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)
	
	target = [x[0] for x in trainBase]
	
	
	print "Loading Data"
	trainNew = []
	
	probSum = 0.0
	weightSum = 0
	
	trn = csv_io.read_data("../predictions/" + targetFile, split="," ,skipFirstLine = False)
	for row, datum in enumerate(trn):

		if ( abs(datum[0] - target[row]) > Threshold):
			print datum[0], target[row]
			trainNew.append(trainBase[row])
			
			probSum += weights[row][0] * math.fabs(target[row] - datum[0])
			weightSum += weights[row][0]
		
		
	print "Train Score: ", (probSum/weightSum)	
	print len(trainNew)
	csv_io.write_delimited_file("PreProcessData/training_PreProcess8" + ".csv", trainNew, delimiter="\t")	
예제 #3
0
def main():
    #read in the training file
    train = csv_io.read_data("../data/train.csv")
    #set the training responses
    target = [x[0] for x in train]
    #set the training features
    train = [x[1:] for x in train]
    #read in the test file
    realtest = csv_io.read_data("../data/test.csv")

    # random forest code
    rf = RandomForestClassifier(n_estimators=150,
                                min_samples_split=2,
                                n_jobs=-1)
    # fit the training data
    print('fitting the model')
    rf.fit(train, target)
    # run model against test data
    predicted_probs = rf.predict_proba(realtest)

    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)

    print(
        'Random Forest Complete! You Rock! Submit random_forest_solution.csv to Kaggle'
    )
예제 #4
0
def main():
    #read in the training file
    train = csv_io.read_data("data/train.csv")
    target = ravel(csv_io.read_data("data/trainLabels.csv"))

    realtest = csv_io.read_data("data/test.csv")
    print len(realtest)

    # random forest code
    rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1, random_state=1, oob_score=True)
    # fit the training data
    print('fitting the model')
    rf.fit(train, target)

    # run model against test data
    predicted_probs = rf.predict_proba(realtest)
    predicted_class = rf.predict(realtest)
    print predicted_class[1:10]
    print(len(predicted_class))

    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))]
    print predicted_class[0:9]
    print(len(predicted_class))

    csv_io.write_delimited_file("results/random_forest_solution.csv", predicted_class, header=['Id', 'Solution'])
예제 #5
0
def PreProcess3():

    trainBase = csv_io.read_data(
        "PreProcessData/training_PreProcess2_temp.csv", False)
    test = csv_io.read_data("PreProcessData/test_PreProcess2_temp.csv", False)

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    NumFeatures = 200

    #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True)
    chi = chi2(train, target)
    print "Training"
    #clf.fit(train, target)

    chi = SelectKBest(chi2, k=NumFeatures).fit(train, target)
    print chi.get_support(indices=True)
    print chi.transform(X), np.array(train)[:, [0]]

    return

    trainNew = []
    testNew = []

    print "Computing Importances"
    importances = clf.feature_importances_
    #print importances
    importancesTemp = sorted(importances, reverse=True)
    print len(importancesTemp), "importances"

    if (len(importancesTemp) > NumFeatures):
        threshold = importancesTemp[NumFeatures]
        #print "Sorted and deleted importances"
        #print importancesTemp

        rowIndex = 0
        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (impIndex == 0):
                    newRow.append(target[rowIndex])
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)
            rowIndex += 1

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    #print impIndex, len(importances)
                    newRow.append(row[impIndex])
            testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess2_chi.csv",
                                trainNew)
    csv_io.write_delimited_file("PreProcessData/test_PreProcess2_chi.csv",
                                testNew)
예제 #6
0
파일: blend.py 프로젝트: mb16/Kaggle
def Blend():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t")
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)
		
	SEED = 448
	#random.seed(SEED)
	#random.shuffle(trainBase)
	
	target = [x[0] for x in trainBase]
	
	dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)
	clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
		]
	
	
	# clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]
	
	test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False)
	dataset_blend_test_set = np.zeros((len(test), len(clfs)))
	
	for ExecutionIndex, clf in enumerate(clfs):

		clf.fit(dataset_blend_train, target)
		submission = clf.predict(dataset_blend_test)
		
		submission = ["%f" % x for x in submission]
		now = datetime.datetime.now()
		csv_io.write_delimited_file("../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission)	
		

		
		# attempt to score the training set to predict score for blend...
		probSum = 0.0
		weightSum = 0
		
		trainPrediction = clf.predict(dataset_blend_train)
		for i in range(0, len(trainPrediction)):
			probX = trainPrediction[i]
			

			probSum += weights[i][0] * math.fabs(target[i] - probX)
			weightSum += weights[i][0]
			#probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)
			 
		print "Train Score: ", (probSum/weightSum)
	
		dataset_blend_test_set[:, ExecutionIndex] = submission
	
	
	
	csv_io.write_delimited_file_single("../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))	
예제 #7
0
파일: blend.py 프로젝트: zyx061212/Kaggle
def Blend():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv",
                                 skipFirstLine=False,
                                 split="\t")
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    SEED = 448
    #random.seed(SEED)
    #random.shuffle(trainBase)

    target = [x[0] for x in trainBase]

    dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)
    clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)]

    # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]

    test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False)
    dataset_blend_test_set = np.zeros((len(test), len(clfs)))

    for ExecutionIndex, clf in enumerate(clfs):

        clf.fit(dataset_blend_train, target)
        submission = clf.predict(dataset_blend_test)

        submission = ["%f" % x for x in submission]
        now = datetime.datetime.now()
        csv_io.write_delimited_file(
            "../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") +
            ".csv", submission)

        # attempt to score the training set to predict score for blend...
        probSum = 0.0
        weightSum = 0

        trainPrediction = clf.predict(dataset_blend_train)
        for i in range(0, len(trainPrediction)):
            probX = trainPrediction[i]

            probSum += weights[i][0] * math.fabs(target[i] - probX)
            weightSum += weights[i][0]
            #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)

        print "Train Score: ", (probSum / weightSum)

        dataset_blend_test_set[:, ExecutionIndex] = submission

    csv_io.write_delimited_file_single(
        "../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv",
        dataset_blend_test_set.mean(1))
예제 #8
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    svc = svm.SVC(kernel='poly', degree=2)
    scores = cross_val_score(rf, training, target, cv=10)
    print np.mean(scores)
예제 #9
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    rf = RandomForestClassifier(n_estimators=150, max_features=0.012)
    scores = cross_val_score(rf, training, target, cv=10)
    print np.mean(scores)
예제 #10
0
def main():
    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv_io.read_data("../Data/test.csv")

    rf = RandomForestClassifier(n_estimators=100, min_split=2)
    rf.fit(train, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("../Submissions/rf_benchmark.csv",
                                predicted_probs)
예제 #11
0
파일: blend.py 프로젝트: mb16/Kaggle
def Blend():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False)
	
	SEED = 448
	random.seed(SEED)
	random.shuffle(trainBase)
	
	target = [x[0] for x in trainBase]
	
	dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)

	clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]
	
	test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)
	dataset_blend_test_j = np.zeros((len(test), len(clfs)))
	
	for ExecutionIndex, clf in enumerate(clfs):
		#clf = LogisticRegression()
		clf.fit(dataset_blend_train, target)
		submission = clf.predict_proba(dataset_blend_test)[:,1]
		
		submission = ["%f" % x for x in submission]
		now = datetime.datetime.now()
		csv_io.write_delimited_file_GUID("../Submissions/stack" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", submission)	
		

		
		# attempt to score the training set to predict score for blend...
		probSum = 0.0
		trainPrediction = clf.predict_proba(dataset_blend_train)[:,1]
		for i in range(0, len(trainPrediction)):
			probX = trainPrediction[i]
			if ( probX > 0.999):
				probX = 0.999;		
			if ( probX < 0.001):
				probX = 0.001;

			probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)
			 
		print "Train Score: ", (-probSum/len(trainPrediction))
	
		dataset_blend_test_j[:, ExecutionIndex] = submission
	
	
	
	csv_io.write_delimited_file_GUID_numpy("../Submissions/stack_LG_" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1))	
	var = raw_input("Enter to terminate.")	
예제 #12
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    test, throwaway = csv_io.read_data("../Data/test.csv")
    
    n_test = len(test)
    n_target = len(set(target))

    predicted_probs = [[0.001 for x in range(n_target)] 
                       for y in range(n_test)]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/uniform_benchmark.csv",
                                predicted_probs)
예제 #13
0
def main():
    train = csv_io.read_data("../Data/train.csv")
    targets = [int(x[0]) for x in train]
    num_targets = len(targets)
    num_ones = np.sum(targets)
    optimized_value = float(num_ones) / num_targets

    test = csv_io.read_data("../Data/test.csv")
    
    predicted_probs = ["%f" % optimized_value for x in test] 
    csv_io.write_delimited_file("../Submissions/optimized_value_benchmark.csv",
                                predicted_probs)
예제 #14
0
def main():
    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv_io.read_data("../Data/test.csv")

    svc = svm.SVC(probability=True)
    svc.fit(train, target)
    predicted_probs = svc.predict_proba(test)
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv",
                                predicted_probs)
예제 #15
0
def main():
    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv_io.read_data("../Data/test.csv")

    rf = RandomForestClassifier(n_estimators=100, min_split=2)
    rf.fit(train, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("../Submissions/rf_benchmark.csv",
                                predicted_probs)
예제 #16
0
def main():
    train = csv_io.read_data("../data/train.csv")
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv_io.read_data("../data/test.csv")

    svc = svm.SVC(probability=True)
    svc.fit(train, target)
    predicted_probs = svc.predict_proba(test)
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("../submissions/svm_benchmark.csv",
                                predicted_probs)
예제 #17
0
def PreProcessRun(dataSet):
    print
    print "DataSet: ", dataSet

    print "Loading Data"
    data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess.csv",
                            split="\t",
                            skipFirstLine=False)
    print dataSet, "Size: ", len(data[0])

    if (dataSet == "training"):  # do only once.
        shutil.copy2("PreProcessData/DataClassList.csv",
                     "PreProcessData/DataClassList1.csv")

    DataClassList = csv_io.read_data("PreProcessData/DataClassList1.csv",
                                     False)

    offset = 0
    offset2 = 0
    if (dataSet == "test"):
        offset = 1
        offset2 = -1

    print DataClassList

    print "Appending New Data"
    firstTime = True
    for row in data:

        text = ""

        val = row[136 + offset2] / row[139 + offset2]
        row.append(val)
        if (firstTime and dataSet == "training"):  # do only once.
            text = DataClassList[135 + offset][0] + "_DIV_" + DataClassList[
                139 + offset][0]
            csv_io.write_delimited_file("PreProcessData/DataClassList1.csv",
                                        [text],
                                        filemode="a")
        if (firstTime):
            print row[136 + offset2], row[139 + offset2], val, text

        firstTime = False

    csv_io.write_delimited_file("PreProcessData/" + dataSet +
                                "_PreProcess1.csv",
                                data,
                                delimiter="\t")

    print "Done."
예제 #18
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    svc = svm.SVC(probability=True)
    svc.fit(training, target)
    predicted_probs = svc.predict_proba(test)
    predicted_probs = [[min(max(x,0.001),0.999) for x in y] 
                       for y in predicted_probs]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv",
                                predicted_probs)
예제 #19
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    svc = svm.SVC(probability=True)
    svc.fit(training, target)
    predicted_probs = svc.predict_proba(test)
    predicted_probs = [[min(max(x, 0.001), 0.999) for x in y]
                       for y in predicted_probs]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv",
                                predicted_probs)
예제 #20
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    rf = RandomForestClassifier(n_estimators=100, min_split=2)
    rf.fit(training, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = [[min(max(x,0.001),0.999) for x in y] 
                       for y in predicted_probs]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/rf_benchmark.csv",
                                predicted_probs)
예제 #21
0
def main():
    np.random.seed(42)
    #read in the training file
    modelone = np.asarray(csv_io.read_data("results/gmm_pca12_6.csv",header=True))
    modeltwo = np.asarray(csv_io.read_data("results/random_forest_solution-12pca-4.csv",header=True))
    modelthree= np.asarray(csv_io.read_data("results/svm_pca12_5.csv",header=True))
    bagmodel = np.column_stack((modelone[:,1], modeltwo[:,1], modelthree[:,1]))
    bagsum = bagmodel.sum(axis=1)
    predicted_class = np.zeros(bagsum.shape)
    predicted_class[bagsum >=2] = 1
    
    predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))]
    print predicted_class[0:9]
    print(len(predicted_class))
    csv_io.write_delimited_file("results/bagging_solution_7.csv", predicted_class, header=['Id', 'Solution'])
예제 #22
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    rf = RandomForestClassifier(n_estimators=100, min_split=2)
    rf.fit(training, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = [[min(max(x, 0.001), 0.999) for x in y]
                       for y in predicted_probs]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/rf_benchmark.csv",
                                predicted_probs)
예제 #23
0
def main(strat = False, visualization = False):
    #read in the training file
    X = csv_io.read_data("data/train.csv")
    target = ravel(csv_io.read_data("data/trainLabels.csv"))
    realtest = csv_io.read_data("data/test.csv")
    print len(realtest)

    #pca
    pca = PCA(n_components=num_pca)
    pca.fit(X)
    train = pca.transform(X)
    test_transformed = pca.transform(realtest)
    print('performed pca')

    # random forest code
    clf = svm.SVC()
    if strat:
        print "stratified cross-validation on shuffled data"    
        # adapted from http://stackoverflow.com/a/8281241
        crossval = []
        for i in range(strat):
            X, y = shuffle(train, target, random_state=i)
            skf = StratifiedKFold(y, 10)
            crossval.append([min(cross_val_score(clf, X, y, cv=skf)), np.median(cross_val_score(clf, X, y, cv=skf)), max(cross_val_score(clf, X, y, cv=skf))]) 
        print crossval

    if visualization:
        print "preparing visualization"
        data_train, data_test, target_train, target_test = train_test_split(train, target, test_size=0.20, random_state=42)
        plot1 = drawLearningCurve(clf, data_train, target_train, data_test, target_test)
        pp = PdfPages('figures/learningCurve.pdf')
        pp.savefig(plot1)
        pp.close()

    print('fitting the model')
    clf.fit(train, target)
    # run model against test data
    predicted_class = clf.predict(test_transformed)
    print predicted_class[0:9]
    print(len(predicted_class))

    print('Writing output')
    predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))]
    print predicted_class[0:9]
    print(len(predicted_class))
    csv_io.write_delimited_file("results/svm_pca12_5.csv", predicted_class, header=['Id', 'Solution'])

    print ('Finished. Exiting.')
예제 #24
0
파일: blend_knn.py 프로젝트: mb16/Kaggle
def Blend():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False)
	target = [x[0] for x in trainBase]
	
	dataset_blend_train, dataset_blend_test = stack_knn.run_stack()

	clf = LogisticRegression()
	clf.fit(dataset_blend_train, target)
	submission = clf.predict_proba(dataset_blend_test)[:,1]
	
 	submission = ["%f" % x for x in submission]
	now = datetime.datetime.now()
	csv_io.write_delimited_file_GUID("../Submissions/stack" + now.strftime("%Y%m%d%H%M") + ".csv", "PreProcessData/test_PatientGuid.csv", submission)	
	

	
	# attempt to score the training set to predict score for blend...
	probSum = 0.0
	trainPrediction = clf.predict_proba(dataset_blend_train)[:,1]
	for i in range(0, len(trainPrediction)):
		probX = trainPrediction[i]
		if ( probX > 0.999):
			probX = 0.999;		
		if ( probX < 0.001):
			probX = 0.001;

		probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)
		 
	print "Train Score: ", (-probSum/len(trainPrediction))
	
	
	
	var = raw_input("Enter to terminate.")	
예제 #25
0
def main():
    train = csv_io.read_data("{}/Data/train.csv".format(os.getcwd()), True)

    target = [float(x[0]) for x in train]

    # Remove the target from the training
    train = [x[1:] for x in train]

    # Remove the categoricals that I can't convert
    for x in train:
        del x[1]
        del x[1]
        del x[5]
        del x[6]
    cats = preprocess.enum_categ_data(train, "f", 10)
    preprocess.strf_to_floats(train, missing="average")

    #    test = csv_io.read_data("{}/Data/test.csv".format(os.getcwd()), True)
    #
    #    # Remove the categoricals that I can't convert
    #    for x in test:
    #        del x[1]
    #        del x[1]
    #        del x[5]
    #        del x[6]

    # I can't just run enum_categ_data on test data, need to match the right cat to the right index!!!

    #    cats = preprocess.enum_categ_data(test, 'f', 10)
    #    preprocess.strf_to_floats(test, missing='average')

    rf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
    rf.fit(train, target)

    print rf.score(train, target)
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./hotness_features.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )

    hotness_col = target
    familaritycol = train[:, 30]

    mydict = {}
    familarity_classes = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
    for fclass in familarity_classes:
        mydict[fclass]=[]

    print len(target)
    for i in range(0, len(target)):
	key = familaritycol[i]
	if (key < 0.2):
	    keyclass = 0.0
        elif (key < 0.4):
            keyclass = 0.2
	elif (key < 0.6):
	    keyclass = 0.4
	elif (key < 0.8):
	    keyclass = 0.6
	else:
	    keyclass = 0.8

	value = hotness_col[i]
	mydict[keyclass].append(value)

    print len(mydict[0.0]) + len(mydict[0.2]) + len(mydict[0.4]) + len(mydict[0.6]) + len(mydict[0.8])
    plt.hist(mydict[0.8], normed=True)
    plt.show()
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./filtered_classes.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )

    hotness_col = target
    print "Length of target", len(hotness_col)
    col = train[:, 5]

    mydict = {}
    mydict[0] = []
    mydict[1] = []
    for i in range(0, len(hotness_col)):
        key = hotness_col[i]
	val = col[i]
	mydict[key].append(val) 
   
    mylabels = ['Not Hot','Hot']
    print len(mydict[0]), len(mydict[1])
    plt.hist([mydict[0], mydict[1]], label=mylabels, normed=True) 
    plt.xlabel('Artist Hotness')
    plt.ylabel('Frequency of songs(normalized)')
    plt.legend(loc='upper left')
    plt.show()
예제 #28
0
def Blend():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 False)
    target = [x[0] for x in trainBase]

    dataset_blend_train, dataset_blend_test = stack_gb.run_stack()

    clf = LogisticRegression()
    clf.fit(dataset_blend_train, target)
    submission = clf.predict_proba(dataset_blend_test)[:, 1]

    submission = ["%f" % x for x in submission]
    now = datetime.datetime.now()
    csv_io.write_delimited_file_GUID(
        "../Submissions/stack_" + now.strftime("%Y%m%d%H%M") + ".csv",
        "PreProcessData/test_PatientGuid.csv", submission)

    # attempt to score the training set to predict score for blend...
    probSum = 0.0
    trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1]
    for i in range(0, len(trainPrediction)):
        probX = trainPrediction[i]
        if (probX > 0.999):
            probX = 0.999
        if (probX < 0.001):
            probX = 0.001

        probSum += int(
            target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX)

    print "Train Score: ", (-probSum / len(trainPrediction))

    var = raw_input("Enter to terminate.")
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./hotness_features.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )
    plt.plot(train[:,0], target,'ro')
    plt.xlim(-40,0)
    plt.show()
예제 #30
0
def main():
    train=csv_io.read_data("Data/train.csv")
    train=train[0:10000]
    target=[x[0] for x in train]
    train=[x[1:] for x in train]
    realtest=csv_io.read_data("Data/test.csv")
    forest = RandomForestClassifier(n_estimators = 100)
    forest=forest.fit(train,target)
    predicted_probs=forest.predict_proba(realtest)
    fr=open('Result2.csv','w')
    fr.write("ImageId,Label"+"\n")
    count=1
    for y in predicted_probs:
        index, value = max(enumerate(y), key=operator.itemgetter(1))
        fr.write(str(count)+","+str(index)+"\n")
        count+=1
    fr.close()
예제 #31
0
def main():
    train=csv_io.read_data("Data/train.csv")
    train=train[0:3000]
    target=[x[0] for x in train]
    train=[x[1:] for x in train]
    realtest=csv_io.read_data("Data/test.csv")
    lr=LogisticRegression()
    lr.fit(train,target)
    predicted_probs=lr.predict_proba(realtest)
    fr=open('Results.csv','w')
    fr.write("ImageId,Label"+"\n")
    count=1
    for y in predicted_probs:
        index, value = max(enumerate(y), key=operator.itemgetter(1))
        fr.write(str(count)+","+str(index)+"\n")
        count+=1
    fr.close()
def main():
    #read in  data, parse into training and target sets
    train = csv_io.read_data("./hotness_features_classes.csv")
    target = np.array( [x[0] for x in train] )
    train = np.array( [x[1:] for x in train] )
    train_scaled = preprocessing.scale(train)
    clf = tree.DecisionTreeClassifier(random_state = 0)
    scores = cross_validation.cross_val_score(clf, train_scaled, target,None, cv=10)
    print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()/2)
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./hotness_features.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )

    mydict = {}

    yearcol = train[:, 27]
    durationcol = train[:, 2]

    mydict[1960]=[]
    mydict[1970]=[]
    mydict[1980]=[]
    mydict[1990]=[]
    mydict[2000]=[]
    mydict[2010]=[]
    decade2010 = 0

    for i in range(0,len(yearcol)) :
        year = yearcol[i]
        if ((year >= 1960) and (year < 1970)):
            decade = 1960
        elif ((year >= 1970) and (year < 1980)):
            decade = 1970
        elif ((year >=1980) and (year < 1990)):
            decade = 1980
	elif ((year >= 1990) and (year < 2000)):
	    decade = 1990
        elif ((year >= 2000) and (year < 2010)):
	    decade = 2000
	else:
            decade2010 = decade2010 + 1
	    decade = 2010

        mydict[decade].append(durationcol[i])

    print decade2010
   
    data = [mydict[1960], mydict[1970], mydict[1980], mydict[1990], mydict[2000], mydict[2010]] 
    labels = ['1960', '1970','1980','1990','2000','2010']

    mean60 = np.mean(mydict[1960])
    mean70 = np.mean(mydict[1970])
    mean80 = np.mean(mydict[1980])
    mean90 = np.mean(mydict[1990])
    mean2000 = np.mean(mydict[2000])
    mean2010  = np.mean(mydict[2010])


    plt.hist(data,bins=10, normed=True, label=labels, histtype='bar', cumulative=True)
    plt.legend()
    plt.figure(2)
    plt.plot(labels, [mean60, mean70, mean80, mean90, mean2000, mean2010], "ro")


    plt.show()
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./filtered_classes_musiconly.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )
    train_scaled = preprocessing.scale(train)
    clf  = SVC(kernel='rbf', C=1000, gamma=0.001)
    scores = cross_validation.cross_val_score(clf, train_scaled, target, metrics.classification_report, cv=10)
    print scores
예제 #35
0
파일: PreProcess2.py 프로젝트: mb16/Kaggle
def PreProcessRun(dataSet):
	print
	print "DataSet: ", dataSet
	
	print "Loading Data"
	data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess1.csv", split="\t" ,skipFirstLine = False)
	print dataSet, "Size: ", len(data[0])
	
	if ( os.path.exists("PreProcessData/" + dataSet + "_PreProcess2.csv") ):
		os.remove("PreProcessData/" + dataSet + "_PreProcess2.csv")
	
	SkipArr = [0,2,4,172]

	
	DataClassList = csv_io.read_data("PreProcessData/DataClassList1.csv", False)
	DataClassListNew = []
	
	firstTime = True
	for index, item in enumerate(data):
	
		rowNew = []
		#print item
		
		for index, val in enumerate(item):
			if dataSet == "training" and (index - 1) in SkipArr:
				continue
			elif dataSet == "test" and index in SkipArr:
				continue
			rowNew.append(val)
		
			#print val
			if dataSet == "test" and firstTime == True:
				print DataClassList[index]
				DataClassListNew.append(DataClassList[index])
				
		csv_io.write_delimited_file("PreProcessData/" + dataSet + "_PreProcess2.csv", [copy.deepcopy(rowNew)], filemode="a", delimiter="\t")

		firstTime = False

	if dataSet == "test":
		csv_io.write_delimited_file("PreProcessData/DataClassList2.csv", DataClassListNew)

	
	print "Done."		
예제 #36
0
def PostProcess():

    lossThreshold = 4.0  # best seems to be about 4.0
    model = "Long-Lat KNN5"

    #used only for targets values.
    trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_40.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess4_40.csv", False)
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    target = [x[0] for x in trainBase]

    stackFiles = []
    for filename in os.listdir("../predictions"):
        parts = filename.split("_")
        if (filename[0:5] == "Stack" and float(parts[2]) < lossThreshold):

            stackFiles.append(filename)

    dataset_blend_train = np.zeros((len(trainBase), len(stackFiles)))
    dataset_blend_test = np.zeros((len(test), len(stackFiles)))

    print "Loading Data"
    for fileNum, file in enumerate(stackFiles):
        print file
        trn = csv_io.read_data("../predictions/Target_" + file,
                               split=",",
                               skipFirstLine=False)
        for row, datum in enumerate(trn):
            dataset_blend_train[row, fileNum] = datum[0]

        tst = csv_io.read_data("../predictions/" + file,
                               split=",",
                               skipFirstLine=False)
        for row, datum in enumerate(tst):
            dataset_blend_test[row, fileNum] = datum[0]

    np.savetxt('temp/dataset_blend_trainX.txt', dataset_blend_train)
    np.savetxt('temp/dataset_blend_testX.txt', dataset_blend_test)
    np.savetxt('temp/dataset_blend_trainY.txt', target)
    print "Num file processed: ", len(stackFiles), "Threshold: ", lossThreshold
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./filtered_classes.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )

    mydict = {}

    yearcol = train[:, 9]
    energycol = train[:, 13]

    mydict[1960]=[]
    mydict[1970]=[]
    mydict[1980]=[]
    mydict[1990]=[]
    mydict[2000]=[]
    mydict[2010]=[]

    for i in range(0,len(yearcol)) :
        year = yearcol[i]
        if ((year >= 1960) and (year < 1970)):
            decade = 1960
        elif ((year >= 1970) and (year < 1980)):
            decade = 1970
        elif ((year >=1980) and (year < 1990)):
            decade = 1980
	elif ((year >= 1990) and (year < 2000)):
	    decade = 1990
        elif ((year >= 2000) and (year < 2010)):
	    decade = 2000
	else:
	    decade = 2010

        mydict[decade].append(energycol[i])

    mean60 = np.median(mydict[1960])
    mean70 = np.median(mydict[1970])
    mean80 = np.median(mydict[1980])
    mean90 = np.median(mydict[1990])
    mean2000 = np.median(mydict[2000])
    mean2010  = np.median(mydict[2010])
   
    print mean60, mean70, mean80, mean90, mean2000, mean2010
    print len(mydict[1960]),len(mydict[1970]), len(mydict[1980]), len(mydict[1990]), len(mydict[2000]), len(mydict[2010])
    
    data = [mydict[1960], mydict[1970], mydict[1980], mydict[1990], mydict[2000], mydict[2010]]
    labels = ['1960', '1970', '1980', '1990', '2000','2010']
    plt.hist(data, bins=20, normed=True, label=labels)
    
#    plt.hist(mydict[1960],bins=50, normed=True, label='1960',histtype='stepfilled', cumulative=True)
#    plt.hist(mydict[1980],bins=50, normed=True, label='1980', histtype='stepfilled', cumulative=True)
#    plt.hist(mydict[2010],bins=50 ,normed=True, label='2010', histtype='stepfilled', cumulative=True)
    plt.legend(loc='upper left')
    plt.figure(2)
    plt.plot(labels, [mean60, mean70, mean80, mean90, mean2000, mean2010])
    plt.show()
예제 #38
0
파일: stack.py 프로젝트: zyx061212/Kaggle
def main():

    et = csv_io.read_data("../Submissions/et_stack_avg_benchmark.csv", False)
    rbf = csv_io.read_data(
        "../Submissions/svm-rbf-bootstrap-stack_meanSpan_benchmark.csv", False)
    poly = csv_io.read_data(
        "../Submissions/svm-poly-bootstrap-stack_meanSpan_benchmark.csv",
        False)
    rf = csv_io.read_data("../Submissions/rf2_avg_benchmark.csv", False)
    gb = csv_io.read_data("../Submissions/gb_avg_benchmark.csv", False)

    stack = []
    stack.append(et)
    stack.append(rbf)
    stack.append(poly)
    stack.append(rf)
    stack.append(gb)

    spanDistance = 3
    finalList = []
    for p in range(0, len(stack[0])):
        temp_list = []
        for q in range(0, len(stack)):
            temp_list.append(stack[q][p][0])

        avg = sum(temp_list) / float(len(stack))

        if (avg < 0.5):
            finalList.append(0.2)
            #finalList.append(min(temp_list))
            print p, q, temp_list, avg, min(temp_list)
        else:
            finalList.append(0.80)
            #finalList.append(max(temp_list))
            print p, q, temp_list, avg, max(temp_list)

        #finalList.append( meanSpan(temp_list, spanDistance) )
        #print p, q, temp_list, meanSpan(temp_list, spanDistance)

    finalStack = ["%f" % x for x in finalList]
    csv_io.write_delimited_file("../Submissions/stack.csv", finalStack)

    var = raw_input("Enter to terminate.")
예제 #39
0
파일: stack.py 프로젝트: mb16/Kaggle
def main():

    et = csv_io.read_data("../Submissions/et_stack_avg_benchmark.csv", False)
    rbf = csv_io.read_data("../Submissions/svm-rbf-bootstrap-stack_meanSpan_benchmark.csv", False)
    poly = csv_io.read_data("../Submissions/svm-poly-bootstrap-stack_meanSpan_benchmark.csv", False)
    rf = csv_io.read_data("../Submissions/rf2_avg_benchmark.csv", False)
    gb = csv_io.read_data("../Submissions/gb_avg_benchmark.csv", False)

    stack = []
    stack.append(et)
    stack.append(rbf)
    stack.append(poly)
    stack.append(rf)
    stack.append(gb)	
	
    spanDistance = 3
    finalList = []
    for p in range(0, len(stack[0])):
        temp_list =[]	
        for q in range(0, len(stack)):		
		    temp_list.append( stack[q][p][0]) 

        avg = sum(temp_list)/float(len(stack))	

        if ( avg < 0.5 ):
            finalList.append(0.2) 
            #finalList.append(min(temp_list)) 
            print p, q, temp_list, avg, min(temp_list)
        else:		
            finalList.append(0.80) 
		    #finalList.append(max(temp_list)) 
            print p, q, temp_list, avg, max(temp_list)
			
        #finalList.append( meanSpan(temp_list, spanDistance) )
        #print p, q, temp_list, meanSpan(temp_list, spanDistance)
  			
		
    finalStack = ["%f" % x for x in finalList]
    csv_io.write_delimited_file("../Submissions/stack.csv", finalStack)	
	
	

    var = raw_input("Enter to terminate.")
예제 #40
0
def PreProcess3():
    filename = "stack201208301510"

    data = csv_io.read_data("../Submissions/" + filename + ".csv", False)
    data = SimpleScale(
        data, floor=0.05,
        ceiling=0.90)  # took 0.389 score an lowered to 0.40, not good...

    csv_io.write_delimited_file(
        "../Submissions/" + filename + "_SimpleScale.csv", data)
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./hotness_features_classes.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )
    train_scaled = preprocessing.scale(train)
    clf = SVC(kernel='linear')
    selector = RFECV(clf, step=1, cv=10)
    selector = selector.fit(train_scaled, target)
    print selector.support_
예제 #42
0
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./filtered_classes.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )
    train_scaled = preprocessing.scale(train)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_scaled, target, test_size = 0.8)
    clf  = SVC(kernel='linear', C=0.005).fit(X_train, y_train)
    print clf.score(X_test, y_test)
예제 #43
0
def main():
    #read in the training file
    train = csv_io.read_data("train.csv")
    #set the training responses
    target = [x[0] for x in train]
    #set the training features
    train = [x[1:] for x in train]
    #read in the test file
    realtest = csv_io.read_data("test.csv")

    # code for logistic regression
    lr = LogisticRegression()
    lr.fit(train, target)
    predicted_probs = lr.predict_proba(realtest)
    
    # write solutions to file
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("log_solution.csv", predicted_probs)
    
    print ('Logistic Regression Complete! Submit log_solution.csv to Kaggle')
예제 #44
0
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./filtered_classes_musiconly.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )
    train_scaled = preprocessing.scale(train)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_scaled, target, test_size = 0.8)
    clf  = SVC(kernel='rbf', C = 1000.0, gamma=0.001).fit(X_train, y_train)
    y_val_predict = clf.predict(X_test)
    print metrics.zero_one_score(y_test, y_val_predict)
예제 #45
0
def main():
    #read in the training file
    train = csv_io.read_data("train.csv")
    #set the training responses
    target = [x[0] for x in train]
    #set the training features
    train = [x[1:] for x in train]
    #read in the test file
    realtest = csv_io.read_data("test.csv")

    # code for logistic regression
    lr = LogisticRegression()
    lr.fit(train, target)
    predicted_probs = lr.predict_proba(realtest)
    
    # write solutions to file
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("log_solution.csv", predicted_probs)
    
    print ('Logistic Regression Complete! Submit log_solution.csv to Kaggle')
예제 #46
0
파일: PostProcess.py 프로젝트: mb16/Kaggle
def PostProcess():

	lossThreshold = 4.0  # best seems to be about 4.0
	model = "Long-Lat KNN5"

	#used only for targets values.
	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_40.csv", skipFirstLine = False, split = "\t")
	test = csv_io.read_data("PreProcessData/test_PreProcess4_40.csv", False)
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)

	target = [x[0] for x in trainBase]
	
	
	
	
	stackFiles = []
	for filename in os.listdir("../predictions"):
		parts = filename.split("_")
		if ( filename[0:5] == "Stack" and float(parts[2]) < lossThreshold):

			stackFiles.append(filename)
	
	
	dataset_blend_train = np.zeros((len(trainBase), len(stackFiles)))
	dataset_blend_test = np.zeros((len(test), len(stackFiles)))
	
	print "Loading Data"
	for fileNum, file in enumerate(stackFiles):
		print file
		trn = csv_io.read_data("../predictions/Target_" + file, split="," ,skipFirstLine = False)
		for row, datum in enumerate(trn):
			dataset_blend_train[row, fileNum] = datum[0]
		
		tst = csv_io.read_data("../predictions/" + file, split="," ,skipFirstLine = False)
		for row, datum in enumerate(tst):
			dataset_blend_test[row, fileNum] = datum[0]

	np.savetxt('temp/dataset_blend_trainX.txt', dataset_blend_train)
	np.savetxt('temp/dataset_blend_testX.txt', dataset_blend_test)
	np.savetxt('temp/dataset_blend_trainY.txt', target)
	print "Num file processed: ", len(stackFiles), "Threshold: ", lossThreshold
예제 #47
0
def mp_worker(fn):

    data = csv_io.read_data(fn, 0, l=85)

    np.random.shuffle(np.array(data))
    #print len(data[0])
    y = [x[len(x) - 1] for x in data]
    X = [x[0:len(x) - 1] for x in data]
    #print len(X), len(X[0])
    lrf = LBLRFImbalanced(fn, X, y)

    return
예제 #48
0
def Analyze1():

    Threshold = 4.0
    targetFile = "Target_Stack_20121017110223_3.06649134025_GradientBoos.csv"

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv",
                                 skipFirstLine=False,
                                 split="\t")
    shutil.copy2("PreProcessData/test_PreProcess3.csv",
                 "PreProcessData/test_PreProcess8.csv")
    shutil.copy2("PreProcessData/DataClassList3.csv",
                 "PreProcessData/DataClassList8.csv")
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    target = [x[0] for x in trainBase]

    print "Loading Data"
    trainNew = []

    probSum = 0.0
    weightSum = 0

    trn = csv_io.read_data("../predictions/" + targetFile,
                           split=",",
                           skipFirstLine=False)
    for row, datum in enumerate(trn):

        if (abs(datum[0] - target[row]) > Threshold):
            print datum[0], target[row]
            trainNew.append(trainBase[row])

            probSum += weights[row][0] * math.fabs(target[row] - datum[0])
            weightSum += weights[row][0]

    print "Train Score: ", (probSum / weightSum)
    print len(trainNew)
    csv_io.write_delimited_file("PreProcessData/training_PreProcess8" + ".csv",
                                trainNew,
                                delimiter="\t")
예제 #49
0
def main():
    #read in the training file
    train = csv_io.read_data("train.csv")

    #set the training responses
    target = [x[0] for x in train]

    #set the training features
    train = [x[1, 3, 4, 5, 6] for x in train]

    #read in the test file
    realtest = csv_io.read_data("test.csv")

    # random forest code
    rf = RandomForestClassifier(n_estimators=10)
    # fit the training data
    print('fitting the model')
    rf.fit(train, target)
    # run model against test data
    predicted_probs = rf.predict_proba(realtest)

    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)
예제 #50
0
파일: logloss.py 프로젝트: zyx061212/Kaggle
def main():
    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train][1:]  # skip the headers
    probabilities = csv_io.read_data("../Submissions/svm_benchmark.csv")
    prob = [x[0] for x in probabilities]

    probSum = 0

    for i in range(0, len(prob)):
        #tempProb = max(prob[i], 0.000001)
        #tempProb = min(tempProb, 0.999999)
        #tempProb = max(prob[i], 0.1)
        #tempProb = min(tempProb, 0.9)
        print i, probSum, prob[i], target[i]
        print target[i] * log(prob[i]), (1 - target[i]) * log(1 - prob[i])
        probSum += target[i] * log(
            prob[i]) + (1 - target[i]) * log(1 - prob[i])

    print probSum
    print len(prob)
    print -probSum / len(prob)
    #result = (-1/len(probs))*mySum;

    var = raw_input("Enter to terminate.")
예제 #51
0
def getBenchmark(test_label):
    tfile = open('MyBenchmark.csv', 'wb')
    test_info = csv_io.read_data('test_info.csv')

    try:
        twriter = csv.writer(tfile, delimiter=',')
        twriter.writerow(['Id', 'Prediction'])
        index = 0

        print np.array(test_info)
        print test_info[0][0], test_label[0]

        #print np.array(test_info)
        for id in test_info:
            #print 'converted line', index
            twriter.writerow([int(id[0]), test_label[index]])
            index += 1

    finally:
        tfile.close()
예제 #52
0
def runKmeans(data_file):
    train_data = csv_io.read_data(data_file)
    print len(train_data)
    num_clusters = 10
    model = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)

    max_score = 0
    iteration = 2
    best_classification = []
    for i in range(1, iteration):
        print "Iteration number " + str(i)
        model.fit(train_data)
        score = model.score(train_data)

        if i == 1 or score > max_score:
            max_score = score
            best_classification = model.predict(train_data)

    print len(best_classification.tolist())
    return best_classification.tolist()
예제 #53
0
def run_stack(SEED):

	model = "Long-Lat KNN5 - 50 Features"

	print "Running GB, RF, ET stack."

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_50.csv", skipFirstLine = False, split = "\t")
	test = csv_io.read_data("PreProcessData/test_PreProcess4_50.csv", skipFirstLine = False, split = "\t")
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)

	
	#random.seed(SEED)
	#random.shuffle(trainBase)
	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10).


	predicted_list = []
	bootstrapLists = []

	# use this for quick runs.
	# note RF with 150 crashes on 30 features
	# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	# GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
	# RandomForestRegressor(n_estimators=100, n_jobs=1),
	#RandomForestRegressor(n_estimators=75, n_jobs=1),
	# clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1),
		# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False)
		# ]	
	#knn 5 at 3.45
	#knn 15 at 3.31
	#knn 25 at 3.30
	#knn 40 at 3.31
	# KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# LinearRegression at 3.77
	# Ridge at 3.77
	# SGD 4.23
	#Gauss at 13
	# LinearRegression(fit_intercept=True, normalize=False, copy_X=True),
	# Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001),
	# SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False),
	# GaussianNB()
	# clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
		 # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)	
		# ]
		
	# GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ******************
	# clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166)
			# ]	
			
	# about 1 hour run time, and 3.10 score.		
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.05
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166)
	# about 4 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166)	
	
	clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166)
			]		
	
	
		# use this for quick runs.
	# clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)]	
	
	
	
	# use this for quick runs.  reduced estimators to 50
	# clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
        # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True,
        # tol=0.001, verbose=False)
			# ]	
			
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1)
	
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)]
			
			
	# full algorithm stack.
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]
	

	
	print "Data size: ", len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Scaling"
	targetPre = [x[0] for x in trainBase]
	trainPre = [x[1:] for x in trainBase]
	testPre = [x[0:] for x in test]
	#print trainPre[0]
	scaler = preprocessing.Scaler().fit(trainPre)
	trainScaled = scaler.transform(trainPre)
	testScaled = scaler.transform(testPre)	

	#print scaler.mean_
	#print scaler.std_
	print "Begin Training"
	
	
	for ExecutionIndex, clf in enumerate(clfs):
		print str(clf)
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((len(test), NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True)
		for train_index, test_index in Folds:

			#trainBaseTemp = [trainBase[i] for i in train_index]
			#target = [x[0] for x in trainBaseTemp]
			#train = [x[1:] for x in trainBaseTemp]
	
			#testBaseTemp = [trainBase[i] for i in test_index]
			#targetTest = [x[0] for x in testBaseTemp]
			#trainTest = [x[1:] for x in testBaseTemp]
		
			#test = [x[0:] for x in test]
	
			target = [targetPre[i] for i in train_index]
			train = [trainScaled[i] for i in train_index]
			
			targetTest = [targetPre[i] for i in test_index]	
			trainTest = [trainScaled[i] for i in test_index]	
	
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(train), len(target)
			
			clf.fit(train, target)
			prob = clf.predict(trainTest) 
			
			dataset_blend_train[test_index, ExecutionIndex] = prob



	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i]

				probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				weightSum += weights[test_index[i]][0] 
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			print "Score: ", probSum/weightSum
 
			avg += 	(probSum/weightSum)/NumFolds

			predicted_probs = clf.predict(testScaled) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
		
				
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single("../predictions_50/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions_50/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions_40/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",")
		
		
		print now
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
예제 #54
0
def PreProcess3():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 split="\t",
                                 skipFirstLine=False)
    test = csv_io.read_data("PreProcessData/test_PreProcess2.csv",
                            split="\t",
                            skipFirstLine=False)
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    print "Train Size: ", len(trainBase[0]), "Test Size: ", len(test[0])

    shutil.copy2("PreProcessData/DataClassList2.csv",
                 "PreProcessData/DataClassList3.csv")

    lat = len(trainBase[0]) - 2
    long = len(trainBase[0]) - 1

    target = [x[0] for x in trainBase]
    train = [x[lat:long + 1] for x in trainBase]

    n_neighborsArr = [5]
    leaf_sizeArr = [30]
    for n_neighbor in n_neighborsArr:
        for leaf_s in leaf_sizeArr:

            print "Training neighbors: ", n_neighbor, "leaf_size: ", leaf_s

            neigh = KNeighborsRegressor(n_neighbors=n_neighbor,
                                        warn_on_equidistant=False,
                                        leaf_size=leaf_s,
                                        algorithm="ball_tree",
                                        weights=myFunc)
            neigh.fit(train, target)

            probSum = 0
            weightSum = 0

            for index, data in enumerate(trainBase):
                pred = neigh.predict([data[lat], data[long]])
                #print data[lat], data[long], "Prediction: ", pred[0], "Target: ", target[index]
                if (len(n_neighborsArr) == 1):
                    trainBase[index].append(pred[0])

                probSum += weights[index][0] * math.fabs(target[index] -
                                                         pred[0])
                weightSum += weights[index][0]

            print "Score: ", probSum / weightSum
            if (len(n_neighborsArr) > 1):
                continue

            for index, data in enumerate(test):
                pred = neigh.predict([data[lat - 1], data[long - 1]])
                #print data[lat - 1], data[long - 1], "Prediction: ", pred[0]
                if (len(n_neighborsArr) == 1):
                    test[index].append(pred[0])

    if (len(n_neighborsArr) > 1):
        return

    with open("PreProcessData/DataClassList3.csv", "a") as myfile:
        myfile.write("Lat-Long-Predictor\n")

    print "Writing Data"
    csv_io.write_delimited_file("PreProcessData/training_PreProcess3.csv",
                                trainBase,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess3.csv",
                                test,
                                delimiter="\t")
    print "Done."
예제 #55
0
파일: blend2.py 프로젝트: zyx061212/Kaggle
def run_stack(SEED):

    model = "Lasso"
    lossThreshold = 0.38

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBaseOrig['var11']
    testOrig = pd.read_csv('../models/' + model + '_test.csv')

    targetBase = np.nan_to_num(np.array(trainBaseTarget))

    trainBaseID = trainBaseOrig['id']
    testID = testOrig['id']

    avg = 0
    NumFolds = 5

    stackFiles = []
    for filename in os.listdir("../predictions"):
        parts = filename.split("_")
        if (filename[0:5] == "Stack" and float(parts[2]) > lossThreshold):

            stackFiles.append(filename)

    trainBase = np.zeros((len(trainBaseOrig), len(stackFiles)))
    test = np.zeros((len(testOrig), len(stackFiles)))

    print("Loading Data")
    for fileNum, file in enumerate(stackFiles):
        print(file)
        trn = csv_io.read_data(
            "../predictions/Target_" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(trn):
            trainBase[row, fileNum] = datum[1]  # -1 because we skil

        tst = csv_io.read_data(
            "../predictions/" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(tst):
            test[row, fileNum] = datum[1]

    np.savetxt('temp/dataset_blend_train.txt', trainBase)
    np.savetxt('temp/dataset_blend_test.txt', test)
    print("Num file processed: " + " " + str(len(stackFiles)) + " " +
          "Threshold: " + str(lossThreshold))

    print("Starting Scale")

    allVals = np.vstack((trainBase, test))

    scl = StandardScaler(copy=True, with_mean=True, with_std=True)
    scl.fit(allVals)  # should fit on the combined sets.

    trainBase = scl.transform(trainBase)
    test = scl.transform(test)

    print("Starting Blend")

    clfs = [
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),
        Lasso(alpha=0.000016681005372000593),
        #Ridge(),
        #LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
    ]

    print("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    print("Begin Training")

    lenTrainBase = len(trainBase)
    lenTest = len(test)

    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print(clf)
        avg = 0

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]
            weightTest = [trainBaseWeight[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))
            #train = np.array(np.reshape(train, (-1, 1))  )
            weight = np.array(np.reshape(weight, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest)
            #print(predicted[:,0])
            print(predicted)
            dataset_blend_train[
                test_index,
                ExecutionIndex] = predicted  #[:,0] #needed for Ridge

            #print(targetTest.shape)
            #print(prpredictedob.shape)
            #print(weightTest.shape)

            print(
                str(
                    score.normalized_weighted_gini(targetTest.ravel(),
                                                   predicted.ravel(),
                                                   weightTest.ravel())))
            avg += score.normalized_weighted_gini(
                targetTest.ravel(), predicted.ravel(),
                weightTest.ravel()) / NumFolds
            #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())))
            #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds

            predicted = clf.predict(test)
            dataset_blend_test_set[:, foldCount] = predicted  #[:,0]

            foldCount = foldCount + 1

            #break

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))

        submission = pd.DataFrame(np.zeros((len(testID), 2)),
                                  columns=['id', 'target'])
        submission['target'] = dataset_blend_test[:, ExecutionIndex]
        submission['id'] = testID
        submission.to_csv("../submission/Blend_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + ".csv",
                          index=False)

        #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )

        submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)),
                                  columns=['id', 'target'])
        submission['target'] = dataset_blend_train[:, ExecutionIndex]
        submission['id'] = trainBaseID
        submission.to_csv("../submission/Target_Blend_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + ".csv",
                          index=False)

        csv_io.write_delimited_file("../log/RunLogBlend.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", "Blend", "Stacks: ", stackFiles
        ],
                                    filemode="a",
                                    delimiter=",")

        print("------------------------Average: " + str(avg))

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
예제 #56
0
def main():

    trainBase = csv_io.read_data("PreProcessData/PreProcess2.csv", False)

    avg = 0
    NumFolds = 5  # should be odd for median

    predicted_list = []

    spanDistance = 12
    bootstrapLists = []

    NeighborsArray = [10]
    for Neighbors in NeighborsArray:

        predicted_list = []

        Folds = cross_validation.KFold(len(trainBase) - 1,
                                       k=NumFolds,
                                       indices=True,
                                       shuffle=False,
                                       random_state=None)
        for train_index, test_index in Folds:

            trainBaseTemp = [trainBase[i + 1] for i in train_index]
            #trainBaseTemp = trainBase
            target = [x[0] for x in trainBaseTemp]
            train = [x[1:] for x in trainBaseTemp]

            testBaseTemp = [trainBase[i + 1] for i in test_index]
            #testBaseTemp = trainBase
            targetTest = [x[0] for x in testBaseTemp]
            trainTest = [x[1:] for x in testBaseTemp]

            test = csv_io.read_data("PreProcessData/PreTestData2.csv", False)
            test = [x[0:] for x in test]

            kn = neighbors.KNeighborsClassifier(n_neighbors=Neighbors,
                                                weights='distance',
                                                algorithm='brute',
                                                leaf_size=100,
                                                warn_on_equidistant=True,
                                                p=2)

            kn.fit(train, target)
            prob = kn.predict(trainTest)

            prob = SimpleScale(prob)  # scale output probababilities

            probSum = 0
            totalOffByHalf = 0
            totalPositive = 0
            totalPositiveOffByHalf = 0
            totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i][1]  # [1]
                if (probX > 0.999):
                    probX = 0.999
                if (probX < 0.001):
                    probX = 0.001
                #print i, probSum, probX, targetTest[i]
                #print target[i]*log(probX), (1-target[i])*log(1-probX)
                probSum += int(targetTest[i]) * log(probX) + (
                    1 - int(targetTest[i])) * log(1 - probX)
                if (math.fabs(probX - int(targetTest[i])) > 0.5):
                    totalOffByHalf = totalOffByHalf + 1

                if (int(targetTest[i]) == 1):
                    totalPositive = totalPositive + 1
                if (int(targetTest[i]) == 1 and probX < 0.5):
                    totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                if (probX > 0.5):
                    totalPositivePredictions = totalPositivePredictions + 1

            print "Total Off By > 0.5 ", totalOffByHalf
            print "Total Positive ", totalPositive
            print "Total Positive Off By Half ", totalPositiveOffByHalf
            print "Total Positive Predictions ", totalPositivePredictions
            print "Neighbors: ", Neighbors
            print -probSum / len(prob)

            avg += (-probSum / len(prob)) / NumFolds

            predicted_probs = kn.predict(test)  # was test

            prob = SimpleScale(prob)  # scale output probababilities

            predicted_list.append([x[1] for x in predicted_probs])

        avg_list = []
        med_list = []

        # For N folds, get the average/median for each prediction item in test set.
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(predicted_list)):
                temp_list.append(predicted_list[q][p])

            avg_list.append(mean(temp_list))
            med_list.append(getMedian(temp_list))

            #print p, q, temp_list, mean(temp_list), getMedian(temp_list)

        bootstrapLists.append(avg_list)

    # This would be used if we ran multiple runs with different training values.
    # Primitive stacking, should rather save data, and do formal stacking.
    if (len(bootstrapLists) > 1):
        finalList = []
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(bootstrapLists)):
                temp_list.append(bootstrapLists[q][p])

            finalList.append(meanSpan(temp_list, spanDistance))

            print p, q, temp_list, meanSpan(temp_list, spanDistance)
    else:
        finalList = bootstrapLists[0]

    avg_values = ["%f" % x for x in finalList]
    csv_io.write_delimited_file("../Submissions/rf2_stack_avg.csv", avg_values)

    print "Average: ", avg

    var = raw_input("Enter to terminate.")
예제 #57
0
def PreProcess4(N_Features):

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess3.csv",
                            skipFirstLine=False,
                            split="\t")
    shutil.copy2("PreProcessData/DataClassList3.csv",
                 "PreProcessData/DataClassList4.csv")

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    DataClassList = csv_io.read_data("PreProcessData/DataClassList4.csv",
                                     False)

    print "Data len: ", len(train[0])
    print "DataClassList len: ", len(DataClassList)
    #return

    # this seems about optimal, but has not been tuned on latest improvements.
    NumFeatures = N_Features
    # NOTE going from 30 to 20 features on KNN5 set has almost no effect.  Down to 15 is significant loss.
    # for GBM at 6 and 400 30 is 3.01 and 30 3.05.

    print "Scaling"
    term = 5000  #  scaler has memory errors between 5000 and 10000
    #term = len(trainBase)
    targetPre = [x[0] for x in trainBase][0:term]
    trainPre = [x[1:] for x in trainBase][0:term]
    #testPre = [x[0:] for x in test][0:term]
    targetPre = target[0:term]
    #print trainPre[term - 1]
    scaler = preprocessing.Scaler().fit(trainPre)
    trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)

    #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True)
    clf = GradientBoostingRegressor(loss='ls',
                                    learn_rate=0.05,
                                    subsample=0.5,
                                    max_depth=6,
                                    n_estimators=400,
                                    random_state=166,
                                    min_samples_leaf=30)

    print "Training"

    clf.fit(trainScaled, targetPre)

    trainNew = []
    testNew = []

    print "Computing Importances"
    importances = clf.feature_importances_

    DataClassListNew = []
    for DataIndex, DataClass in enumerate(DataClassList):
        print DataClass[0], importances[DataIndex]
        DataClassListNew.append([DataClass[0], importances[DataIndex]])

    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_" + str(NumFeatures) +
        ".csv", DataClassListNew)

    DataClassListNew_temp = sorted(DataClassListNew,
                                   key=operator.itemgetter(1),
                                   reverse=True)
    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_sorted_" + str(NumFeatures) +
        ".csv", DataClassListNew_temp)

    importancesTemp = sorted(importances, reverse=True)
    print len(importancesTemp), "importances"

    if (len(importancesTemp) > NumFeatures):
        threshold = importancesTemp[NumFeatures]

        print "Importance threshold: ", threshold

        rowIndex = 0
        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (impIndex == 0):
                    newRow.append(target[rowIndex])
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)
            rowIndex += 1

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess4_" +
                                str(NumFeatures) + ".csv",
                                trainNew,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess4_" +
                                str(NumFeatures) + ".csv",
                                testNew,
                                delimiter="\t")
예제 #58
0
def Blend():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 False)

    SEED = 448
    random.seed(SEED)
    random.shuffle(trainBase)

    target = [x[0] for x in trainBase]

    dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)

    clfs = [
        LogisticRegression(penalty='l2',
                           dual=False,
                           tol=0.0001,
                           C=1.0,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l2',
                           dual=False,
                           tol=0.0001,
                           C=0.5,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l2',
                           dual=False,
                           tol=0.0001,
                           C=0.1,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l1',
                           dual=False,
                           tol=0.0001,
                           C=1.0,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l1',
                           dual=False,
                           tol=0.0001,
                           C=0.5,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l1',
                           dual=False,
                           tol=0.0001,
                           C=0.1,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None)
    ]

    test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)
    dataset_blend_test_j = np.zeros((len(test), len(clfs)))

    for ExecutionIndex, clf in enumerate(clfs):
        #clf = LogisticRegression()
        clf.fit(dataset_blend_train, target)
        submission = clf.predict_proba(dataset_blend_test)[:, 1]

        submission = ["%f" % x for x in submission]
        now = datetime.datetime.now()
        csv_io.write_delimited_file_GUID(
            "../Submissions/stack" + now.strftime("%Y%m%d%H%M%S") + ".csv",
            "PreProcessData/test_PatientGuid.csv", submission)

        # attempt to score the training set to predict score for blend...
        probSum = 0.0
        trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1]
        for i in range(0, len(trainPrediction)):
            probX = trainPrediction[i]
            if (probX > 0.999):
                probX = 0.999
            if (probX < 0.001):
                probX = 0.001

            probSum += int(
                target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX)

        print "Train Score: ", (-probSum / len(trainPrediction))

        dataset_blend_test_j[:, ExecutionIndex] = submission

    csv_io.write_delimited_file_GUID_numpy(
        "../Submissions/stack_LG_" + now.strftime("%Y%m%d%H%M%S") + ".csv",
        "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1))
    var = raw_input("Enter to terminate.")