def main(arg): dataset = arg[1] #'algebra_2005_2006' training_data, testing_data, testing_result_data = load_data(dataset) NumOfLineToTrain = 50000 #len(training_data) #random.shuffle(training_data) clf = [] y_pred_list = [] M = 3 #number of classifier name_list = ['Original', 'Original+Condensed', 'Condensed'] # mode0: normal, 1: normal+condensed, 2: only condensed for i in range(3): rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\ testing_data, testing_result_data, NumOfLineToTrain, False, i) print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \ '# of features:', len(rows[0]) ############################################################## #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist) clf.append( KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=10, p=2)) #clf.append(RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=False)) #clf.append(svm.LinearSVC(verbose=False, C=4.0)) #clf = tree.DecisionTreeClassifier() #clf.append(GaussianNB()) #clf = MultinomialNB(alpha=1.0) #clf.append(BernoulliNB(alpha=2.0, binarize=1.0)) #clf.append(linear_model.SGDClassifier(n_jobs=-1,n_iter=1000)) #clf.append(linear_model.LogisticRegressionCV(n_jobs=-1, verbose=False)) ############################################################# #Train and do prediction for each method start = time.time() print 'Training', name_list[i], '...' clf[i].fit(rows, CFA_list) print 'Predicting', name_list[i], '...' y_pred_list.append(clf[i].predict(testing_rows)) end = time.time() print "Time elapse: ", end - start, " sec" for i in range(len(name_list)): print name_list[i], ' rmse= ', rmse(y_pred_list[i], test_CFA) print "first 30 items of prediction: ", [ int(round(float(i))) for i in y_pred_list[i][:30] ] print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]] print 'Please close the ROC curve plot' plotrocmany(test_CFA, y_pred_list, name_list) return
def main(arg): dataset = arg[1] #'algebra_2005_2006' training_data, testing_data, testing_result_data = load_data(dataset) student_result, overall_result, student_kc = training(training_data) predict_result = predict(student_result, overall_result, student_kc, testing_data) predict_error = rmse(predict_result, [float(i[13]) for i in testing_result_data[1:]]) predict_result = predict(student_result, overall_result, student_kc, training_data) training_error = rmse(predict_result, [float(i[13]) for i in training_data[1:]]) print '|', dataset, '|', training_error, '|', predict_error, '|' return
def main(arg): dataset = arg[1] #'algebra_2005_2006' training_data, testing_data, testing_result_data = load_data(dataset) #shuffle the training data #training_data = random.shuffle(training_data) learnrate = 0.01 regular = 0.02 numofstep = 30 matrix, students, problems, testing_sample = training(training_data, learnrate, regular, numofstep) predict_result = predict_from_matrix(matrix, students, problems,[ (data[0].upper(), data[1].upper()) for data in testing_sample]) training_error = rmse(predict_result, [float(i[2]) for i in testing_sample]) predict_test_result = predict_from_matrix(matrix, students, problems,[ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]]) predict_error = rmse(predict_test_result, [float(i[13]) for i in testing_result_data[1:]]) print "first 50 items of prediction before rounding: ",[float(i) for i in predict_test_result[:50]] print "first 50 items of prediction: ",[int(round(float(i))) for i in predict_test_result[:50]] print "first 50 items of test GT: ", [int(i[13]) for i in testing_result_data[1:50]] print '|', dataset, '|', training_error, '|', predict_error ,'|' plotroc([float(i[2]) for i in testing_sample], predict_result,\ [float(i[13]) for i in testing_result_data[1:]], predict_test_result) return
def main(arg): #pdb.set_trace() numpy.seterr(all='raise') dataset = arg[1] #'algebra_2005_2006' start = time.time() training_data, testing_data, testing_result_data = load_data(dataset) end = time.time() print "Time to load data", end - start, " sec" start = time.time() rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\ testing_data, testing_result_data, 100000, False, 2) end = time.time() print "Time to process data", end - start, " sec" print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \ '# of features:', len(rows[0]) #print rows[:200] #print testing_rows[:200] del training_data, testing_data gc.collect() process = psutil.Process(os.getpid()) print "RAM usage (MB):", process.memory_info().rss / 1024 / 1024 write_file("preprocessed_train.txt", rows) write_file("preprocessed_test.txt", testing_rows) start = time.time() ############################################################## #clf = linear_model.SGDClassifier(n_jobs=-1,n_iter=1000) #clf = linear_model.LogisticRegressionCV(n_jobs=-1, verbose=True) #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist) clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=10, p=2) #clf = RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=True) #clf = svm.LinearSVC(verbose=True, C=1.0) #clf = svm.SVC(verbose=True, cache_size=5000, C=1.0) #clf = tree.DecisionTreeClassifier() #clf = GaussianNB() #clf = MultinomialNB(alpha=1.0) #clf = BernoulliNB(alpha=2.0, binarize=1.0) ############################################################# clf.fit(rows, CFA_list) print clf #print clf.feature_importances_ end = time.time() print "Time to train classifier", end - start, " sec" process = psutil.Process(os.getpid()) print "RAM usage (MB):", process.memory_info().rss / 1024 / 1024 start = time.time() predict_result = clf.predict(rows[:1500]) end = time.time() print "Time to do prediction of 1.5k self-test", end - start, " sec" #print "Mean accuracy" , clf.score(rows, CFA_list) print "first 30 items of predict: ", [int(i) for i in predict_result[:30]] print "first 30 items of GT: ", [int(i) for i in CFA_list[:30]] predict_result = [float(i) for i in predict_result] #training_error = rmse(predict_result, [ float(i) for i in CFA_list[:1500]]) Classifier_Eval(CFA_list[:1500], predict_result, True) print "rmse of first 50 items ", rmse( [float(i) for i in predict_result[:50]], [float(i) for i in CFA_list[:50]]) print "rmse of first 150 items ", rmse( [float(i) for i in predict_result[:150]], [float(i) for i in CFA_list[:150]]) print "rmse of first 500 items ", rmse( [float(i) for i in predict_result[:500]], [float(i) for i in CFA_list[:500]]) print "rmse of first 1500 items ", rmse( [float(i) for i in predict_result[:1500]], [float(i) for i in CFA_list[:1500]]) #print "rmse of first 5000 items ", rmse([ float(i) for i in predict_result[:5000]], [ float(i) for i in CFA_list[:5000]]) #print "rmse of first 15000 items ", rmse([ float(i) for i in predict_result[:15000]], [ float(i) for i in CFA_list[:15000]]) #print "rmse of first 45000 items ", rmse([ float(i) for i in predict_result[:45000]], [ float(i) for i in CFA_list[:45000]]) start = time.time() predict_test_result = clf.predict(testing_rows) end = time.time() print "Time to do prediction of testing rows", end - start, " sec" print "first 30 items of test predict: ", [ int(i) for i in predict_test_result[:30] ] print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]] predict_test_result = [float(i) for i in predict_test_result] Classifier_Eval(test_CFA, predict_test_result, False) #predict_error = rmse(predict_test_result, [ float(i) for i in test_CFA]) #print '|', dataset, '|', training_error, '|', predict_error ,'|' plotroc(CFA_list[:1500], predict_result, test_CFA, predict_test_result) return
def main(arg): dataset = arg[1] #'algebra_2005_2006' training_data, testing_data, testing_result_data = load_data(dataset) NumOfLineToTrain = 300000 #len(training_data) # mode0: normal, 1: normal+condensed, 2: only condensed Feature_vector_mode = 0 rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\ testing_data, testing_result_data, NumOfLineToTrain, False, Feature_vector_mode) print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \ '# of features:', len(rows[0]) clf=[]; y_pred_list=[]; M=3 #number of classifier name_list=['KNN', 'RandomForest', 'LinearSVM', \ 'Collabrative filtering'] #y_pred_list.append([random.randint(0,1) for i in testing_rows]) ############################################################## #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist) clf.append(KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=10, p=2)) clf.append(RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=False)) clf.append(svm.LinearSVC(verbose=False, C=4.0)) #clf = tree.DecisionTreeClassifier() #clf.append(GaussianNB()) #clf = MultinomialNB(alpha=1.0) #clf.append(BernoulliNB(alpha=2.0, binarize=1.0)) #clf.append(linear_model.SGDClassifier(n_jobs=-1,n_iter=1000)) #clf.append(linear_model.LogisticRegressionCV(n_jobs=-1, verbose=False)) ############################################################# #Train and do prediction for each method for i in range(M): start = time.time() print 'Training', name_list[i], '...' clf[i].fit(rows, CFA_list) print 'Predicting', name_list[i], '...' y_pred_list.append(clf[i].predict(testing_rows)) end = time.time() print "Time elapse: ", end-start, " sec" start = time.time() learnrate = 0.01; regular = 0.02; numofstep = 100 matrix, students, problems, testing_sample = training(training_data[:NumOfLineToTrain], learnrate, regular, numofstep) y_pred_list.append(predict_from_matrix(matrix, students, problems,\ [ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]])) end = time.time() print "Time elapse: ", end-start, " sec" for i in range(len(name_list)): print name_list[i], ' rmse= ', rmse(y_pred_list[i], test_CFA) print "first 30 items of prediction: ",[int(round(float(i))) for i in y_pred_list[i][:30]] print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]] print 'Please close the ROC curve plot' plotrocmany(test_CFA, y_pred_list, name_list) return