Exemplo n.º 1
0
def main(arg):
    dataset = arg[1]  #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)
    NumOfLineToTrain = 50000  #len(training_data)

    #random.shuffle(training_data)

    clf = []
    y_pred_list = []
    M = 3  #number of classifier
    name_list = ['Original', 'Original+Condensed', 'Condensed']

    # mode0: normal, 1: normal+condensed, 2: only condensed
    for i in range(3):
        rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\
         testing_data, testing_result_data, NumOfLineToTrain, False, i)
        print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \
        '# of features:', len(rows[0])

        ##############################################################

        #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist)
        clf.append(
            KNeighborsClassifier(n_jobs=-1,
                                 weights='distance',
                                 n_neighbors=10,
                                 p=2))
        #clf.append(RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=False))
        #clf.append(svm.LinearSVC(verbose=False, C=4.0))
        #clf = tree.DecisionTreeClassifier()

        #clf.append(GaussianNB())
        #clf = MultinomialNB(alpha=1.0)
        #clf.append(BernoulliNB(alpha=2.0, binarize=1.0))

        #clf.append(linear_model.SGDClassifier(n_jobs=-1,n_iter=1000))
        #clf.append(linear_model.LogisticRegressionCV(n_jobs=-1, verbose=False))

        #############################################################

        #Train and do prediction for each method
        start = time.time()
        print 'Training', name_list[i], '...'
        clf[i].fit(rows, CFA_list)
        print 'Predicting', name_list[i], '...'
        y_pred_list.append(clf[i].predict(testing_rows))
        end = time.time()
        print "Time elapse: ", end - start, " sec"

    for i in range(len(name_list)):
        print name_list[i], ' rmse= ', rmse(y_pred_list[i], test_CFA)
        print "first 30 items of prediction: ", [
            int(round(float(i))) for i in y_pred_list[i][:30]
        ]

    print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]]
    print 'Please close the ROC curve plot'
    plotrocmany(test_CFA, y_pred_list, name_list)
    return
Exemplo n.º 2
0
def main(arg):
    dataset = arg[1]  #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)

    student_result, overall_result, student_kc = training(training_data)

    predict_result = predict(student_result, overall_result, student_kc,
                             testing_data)
    predict_error = rmse(predict_result,
                         [float(i[13]) for i in testing_result_data[1:]])

    predict_result = predict(student_result, overall_result, student_kc,
                             training_data)
    training_error = rmse(predict_result,
                          [float(i[13]) for i in training_data[1:]])

    print '|', dataset, '|', training_error, '|', predict_error, '|'
    return
Exemplo n.º 3
0
def main(arg):
    dataset = arg[1] #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)

    #shuffle the training data
    #training_data = random.shuffle(training_data)

    learnrate = 0.01
    regular = 0.02
    numofstep = 30
    matrix, students, problems, testing_sample = training(training_data, learnrate, regular, numofstep)
    predict_result = predict_from_matrix(matrix, students, problems,[ (data[0].upper(), data[1].upper()) for data in testing_sample])
    training_error = rmse(predict_result, [float(i[2]) for i in testing_sample])

    predict_test_result = predict_from_matrix(matrix, students, problems,[ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]])
    predict_error = rmse(predict_test_result, [float(i[13]) for i in testing_result_data[1:]])

    print "first 50 items of prediction before rounding: ",[float(i) for i in predict_test_result[:50]]
    print "first 50 items of prediction: ",[int(round(float(i))) for i in predict_test_result[:50]]
    print "first 50 items of test GT: ", [int(i[13]) for i in testing_result_data[1:50]]
    print '|', dataset, '|', training_error, '|', predict_error ,'|'
    plotroc([float(i[2]) for i in testing_sample], predict_result,\
     [float(i[13]) for i in testing_result_data[1:]], predict_test_result)
    return
Exemplo n.º 4
0
def main(arg):
    #pdb.set_trace()
    numpy.seterr(all='raise')
    dataset = arg[1]  #'algebra_2005_2006'
    start = time.time()
    training_data, testing_data, testing_result_data = load_data(dataset)
    end = time.time()
    print "Time to load data", end - start, " sec"

    start = time.time()
    rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\
     testing_data, testing_result_data, 100000, False, 2)
    end = time.time()
    print "Time to process data", end - start, " sec"
    print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \
    '# of features:', len(rows[0])

    #print rows[:200]
    #print testing_rows[:200]

    del training_data, testing_data
    gc.collect()
    process = psutil.Process(os.getpid())
    print "RAM usage (MB):", process.memory_info().rss / 1024 / 1024

    write_file("preprocessed_train.txt", rows)
    write_file("preprocessed_test.txt", testing_rows)

    start = time.time()

    ##############################################################

    #clf = linear_model.SGDClassifier(n_jobs=-1,n_iter=1000)
    #clf = linear_model.LogisticRegressionCV(n_jobs=-1, verbose=True)

    #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist)
    clf = KNeighborsClassifier(n_jobs=-1,
                               weights='distance',
                               n_neighbors=10,
                               p=2)

    #clf = RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=True)
    #clf = svm.LinearSVC(verbose=True,  C=1.0)
    #clf = svm.SVC(verbose=True, cache_size=5000, C=1.0)
    #clf = tree.DecisionTreeClassifier()

    #clf = GaussianNB()
    #clf = MultinomialNB(alpha=1.0)
    #clf = BernoulliNB(alpha=2.0, binarize=1.0)

    #############################################################

    clf.fit(rows, CFA_list)
    print clf
    #print clf.feature_importances_

    end = time.time()
    print "Time to train classifier", end - start, " sec"

    process = psutil.Process(os.getpid())
    print "RAM usage (MB):", process.memory_info().rss / 1024 / 1024

    start = time.time()
    predict_result = clf.predict(rows[:1500])
    end = time.time()
    print "Time to do prediction of 1.5k self-test", end - start, " sec"

    #print "Mean accuracy" , clf.score(rows, CFA_list)
    print "first 30 items of predict: ", [int(i) for i in predict_result[:30]]
    print "first 30 items of GT: ", [int(i) for i in CFA_list[:30]]
    predict_result = [float(i) for i in predict_result]
    #training_error = rmse(predict_result, [ float(i) for i in CFA_list[:1500]])
    Classifier_Eval(CFA_list[:1500], predict_result, True)

    print "rmse of first 50 items ", rmse(
        [float(i) for i in predict_result[:50]],
        [float(i) for i in CFA_list[:50]])
    print "rmse of first 150 items ", rmse(
        [float(i) for i in predict_result[:150]],
        [float(i) for i in CFA_list[:150]])
    print "rmse of first 500 items ", rmse(
        [float(i) for i in predict_result[:500]],
        [float(i) for i in CFA_list[:500]])
    print "rmse of first 1500 items ", rmse(
        [float(i) for i in predict_result[:1500]],
        [float(i) for i in CFA_list[:1500]])
    #print "rmse of first 5000 items ", rmse([ float(i) for i in predict_result[:5000]], [ float(i) for i in CFA_list[:5000]])
    #print "rmse of first 15000 items ", rmse([ float(i) for i in predict_result[:15000]], [ float(i) for i in CFA_list[:15000]])
    #print "rmse of first 45000 items ", rmse([ float(i) for i in predict_result[:45000]], [ float(i) for i in CFA_list[:45000]])

    start = time.time()
    predict_test_result = clf.predict(testing_rows)
    end = time.time()
    print "Time to do prediction of testing rows", end - start, " sec"

    print "first 30 items of test predict: ", [
        int(i) for i in predict_test_result[:30]
    ]
    print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]]

    predict_test_result = [float(i) for i in predict_test_result]
    Classifier_Eval(test_CFA, predict_test_result, False)
    #predict_error =  rmse(predict_test_result, [ float(i) for i in test_CFA])
    #print '|', dataset, '|', training_error, '|', predict_error ,'|'

    plotroc(CFA_list[:1500], predict_result, test_CFA, predict_test_result)
    return
Exemplo n.º 5
0
def main(arg):
    dataset = arg[1] #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)
    NumOfLineToTrain = 300000 #len(training_data)

    # mode0: normal, 1: normal+condensed, 2: only condensed
    Feature_vector_mode = 0
    rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\
     testing_data, testing_result_data, NumOfLineToTrain, False, Feature_vector_mode)
    print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \
    '# of features:', len(rows[0])


    clf=[]; y_pred_list=[]; M=3 #number of classifier
    name_list=['KNN', 'RandomForest', 'LinearSVM', \
    'Collabrative filtering']
    #y_pred_list.append([random.randint(0,1) for i in testing_rows])

    ##############################################################

    #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist)
    clf.append(KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=10, p=2))
    clf.append(RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=False))
    clf.append(svm.LinearSVC(verbose=False, C=4.0))
    #clf = tree.DecisionTreeClassifier()

    #clf.append(GaussianNB())
    #clf = MultinomialNB(alpha=1.0)
    #clf.append(BernoulliNB(alpha=2.0, binarize=1.0))

    #clf.append(linear_model.SGDClassifier(n_jobs=-1,n_iter=1000))
    #clf.append(linear_model.LogisticRegressionCV(n_jobs=-1, verbose=False))

    #############################################################

    #Train and do prediction for each method
    for i in range(M):
        start = time.time()
        print 'Training', name_list[i], '...'
        clf[i].fit(rows, CFA_list)
        print 'Predicting', name_list[i], '...'
        y_pred_list.append(clf[i].predict(testing_rows))
        end = time.time()
        print "Time elapse: ", end-start, " sec"

    start = time.time()
    learnrate = 0.01; regular = 0.02; numofstep = 100
    matrix, students, problems, testing_sample = training(training_data[:NumOfLineToTrain], learnrate, regular, numofstep)
    y_pred_list.append(predict_from_matrix(matrix, students, problems,\
        [ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]]))
    end = time.time()
    print "Time elapse: ", end-start, " sec"

    for i in range(len(name_list)):
        print name_list[i], ' rmse= ', rmse(y_pred_list[i], test_CFA)
        print "first 30 items of prediction: ",[int(round(float(i))) for i in y_pred_list[i][:30]]

    print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]]
    print 'Please close the ROC curve plot'
    plotrocmany(test_CFA, y_pred_list, name_list)
    return