示例#1
0
def main(arg):
    dataset = arg[1]  #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)
    NumOfLineToTrain = 50000  #len(training_data)

    #random.shuffle(training_data)

    clf = []
    y_pred_list = []
    M = 3  #number of classifier
    name_list = ['Original', 'Original+Condensed', 'Condensed']

    # mode0: normal, 1: normal+condensed, 2: only condensed
    for i in range(3):
        rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\
         testing_data, testing_result_data, NumOfLineToTrain, False, i)
        print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \
        '# of features:', len(rows[0])

        ##############################################################

        #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist)
        clf.append(
            KNeighborsClassifier(n_jobs=-1,
                                 weights='distance',
                                 n_neighbors=10,
                                 p=2))
        #clf.append(RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=False))
        #clf.append(svm.LinearSVC(verbose=False, C=4.0))
        #clf = tree.DecisionTreeClassifier()

        #clf.append(GaussianNB())
        #clf = MultinomialNB(alpha=1.0)
        #clf.append(BernoulliNB(alpha=2.0, binarize=1.0))

        #clf.append(linear_model.SGDClassifier(n_jobs=-1,n_iter=1000))
        #clf.append(linear_model.LogisticRegressionCV(n_jobs=-1, verbose=False))

        #############################################################

        #Train and do prediction for each method
        start = time.time()
        print 'Training', name_list[i], '...'
        clf[i].fit(rows, CFA_list)
        print 'Predicting', name_list[i], '...'
        y_pred_list.append(clf[i].predict(testing_rows))
        end = time.time()
        print "Time elapse: ", end - start, " sec"

    for i in range(len(name_list)):
        print name_list[i], ' rmse= ', rmse(y_pred_list[i], test_CFA)
        print "first 30 items of prediction: ", [
            int(round(float(i))) for i in y_pred_list[i][:30]
        ]

    print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]]
    print 'Please close the ROC curve plot'
    plotrocmany(test_CFA, y_pred_list, name_list)
    return
示例#2
0
def main(arg):
    dataset = arg[1] #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)

    predict_test_result = [random.randint(0,1) for i in training_data[1:]]
    test_CFA = [float(i[13]) for i in training_data[1:]]
    #training_error = rmse(predict_test_result, test_CFA)
    Classifier_Eval(test_CFA, predict_test_result,True)

    predict_result = [random.randint(0,1) for i in testing_result_data[1:]]
    train_CFA = [float(i[13]) for i in testing_result_data[1:]]
    #predict_error =  rmse(predict_result, train_CFA)
    Classifier_Eval(train_CFA, predict_result, False)   

    #print '|', dataset, '|', training_error, '|', predict_error ,'|'
    plotroc(train_CFA, predict_result, test_CFA, predict_test_result)
    return
示例#3
0
def main(arg):
    dataset = arg[1]  #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)

    student_result, overall_result, student_kc = training(training_data)

    predict_result = predict(student_result, overall_result, student_kc,
                             testing_data)
    predict_error = rmse(predict_result,
                         [float(i[13]) for i in testing_result_data[1:]])

    predict_result = predict(student_result, overall_result, student_kc,
                             training_data)
    training_error = rmse(predict_result,
                          [float(i[13]) for i in training_data[1:]])

    print '|', dataset, '|', training_error, '|', predict_error, '|'
    return
示例#4
0
文件: cf.py 项目: leungpeng/kdd2010
def main(arg):
    dataset = arg[1] #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)

    #shuffle the training data
    #training_data = random.shuffle(training_data)

    learnrate = 0.01
    regular = 0.02
    numofstep = 30
    matrix, students, problems, testing_sample = training(training_data, learnrate, regular, numofstep)
    predict_result = predict_from_matrix(matrix, students, problems,[ (data[0].upper(), data[1].upper()) for data in testing_sample])
    training_error = rmse(predict_result, [float(i[2]) for i in testing_sample])

    predict_test_result = predict_from_matrix(matrix, students, problems,[ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]])
    predict_error = rmse(predict_test_result, [float(i[13]) for i in testing_result_data[1:]])

    print "first 50 items of prediction before rounding: ",[float(i) for i in predict_test_result[:50]]
    print "first 50 items of prediction: ",[int(round(float(i))) for i in predict_test_result[:50]]
    print "first 50 items of test GT: ", [int(i[13]) for i in testing_result_data[1:50]]
    print '|', dataset, '|', training_error, '|', predict_error ,'|'
    plotroc([float(i[2]) for i in testing_sample], predict_result,\
     [float(i[13]) for i in testing_result_data[1:]], predict_test_result)
    return
示例#5
0
def refreshData(request):
    project.load_data()
    return HttpResponse("Data loaded successfully")
示例#6
0
def run(nt=1024):
    num_threads = nt

    project.load_data()

    num_items = len(project.ratings_dict)

    #search for the item with the most review
    most_reviews = -1
    for ratings in project.ratings_dict.values():
        if len(ratings) > most_reviews:
            most_reviews = len(ratings)

    #generate itermediary lists so ordering can be preserved
    asins = list(project.ratings_dict.keys())
    ratings = list(project.ratings_dict.values())

    ratings_list = []
    for i in range(num_items):
        row = []
        for j in range(most_reviews):
            if j < len(ratings[i]):
                row.append(ratings[i][j])
            else:
                row.append(-1.0)
        ratings_list.append(row)
    ratings_array = np.array(ratings_list, dtype=np.float)

    #decide how much work each thread should do
    rows_per_thread = -1
    if num_threads >= len(ratings_array):
        num_threads = len(ratings_array)
        rows_per_thread = 1
    else:
        rows_per_thread = math.ceil(len(ratings_array) / num_threads)

    #decide how to allocate threads/blocks
    if num_threads > num_items:
        num_threads = num_items
    num_blocks = 1
    threads_per_block = num_threads
    max_threads_per_block = 32

    while threads_per_block > max_threads_per_block:
        num_blocks += 1
        threads_per_block = math.ceil(float(num_threads) / float(num_blocks))

        #check if we're using too many blocks
        if (num_blocks > 65535):
            num_blocks = 1
            threads_per_block = num_threads
            max_threads_per_block *= 2

    #device allocation
    ratings_array_d = cuda.to_device(ratings_array)
    rows_per_thread_d = cuda.to_device(rows_per_thread)
    num_items_d = cuda.to_device(num_items)
    maxes_d = cuda.device_array((num_threads, ), dtype=np.float)
    max_indexes_d = cuda.device_array((num_threads, ), dtype=np.int)

    #do parallel computation
    start = time.time()
    find_max_parallel[num_blocks,
                      threads_per_block](ratings_array_d, maxes_d,
                                         max_indexes_d, rows_per_thread,
                                         num_items)
    end = time.time()
    elapsed = end - start

    #return the data
    maxes = maxes_d.copy_to_host()
    max_indexes = max_indexes_d.copy_to_host()

    #do a linear search on the values
    max_val = -1
    max_idx = -1
    for i in range(num_threads):
        if maxes[i] > max_val:
            max_val = maxes[i]
            max_idx = max_indexes[i]

    #print('Best Asin: ' + str(asins[max_idx]))
    #print('Rating: ' + str(max_val))
    #print('Time Elapsed: ' + str(end-start))
    print(str(end - start))
示例#7
0
    if index >= num_searchers: return

    input_idx = search_length * index
    best = -1.0
    best_index = -1
    for i in range(input_idx, input_idx + search_length):
        if index + i >= num_items: return

        if ip[index + i] > best:
            best = ip[index + i]
            best_index = index + i
    op[index] = best_index


"""                           MAIN                                        """
project.load_data()

num_items = len(project.ratings_dict)

#search for the item with the most review
most_reviews = -1
for ratings in project.ratings_dict.values():
    if len(ratings) > most_reviews:
        most_reviews = len(ratings)

#generate itermediary lists so ordering can be preserved
asins = list(project.ratings_dict.keys())
ratings = list(project.ratings_dict.values())

ratings_list = []
for i in range(num_items):
示例#8
0
 def ready(self):
     project.load_data()
示例#9
0
def main(arg):
    #pdb.set_trace()
    numpy.seterr(all='raise')
    dataset = arg[1]  #'algebra_2005_2006'
    start = time.time()
    training_data, testing_data, testing_result_data = load_data(dataset)
    end = time.time()
    print "Time to load data", end - start, " sec"

    start = time.time()
    rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\
     testing_data, testing_result_data, 100000, False, 2)
    end = time.time()
    print "Time to process data", end - start, " sec"
    print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \
    '# of features:', len(rows[0])

    #print rows[:200]
    #print testing_rows[:200]

    del training_data, testing_data
    gc.collect()
    process = psutil.Process(os.getpid())
    print "RAM usage (MB):", process.memory_info().rss / 1024 / 1024

    write_file("preprocessed_train.txt", rows)
    write_file("preprocessed_test.txt", testing_rows)

    start = time.time()

    ##############################################################

    #clf = linear_model.SGDClassifier(n_jobs=-1,n_iter=1000)
    #clf = linear_model.LogisticRegressionCV(n_jobs=-1, verbose=True)

    #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist)
    clf = KNeighborsClassifier(n_jobs=-1,
                               weights='distance',
                               n_neighbors=10,
                               p=2)

    #clf = RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=True)
    #clf = svm.LinearSVC(verbose=True,  C=1.0)
    #clf = svm.SVC(verbose=True, cache_size=5000, C=1.0)
    #clf = tree.DecisionTreeClassifier()

    #clf = GaussianNB()
    #clf = MultinomialNB(alpha=1.0)
    #clf = BernoulliNB(alpha=2.0, binarize=1.0)

    #############################################################

    clf.fit(rows, CFA_list)
    print clf
    #print clf.feature_importances_

    end = time.time()
    print "Time to train classifier", end - start, " sec"

    process = psutil.Process(os.getpid())
    print "RAM usage (MB):", process.memory_info().rss / 1024 / 1024

    start = time.time()
    predict_result = clf.predict(rows[:1500])
    end = time.time()
    print "Time to do prediction of 1.5k self-test", end - start, " sec"

    #print "Mean accuracy" , clf.score(rows, CFA_list)
    print "first 30 items of predict: ", [int(i) for i in predict_result[:30]]
    print "first 30 items of GT: ", [int(i) for i in CFA_list[:30]]
    predict_result = [float(i) for i in predict_result]
    #training_error = rmse(predict_result, [ float(i) for i in CFA_list[:1500]])
    Classifier_Eval(CFA_list[:1500], predict_result, True)

    print "rmse of first 50 items ", rmse(
        [float(i) for i in predict_result[:50]],
        [float(i) for i in CFA_list[:50]])
    print "rmse of first 150 items ", rmse(
        [float(i) for i in predict_result[:150]],
        [float(i) for i in CFA_list[:150]])
    print "rmse of first 500 items ", rmse(
        [float(i) for i in predict_result[:500]],
        [float(i) for i in CFA_list[:500]])
    print "rmse of first 1500 items ", rmse(
        [float(i) for i in predict_result[:1500]],
        [float(i) for i in CFA_list[:1500]])
    #print "rmse of first 5000 items ", rmse([ float(i) for i in predict_result[:5000]], [ float(i) for i in CFA_list[:5000]])
    #print "rmse of first 15000 items ", rmse([ float(i) for i in predict_result[:15000]], [ float(i) for i in CFA_list[:15000]])
    #print "rmse of first 45000 items ", rmse([ float(i) for i in predict_result[:45000]], [ float(i) for i in CFA_list[:45000]])

    start = time.time()
    predict_test_result = clf.predict(testing_rows)
    end = time.time()
    print "Time to do prediction of testing rows", end - start, " sec"

    print "first 30 items of test predict: ", [
        int(i) for i in predict_test_result[:30]
    ]
    print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]]

    predict_test_result = [float(i) for i in predict_test_result]
    Classifier_Eval(test_CFA, predict_test_result, False)
    #predict_error =  rmse(predict_test_result, [ float(i) for i in test_CFA])
    #print '|', dataset, '|', training_error, '|', predict_error ,'|'

    plotroc(CFA_list[:1500], predict_result, test_CFA, predict_test_result)
    return
示例#10
0
def main(arg):
    dataset = arg[1] #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)
    NumOfLineToTrain = 300000 #len(training_data)

    # mode0: normal, 1: normal+condensed, 2: only condensed
    Feature_vector_mode = 0
    rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\
     testing_data, testing_result_data, NumOfLineToTrain, False, Feature_vector_mode)
    print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \
    '# of features:', len(rows[0])


    clf=[]; y_pred_list=[]; M=3 #number of classifier
    name_list=['KNN', 'RandomForest', 'LinearSVM', \
    'Collabrative filtering']
    #y_pred_list.append([random.randint(0,1) for i in testing_rows])

    ##############################################################

    #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist)
    clf.append(KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=10, p=2))
    clf.append(RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=False))
    clf.append(svm.LinearSVC(verbose=False, C=4.0))
    #clf = tree.DecisionTreeClassifier()

    #clf.append(GaussianNB())
    #clf = MultinomialNB(alpha=1.0)
    #clf.append(BernoulliNB(alpha=2.0, binarize=1.0))

    #clf.append(linear_model.SGDClassifier(n_jobs=-1,n_iter=1000))
    #clf.append(linear_model.LogisticRegressionCV(n_jobs=-1, verbose=False))

    #############################################################

    #Train and do prediction for each method
    for i in range(M):
        start = time.time()
        print 'Training', name_list[i], '...'
        clf[i].fit(rows, CFA_list)
        print 'Predicting', name_list[i], '...'
        y_pred_list.append(clf[i].predict(testing_rows))
        end = time.time()
        print "Time elapse: ", end-start, " sec"

    start = time.time()
    learnrate = 0.01; regular = 0.02; numofstep = 100
    matrix, students, problems, testing_sample = training(training_data[:NumOfLineToTrain], learnrate, regular, numofstep)
    y_pred_list.append(predict_from_matrix(matrix, students, problems,\
        [ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]]))
    end = time.time()
    print "Time elapse: ", end-start, " sec"

    for i in range(len(name_list)):
        print name_list[i], ' rmse= ', rmse(y_pred_list[i], test_CFA)
        print "first 30 items of prediction: ",[int(round(float(i))) for i in y_pred_list[i][:30]]

    print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]]
    print 'Please close the ROC curve plot'
    plotrocmany(test_CFA, y_pred_list, name_list)
    return