def libsvm(train_path, test_path, top_k, token_size, model_type): review_token, review_star, review_rating = fr.json_reader(train_path, top_k, "train") token_list, df_dict = fr.get_dict(review_token, top_k) train_mtx = fe.feature_matrix(review_token, token_list, df_dict, token_size, model_type) file_process(train_mtx, review_rating, token_size, "train") test_token = fr.json_reader(test_path, top_k, "test") test_rating = np.zeros(len(test_token)) test_list, test_df = fr.get_dict(test_token, top_k) test_mtx = fe.feature_matrix(test_token, token_list, test_df, token_size, model_type) file_process(test_mtx, test_rating, token_size, "test") print "Liblinear SVM preparation finished."
def rmlr(train_path, test_path, top_k, token_size, model_type): # read the training file review_token, review_star, review_rating = fr.json_reader(train_path, top_k, "train") # process all the unigram and bigram in review and pick the top 1000 token_list, df_dict = fr.get_dict(review_token, top_k) # get the unigram and bigram data matrix train_mtx = fe.feature_matrix(review_token, token_list, df_dict, token_size, model_type) # perform gradient ascent on training set, stochastic or batched k_size = 250 model_mtx = cf.logistic_regression(train_mtx, review_star, review_rating, k_size, token_size) # read the test file test_token = fr.json_reader(test_path, top_k, "test") test_list, test_df = fr.get_dict(test_token, top_k) test_mtx = fe.feature_matrix(test_token, token_list, test_df, token_size, model_type) # predict the result predict(model_mtx, test_mtx)