def libsvm(train_path, test_path, top_k, token_size, model_type):
    review_token, review_star, review_rating = fr.json_reader(train_path, top_k, "train")
    token_list, df_dict = fr.get_dict(review_token, top_k)
    train_mtx = fe.feature_matrix(review_token, token_list, df_dict, token_size, model_type)
    file_process(train_mtx, review_rating, token_size, "train")

    test_token = fr.json_reader(test_path, top_k, "test")
    test_rating = np.zeros(len(test_token))
    test_list, test_df = fr.get_dict(test_token, top_k)
    test_mtx = fe.feature_matrix(test_token, token_list, test_df, token_size, model_type)
    file_process(test_mtx, test_rating, token_size, "test")
    print "Liblinear SVM preparation finished."
def rmlr(train_path, test_path, top_k, token_size, model_type):
    # read the training file
    review_token, review_star, review_rating = fr.json_reader(train_path, top_k, "train")
    # process all the unigram and bigram in review and pick the top 1000
    token_list, df_dict = fr.get_dict(review_token, top_k)
    # get the unigram and bigram data matrix
    train_mtx = fe.feature_matrix(review_token, token_list, df_dict, token_size, model_type)
    # perform gradient ascent on training set, stochastic or batched
    k_size = 250
    model_mtx = cf.logistic_regression(train_mtx, review_star, review_rating, k_size, token_size)
    # read the test file
    test_token = fr.json_reader(test_path, top_k, "test")
    test_list, test_df = fr.get_dict(test_token, top_k)
    test_mtx = fe.feature_matrix(test_token, token_list, test_df, token_size, model_type)
    # predict the result
    predict(model_mtx, test_mtx)