Exemplo n.º 1
0
def regressior_baseline(city="Phoenix"):
    bus_df = get_data(city)
    test_ids = [line.strip() for line in open(test_business_list).readlines()]
    train_raw_x, test_raw_x = gen_data_no_review(bus_df, test_ids)
    print "baseline lasso no review"
    # 335 rmse: 0.792623
    # 410 rmse: 1.255996
    print "rmse: %f" % baseline_lasso_leave_one_out(
        train_raw_x, test_raw_x, test_ids, alpha=0.001, result_path="regression_lasso_no_review.txt"
    )
    print "baseline extra no review"
    # 335 rmse: 0.818625
    print "rmse: %f" % baseline_extra_leave_one_out(
        train_raw_x, test_raw_x, test_ids, n=20, d=10, result_path="regression_extra_no_review.txt"
    )
    train_raw_x, test_raw_x = gen_data_with_top_review(bus_df, test_ids, city)
    print "baseline lasso with 10 reviews"
    # 335 rmse: 0.808528
    # 410 rmse: 1.256638
    print "rmse: %f" % baseline_lasso_leave_one_out(
        train_raw_x, test_raw_x, test_ids, alpha=10e-4, result_path="regression_with_review.txt"
    )
    print "baseline extra no review"
    print "rmse: %f" % baseline_extra_leave_one_out(
        train_raw_x, test_raw_x, test_ids, n=40, d=40, result_path="regression_extra_with_review.txt"
    )
Exemplo n.º 2
0
def build_feature(input_file, top_review_number=50):
    bus_df = get_data()
    all_stars_dist = pd.value_counts(
        bus_df.stars, normalize=True).sort_index(ascending=False)
    all_stars_dist_dict = all_stars_dist.to_dict()
    stars_dict = dict(zip(bus_df.business_id, bus_df.stars))
    x = []
    y = []
    xi = {}
    now_b_id = ""
    count = 0
    for line in open(input_file):
        line = line.replace("BUSINESS#", "")
        source, dest, pagerank = line.strip().split(',')
        if now_b_id != source:
            if xi:
                row = []
                for k in sorted(xi.keys()):
                    row.append((xi[k] / count) - all_stars_dist_dict[k])
                x.append(row)
            xi = {k: 0.0 for k in all_stars_dist_dict.keys()}
            now_b_id = source
            count = 0
            y.append(stars_dict[now_b_id])
        if count < top_review_number:
            xi[stars_dict[dest]] += 1
            count += 1
        row = []
    for k in sorted(xi.keys()):
        row.append((xi[k] / count) - all_stars_dist_dict[k])
    x.append(row)
    return np.array(x), np.array(y)
Exemplo n.º 3
0
def baseline_city_average(city="Phoenix"):
    bus_df = get_data(city)
    # test_ids = bus_df.business_id[:100].tolist()
    test_ids = [line.strip()
                for line in open(test_business_list).readlines()]
    x = bus_df[['business_id', 'review_count', 'categories']]
    test_x = x[bus_df['business_id'].isin(test_ids)].reset_index()
    predict = []
    for test_id in test_ids:
        predict.append(x[x.business_id != test_id].review_count.mean())
    rmse = mean_squared_error(predict, test_x.review_count.as_matrix()) ** 0.5
    print rmse
Exemplo n.º 4
0
def search_n_d(city="Phoenix"):
    bus_df = get_data(city)
    test_ids = [line.strip() for line in open(test_business_list).readlines()]
    print "no review"
    train_raw_x, test_raw_x = gen_data_no_review(bus_df, test_ids)
    train_x = train_raw_x[~train_raw_x.business_id.isin(test_ids)]
    train_y = train_x.stars.as_matrix()
    train_x = train_x.drop(["business_id", "stars"], 1).as_matrix()
    tune_extra(train_x, train_y)
    print "with top 10 review"
    train_raw_x, test_raw_x = gen_data_with_top_review(bus_df, test_ids, city)
    train_x = train_raw_x[~train_raw_x.business_id.isin(test_ids)]
    train_y = train_x.stars.as_matrix()
    train_x = train_x.drop(["business_id", "stars"], 1).as_matrix()
    tune_extra(train_x, train_y)
Exemplo n.º 5
0
def baseline_city_average(city="Phoenix"):
    bus_df = get_data(city)
    test_ids = [line.strip() for line in open(test_business_list).readlines()]
    x = bus_df[["business_id", "stars"]]
    test_x = x[bus_df["business_id"].isin(test_ids)].reset_index()
    predict = []
    for test_id in test_ids:
        predict.append(x[x.business_id != test_id].stars.mean())

    rmse = mean_squared_error(predict, test_x.stars.as_matrix()) ** 0.5
    result = pd.DataFrame([], columns=["stars", "predict"])
    result["stars"] = test_x.stars
    result["predict"] = predict
    result = result.sort("stars", ascending=0)
    result.to_csv("city.csv", index=False)
    print rmse
Exemplo n.º 6
0
def baseline_category_average(city="Phoenix"):
    bus_df = get_data(city)
    test_ids = [line.strip()
                for line in open(test_business_list).readlines()]
    x = bus_df[['business_id', 'review_count', 'categories']]
    test_x = x[bus_df['business_id'].isin(test_ids)].reset_index()
    predict = []
    for test_id in test_ids:
        categories = set(x[x.business_id == test_id].categories.tolist()[0])
        if categories:
            c = x.categories.apply(
                lambda x: len(categories.intersection(set(x))) > 0)
            predict.append(x[c].review_count.mean())
        else:
            predict.append(x[x.business_id != test_id].review_count.mean())
    rmse = mean_squared_error(predict, test_x.review_count.as_matrix()) ** 0.5
    print rmse
Exemplo n.º 7
0
def search_alpha(city="Phoenix"):
    bus_df = get_data(city)
    test_ids = [line.strip() for line in open(test_business_list).readlines()]
    print "no review"
    train_raw_x, test_raw_x = gen_data_no_review(bus_df, test_ids)
    train_x = train_raw_x[~train_raw_x.business_id.isin(test_ids)]
    train_y = train_x.stars.as_matrix()
    train_x = train_x.drop(["business_id", "stars"], 1).as_matrix()
    coefs, scores, alpha = tune_lasso(train_x, train_y)
    best_alpha = alpha[scores.index(min(scores))]
    print "best alpha=%f" % best_alpha
    print "with top 10 review"
    train_raw_x, test_raw_x = gen_data_with_top_review(bus_df, test_ids, city)
    train_x = train_raw_x[~train_raw_x.business_id.isin(test_ids)]
    train_y = train_x.stars.as_matrix()
    train_x = train_x.drop(["business_id", "stars"], 1).as_matrix()
    coefs, scores, alpha = tune_lasso(train_x, train_y)
    best_alpha = alpha[scores.index(min(scores))]
    print "best alpha=%f" % best_alpha
Exemplo n.º 8
0
def baseline_category_average(city="Phoenix"):
    bus_df = get_data(city)
    test_ids = [line.strip() for line in open(test_business_list).readlines()]
    x = bus_df[["business_id", "stars", "categories"]]
    test_x = x[bus_df["business_id"].isin(test_ids)].reset_index()
    predict = []
    for test_id in test_ids:
        categories = set(x[x.business_id == test_id].categories.tolist()[0])
        if categories:
            c = x.categories.apply(lambda x: len(categories.intersection(set(x))) > 0)
            predict.append(x[c].stars.mean())
        else:
            predict.append(x[x.business_id != test_id].stars.mean())
    result = pd.DataFrame([], columns=["stars", "predict"])
    result["stars"] = test_x.stars
    result["predict"] = predict
    result = result.sort("stars", ascending=0)
    result.to_csv("category.csv", index=False)
    rmse = mean_squared_error(predict, test_x.stars.as_matrix()) ** 0.5
    print rmse