示例#1
0
# Tools
from utils import disp, data

# ML
from collections import Counter
from math import log

print "> Loading data"

root = data.getParent(__file__)

alltoken = data.loadFile(root + '/computed/alltoken.pkl')
reviews_feature = data.loadFile(root + '/computed/reviews_feature.pkl')

n = len(reviews_feature)

print "Total reviews:", n

# TF-IDF
print "> Computing TF"
TF = dict()
i = 0
for review in reviews_feature:
  i += 1
  disp.tempPrint(str(i))
  TF[review] = Counter()
  for token in reviews_feature[review]:
    TF[review][token] = float(reviews_feature[review][token]) / float(max(reviews_feature[review].values()))

print "> Computing IDF"
IDF = dict()
"""
    In this script we look at the distribution of the number of reviews by category to see which one to choose for the per category sLDA.
"""

from __future__ import print_function
import json
from utils import tokenizer, disp, data
from collections import Counter
import numpy as np

""" Files & Folders Parameters """
root = data.getParent("")

def categories_info():
    filepath = root + "/dataset/yelp_academic_dataset_business.json"

    """ Generate the count of reviews per category """
    business_file = open(filepath);
    lines_file = business_file.readlines();
    business_file.close();

    business_by_category = dict();
    categories_business_counts = Counter();
    categories_reviews_counts = Counter();

    for line_json in lines_file:
        business_dict = json.loads(line_json);
        business_id = business_dict["business_id"];
        categories_list = business_dict["categories"];

        for category in categories_list: