def generateExample(filename): # extracting tokens for line in data.generateLine(filename): review = json.loads(line) tokens = tok.tokenize(review["text"]) stars = int(review["stars"]) yield tokens, stars
def generateYelpUser(filename): # extracting tokens for line in data.generateLine(filename): user = json.loads(line) user_id = user['user_id'] date = user['yelping_since'] elite = user['elite'] yield user_id, date, elite
def generateYelpSentenceExample(filename): tok = Tokenizer(preserve_case=False) # extracting tokens for line in data.generateLine(filename): review = json.loads(line) tokens = tok.sentence_tokenize(review['text']) stars = int(review['stars']) yield tokens, stars
def generateYelpReview(filename): # extracting tokens for line in data.generateLine(filename): review = json.loads(line) review_id = review['review_id'] user_id = review['user_id'] business_id = review['business_id'] date = review['date'] yield review_id, user_id, business_id, date
def generateYelpBusiness(filename): # extracting tokens for line in data.generateLine(filename): business = json.loads(line) business_id = business['business_id'] lng, lat = business['longitude'], business['latitude'] stars = business['stars'] state = business['state'].strip() yield business_id, lng, lat, stars, state
# number of reviews a token has to appear to be kept hardthreshold = 2 print "> Loading data" alltoken = data.loadFile(root + '/computed/alltoken.pkl') print "> Scanning data" print "Loading file", filename reviews_feature = dict() reviews_score = dict() tok = Tokenizer(preserve_case=True) # extracting tokens for line in data.generateLine(filename): review = json.loads(line) reviewid = review['review_id'] text = tok.ngrams(review['text'], 1, 3) score = int(review['stars']) # filtering tokens by the ones in the model text = filter(lambda k: k in alltoken, text) reviews_feature[reviewid] = Counter(text) reviews_score[reviewid] = score print "> End of full scan" print "> Saving" data.saveFile(reviews_feature, root + "/computed/reviews_feature.pkl") data.saveFile(reviews_score, root + "/computed/reviews_score.pkl")