Exemplo n.º 1
0
def generateExample(filename):
    # extracting tokens
    for line in data.generateLine(filename):
        review = json.loads(line)
        tokens = tok.tokenize(review["text"])
        stars = int(review["stars"])
        yield tokens, stars
Exemplo n.º 2
0
def generateYelpUser(filename):
    # extracting tokens
    for line in data.generateLine(filename):
        user = json.loads(line)
        user_id = user['user_id']
        date = user['yelping_since']
        elite = user['elite']
        yield user_id, date, elite
Exemplo n.º 3
0
def generateYelpSentenceExample(filename):
  tok = Tokenizer(preserve_case=False)
  # extracting tokens
  for line in data.generateLine(filename):
    review = json.loads(line)
    tokens = tok.sentence_tokenize(review['text'])
    stars = int(review['stars'])
    yield tokens, stars
Exemplo n.º 4
0
def generateYelpUser(filename):
    # extracting tokens
    for line in data.generateLine(filename):
        user = json.loads(line)
        user_id = user['user_id']
        date = user['yelping_since']
        elite = user['elite']
        yield user_id, date, elite
Exemplo n.º 5
0
def generateYelpReview(filename):
    # extracting tokens
    for line in data.generateLine(filename):
        review = json.loads(line)
        review_id = review['review_id']
        user_id = review['user_id']
        business_id = review['business_id']
        date = review['date']
        yield review_id, user_id, business_id, date
Exemplo n.º 6
0
def generateYelpBusiness(filename):
    # extracting tokens
    for line in data.generateLine(filename):
        business = json.loads(line)
        business_id = business['business_id']
        lng, lat = business['longitude'], business['latitude']
        stars = business['stars']
        state = business['state'].strip()
        yield business_id, lng, lat, stars, state
Exemplo n.º 7
0
def generateYelpReview(filename):
    # extracting tokens
    for line in data.generateLine(filename):
        review = json.loads(line)
        review_id = review['review_id']
        user_id = review['user_id']
        business_id = review['business_id']
        date = review['date']
        yield review_id, user_id, business_id, date
Exemplo n.º 8
0
def generateYelpBusiness(filename):
    # extracting tokens
    for line in data.generateLine(filename):
        business = json.loads(line)
        business_id = business['business_id']
        lng, lat = business['longitude'], business['latitude']
        stars = business['stars']
        state = business['state'].strip()
        yield business_id, lng, lat, stars, state
# number of reviews a token has to appear to be kept
hardthreshold = 2

print "> Loading data"
alltoken = data.loadFile(root + '/computed/alltoken.pkl')

print "> Scanning data"
print "Loading file", filename

reviews_feature = dict()
reviews_score = dict()

tok = Tokenizer(preserve_case=True)
# extracting tokens
for line in data.generateLine(filename):
  review = json.loads(line)
  reviewid = review['review_id']
  text = tok.ngrams(review['text'], 1, 3)
  score = int(review['stars'])
 
  # filtering tokens by the ones in the model
  text = filter(lambda k: k in alltoken, text)
  reviews_feature[reviewid] = Counter(text)
  reviews_score[reviewid] = score

print "> End of full scan"

print "> Saving"
data.saveFile(reviews_feature, root + "/computed/reviews_feature.pkl")
data.saveFile(reviews_score, root + "/computed/reviews_score.pkl")