def main(): # first read in the inverted index file parser = argparse.ArgumentParser() parser.add_argument('-index', required=True, help='Path to inverted index file') parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json") opts = parser.parse_args() # Pre-processing f_index = open(opts.index,'r') print "loading index file..." wordsmap = {} # count = 0 # for line in f_index: # count += 1 # j_obj = json.load(line) # for k, v in j_obj.items(): # wordsmap[k] = v # j_obj = None # if count % 100 == 0: # print count wordsmap = json.load(f_index) print "done" f_index.close() b_map = {} print "loading business file..." f_b = open(opts.business, 'r') line_num = 0 for line in f_b: b_json = json.loads(line) b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])} line_num += 1 print "done" tokenizer = Tokenizer() # TODO: need to check error input # Bug: c-d exit situation for line in sys.stdin: result = [] line = line.strip('\n') if len(line)==0: continue elif line[0]=='"': line = line.strip('"') words = tokenizer.process_review(line) result = phrase_query(words, wordsmap) elif len(line.split())==1: words = tokenizer.process_review(line) result = one_word_query(words[0], wordsmap) else: words = tokenizer.process_review(line) result = free_text_query(words, wordsmap) rank_res = rank(words,result,b_map,wordsmap) print rank_res
def main(): parser = argparse.ArgumentParser() parser.add_argument('-review_file', required=True, help='Path to review data') parser.add_argument('-business_file', required=True, help='Path to business data') parser.add_argument('-output', required=True, help='Path to output index file') opts = parser.parse_args() f_reviews = open(opts.review_file,'r') f_business = open(opts.business_file,'r') line_num = 0 b_map = {} for line in f_business: b_obj = json.loads(line) b_map[b_obj['business_id']] = line_num line_num += 1 tokenizer = Tokenizer() wordsmap = {} line_num = 0 for line in f_reviews: r = json.loads(line) words = tokenizer.process_review(r['text']); w_idx = 0 for w in words: if w=="": continue b_id = b_map[r['business_id']] if w in wordsmap: if b_id in wordsmap: b_map = wordsmap[w][b_id] if line_num in b_map: b_map[line_num].append(w_idx) else: b_map[line_num] = [w_idx] else: wordsmap[w][b_id] = {line_num:[w_idx]} else: wordsmap[w] = {b_id:{line_num:[w_idx]}} w_idx += 1 line_num += 1 if line_num % 1000==0: print line_num # if line_num == 1000: # break with open(opts.output, 'w') as f_out: json.dump(wordsmap, f_out)