예제 #1
0
def main():
	# first read in the inverted index file
	parser = argparse.ArgumentParser()
	parser.add_argument('-index', required=True, help='Path to inverted index file')
	parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json")
	opts = parser.parse_args()

	# Pre-processing
	f_index = open(opts.index,'r')
	print "loading index file..."
	wordsmap = {}
	# count = 0
	# for line in f_index:
	# 	count += 1
	# 	j_obj = json.load(line)
	# 	for k, v in j_obj.items():
	# 		wordsmap[k] = v
	# 	j_obj = None
	# 	if count % 100 == 0:
	# 		print count
	wordsmap = json.load(f_index)
	print "done"
	f_index.close()
	b_map = {}
	print "loading business file..."
	f_b = open(opts.business, 'r')
	line_num = 0
	for line in f_b:
		b_json = json.loads(line)
		b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])}
		line_num += 1
	print "done"


	tokenizer = Tokenizer()
	# TODO: need to check error input  
	# Bug: c-d exit situation
	
	for line in sys.stdin:
		result = []
		line = line.strip('\n')
		if len(line)==0:
			continue
		elif line[0]=='"':
			line = line.strip('"')
			words = tokenizer.process_review(line)
			result = phrase_query(words, wordsmap)
		elif len(line.split())==1:
			words = tokenizer.process_review(line)
			result = one_word_query(words[0], wordsmap)
		else:
			words = tokenizer.process_review(line)
			result = free_text_query(words, wordsmap)
		rank_res = rank(words,result,b_map,wordsmap)
		print rank_res
예제 #2
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-review_file', required=True, help='Path to review data')
	parser.add_argument('-business_file', required=True, help='Path to business data')
	parser.add_argument('-output', required=True, help='Path to output index file')
	opts = parser.parse_args()
	f_reviews = open(opts.review_file,'r')
	f_business = open(opts.business_file,'r')

	line_num = 0
	b_map = {}
	for line in f_business:
		b_obj = json.loads(line)
		b_map[b_obj['business_id']] = line_num
		line_num += 1

	tokenizer = Tokenizer()
	wordsmap = {}
	line_num = 0
	for line in f_reviews:
		r = json.loads(line)
		words = tokenizer.process_review(r['text']);
		w_idx = 0
		for w in words:
			if w=="":
				continue
			b_id = b_map[r['business_id']]
			if w in wordsmap:
				if b_id in wordsmap:
					b_map = wordsmap[w][b_id]
					if line_num in b_map:
						b_map[line_num].append(w_idx)
					else:
						b_map[line_num] = [w_idx]
				else:
					wordsmap[w][b_id] = {line_num:[w_idx]}
			else:
				wordsmap[w] = {b_id:{line_num:[w_idx]}}
			w_idx += 1
		line_num += 1
		if line_num % 1000==0:
			print line_num 
		# if line_num == 1000:
		# 	break
		
	with open(opts.output, 'w') as f_out:
		json.dump(wordsmap, f_out)