def replace_score(predictor_name = 'MaxEntropy'):
    from database import update_score_for_product_id, select_ft_score
    from predictor import loadTrainedPredictor
    from srs_local import get_ft_dicts_from_contents
    '''
    this function replaces all scores stored in the db with scores by vanderSentiment
    '''
    res = select_ft_score()
    predictor = loadTrainedPredictor(predictor_name)
    # print res[1]["ft_score"]
    for r in res: 
        product_id = r["product_id"]
        prod_contents = r["contents"]
        prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor)
        update_score_for_product_id(product_id, prod_ft_score_dict, prod_ft_senIdx_dict)
def calculate_ft_dict_for_all_products(predictor_name):
	predictor = loadTrainedPredictor()
	client, db = connect_to_db()
	db_product_collection = db.product_collection
	cursor = db_product_collection.find()
	
	i=0
	for product in cursor:
		i+=1
		product_id = product['product_id']
		prod_contents = product['contents']
		prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor)
		query = {"product_id": product_id}
		update_field = {
			"ft_score": prod_ft_score_dict,
			"ft_senIdx": prod_ft_senIdx_dict
		}
		db_product_collection.update(query, {"$set": update_field}, True)
		if i%10 == 0:
			print i

	client.close()
def fill_in_db(product_id, predictor_name = 'MaxEntropy', review_ratio_threshold = 0.8, scrape_time_limit = 30):	
	# fetch product info from db
	prod_contents, prod_ft_score_dict, prod_ft_senIdx_dict = loadScraperDataFromDB(product_id)

	if len(prod_contents) == 0: # not in db yet
		print "{0} not in db, now scraping...".format(product_id)
		# scrape product info and review contents:
		amazonScraper = createAmazonScraper()
		product_name, prod_contents, prod_review_ids, prod_ratings, review_ending_sentence = scraper_main(amazonScraper, product_id, True, scrape_time_limit)
		prod_num_reviews, prod_category = scrape_num_review_and_category(product_id)
		if prod_num_reviews == -1:
			prod_num_reviews = len(prod_review_ids)

		# classify, sentiment score
		predictor = loadTrainedPredictor(predictor_name)
		prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor)
		
		# insert new entry
		if len(prod_contents) > 0:
			upsert_contents_for_product_id(product_id, product_name, prod_contents, \
				prod_review_ids, prod_ratings, review_ending_sentence, prod_num_reviews, prod_category,\
				prod_ft_score_dict, prod_ft_senIdx_dict)
			return True
		else:
			print "Do not find reviews for %s" % product_id
			return False

	else:

		print "{0} already in db".format(product_id)
		# scrape for total number of review and category
		prod_num_reviews, prod_category = scrape_num_review_and_category(product_id)
		query_res = select_for_product_id(product_id)
		if prod_num_reviews == -1:
			prod_num_reviews = query_res[0]['num_reviews']
		num_review_db = len(query_res[0]["review_ids"])
		prod_num_reviews = max(prod_num_reviews, num_review_db)
		update_num_reviews_for_product_id(product_id, prod_num_reviews)


		if num_review_db < review_ratio_threshold * prod_num_reviews and num_review_db < 100: 
			print "But not enough reviews in db, scrapping for more..."
			# scrape contents
			amazonScraper = createAmazonScraper()
			_, prod_contents_new, prod_review_ids, prod_ratings, review_ending_sentence = scraper_main(amazonScraper, product_id, True, scrape_time_limit)		

			# classify, get sentiment score
			predictor = loadTrainedPredictor(predictor_name)
			if len(prod_contents_new) > 0:
				print "Filling scraped new reviews into db..."
				if len(prod_ft_score_dict) == 0 or len(prod_ft_senIdx_dict) == 0:				
					prod_contents = prod_contents + prod_contents_new
					prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor, start_idx = 0)
				else: # already has ft_scores calculated for previous contents:
					start_idx = len(prod_contents)
					prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents_new, predictor, start_idx = start_idx)
				
				# append new entry to existing entry
				update_contents_for_product_id(product_id, prod_contents_new, prod_review_ids, \
					prod_ratings, review_ending_sentence, prod_category, \
					prod_ft_score_dict, prod_ft_senIdx_dict)
				return True

			else:
				print "Do not find new reviews for %s" % product_id
				if len(prod_ft_score_dict) == 0 or len(prod_ft_senIdx_dict) == 0:
					prod_contents = prod_contents + prod_contents_new
					prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor)

					update_score_for_product_id(product_id, prod_ft_score_dict, prod_ft_senIdx_dict)

				return True

		else:
			print "enough reviews in db, getting scores..."
			if len(prod_ft_score_dict) == 0 or len(prod_ft_senIdx_dict) == 0:
				'''
				This only triggered if product review is loaded from file and not scraped directly
				'''
				# classify, sentiment score
				predictor = loadTrainedPredictor(predictor_name)
				prod_ft_score_dict, prod_ft_senIdx_dict = \
				get_ft_dicts_from_contents(prod_contents, predictor)
				
				# update old entry
				update_for_product_id(product_id, prod_ft_score_dict, prod_ft_senIdx_dict)

				return True
			else:
				return True
def Word2Vec_Predictor_test(sentence_list, aspectPattern_names, criteria_for_choosing_class, similarity_measure, cp_threshold, ratio_threshold, lookup, Isprint=0):
	# load Word2Vec predictor and initilize
	p = loadTrainedPredictor('Word2Vec')
	model = p.model
	aspectPatterns = AspectPatterns(aspectPattern_names)
	static_aspect_list = p.static_aspects_all["static_aspect_list"]
	static_wordlist_vec = static_aspect_to_vec(p.static_aspects_all, model)
	static_aspect_list_show = p.static_aspects_all['static_aspect_list_show']
	static_aspect_list_fortraining = p.static_aspects_all['static_aspect_list_fortraining']
	num_useful = len(static_aspect_list_fortraining)
	static_aspect_list_lookup={static_aspect_list[i]:i for i in range(num_useful)}
	for i in range(num_useful,len(static_aspect_list)):
		static_aspect_list_lookup[static_aspect_list[i]]=num_useful
	static_aspect_list_lookup['useless']=num_useful

	# Initilize classification matrix 
	classification_list=[]
	num_useful=len(static_aspect_list_fortraining)
	count=0
	count_correct=0
	count_useless_true=0
	count_useless_prediction=0
	count_useful_correct=0
	correctness_matrix=np.zeros([num_useful+1,num_useful+1],dtype=np.int)
	classified_sentences_true=[[] for i in range(num_useful+1)]
	classified_sentences_predict=[[] for i in range(num_useful+1)]
	classified_sentences_matrix=[[[] for j in range(num_useful+1)] for i in range(num_useful+1)]

	# Classify each sentence
	predictions = []
	for sentence in sentence_list:
		count=count+1
		aspect_prediction =p.predict(sentence, criteria_for_choosing_class, similarity_measure, cp_threshold, ratio_threshold)

		predictions.append(aspect_prediction[0])
		
		aspect_true=sentence.labeled_aspects
		classification=(aspect_true,aspect_prediction[0],sentence.word2vec_features_list,aspect_prediction[1][:3],sentence.content)

		#print classification
		classification_list.append(classification)
		correctness_matrix[static_aspect_list_lookup[aspect_true],static_aspect_list_lookup[aspect_prediction[0]]]+=1
		classified_sentences_true[static_aspect_list_lookup[aspect_true]].append(classification)
		classified_sentences_predict[static_aspect_list_lookup[aspect_prediction[0]]].append(classification)
		classified_sentences_matrix[static_aspect_list_lookup[aspect_true]][static_aspect_list_lookup[aspect_prediction[0]]].append(classification)

	count_useful_total=sum(sum(correctness_matrix[:num_useful,:]))
	count_useful_prediction_total=sum(sum(correctness_matrix[:,:num_useful]))
	count_useful_correct=sum(correctness_matrix.diagonal()[0:num_useful])
	count_correct=sum(correctness_matrix.diagonal())
	count_useless2useful=sum(sum(correctness_matrix[num_useful:num_useful+1,0:num_useful]))
	count_useful2useless=sum(sum(correctness_matrix[0:num_useful,num_useful:num_useful+1]))
	count_item_prediction=np.sum(correctness_matrix,axis=0)
	count_item_truelabel=np.sum(correctness_matrix,axis=1)
	correct_all=round(1.*count_correct/count,3)
	if abs(count_useful_prediction_total)>0.01:
		correct_useful_precision=round(1.*count_useful_correct/count_useful_prediction_total,3)
	else: correct_useful_precision=-1
	correct_useful_recall=round(1.*count_useful_correct/count_useful_total,3)
	if Isprint==1:
		correct_item_precision=np.zeros(num_useful+1)
		for ii in range(num_useful+1):
			if abs(count_item_prediction[ii])>0.01:
				correct_item_precision[ii]=np.divide(1.*correctness_matrix.diagonal()[ii],count_item_prediction[ii])
			else: correct_item_precision[ii]=-1
		correct_item_recall=np.divide(1.*correctness_matrix.diagonal(),count_item_truelabel)	
		print ((count,str(round(100.*correct_all,2))+'%'),(count_useful_total,str(round(100.*correct_useful_precision))+'%',str(round(100.*correct_useful_recall))+'%'),count_useless2useful,count_useful2useless)
		print static_aspect_list_show
		print correctness_matrix
		for i in range(len(classified_sentences_matrix[lookup[0]][lookup[1]])):
			print classified_sentences_matrix[lookup[0]][lookup[1]][i]
			print " "
	return np.array(predictions)