def replace_score(predictor_name = 'MaxEntropy'): from database import update_score_for_product_id, select_ft_score from predictor import loadTrainedPredictor from srs_local import get_ft_dicts_from_contents ''' this function replaces all scores stored in the db with scores by vanderSentiment ''' res = select_ft_score() predictor = loadTrainedPredictor(predictor_name) # print res[1]["ft_score"] for r in res: product_id = r["product_id"] prod_contents = r["contents"] prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor) update_score_for_product_id(product_id, prod_ft_score_dict, prod_ft_senIdx_dict)
def calculate_ft_dict_for_all_products(predictor_name): predictor = loadTrainedPredictor() client, db = connect_to_db() db_product_collection = db.product_collection cursor = db_product_collection.find() i=0 for product in cursor: i+=1 product_id = product['product_id'] prod_contents = product['contents'] prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor) query = {"product_id": product_id} update_field = { "ft_score": prod_ft_score_dict, "ft_senIdx": prod_ft_senIdx_dict } db_product_collection.update(query, {"$set": update_field}, True) if i%10 == 0: print i client.close()
def fill_in_db(product_id, predictor_name = 'MaxEntropy', review_ratio_threshold = 0.8, scrape_time_limit = 30): # fetch product info from db prod_contents, prod_ft_score_dict, prod_ft_senIdx_dict = loadScraperDataFromDB(product_id) if len(prod_contents) == 0: # not in db yet print "{0} not in db, now scraping...".format(product_id) # scrape product info and review contents: amazonScraper = createAmazonScraper() product_name, prod_contents, prod_review_ids, prod_ratings, review_ending_sentence = scraper_main(amazonScraper, product_id, True, scrape_time_limit) prod_num_reviews, prod_category = scrape_num_review_and_category(product_id) if prod_num_reviews == -1: prod_num_reviews = len(prod_review_ids) # classify, sentiment score predictor = loadTrainedPredictor(predictor_name) prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor) # insert new entry if len(prod_contents) > 0: upsert_contents_for_product_id(product_id, product_name, prod_contents, \ prod_review_ids, prod_ratings, review_ending_sentence, prod_num_reviews, prod_category,\ prod_ft_score_dict, prod_ft_senIdx_dict) return True else: print "Do not find reviews for %s" % product_id return False else: print "{0} already in db".format(product_id) # scrape for total number of review and category prod_num_reviews, prod_category = scrape_num_review_and_category(product_id) query_res = select_for_product_id(product_id) if prod_num_reviews == -1: prod_num_reviews = query_res[0]['num_reviews'] num_review_db = len(query_res[0]["review_ids"]) prod_num_reviews = max(prod_num_reviews, num_review_db) update_num_reviews_for_product_id(product_id, prod_num_reviews) if num_review_db < review_ratio_threshold * prod_num_reviews and num_review_db < 100: print "But not enough reviews in db, scrapping for more..." # scrape contents amazonScraper = createAmazonScraper() _, prod_contents_new, prod_review_ids, prod_ratings, review_ending_sentence = scraper_main(amazonScraper, product_id, True, scrape_time_limit) # classify, get sentiment score predictor = loadTrainedPredictor(predictor_name) if len(prod_contents_new) > 0: print "Filling scraped new reviews into db..." if len(prod_ft_score_dict) == 0 or len(prod_ft_senIdx_dict) == 0: prod_contents = prod_contents + prod_contents_new prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor, start_idx = 0) else: # already has ft_scores calculated for previous contents: start_idx = len(prod_contents) prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents_new, predictor, start_idx = start_idx) # append new entry to existing entry update_contents_for_product_id(product_id, prod_contents_new, prod_review_ids, \ prod_ratings, review_ending_sentence, prod_category, \ prod_ft_score_dict, prod_ft_senIdx_dict) return True else: print "Do not find new reviews for %s" % product_id if len(prod_ft_score_dict) == 0 or len(prod_ft_senIdx_dict) == 0: prod_contents = prod_contents + prod_contents_new prod_ft_score_dict, prod_ft_senIdx_dict = get_ft_dicts_from_contents(prod_contents, predictor) update_score_for_product_id(product_id, prod_ft_score_dict, prod_ft_senIdx_dict) return True else: print "enough reviews in db, getting scores..." if len(prod_ft_score_dict) == 0 or len(prod_ft_senIdx_dict) == 0: ''' This only triggered if product review is loaded from file and not scraped directly ''' # classify, sentiment score predictor = loadTrainedPredictor(predictor_name) prod_ft_score_dict, prod_ft_senIdx_dict = \ get_ft_dicts_from_contents(prod_contents, predictor) # update old entry update_for_product_id(product_id, prod_ft_score_dict, prod_ft_senIdx_dict) return True else: return True
def Word2Vec_Predictor_test(sentence_list, aspectPattern_names, criteria_for_choosing_class, similarity_measure, cp_threshold, ratio_threshold, lookup, Isprint=0): # load Word2Vec predictor and initilize p = loadTrainedPredictor('Word2Vec') model = p.model aspectPatterns = AspectPatterns(aspectPattern_names) static_aspect_list = p.static_aspects_all["static_aspect_list"] static_wordlist_vec = static_aspect_to_vec(p.static_aspects_all, model) static_aspect_list_show = p.static_aspects_all['static_aspect_list_show'] static_aspect_list_fortraining = p.static_aspects_all['static_aspect_list_fortraining'] num_useful = len(static_aspect_list_fortraining) static_aspect_list_lookup={static_aspect_list[i]:i for i in range(num_useful)} for i in range(num_useful,len(static_aspect_list)): static_aspect_list_lookup[static_aspect_list[i]]=num_useful static_aspect_list_lookup['useless']=num_useful # Initilize classification matrix classification_list=[] num_useful=len(static_aspect_list_fortraining) count=0 count_correct=0 count_useless_true=0 count_useless_prediction=0 count_useful_correct=0 correctness_matrix=np.zeros([num_useful+1,num_useful+1],dtype=np.int) classified_sentences_true=[[] for i in range(num_useful+1)] classified_sentences_predict=[[] for i in range(num_useful+1)] classified_sentences_matrix=[[[] for j in range(num_useful+1)] for i in range(num_useful+1)] # Classify each sentence predictions = [] for sentence in sentence_list: count=count+1 aspect_prediction =p.predict(sentence, criteria_for_choosing_class, similarity_measure, cp_threshold, ratio_threshold) predictions.append(aspect_prediction[0]) aspect_true=sentence.labeled_aspects classification=(aspect_true,aspect_prediction[0],sentence.word2vec_features_list,aspect_prediction[1][:3],sentence.content) #print classification classification_list.append(classification) correctness_matrix[static_aspect_list_lookup[aspect_true],static_aspect_list_lookup[aspect_prediction[0]]]+=1 classified_sentences_true[static_aspect_list_lookup[aspect_true]].append(classification) classified_sentences_predict[static_aspect_list_lookup[aspect_prediction[0]]].append(classification) classified_sentences_matrix[static_aspect_list_lookup[aspect_true]][static_aspect_list_lookup[aspect_prediction[0]]].append(classification) count_useful_total=sum(sum(correctness_matrix[:num_useful,:])) count_useful_prediction_total=sum(sum(correctness_matrix[:,:num_useful])) count_useful_correct=sum(correctness_matrix.diagonal()[0:num_useful]) count_correct=sum(correctness_matrix.diagonal()) count_useless2useful=sum(sum(correctness_matrix[num_useful:num_useful+1,0:num_useful])) count_useful2useless=sum(sum(correctness_matrix[0:num_useful,num_useful:num_useful+1])) count_item_prediction=np.sum(correctness_matrix,axis=0) count_item_truelabel=np.sum(correctness_matrix,axis=1) correct_all=round(1.*count_correct/count,3) if abs(count_useful_prediction_total)>0.01: correct_useful_precision=round(1.*count_useful_correct/count_useful_prediction_total,3) else: correct_useful_precision=-1 correct_useful_recall=round(1.*count_useful_correct/count_useful_total,3) if Isprint==1: correct_item_precision=np.zeros(num_useful+1) for ii in range(num_useful+1): if abs(count_item_prediction[ii])>0.01: correct_item_precision[ii]=np.divide(1.*correctness_matrix.diagonal()[ii],count_item_prediction[ii]) else: correct_item_precision[ii]=-1 correct_item_recall=np.divide(1.*correctness_matrix.diagonal(),count_item_truelabel) print ((count,str(round(100.*correct_all,2))+'%'),(count_useful_total,str(round(100.*correct_useful_precision))+'%',str(round(100.*correct_useful_recall))+'%'),count_useless2useful,count_useful2useless) print static_aspect_list_show print correctness_matrix for i in range(len(classified_sentences_matrix[lookup[0]][lookup[1]])): print classified_sentences_matrix[lookup[0]][lookup[1]][i] print " " return np.array(predictions)