def count_vote_dist(): db_inst = get_db_inst('AmazonReviews', 'AndroidAPP') delta = 2 x_list = [] y_list = [] xx = [] for i in range(1000): x_list.append((i * delta, (i + 1) * delta)) pass for tu in x_list: try: # y_list.append(math.log(db_inst.find({"total_vote": {"$gt": tu[0], "$lt": tu[1]}}).count(), 10)) y_list.append(db_inst.find({"total_vote": {"$gte": tu[0], "$lt": tu[1]}}).count()) xx.append(tu[0]) print y_list[-1] except: xx.append(tu[0]) y_list.append(0) # y_list.append(math.log(db_inst.find({"total_vote": {"$gt": x_list[-1][1]}}).count(), 10)) y_list.append(db_inst.find({"total_vote": {"$gt": x_list[-1][1]}}).count()) xx.append(xx[-1] + 1) res = {"x": x_list, 'y': y_list} open('%s/data/amazon_data/%s' % (PROJECT_PATH, 'vote_counts.json'), 'w').write(json.dumps(res)) # plt.plot(xx, y_list) # plt.grid() # plt.show() sns.distplot(y_list) plt.show()
def structure_data(): """ 将db中的数据进行结构化处理 :return: """ db_sp = get_db_inst('ProjectNavi', 'ScenicPoins') # find_result = db_sp.find({"title": "中国地质博物馆"}) find_result = db_sp.find() for item in find_result: title = item['title'] city = item['city'] addr = item['address'] tmp = city + title + addr md5 = hashlib.md5(tmp).hexdigest() open_time = item['infos'].get('open_time', None) structured_open_time = analyze_open_time(open_time) visit_time = item['infos'].get('visit_time', None) structured_visit_time = analyze_visit_time(visit_time) print title print open_time, structured_open_time print visit_time, structured_visit_time structured_data = {'open_time': {"start": structured_open_time[0], 'end': structured_open_time[1]}, 'visit_time': structured_visit_time} item['structured_infos'] = structured_data item['md5'] = md5 try: db_sp.update({'title': title, 'city': city}, item) except Exception, e: print e
def _create_scenic_point_by_title(title): """ 根据title生成ScenicPoint对象实例 :param title: :return: """ db_sp = get_db_inst('ProjectNavi', 'ScenicPoins') find_result = db_sp.find_one({'title': title}) sp = ScenicPoint(find_result) return sp
def handle_amazon_result(fin_path): with open(fin_path, 'r') as fin: itemlist = [] for line in fin: splits = line.split('\t') item_id = splits[0].replace('itemID: ', '') total_reviews = eval(splits[1].replace('total reviews: ', '')) oprank_errors = eval(splits[2].replace('oprank_errors: ', '')) textrank_errors = eval(splits[3].replace('textrank_errors: ', '')) sum_oprank_errors = eval(splits[4].replace('sum_oprank_errors: ', '')) sum_textrank_errors = eval(splits[5].replace('sum_textrank_errors: ', '')) itemlist.append({'item_id': item_id, 'total_reviews': total_reviews, 'oprank_errors': oprank_errors, 'textrank_errors': textrank_errors, 'sum_oprank_errors': sum_oprank_errors, 'sum_textrank_errors': sum_textrank_errors}) # sortedlist = sorted(itemlist, cmp=lambda x, y: cmp(x['total_reviews'], y['total_reviews'])) db_result = get_db_inst('AmazonReviews', 'AndroidAPP_result') for item in itemlist: db_result.insert({'itemID': item['item_id'], 'total_reviews': item['total_reviews'], 'oprank_errors': item['oprank_errors'], 'textrank_errors': item['textrank_errors']}) print 'handled!'
def get_city_scenic_points(city_name): """ 从mafengwo爬取指定城市的所有景点信息并存入数据库 :param city_name: :return: """ db_sp = get_db_inst('ProjectNavi', 'ScenicPoins') for info in query_by_name(city_name): print info['name'] url = info['url'] print url try: scenic_info = get_details(url) scenic_info['city'] = city_name db_sp.insert(scenic_info) api_logger.info('%s %s inserted' % (city_name, scenic_info['title'])) except Exception, e: print e t = random.random() * 2 print 'sleep %s seconds' % t time.sleep(t)
def amazon_preprocess(start=0, end=10, label_rate=0.65, min_vote=0): """ :param start: :param end: :param label_rate: :return: """ # prepare train set db_inst = get_db_inst('AmazonReviews', 'AndroidAPP') # print len(db_inst.distinct('asin')) manager_groups = {} asin_file = open('%s/process/data/asin.list' % PROJECT_PATH, 'r') # for asin in db_inst.distinct('asin'): # asin_file.write('%s\n' % asin) lines = asin_file.readlines() shuffle(lines) # for asin in db_inst.distinct('asin'): tlines = lines[start:end] review_dicts = {} asin_list = [] for asin in tlines: asin = asin.replace('\n', '') asin_list.append(asin) print 'loading %s' % asin # snm.add_node(SentenceNode(splits[4], extra=int(ll))) # 计算每个APP下的评论 a_reviews = [] max_vote = 0 # 常量 for find_item in db_inst.find({"asin": asin, 'total_vote': {"$gt": min_vote}}): max_vote = max(find_item['total_vote'], max_vote) a_reviews.append(find_item) # process item reviews VOTE RANK review_rank = [] print '%s has %s reviews' % (asin, len(a_reviews)) snm = SentenceNodeManager() for review in a_reviews: alpha_const = 0 T = float(review['total_vote']) / max_vote V = 1 / (1.0 + math.exp(-0.01 * (2 * review['up_vote'] - review['total_vote']))) # V = float(review['up_vote']) / review['total_vote'] vote_rank_value = 2 * (T + alpha_const) * (V + alpha_const) / (T + V + 2 * alpha_const) if vote_rank_value >= label_rate: snm.add_node( SentenceNode(review['reviewText'].lower(), extra=(int(1), vote_rank_value, review['reviewerID']), get_pos_func=tag_sents, get_keywords_func=cal_en_tfidf)) elif vote_rank_value < label_rate: snm.add_node( SentenceNode(review['reviewText'].lower(), extra=(int(0), vote_rank_value, review['reviewerID']), get_pos_func=tag_sents, get_keywords_func=cal_en_tfidf)) review_rank.append((review, vote_rank_value)) manager_groups[asin] = snm review_dicts[asin] = review_rank # else: # break veclist = [] sentlist = [] labellist = [] tokenlist = [] nodelist = [] group_nodelist = [] print 'start normalizing vecs' for pid in manager_groups.keys(): manager = manager_groups[pid] # DBSCANcluster(manager, '%s_DBSCANcluster.json' % pid) # APcluster(manager, '%s_APcluster.json' % pid) manager.normalize_all_sentnodes(tfidf_func=tag_sents) veclist.extend(manager.get_vec_list()) sentlist.extend(manager.get_sent_list()) gnodelist = [] for node in manager.node_list: labellist.append(node.extra[0]) tokenlist.append(node.feature2token()) nodelist.append(node) gnodelist.append(node) group_nodelist.append(gnodelist) print 'end normalizing vecs' return veclist, sentlist, labellist, tokenlist, nodelist, manager_groups
def save_list2mongo(item_list, db_name, collection_name): db_inst = get_db_inst(db_name, collection_name) # r = db_inst.find({"app_name": "UC浏览器"}) # for item in item_list: db_inst.insert_many(item_list)