def save2opinion_corpus(task_id, opinion_results): item_exist = dict() item_exist['task_id'] = task_id item_exist['corpus_results'] = json.dumps(opinion_results) es_intel.index(index=opinion_corpus_results_index_name,doc_type=opinion_corpus_results_index_type,\ id=task_id,body=item_exist) item_task = dict() item_task['compute_status'] = 3 ## 保存观点语料结果,更新计算状态 es_xnr.update(index=writing_task_index_name,doc_type=writing_task_index_type,\ id=task_id, body={'doc':item_task})
def save2models_text(task_id, model_text_dict): item_exist = dict() item_exist['task_id'] = task_id item_exist['model_text_pos'] = model_text_dict['model_text_pos'] item_exist['model_text_neg'] = model_text_dict['model_text_neg'] item_exist['model_text_news'] = model_text_dict['model_text_news'] # 保存智能发帖模板文本结果 print 'item_exist...', item_exist es_intel.index(index=intel_models_text_index_name,doc_type=intel_models_text_index_type,\ id=task_id,body=item_exist) item_task = dict() item_task['compute_status'] = 2 ## 保存智能发帖模板文本结果,更新计算状态 es_xnr.update(index=writing_task_index_name,doc_type=writing_task_index_type,\ id=task_id, body={'doc':{'compute_status':2}})
def save_intelligent_opinion_results(task_id,sub_opinion_results,summary, intel_type): try: item_exist = dict() item_exist['task_id'] = task_id item_exist['subopinion_tweets'] = json.dumps(sub_opinion_results) item_exist['summary'] = summary # 保存子观点结果 es_intel.index(index=intel_opinion_results_index_name,doc_type=intel_type,\ id=task_id,body=item_exist) item_task = dict() item_task['compute_status'] = 2 ## 保存子观点结果,更新计算状态 es_xnr.update(index=writing_task_index_name,doc_type=writing_task_index_type,\ id=task_id, body={'doc':{'compute_status':2}}) mark = True except: mark = False return mark
def news_comments_list(task_source, taskid, weibo_list, cluster_num=-1, cluster_eva_min_size=default_cluster_eva_min_size, vsm=default_vsm, calculation_label=1): #weibo_list把微博读进来 """计算饼图数据,并将饼图数据和去重后的推荐文本写到文件 """ print 'weibo_list..len...', len(weibo_list) params = {"taskid": taskid, "cluster_num": cluster_num, "cluster_eva_min_size": cluster_eva_min_size, \ "vsm": vsm, "calculation_label": calculation_label} comments = weibo_list logfile = os.path.join(LOG_FOLDER, taskid + '.log') cal_results = comments_calculation_v2(comments, logfile=logfile, cluster_num=cluster_num, \ cluster_eva_min_size=int(cluster_eva_min_size), version=vsm) #print cal_results features = cal_results['cluster_infos']['features'] item_infos = cal_results['item_infos'] cluster_ratio = dict() senti_ratio = dict() sentiment_results = dict() cluster_results = dict() rub_results = [] # 过滤前文本数 before_filter_count = len(item_infos) # 过滤后文本数 after_filter_count = 0 download_items = [] for comment in item_infos: # print comment download_item = {} download_item["id"] = comment["id"] download_item["title"] = comment["title"] download_item["text"] = comment["text"] # download_item["timestamp"] = comment["timestamp"] download_item["datetime"] = comment["datetime"] download_item["clusterid"] = comment["clusterid"] download_item["sentiment"] = comment["sentiment"] download_item["ad_label"] = comment["ad_label"] if (comment["clusterid"][:8] != 'nonsense') and (comment["clusterid"] != 'other'): download_item["duplicate"] = comment["duplicate"] download_item["same_from"] = comment["same_from"] download_items.append(download_item) if ('clusterid' in comment) and (comment['clusterid'][:8] != 'nonsense'): clusterid = comment['clusterid'] try: cluster_ratio[clusterid] += 1 except KeyError: cluster_ratio[clusterid] = 1 try: cluster_results[clusterid].append(comment) except KeyError: cluster_results[clusterid] = [comment] if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) and ('clusterid' in comment) \ and (comment['clusterid'][:8] != 'nonsense'): sentiment = comment['sentiment'] try: senti_ratio[sentiment] += 1 except KeyError: senti_ratio[sentiment] = 1 try: sentiment_results[sentiment].append(comment) except KeyError: sentiment_results[sentiment] = [comment] after_filter_count += 1 if comment['clusterid'][:8] == 'nonsense': rub_results.append(comment) ratio_results = dict() ratio_total_count = sum(cluster_ratio.values()) for clusterid, ratio in cluster_ratio.iteritems(): if clusterid in features: feature = features[clusterid] if feature and len(feature): ratio_results[','.join( feature[:3])] = float(ratio) / float(ratio_total_count) sentiratio_results = dict() sentiratio_total_count = sum(senti_ratio.values()) for sentiment, ratio in senti_ratio.iteritems(): if sentiment in emotions_vk_v1: label = emotions_vk_v1[sentiment] if label and len(label): sentiratio_results[label] = float(ratio) / float( sentiratio_total_count) # 情感分类去重 sentiment_dump_dict = dict() for sentiment, contents in sentiment_results.iteritems(): dump_dict = dict() for comment in contents: same_from_sentiment = comment["same_from_sentiment"] try: dump_dict[same_from_sentiment].append(comment) except KeyError: dump_dict[same_from_sentiment] = [comment] sentiment_dump_dict[sentiment] = dump_dict # 子观点分类去重 cluster_dump_dict = dict() for clusterid, contents in cluster_results.iteritems(): if clusterid in features: feature = features[clusterid] if feature and len(feature): dump_dict = dict() for comment in contents: same_from_cluster = comment["same_from"] try: dump_dict[same_from_cluster].append(comment) except KeyError: dump_dict[same_from_cluster] = [comment] for k, v in dump_dict.iteritems(): sort_dump_dict = sorted(v, key=lambda x: x['weight'], reverse=True) cluster_dump_dict[clusterid] = sort_dump_dict #task = taskid.split('_') index_body = { 'name': taskid, 'features': json.dumps(features), 'cluster_dump_dict': json.dumps(cluster_dump_dict) } es_intel.index(index=topics_river_index_name, doc_type=topics_river_index_type, id=taskid, body=index_body) return json.dumps({ "features": features, "cluster_dump_dict": cluster_dump_dict })