def news_content(task_source,task_id,news_limit = NEWS_LIMIT): if task_source == 'weibo': query_body ={'query':{ 'bool':{ 'should':[ {'wildcard':{'text':'*【*】*'}}, {'wildcard':{'text':'*#*#*'}} ] } }, 'size':news_limit } else: query_body = { 'query':{ 'bool':{ 'should':[ {'wildcard':{'text':'*【*】*'}}, {'wildcard':{'text':'*#*'}} ] } }, 'sort':'share', 'size':news_limit } news_results = es_intel.search(index=task_id,doc_type=task_source,body=query_body)['hits']['hits']#['_source'] # print topic,weibo_index_type,start_ts,end_ts,query_body # print news_results news_list = [] for key_weibo in news_results: text_weibo = key_weibo['_source']['text'] uid = key_weibo['_source']['uid'] timestamp = key_weibo['_source']['timestamp'] comment = key_weibo['_source']['comment'] if task_source == 'weibo': mid_weibo = key_weibo['_source']['mid'] retweeted = key_weibo['_source']['retweeted'] elif task_source == 'facebook': mid_weibo = key_weibo['_source']['fid'] retweeted = key_weibo['_source']['share'] else: mid_weibo = key_weibo['_source']['tid'] retweeted = key_weibo['_source']['share'] news_list.append({'news_id':'news','content168':text_weibo,'id':mid_weibo,'datetime':ts2datetime_full(timestamp),'comment':comment,'retweeted':retweeted}) return news_list
def get_models_text(task_id, task_source, opinion_keywords_list): if task_source == 'weibo': sort_item = 'retweeted' else: sort_item = 'share' query_body_pos = { 'query': { 'terms': { 'sentiment': SENTIMENT_POS } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } query_body_neg = { 'query': { 'terms': { 'sentiment': SENTIMENT_NEG } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } query_body_news = { 'query': { 'bool': { 'must': [{ 'wildcard': { 'text': '*【*】*' } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } results_pos = es_intel.search(index=task_id, doc_type=task_source, body=query_body_pos)['hits']['hits'] results_neg = es_intel.search(index=task_id, doc_type=task_source, body=query_body_neg)['hits']['hits'] results_news = es_intel.search(index=task_id, doc_type=task_source, body=query_body_news)['hits']['hits'] text_list_pos = [] text_list_neg = [] text_list_news = [] for result_pos in results_pos: text_list_pos.append(result_pos['_source']['text']) for result_neg in results_neg: text_list_neg.append(result_neg['_source']['text']) for result_news in results_news: text_list_news.append(result_news['_source']['text']) model_text_dict = {} model_text_pos = text_generation_main(text_list_pos, opinion_keywords_list) model_text_neg = text_generation_main(text_list_neg, opinion_keywords_list) model_text_news = text_generation_main(text_list_news, opinion_keywords_list) model_text_dict['model_text_pos'] = model_text_pos model_text_dict['model_text_neg'] = model_text_neg model_text_dict['model_text_news'] = model_text_news print 'model_text_dict..', model_text_dict save2models_text(task_id, model_text_dict)