class ArticleSync(): def __init__(self, table): self._sync_time_file = SYNC_TIME_FILE + table + '.txt' self._record_time = tools.get_json( tools.read_file(self._sync_time_file)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._yqtj_es = ES(YQTJ) self._data_pool_es = ES(DATA_POOL) self._hot_sync = HotSync() self._vip_checked = VipChecked() self._province_filter = ProvinceFilter() # self._event_filter = EventFilter() self._table = table self._per_record_time_key = '{table}_record_time'.format( table=self._table) self._vip_checked.start() self._compare_keywords.start() # self._event_filter.start() def get_article_info(self): ''' @summary: 取article的结构信息 --------- --------- @result: ''' article_info = { "EMOTION": None, "HOST": "", "AUTHOR": "", "URL": "", "WEBSITE_NAME": "", "ACCOUNT": "", "REVIEW_COUNT": None, "KEYWORDS_COUNT": None, "RELEASE_TIME": "", "CONTENT": "", "ID": None, "UUID": "", "WEIGHT": None, "CLUES_IDS": "", "UP_COUNT": None, "INTERACTION_COUNT": None, "RECORD_TIME": None, "COMMENT_COUNT": None, "IS_VIP": None, "INFO_TYPE": None, "HOT_ID": None, "KEYWORD_CLUES_ID": "", "MAY_INVALID": None, "TITLE": "", "KEYWORDS": "", "TRANSMIT_COUNT": None, "ZERO_ID": None, "FIRST_ID": None, "SECOND_ID": None, "SUMMARY": "", "WORD_CLOUD": "", "IMAGE_URL": "" } return article_info def get_article_clues_src(self): article_clues_src = {"CLUES_ID": "", "ARTICLE_ID": "", "ID": ""} return article_clues_src def get_per_record_time(self): per_record_time = self._record_time.get(self._per_record_time_key) return per_record_time def record_now_record_time(self, record_time): self._record_time[self._per_record_time_key] = record_time tools.write_file(self._sync_time_file, tools.dumps_json(self._record_time)) def get_article(self): ''' @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO --------- --------- @result: ''' log.debug("取代做种子集...") per_record_time = self.get_per_record_time() today_time = tools.get_current_date("%Y-%m-%d") min_day_ago = tools.get_before_date(today_time, -30, current_date_format='%Y-%m-%d', return_date_format='%Y-%m-%d') if per_record_time: # body = { # "size":1500, # "query": { # "filtered": { # "filter": { # "range": { # "record_time" : { # "gt": per_record_time # } # } # } # } # }, # "sort":[{"record_time":"asc"}] # } body = { "size": 1500, "query": { "filtered": { "filter": { "bool": { "must": [ { "range": { "record_time": { "gt": per_record_time } } }, { "range": { "release_time": { "gte": min_day_ago + ' 00:00:00', # 30日前 "lte": today_time + ' 23:59:59' # 今日 } } } ] } } } }, "sort": [{ "record_time": "asc" }] } else: body = { "query": { "filtered": { "filter": { "range": { "release_time": { "gte": three_day_ago + ' 00:00:00', # 三日前 "lte": today_time + ' 23:59:59' # 今日 } } } } }, "size": 1500, "sort": [{ "record_time": "asc" }] } log.debug(self._table + " => " + tools.dumps_json(body)) article = self._data_pool_es.search(self._table, body) return article.get('hits', {}).get('hits', []) def deal_article(self, article_list): ''' @summary:处理article --------- @param article_list: --------- @result: ''' article_infos = [] # 补全剩余的信息 for article_info in article_list: # print(tools.dumps_json(article_info)) # 互动量 article_info['INTERACTION_COUNT'] = ( article_info['UP_COUNT'] or 0) + (article_info['TRANSMIT_COUNT'] or 0) + (article_info['REVIEW_COUNT'] or 0) + (article_info['COMMENT_COUNT'] or 0) # 检查库中是否已存在 存在则更新互动量 if self._yqtj_es.get('tab_iopm_article_info', article_info["ID"]): log.debug('%s 已存在' % article_info['TITLE']) data = { "INTERACTION_COUNT": article_info['INTERACTION_COUNT'], "UP_COUNT": article_info['UP_COUNT'], "TRANSMIT_COUNT": article_info['TRANSMIT_COUNT'], "REVIEW_COUNT": article_info['REVIEW_COUNT'], "COMMENT_COUNT": article_info['COMMENT_COUNT'] } # 更新舆情 self._yqtj_es.update_by_id("tab_iopm_article_info", data_id=article_info.get("ID"), data=data) continue # 标题+内容文本信息 del_tag_content = tools.del_html_tag(article_info['CONTENT']) text = article_info['TITLE'] + del_tag_content # print(text) # 地域过滤 contain_airs = ','.join( self._province_filter.find_contain_air(text)) weight_factor = 1 # 权重系数 if not contain_airs and PROVINCE: # log.debug('%s 不包含 本地地名 pass' % article_info['TITLE']) weight_factor = 0.01 # 不是本市的,权重系数较小; 权值 = 权重 * 权重系数 # 线索关键词比对 keywords, clues_ids, zero_ids, first_ids, second_ids, keyword_clues = self._compare_keywords.get_contained_keys( text) article_info[ 'KEYWORDS'] = keywords + ',' + contain_airs if keywords else contain_airs article_info['KEYWORDS'] = ','.join( set(article_info['KEYWORDS'].split(','))) article_info['CLUES_IDS'] = clues_ids article_info['ZERO_ID'] = zero_ids article_info['FIRST_ID'] = first_ids article_info['SECOND_ID'] = second_ids article_info['KEYWORDS_COUNT'] = len(keyword_clues) article_info['KEYWORD_CLUES_ID'] = str(keyword_clues) # # 线索与舆情中间表 # article_clues_srcs = [] # if clues_ids: # for clues_id in clues_ids.split(','): # article_clues_src = self.get_article_clues_src() # article_clues_src['ID'] = tools.get_uuid(clues_id, article_info['ID']) # article_clues_src['CLUES_ID'] = clues_id # article_clues_src['ARTICLE_ID'] = article_info['ID'] # article_clues_srcs.append(article_clues_src) # self._yqtj_es.add_batch(article_clues_srcs, "ID", 'tab_iopm_article_clues_src') # 词语图 word_cloud = self._word_cloud.get_word_cloud(del_tag_content) article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud) # 摘要 if not article_info['SUMMARY']: article_info['SUMMARY'] = self._summary.get_summary( del_tag_content) # 情感分析 (1 正 2 负 3 中立, 百度:0:负向,1:中性,2:正向) emotion = self._emotion.get_emotion(article_info['SUMMARY']) if emotion == 0: emotion = 2 elif emotion == 1: emotion = 3 elif emotion == 2: emotion = 1 else: emotion = 3 article_info['EMOTION'] = emotion # 主流媒体 is_vip, zero_id, first_id, second_id = self._vip_checked.is_vip( article_info['HOST'], article_info['WEBSITE_NAME']) article_info["IS_VIP"] = is_vip if is_vip: article_info['ZERO_ID'] = article_info[ 'ZERO_ID'] + ',' + zero_id if article_info[ 'ZERO_ID'] else zero_id article_info['FIRST_ID'] = article_info[ 'FIRST_ID'] + ',' + first_id if article_info[ 'FIRST_ID'] else first_id article_info['SECOND_ID'] = article_info[ 'SECOND_ID'] + ',' + second_id if article_info[ 'SECOND_ID'] else second_id # 计算相关度 url = IOPM_SERVICE_ADDRESS + 'related_sort' data = { 'article_id': article_info['ID'], # 文章id 'clues_ids': article_info['CLUES_IDS'], # 线索ids 'may_invalid': 0, #是否可能无效(微博包含@ 或者#) 'vip_count': article_info['IS_VIP'], # 主流媒体数 'negative_emotion_count': 1 if article_info['EMOTION'] == 2 else 0, # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data) article_info['WEIGHT'] = result.get('weight', 0) * weight_factor # 统计相似文章 热点 if article_info['INFO_TYPE'] == 3: # 微博 article_info['TITLE'] = article_info['SUMMARY'][:30] article_info['HOT_ID'] = self._hot_sync.get_hot_id( article_info, contain_airs, weight_factor) log.debug(''' title %s release_time %s record_time %s url %s 匹配的关键字:%s 线索id %s 一级分类 %s 二级分类 %s 三级分类 %s 关键词-线索 %s 地域 %s ''' % (article_info['TITLE'], article_info['RELEASE_TIME'], article_info['RECORD_TIME'], article_info["URL"], keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues, contain_airs)) # print(tools.dumps_json(article_info)) article_infos.append(article_info) # print('article入库') # self._yqtj_es.add('tab_iopm_article_info', article_info, article_info["ID"]) # article入库 批量 print('article批量入库 size = %s' % len(article_infos)) # print(tools.dumps_json(article_infos)) self._yqtj_es.add_batch(article_infos, "ID", 'tab_iopm_article_info')
class WechatService(): _todo_accounts = collections.deque() _rownum = 1 _is_done = False def __init__(self): self._db = OracleDB() self._es = ES() self.__load_todo_account() def __load_todo_account(self): if not WechatService._todo_accounts: sql = ''' select * from (select rownum r, t.id, t.domain, t.biz from TAB_IOPM_SITE t where t.biz is not null and rownum < {size}) where r >= {rownum} '''.format(rownum=WechatService._rownum, size=WechatService._rownum + SIZE) results = self._db.find(sql) if not results: WechatService._is_done = True WechatService._rownum = 1 self.__load_todo_account() else: WechatService._todo_accounts = collections.deque( results) # 转为队列 WechatService._rownum += SIZE def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' if not WechatService._todo_accounts: self.__load_todo_account() next_account_info = WechatService._todo_accounts.popleft() next_account_id = next_account_info[2] next_account_biz = next_account_info[3] next_account = next_account_id, next_account_biz, WechatService._is_done # 重置_is_done 状态 WechatService._is_done = False return next_account def is_exist(self, table, data_id): if self._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- %s''' % tools.dumps_json(article_info)) self._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) self._es.add('wechat_account', account_info, account_info.get('__biz'))