def export_to_oracle(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): if aim_table: if self._aim_table != aim_table: self._is_set_unique_key = False self._es = ES() if sync_to_es else '' self._mongodb = MongoDB() if source_table else '' self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._sync_to_es = sync_to_es self._es = None self._aim_db = OracleDB() self._is_oracle = True return self.__export()
class SyncES(): def __init__(self): self._es = ES() self._db = OracleDB() self._max_id = tools.read_file(STO_MAX_ID_FILE) self._max_id = self._max_id and eval(self._max_id) or {} def get_data(self, sql): return self._db.find(sql, to_json=True) def export_to_es(self, table, data, data_id): self._es.add(table=table, data=data, data_id=data_id) def sync_data(self, table, step=20): ''' @summary: 需要先把id设为主键 --------- @param sql: @param table: @param is_step: 分批导 0 位一次导入, 适合数据量不多情况。速度快 --------- @result: ''' max_id = self._max_id.get(table, 0) self._db.set_primary_key(table) while True: inner_sql = 'select * from %s where id > %d and rownum <= %d order by id' % ( table, max_id, step) datas = sync_es.get_data(inner_sql) if not datas: self.close() break for data in datas: data_id = data['ID'] data = tools.dumps_json(data) print(data) print(data_id) # print(data) max_id = data_id self.export_to_es(table, data, data_id) self._max_id[table] = max_id def close(self): tools.write_file(STO_MAX_ID_FILE, str(self._max_id))
def __init__(self, table): self._record_time = tools.get_json( tools.read_file(SYNC_TIME_FILE)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._es = ES() self._hot_sync = HotSync() self._vip_checked = VipChecked() self._table = table self._per_record_time_key = '{table}_record_time'.format( table=self._table)
def main(): oracledb = OracleDB() esdb = ES() # sql = 'select MSG_ID from TAB_IOPM_USER_ACTION t where action_type=301 and msg_type = 502 and record_time>=sysdate-1' # article_ids = oracledb.find(sql) article_ids = [8888515, 8888293, 8891299] for article_id in article_ids: # article_id = article_id[0] body = {"WEIGHT": 0} print(article_id) esdb.update_by_id('tab_iopm_article_info', article_id, body)
class ElasticSearchServer(): def __init__(self): self._es = ES('192.168.60.40') # self._es = ES('localhost') def search(self, table, body): return self._es.search(table, body)
def __init__(self, table): self._sync_time_file = SYNC_TIME_FILE + table + '.txt' self._record_time = tools.get_json( tools.read_file(self._sync_time_file)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._yqtj_es = ES(YQTJ) self._data_pool_es = ES(DATA_POOL) self._hot_sync = HotSync() self._vip_checked = VipChecked() self._province_filter = ProvinceFilter() self._table = table self._per_record_time_key = '{table}_record_time'.format( table=self._table) self._vip_checked.start() self._compare_keywords.start()
class SyncArtice(threading.Thread): def __init__(self): super(SyncArtice, self).__init__() self._es = ES() self._redis = RedisDB() self._sync_count = 0 def run(self): is_show_tip = False while True: try: datas = self.get_data_from_redis(SYNC_STEP) if not datas: if not is_show_tip: print('\n{time} 无数据 休眠... '.format( time=tools.get_current_date())) is_show_tip = True elif self.add_data_to_es(datas): is_show_tip = False self._sync_count += len(datas) tools.print_one_line('已同步 %d 条数据' % self._sync_count) tools.delay_time(1) except Exception as e: log.error(e) def get_data_from_redis(self, count): datas = self._redis.sget('news:news_article', count=count) return_datas = [] for data in datas: data = eval(data) release_time = data.get('release_time') if release_time and len(release_time) == 19: return_datas.append(data) return return_datas def add_data_to_es(self, datas): return self._es.add_batch(datas, primary_key='uuid', table='news_article')
class HotWeekSync(): def __init__(self): self._es = ES() self._event_filter = EventFilter() self._event_filter.start() def _get_week_hots(self, text, release_time): before_week = tools.get_before_date(release_time, -7) body = { "size": 1, "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": before_week, "lte": release_time } } }, "query": { "multi_match": { "query": text, "fields": ["TITLE"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": [ "ID", "TITLE", # "CONTENT", "HOT", "ARTICLE_COUNT", "VIP_COUNT", "NEGATIVE_EMOTION_COUNT", "HOT_DAY_IDS", "WEIGHT" ], # "highlight": { # "fields": { # "TITLE": {} # } # } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_week_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def cluster_week_hot(self, day_hot, hot_value=None, article_count=None, vip_count=None, negative_emotion_count=None, weight=None): ''' @summary: 聚类 --------- @param hot:每日热点信息 @param hot_value: 一条舆情的热度 (不为空时表示该条每日热点为更新热点,那么7日热点已经聚过此热点, 热度应该只加该条舆情的热度) @param article_count: @param vip_count: @param negative_emotion_count: @param weight: --------- @result: ''' article_text = day_hot.get("TITLE") # + hot.get("CONTENT") release_time = day_hot.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_week_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != day_hot["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + (hot_value or day_hot.get('HOT')) data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + ( article_count or day_hot.get('ARTICLE_COUNT')) # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( vip_count or day_hot.get('VIP_COUNT')) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + ( negative_emotion_count or hot.get('NEGATIVE_EMOTION_COUNT')) # 更新相关度 # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT']) # 更新 hot_day_ids if not hot_value: data["HOT_DAY_IDS"] = similar_hot[ 'HOT_DAY_IDS'] + ',' + day_hot['ID'] # 更新热点 self._es.update_by_id("tab_iopm_hot_week_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(day_hot) # 处理事件类型 del_tag_content = tools.del_html_tag(hot_info['CONTENT']) text = hot_info['TITLE'] + del_tag_content contain_event_ids = self._event_filter.find_contain_event(text) hot_info['EVENT_IDS'] = ','.join(contain_event_ids) hot_info['HOT_DAY_IDS'] = day_hot.get("ID") self._es.add('tab_iopm_hot_week_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']
class ArticleSync(): def __init__(self, table): self._sync_time_file = SYNC_TIME_FILE + table + '.txt' self._record_time = tools.get_json( tools.read_file(self._sync_time_file)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._yqtj_es = ES(YQTJ) self._data_pool_es = ES(DATA_POOL) self._hot_sync = HotSync() self._vip_checked = VipChecked() self._province_filter = ProvinceFilter() # self._event_filter = EventFilter() self._table = table self._per_record_time_key = '{table}_record_time'.format( table=self._table) self._vip_checked.start() self._compare_keywords.start() # self._event_filter.start() def get_article_info(self): ''' @summary: 取article的结构信息 --------- --------- @result: ''' article_info = { "EMOTION": None, "HOST": "", "AUTHOR": "", "URL": "", "WEBSITE_NAME": "", "ACCOUNT": "", "REVIEW_COUNT": None, "KEYWORDS_COUNT": None, "RELEASE_TIME": "", "CONTENT": "", "ID": None, "UUID": "", "WEIGHT": None, "CLUES_IDS": "", "UP_COUNT": None, "INTERACTION_COUNT": None, "RECORD_TIME": None, "COMMENT_COUNT": None, "IS_VIP": None, "INFO_TYPE": None, "HOT_ID": None, "KEYWORD_CLUES_ID": "", "MAY_INVALID": None, "TITLE": "", "KEYWORDS": "", "TRANSMIT_COUNT": None, "ZERO_ID": None, "FIRST_ID": None, "SECOND_ID": None, "SUMMARY": "", "WORD_CLOUD": "", "IMAGE_URL": "" } return article_info def get_article_clues_src(self): article_clues_src = {"CLUES_ID": "", "ARTICLE_ID": "", "ID": ""} return article_clues_src def get_per_record_time(self): per_record_time = self._record_time.get(self._per_record_time_key) return per_record_time def record_now_record_time(self, record_time): self._record_time[self._per_record_time_key] = record_time tools.write_file(self._sync_time_file, tools.dumps_json(self._record_time)) def get_article(self): ''' @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO --------- --------- @result: ''' log.debug("取代做种子集...") per_record_time = self.get_per_record_time() today_time = tools.get_current_date("%Y-%m-%d") min_day_ago = tools.get_before_date(today_time, -30, current_date_format='%Y-%m-%d', return_date_format='%Y-%m-%d') if per_record_time: # body = { # "size":1500, # "query": { # "filtered": { # "filter": { # "range": { # "record_time" : { # "gt": per_record_time # } # } # } # } # }, # "sort":[{"record_time":"asc"}] # } body = { "size": 1500, "query": { "filtered": { "filter": { "bool": { "must": [ { "range": { "record_time": { "gt": per_record_time } } }, { "range": { "release_time": { "gte": min_day_ago + ' 00:00:00', # 30日前 "lte": today_time + ' 23:59:59' # 今日 } } } ] } } } }, "sort": [{ "record_time": "asc" }] } else: body = { "query": { "filtered": { "filter": { "range": { "release_time": { "gte": three_day_ago + ' 00:00:00', # 三日前 "lte": today_time + ' 23:59:59' # 今日 } } } } }, "size": 1500, "sort": [{ "record_time": "asc" }] } log.debug(self._table + " => " + tools.dumps_json(body)) article = self._data_pool_es.search(self._table, body) return article.get('hits', {}).get('hits', []) def deal_article(self, article_list): ''' @summary:处理article --------- @param article_list: --------- @result: ''' article_infos = [] # 补全剩余的信息 for article_info in article_list: # print(tools.dumps_json(article_info)) # 互动量 article_info['INTERACTION_COUNT'] = ( article_info['UP_COUNT'] or 0) + (article_info['TRANSMIT_COUNT'] or 0) + (article_info['REVIEW_COUNT'] or 0) + (article_info['COMMENT_COUNT'] or 0) # 检查库中是否已存在 存在则更新互动量 if self._yqtj_es.get('tab_iopm_article_info', article_info["ID"]): log.debug('%s 已存在' % article_info['TITLE']) data = { "INTERACTION_COUNT": article_info['INTERACTION_COUNT'], "UP_COUNT": article_info['UP_COUNT'], "TRANSMIT_COUNT": article_info['TRANSMIT_COUNT'], "REVIEW_COUNT": article_info['REVIEW_COUNT'], "COMMENT_COUNT": article_info['COMMENT_COUNT'] } # 更新舆情 self._yqtj_es.update_by_id("tab_iopm_article_info", data_id=article_info.get("ID"), data=data) continue # 标题+内容文本信息 del_tag_content = tools.del_html_tag(article_info['CONTENT']) text = article_info['TITLE'] + del_tag_content # print(text) # 地域过滤 contain_airs = ','.join( self._province_filter.find_contain_air(text)) weight_factor = 1 # 权重系数 if not contain_airs and PROVINCE: # log.debug('%s 不包含 本地地名 pass' % article_info['TITLE']) weight_factor = 0.01 # 不是本市的,权重系数较小; 权值 = 权重 * 权重系数 # 线索关键词比对 keywords, clues_ids, zero_ids, first_ids, second_ids, keyword_clues = self._compare_keywords.get_contained_keys( text) article_info[ 'KEYWORDS'] = keywords + ',' + contain_airs if keywords else contain_airs article_info['KEYWORDS'] = ','.join( set(article_info['KEYWORDS'].split(','))) article_info['CLUES_IDS'] = clues_ids article_info['ZERO_ID'] = zero_ids article_info['FIRST_ID'] = first_ids article_info['SECOND_ID'] = second_ids article_info['KEYWORDS_COUNT'] = len(keyword_clues) article_info['KEYWORD_CLUES_ID'] = str(keyword_clues) # # 线索与舆情中间表 # article_clues_srcs = [] # if clues_ids: # for clues_id in clues_ids.split(','): # article_clues_src = self.get_article_clues_src() # article_clues_src['ID'] = tools.get_uuid(clues_id, article_info['ID']) # article_clues_src['CLUES_ID'] = clues_id # article_clues_src['ARTICLE_ID'] = article_info['ID'] # article_clues_srcs.append(article_clues_src) # self._yqtj_es.add_batch(article_clues_srcs, "ID", 'tab_iopm_article_clues_src') # 词语图 word_cloud = self._word_cloud.get_word_cloud(del_tag_content) article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud) # 摘要 if not article_info['SUMMARY']: article_info['SUMMARY'] = self._summary.get_summary( del_tag_content) # 情感分析 (1 正 2 负 3 中立, 百度:0:负向,1:中性,2:正向) emotion = self._emotion.get_emotion(article_info['SUMMARY']) if emotion == 0: emotion = 2 elif emotion == 1: emotion = 3 elif emotion == 2: emotion = 1 else: emotion = 3 article_info['EMOTION'] = emotion # 主流媒体 is_vip, zero_id, first_id, second_id = self._vip_checked.is_vip( article_info['HOST'], article_info['WEBSITE_NAME']) article_info["IS_VIP"] = is_vip if is_vip: article_info['ZERO_ID'] = article_info[ 'ZERO_ID'] + ',' + zero_id if article_info[ 'ZERO_ID'] else zero_id article_info['FIRST_ID'] = article_info[ 'FIRST_ID'] + ',' + first_id if article_info[ 'FIRST_ID'] else first_id article_info['SECOND_ID'] = article_info[ 'SECOND_ID'] + ',' + second_id if article_info[ 'SECOND_ID'] else second_id # 计算相关度 url = IOPM_SERVICE_ADDRESS + 'related_sort' data = { 'article_id': article_info['ID'], # 文章id 'clues_ids': article_info['CLUES_IDS'], # 线索ids 'may_invalid': 0, #是否可能无效(微博包含@ 或者#) 'vip_count': article_info['IS_VIP'], # 主流媒体数 'negative_emotion_count': 1 if article_info['EMOTION'] == 2 else 0, # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data) article_info['WEIGHT'] = result.get('weight', 0) * weight_factor # 统计相似文章 热点 if article_info['INFO_TYPE'] == 3: # 微博 article_info['TITLE'] = article_info['SUMMARY'][:30] article_info['HOT_ID'] = self._hot_sync.get_hot_id( article_info, contain_airs, weight_factor) log.debug(''' title %s release_time %s record_time %s url %s 匹配的关键字:%s 线索id %s 一级分类 %s 二级分类 %s 三级分类 %s 关键词-线索 %s 地域 %s ''' % (article_info['TITLE'], article_info['RELEASE_TIME'], article_info['RECORD_TIME'], article_info["URL"], keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues, contain_airs)) # print(tools.dumps_json(article_info)) article_infos.append(article_info) # print('article入库') # self._yqtj_es.add('tab_iopm_article_info', article_info, article_info["ID"]) # article入库 批量 print('article批量入库 size = %s' % len(article_infos)) # print(tools.dumps_json(article_infos)) self._yqtj_es.add_batch(article_infos, "ID", 'tab_iopm_article_info')
def __init__(self): self._es = ES() self._hot_week_sync = HotWeekSync() self._cut_text = CutText() self._cut_text.set_stop_words('utils/stop_words.txt')
class HotSync(): def __init__(self): self._es = ES() self._hot_week_sync = HotWeekSync() self._cut_text = CutText() self._cut_text.set_stop_words('utils/stop_words.txt') def _get_today_hots(self, text, release_time): release_day = release_time[:release_time.find(' ')] body = { "size":1, "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": release_day + ' 00:00:00', "lte": release_day + ' 23:59:59' } } }, "query": { "multi_match": { "query": text, "fields": [ "TITLE" ], "operator": "or", "minimum_should_match": "{percent}%".format(percent = int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }#, # "_source": [ # "ID", # "TITLE", # # "CONTENT", # "RELEASE_TIME", # "WEIGHT", # "HOT", # "ARTICLE_COUNT", # "CLUES_IDS", # "VIP_COUNT", # "NEGATIVE_EMOTION_COUNT" # ], # "highlight": { # "fields": { # "TITLE": {} # } # } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def get_hot_id(self, article_info, positions, weight_factor): ''' @summary: 聚类 --------- @param article_info: --------- @result: ''' # weight_factor = 1 article_text = article_info.get("TITLE")# + article_info.get("CONTENT") release_time = article_info.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_today_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE')# + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot:# 找到相似的热点 if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度与文章数 data['HOT'] = similar_hot['HOT'] + INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + 1 # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + article_info["IS_VIP"] data["NEGATIVE_EMOTION_COUNT"] = similar_hot['NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0) weight_temp = 0 # 记录更新前后的差值 # 更新相关度 if similar_hot['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data_args = { 'hot_id': similar_hot['ID'], # 文章id 'hot_value' :data['HOT'], # 热度值 'clues_ids': similar_hot['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count' : data['ARTICLE_COUNT'], # 文章总数 'vip_count': data["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids':article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data = data_args) weight_temp = similar_hot['WEIGHT'] - result.get('weight', 0) data['WEIGHT'] = result.get('weight', 0) * weight_factor # 更新热点 self._es.update_by_id("tab_iopm_hot_info", data_id = similar_hot.get("ID"), data = data) # 同步7日热点 self._hot_week_sync.cluster_week_hot(similar_hot, hot_value = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0), article_count = 1, vip_count = article_info["IS_VIP"], negative_emotion_count = 1 if article_info['EMOTION'] == 2 else 0, weight = weight_temp) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(article_info) hot_info.pop('HOT_ID') # 热点表中无hot_id # 默认用户行为数量为零 hot_info['ACCEPT_COUNT'] = 0 hot_info['UNACCEPT_COUNT'] = 0 hot_info['WATCH_COUNT'] = 0 # 其他值 hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0 hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info['EMOTION'] == 2 else 0 hot_info['HOT'] = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor hot_info['ID'] = article_info.get("ID") hot_info['ARTICLE_COUNT'] = 1 hot_info['HOT_KEYWORDS'] = ','.join(self._cut_text.cut_for_keyword(article_info["TITLE"])) # 关键词 可优化速度 在比较相似度时已经分词了 TODO hot_info['POSITIONS'] = positions hot_info['EVENT_IDS'] = '' # 事件类型(每日热点不需要 TODO | 每周热点已加) self._es.add('tab_iopm_hot_info', hot_info, data_id = hot_info['ID']) # 同步7日热点 self._hot_week_sync.cluster_week_hot(hot_info) # 返回热点id return hot_info['ID']
def __init__(self): self._es = ES()
sys.path.append('../../') import init import base.constance as Constance import base.base_parser as base_parser import video_info.parsers.base_parser as self_base_parser import utils.tools as tools from utils.log import log import datetime from db.elastic_search import ES import random SITE_ID = 3 NAME = '新浪微博' es = ES() def get_release_time(mblog): try: release_time = mblog['created_at'] data = tools.time.time() ltime = tools.time.localtime(data) timeStr = tools.time.strftime("%Y-%m-%d", ltime) if tools.re.compile('今天').findall(release_time): release_time = release_time.replace('今天', '%s' % timeStr) elif tools.re.compile('昨天').findall(release_time): today = datetime.date.today() yesterday = today - datetime.timedelta(days=1) release_time = release_time.replace('昨天', '%s' % yesterday) elif '小时前' in release_time:
@summary: --------- @author: Boris ''' import sys import os sys.path.append('../') from db.oracledb import OracleDB from db.elastic_search import ES import utils.tools as tools from utils.log import log oracledb = OracleDB() esdb = ES() ###########【拆分词组相关】################ def match_keys(keys_list): ''' @summary: 解析乘积关系的词组 --------- @param keys_list: 词组列表 --------- @result: ''' list_size = len(keys_list) if list_size < 2:
class ArticleSync(): def __init__(self, table): self._record_time = tools.get_json( tools.read_file(SYNC_TIME_FILE)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._es = ES() self._hot_sync = HotSync() self._vip_checked = VipChecked() self._table = table self._per_record_time_key = '{table}_record_time'.format( table=self._table) def get_article_info(self): ''' @summary: 取article的结构信息 --------- --------- @result: ''' article_info = { "EMOTION": None, "HOST": "", "AUTHOR": "", "URL": "", "WEBSITE_NAME": "", "ACCOUNT": "", "REVIEW_COUNT": None, "KEYWORDS_COUNT": None, "RELEASE_TIME": "", "CONTENT": "", "ID": None, "UUID": "", "WEIGHT": None, "CLUES_IDS": "", "UP_COUNT": None, "INTERACTION_COUNT": None, "RECORD_TIME": tools.get_current_date(), "COMMENT_COUNT": None, "IS_VIP": None, "INFO_TYPE": None, "HOT_ID": None, "KEYWORD_CLUES_ID": "", "MAY_INVALID": None, "TITLE": "", "KEYWORDS": "", "TRANSMIT_COUNT": None, "ZERO_ID": None, "FIRST_ID": None, "SECOND_ID": None, "SUMMARY": "", "WORD_CLOUD": "", "IMAGE_URL": "" } return article_info def get_article_clues_src(self): article_clues_src = {"CLUES_ID": "", "ARTICLE_ID": "", "ID": ""} return article_clues_src def get_per_record_time(self): per_record_time = self._record_time.get(self._per_record_time_key) return per_record_time def record_now_record_time(self, record_time): self._record_time[self._per_record_time_key] = record_time tools.write_file(SYNC_TIME_FILE, tools.dumps_json(self._record_time)) def get_article(self): ''' @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO --------- --------- @result: ''' per_record_time = self.get_per_record_time() today_time = tools.get_current_date('%Y-%m-%d') if per_record_time: sql = "select * from {table} where record_time > '{record_time}' and release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format( table=self._table, record_time=per_record_time, today_time=today_time) else: sql = "select * from {table} where release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format( table=self._table, today_time=today_time) url = 'http://{address}/_sql?sql={sql}'.format(address=ADDRESS, sql=sql) log.debug(url) article = tools.get_json_by_requests(url) return article.get('hits', {}).get('hits', []) def deal_article(self, article_list): ''' @summary:处理article --------- @param article_list: --------- @result: ''' article_infos = [] # 补全剩余的信息 for article_info in article_list: # 互动量 # print(tools.dumps_json(article_info)) article_info['INTERACTION_COUNT'] = ( article_info['UP_COUNT'] or 0) + (article_info['TRANSMIT_COUNT'] or 0) + (article_info['REVIEW_COUNT'] or 0) + (article_info['COMMENT_COUNT'] or 0) # 线索关键词比对 del_tag_content = tools.del_html_tag(article_info['CONTENT']) text = article_info['TITLE'] + del_tag_content # print(text) keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues = self._compare_keywords.get_contained_keys( text) article_info['KEYWORDS'] = keywords article_info['CLUES_IDS'] = clues_ids article_info['ZERO_ID'] = zero_ids article_info['FIRST_ID'] = first_id article_info['SECOND_ID'] = second_ids article_info['KEYWORDS_COUNT'] = len(keyword_clues) article_info['KEYWORD_CLUES_ID'] = str(keyword_clues) # 线索与舆情中间表 article_clues_srcs = [] if clues_ids: for clues_id in clues_ids.split(','): article_clues_src = self.get_article_clues_src() article_clues_src['ID'] = tools.get_uuid( clues_id, article_info['ID']) article_clues_src['CLUES_ID'] = clues_id article_clues_src['ARTICLE_ID'] = article_info['ID'] article_clues_srcs.append(article_clues_src) self._es.add_batch(article_clues_srcs, "ID", 'tab_iopm_article_clues_src') # 情感分析 (1 正 2 负 3 中立, 百度:0:负向,1:中性,2:正向) emotion = self._emotion.get_emotion(del_tag_content) if emotion == 0: emotion = 2 elif emotion == 1: emotion = 3 elif emotion == 2: emotion = 1 else: emotion = 3 article_info['EMOTION'] = emotion # 主流媒体 is_vip = self._vip_checked.is_vip( article_info['URL']) or self._vip_checked.is_vip( article_info['WEBSITE_NAME']) article_info["IS_VIP"] = is_vip # 计算相关度 if article_info['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data = { 'article_id': article_info['ID'], # 文章id 'clues_ids': article_info['CLUES_IDS'], # 线索ids 'may_invalid': 0, #是否可能无效(微博包含@ 或者#) 'vip_count': article_info['IS_VIP'], # 主流媒体数 'negative_emotion_count': 1 if article_info['EMOTION'] == 2 else 0, # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data) article_info['WEIGHT'] = result.get('weight', 0) else: article_info['WEIGHT'] = 0 # 词语图 word_cloud = self._word_cloud.get_word_cloud(del_tag_content) article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud) # 摘要 if not article_info['SUMMARY']: article_info['SUMMARY'] = self._summary.get_summary( del_tag_content) # 统计相似文章 热点 if article_info['INFO_TYPE'] == 3: # 微博 article_info['TITLE'] = article_info['SUMMARY'][:30] article_info['HOT_ID'] = self._hot_sync.get_hot_id(article_info) log.debug(''' title %s release_time %s url %s 匹配的关键字:%s 线索id %s 一级分类 %s 二级分类 %s 三级分类 %s 关键词-线索 %s ''' % (article_info['TITLE'], article_info['RELEASE_TIME'], article_info["URL"], keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues)) # print(tools.dumps_json(article_info)) article_infos.append(article_info) # article入库 print('article入库') # print(tools.dumps_json(article_infos)) self._es.add_batch(article_infos, "ID", 'tab_iopm_article_info')
class WechatService(): _db = OracleDB() _es = ES() _redisdb = RedisDB() _wechat_sogou = WechatSogou() _wechat_public_platform = WechatPublicPlatform() _todo_accounts = collections.deque() _rownum = 1 _is_done = False # 做完一轮 _is_all_done = False # 所有账号当日发布的消息均已爬取 # wechat_sogou 最后没被封的时间 _wechat_sogou_enable = True _wechat_sogou_last_unenable_time = tools.get_current_timestamp() # wechat_public_platform 最后没被封的时间 _wechat_public_platform_enable = True _wechat_public_platform_last_unenable_time = tools.get_current_timestamp() def __init__(self): pass def __load_todo_account(self): accounts = WechatService._redisdb.sget('wechat:account', count=1) for account in accounts: account = eval(account) WechatService._todo_accounts.append(account) def is_have_new_article(self, account_id, account_name, __biz): ''' @summary: 检查是否有新发布的文章 --------- @param account_id: @param __biz: --------- @result: ''' result = '' if WechatService._wechat_sogou_enable: # 搜狗微信可用 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: # 被封了 请求失败 记录下失败时间 WechatService._wechat_sogou_enable = False WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 搜狗微信停用时间超过24小时了 可重新尝试 elif tools.get_current_timestamp( ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 搜狗微信可用 WechatService._wechat_sogou_enable = True elif result == constance.NOT_UPDATE: pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: pass # 更新下可用时间 WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章 if not result or result == constance.VERIFICATION_CODE: if WechatService._wechat_public_platform_enable: # 微信公众平台可用 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 WechatService._wechat_public_platform_enable = False WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) elif tools.get_current_timestamp( ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 WechatService._wechat_public_platform_enable = True elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 pass # 更新下可用时间 WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) return result def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' if not WechatService._todo_accounts: self.__load_todo_account() if not WechatService._todo_accounts: return None oralce_id, account_id, account_name, last_article_release_time, biz = WechatService._todo_accounts.popleft( ) next_account_id = account_id next_account_biz = biz next_account_name = account_name next_account = next_account_id, next_account_biz sql = "update TAB_IOPM_SITE t set t.spider_status=602 where t.biz = '%s'" % ( next_account_biz) WechatService._db.update(sql) return next_account def update_account_article_num(self, __biz): # 查询es 统计数量 # 今日 body = { "size": 0, "query": { "filtered": { "filter": { "range": { "record_time": { "gte": tools.get_current_date('%Y-%m-%d') + ' 00:00:00', "lte": tools.get_current_date('%Y-%m-%d') + ' 23:59:59' } } }, "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) today_msg = result.get('hits', {}).get('total', 0) # 历史总信息量 body = { "size": 0, "query": { "filtered": { "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) total_msg = result.get('hits', {}).get('total', 0) if total_msg: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d, t.spider_status=603 where t.biz = '%s'" % ( today_msg, total_msg, __biz) else: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.spider_status=603 where t.biz = '%s'" % ( today_msg, __biz) print(sql) WechatService._db.update(sql) def is_exist(self, table, data_id): if WechatService._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- 标题 %s 发布时间 %s 作者 %s 公众号 %s url %s ''' % (article_info['title'], article_info['release_time'], article_info['author'], article_info['account'], article_info['url'])) WechatService._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) WechatService._es.add('wechat_account', account_info, account_info.get('__biz'))
def __init__(self): self._yqtj_es = ES(YQTJ)
class UpdateWeight(): """docstring for UpdateWeight""" def __init__(self): self._yqtj_es = ES(YQTJ) def get_articles(self, table, record_time, release_time_begin, release_time_end): body = { "query": { "filtered": { "filter": { "bool": { "must": [ { "range": { "RECORD_TIME": { # 查询大于该csr_res_id 的信息 "gt": record_time } } }, { "range": { "RELEASE_TIME": { "gte": release_time_begin, "lte": release_time_end } } } ] } } } }, "size": 1500, "sort": [{ "RECORD_TIME": "asc" }] } print(tools.dumps_json(body)) article = self._yqtj_es.search(table, body) return article.get('hits', {}).get('hits', []) def update_article_weight(self, articles): release_time = '' for article in articles: article_info = article.get('_source') if article_info['WEIGHT'] == 0: continue data = { 'article_id': article_info['ID'], # 文章id 'clues_ids': article_info['CLUES_IDS'], # 线索ids 'may_invalid': 0, #是否可能无效(微博包含@ 或者#) 'vip_count': article_info['IS_VIP'], # 主流媒体数 'negative_emotion_count': article_info['EMOTION'], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } print(article_info["TITLE"]) print(article_info["RELEASE_TIME"]) result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS, data=data) weight = result.get('weight', 0) # * weight_factor 没有考虑到地域 tools.print_one_line("修改相关度 %s -> %s" % (article_info['WEIGHT'], weight)) if self._yqtj_es.update_by_id('tab_iopm_article_info', article_info['ID'], {"WEIGHT": weight}): release_time, record_time = article_info[ "RELEASE_TIME"], article_info["RECORD_TIME"] return release_time, record_time def update_hot_weight(self, articles): release_time = '' for article in articles: article_info = article.get('_source') if article_info['WEIGHT'] == 0: continue data = { 'hot_id': article_info['ID'], # 文章id 'hot_value': article_info['HOT'], # 热度值 'clues_ids': article_info['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count': article_info['ARTICLE_COUNT'], # 文章总数 'vip_count': article_info["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': article_info["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } print(''' release_time %s record_time %s ''' % (article_info["RELEASE_TIME"], article_info["RECORD_TIME"])) result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS, data=data) weight = result.get('weight', 0) # * weight_factor 没有考虑到地域 tools.print_one_line("修改相关度 %s -> %s" % (article_info['WEIGHT'], weight)) if self._yqtj_es.update_by_id('tab_iopm_hot_info', article_info['ID'], {"WEIGHT": weight}): record_time = article_info['RECORD_TIME'] return record_time
def __init__(self): self._es = ES() self._current_csr_res_id = tools.read_file(STO_CURRENT_ID_FILE) self._current_csr_res_id = self._current_csr_res_id and int( self._current_csr_res_id) or 0
class NewsCluster(): def __init__(self): self._es = ES() self._current_csr_res_id = tools.read_file(STO_CURRENT_ID_FILE) self._current_csr_res_id = self._current_csr_res_id and int( self._current_csr_res_id) or 0 def _get_same_day_hots(self, text, start_time): news_day_time = start_time[:start_time.find(' ')] body = { "query": { "filtered": { "filter": { "range": { "start_time": { "gte": news_day_time + ' 00:00:00', 'lte': news_day_time + ' 59:59:59' } } }, "query": { "multi_match": { "query": text, "fields": ["csr_content"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": ["hot_id", "csr_res_ids", "csr_content", 'hot'], "highlight": { "fields": { "csr_content": {} } } } # 默认按照匹配分数排序 hots = self._es.search('tab_news_csr_hot', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def _save_current_id(self): ''' @summary: 保存做到的id, 下次接着做 --------- --------- @result: ''' tools.write_file(STO_CURRENT_ID_FILE, str(self._current_csr_res_id)) def deal_news(self): ''' @summary: 取tab_news_csr_result信息 --------- --------- @result: ''' while True: body = { "query": { "filtered": { "filter": { "range": { "csr_res_id": { # 查询大于该csr_res_id 的信息 "gt": self._current_csr_res_id } } } } }, "_source": ["csr_res_id", "csr_content", "start_time"], "sort": [{ "csr_res_id": "asc" }] } news_json = self._es.search('tab_news_csr_result', body) news_list = news_json.get('hits', {}).get('hits', []) if not news_list: log.debug( 'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' % (self._current_csr_res_id, SLEEP_TIME)) tools.delay_time(SLEEP_TIME) continue for news_info in news_list: news = news_info.get('_source') csr_res_id = news.get('csr_res_id') csr_content = news.get('csr_content') start_time = news.get('start_time') log.debug(''' 处理 tab_news_csr_result csr_res_id %s start_time %s csr_content %s ''' % (csr_res_id, start_time, csr_content)) # 找相似文章 similar_hot = None hots = self._get_same_day_hots(csr_content, start_time) # 遍历相似的文章,比较相似度 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('csr_content') temp_similarity = compare_text(csr_content, hot_text) if temp_similarity > MIN_SIMILARITY: similar_hot = hot break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 # 如果找到相似的文章,追加csr_res_id和hot值, 否则将该条信息最为新的热点 if similar_hot: # 找到相似的热点 log.debug('找到所属热点:%s' % similar_hot.get('csr_content')) data = {} # 更新热点的热度及追加文章的id data["hot"] = similar_hot["hot"] + 1 data["csr_res_ids"] = similar_hot[ "csr_res_ids"] + ',' + csr_res_id # 更新热点 self._es.update_by_id("tab_news_csr_hot", data_id=similar_hot.get("hot_id"), data=data) else: # 没有找到相似的热点, 将当前文章作为热点 log.debug('无所属热点') hot_info = { 'hot_id': csr_res_id, 'hot': 1, 'start_time': start_time, 'csr_res_ids': csr_res_id, 'csr_content': csr_content } self._es.add('tab_news_csr_hot', hot_info, data_id=csr_res_id) # 保存当前的id self._current_csr_res_id = csr_res_id self._save_current_id()
class WechatService(): _todo_accounts = collections.deque() _rownum = 1 _is_done = False def __init__(self): self._db = OracleDB() self._es = ES() self.__load_todo_account() def __load_todo_account(self): if not WechatService._todo_accounts: sql = ''' select * from (select rownum r, t.id, t.domain, t.biz from TAB_IOPM_SITE t where t.biz is not null and rownum < {size}) where r >= {rownum} '''.format(rownum=WechatService._rownum, size=WechatService._rownum + SIZE) results = self._db.find(sql) if not results: WechatService._is_done = True WechatService._rownum = 1 self.__load_todo_account() else: WechatService._todo_accounts = collections.deque( results) # 转为队列 WechatService._rownum += SIZE def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' if not WechatService._todo_accounts: self.__load_todo_account() next_account_info = WechatService._todo_accounts.popleft() next_account_id = next_account_info[2] next_account_biz = next_account_info[3] next_account = next_account_id, next_account_biz, WechatService._is_done # 重置_is_done 状态 WechatService._is_done = False return next_account def is_exist(self, table, data_id): if self._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- %s''' % tools.dumps_json(article_info)) self._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) self._es.add('wechat_account', account_info, account_info.get('__biz'))
def __init__(self): self._db = OracleDB() self._es = ES() self.__load_todo_account()
def __init__(self): self._es = ES() self._event_filter = EventFilter() self._event_filter.start()
def __init__(self): self._es = ES('192.168.60.40')
def __init__(self): self._es = ES() self._db = OracleDB() self._max_id = tools.read_file(STO_MAX_ID_FILE) self._max_id = self._max_id and eval(self._max_id) or {}
class ExportData(): INSERT = 1 UPDATE = 2 EXCEPTION = 3 def __init__(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): ''' @summary: 初始化 --------- @param source_table: 源table mongo数据库 @param aim_table: 目标table @param key_map: 目标table 和 源table 的键的映射 eg: key_map = { 'aim_key1' : 'str_source_key2', # 目标键 = 源键对应的值 类型为str 'aim_key2' : 'int_source_key3', # 目标键 = 源键对应的值 类型为int 'aim_key3' : 'date_source_key4', # 目标键 = 源键对应的值 类型为date 'aim_key4' : 'vint_id', # 目标键 = 值 类型为int 'aim_key5' : 'vstr_name', # 目标键 = 值 类型为str 'aim_key6' : 'vdate_name', # 目标键 = 值 类型为date 'aim_key7' : 'sint_select id from xxx' # 目标键 = 值为sql 查询出的结果 类型为int 'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str 'aim_key9' : 'clob_key8' # 目标键 = 源键对应的值 类型为clob 'aim_key10' : 'clob_key8' # 目标键 = 源键对应的值 类型为str } @param unique_key: 唯一的key 目标数据库根据该key去重 @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据 eg: unique_key_mapping_source_key = { 'url':'str_url' # 目标键 = 源键对应的值 类型为str } @param condition: 导出满足什么样条件的数据 默认是read_status = 0 的 @param datas: 要导出的数据,格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表,为空时默认导出mongodb的数据 @param callback 导出数据的回调,导出一组,执行一次,callback(execute_type, sql) execute_type为执行类型(ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION) sql 为执行的语句 --------- @result: ''' super(ExportData, self).__init__() self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._update_read_status = update_read_status self._condition = condition self._mongodb = MongoDB() if self._source_table else '' self._datas = datas self._sync_to_es = sync_to_es self._callback = callback self._is_oracle = False self._is_set_unique_key = False self._is_set_unique_key = False self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key def export_to_oracle(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): if aim_table: if self._aim_table != aim_table: self._is_set_unique_key = False self._es = ES() if sync_to_es else '' self._mongodb = MongoDB() if source_table else '' self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._sync_to_es = sync_to_es self._es = None self._aim_db = OracleDB() self._is_oracle = True return self.__export() def export_to_mysql(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback=''): if self._aim_table != aim_table: self._is_set_unique_key = False self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._aim_db = MysqlDB() return self.__export() def make_sql(self, data): ''' @summary: --------- @param data: 数据字典 --------- @result: 当unique_key_mapping_source_key不为空时返回insert_sql, update_sql 否则返回insert_sql ''' aim_keys = tuple(self._key_map.keys()) source_keys = tuple(self._key_map.values()) # 取源key值 对应的type 和 key (源key包含type 和 key 信息) keys = [] value_types = [] for source_key in source_keys: temp_var = source_key.split('_', 1) value_types.append(temp_var[0]) keys.append(temp_var[1]) insert_sql = 'insert into ' + self._aim_table + " (" + ', '.join( aim_keys) + ") values (" update_sql = 'update ' + self._aim_table + " set " data_json = {} # 导入到es中用 values = [] for i in range(len(keys)): if (value_types[i] != 'vint' and value_types[i] != 'vstr' and value_types[i] != 'vdate' and value_types[i] != 'sint' and value_types[i] != 'sstr') and (not data[keys[i]] and data[keys[i]] != 0): values.append('null') insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = None elif value_types[i] == 'str': values.append( str(data[keys[i]]).replace("'", "''") ) # if isinstance(data[keys[i]], str) else data[keys[i]]) # 将单引号替换成两个单引号 否者insert_sql语句语法出错 insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'clob': text = str(data[keys[i]]).replace("'", "''") if not text: insert_sql += "'%s', " values.append(text) update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = None else: values_ = tools.cut_string(text, 1000) clob_text = '' for value in values_: clob_text += "to_clob('%s') || " % value clob_text = clob_text[:-len(' || ')] values.append(clob_text) insert_sql += "%s, " update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = data[keys[i]] elif value_types[i] == 'int': if isinstance(data[keys[i]], int) or isinstance( data[keys[i]], float) or isinstance( data[keys[i]], str): values.append(data[keys[i]]) elif isinstance(data[keys[i]], bool): values.append(data[keys[i]] and 1 or 0) else: # _id values.append(int(str(data[keys[i]])[-6:], 16)) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = eval( values[-1]) if isinstance(values[-1], str) else values[-1] elif value_types[i] == 'date': values.append(data[keys[i]].replace('年', '-').replace( '月', '-').replace('日', '')) if self._is_oracle: format_date = 'yyyy-mm-dd hh24:mi:ss'[:len( values[-1]) if len(values[-1]) <= 10 else None] insert_sql += "to_date('%s','{}'), ".format(format_date) update_sql += aim_keys[i] + "= to_date('%s','%s'), " % ( values[-1], format_date) data_json[aim_keys[i].upper()] = values[-1] else: insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'vint': if tools.get_english_words(keys[i]): sql = 'select %s from dual' % keys[i] value = self._aim_db.find(sql)[0][0] values.append(value) data_json[aim_keys[i].upper()] = values[-1] else: values.append(keys[i]) data_json[aim_keys[i].upper()] = eval(values[-1]) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] elif value_types[i] == 'vstr': values.append(keys[i]) insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'vdate': values.append(keys[i]) if self._is_oracle: format_date = 'yyyy-mm-dd hh24:mi:ss'[:len( values[-1]) if len(values[-1]) <= 10 else None] insert_sql += "to_date('%s','{}'), ".format(format_date) update_sql += aim_keys[i] + "= to_date('%s','%s'), " % ( values[-1], format_date) data_json[aim_keys[i].upper()] = values[-1] else: insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'sint': value = self._aim_db.find(keys[i], fetch_one=True)[0] values.append(value) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % value data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'sstr': value = self._aim_db.find(keys[i], fetch_one=True)[0] values.append(value) insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % value data_json[aim_keys[i].upper()] = values[-1] else: error_msg = '%s不符合key_map规定格式' % value_types[i] raise (Exception(error_msg)) insert_sql = insert_sql[:-2] + ")" insert_sql = insert_sql % tuple(values) # tools.print(data_json) # log.debug(insert_sql) if self._unique_key_mapping_source_key: # aim_key = tuple(self._unique_key_mapping_source_key.keys())[0] # value = tuple(self._unique_key_mapping_source_key.values())[0] # temp_var = value.split('_', 1) # source_key_types = temp_var[0] # source_key = temp_var[1] # if source_key_types == 'str': # update_sql = update_sql[:-2] + " where %s = '%s'" %(aim_key, data[source_key]) # elif source_key_types == 'int': # update_sql = update_sql[:-2] + " where %s = %s" %(aim_key, data[source_key]) # # log.debug(update_sql) return insert_sql, update_sql[:-2], data_json else: return insert_sql, data_json # @tools.run_safe_model(__name__) def __export(self): if self._unique_key and not self._is_set_unique_key: self._aim_db.set_unique_key(self._aim_table, self._unique_key) self._is_set_unique_key = True datas = self._mongodb.find( self._source_table, condition=self._condition) if self._mongodb else ( self._datas if isinstance(self._datas, list) else [self._datas]) for data in datas: if self._unique_key_mapping_source_key: insert_sql, update_sql, data_json = self.make_sql(data) else: insert_sql, data_json = self.make_sql(data) # tools.write_file(self._aim_table + '.txt', insert_sql, 'w+') def exception_callfunc(e): if 'ORA-00001' in str(e): if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) else: log.error(insert_sql) execute_type = ExportData.EXCEPTION sql = '' # log.debug(insert_sql) if self._aim_db.add(insert_sql, exception_callfunc): self._export_count += 1 sql = insert_sql execute_type = ExportData.INSERT if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) elif self._unique_key_mapping_source_key: # 取id字段 aim_key = tuple(self._unique_key_mapping_source_key.keys())[0] value = tuple(self._unique_key_mapping_source_key.values())[0] temp_var = value.split('_', 1) source_key_types = temp_var[0] source_key = temp_var[1] select_sql = 'select id from ' + self._aim_table if source_key_types == 'str': select_sql = select_sql + " where %s = '%s'" % ( aim_key, data[source_key]) elif source_key_types == 'int': select_sql = select_sql + " where %s = %s" % ( aim_key, data[source_key]) data_id = self._aim_db.find(select_sql) if data_id: data_id = data_id[0][0] else: continue #拼接update语句 update_sql += " where id = %s" % data_id log.debug(update_sql) # 删除 update 里面 id= xxx 的条件,保证更新后的数据 ID不变 id_info = ''.join( tools.get_info(update_sql, [' id .*?,', ' ID .*?,'])) update_sql = update_sql.replace(id_info, '') # 修改data_json 里的ID if "ID" in data_json.keys(): data_json["ID"] = data_id # 更新 if self._aim_db.update(update_sql): self._update_count += 1 sql = update_sql execute_type = ExportData.UPDATE if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) # 同步到ES if self._sync_to_es and execute_type != ExportData.EXCEPTION: self._es.add(table=self._aim_table, data=data_json, data_id=data_json.get('ID')) if self._callback: self._callback(execute_type, sql, data_json) log.debug(''' 共导出%s条数据 共更新%s条数据 ''' % (self._export_count, self._update_count)) return self._export_count + self._update_count def close(self): self._aim_db.close()
import sys sys.path.append('..') import init import base.constance as Constance import utils.tools as tools from utils.log import log from db.mongodb import MongoDB from db.elastic_search import ES db = MongoDB() es = ES() def remove_table(tab_list): for tab in tab_list: db.delete(tab) def reset_table(tab_list): for tab in tab_list: db.update(tab, {'status': 3}, {'status': 0}) def add_url(table, site_id='', url='', depth=0, remark='', status=Constance.TODO, title='',
class WechatService(): _db = OracleDB() _es = ES() _wechat_sogou = WechatSogou() _wechat_public_platform = WechatPublicPlatform() _todo_accounts = collections.deque() _rownum = 1 _is_done = False # 做完一轮 _is_all_done = False # 所有账号当日发布的消息均已爬取 # wechat_sogou 最后没被封的时间 _wechat_sogou_enable = True _wechat_sogou_last_unenable_time = tools.get_current_timestamp() # wechat_public_platform 最后没被封的时间 _wechat_public_platform_enable = True _wechat_public_platform_last_unenable_time = tools.get_current_timestamp() def __init__(self): pass def __load_todo_account(self): if not WechatService._todo_accounts: sql = '' if not WechatService._is_all_done: sql = ''' select * from (select rownum r, t.id, t.domain, t.biz, t.name from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and (today_msg is null or today_msg = 0) and rownum < {size}) where r >= {rownum} '''.format(rownum=WechatService._rownum, size=WechatService._rownum + SIZE) else: # 今日公众号发布的新文章均已爬取 sql = ''' select * from (select rownum r, t.id, t.domain, t.biz, t.name from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and rownum < {size}) where r >= {rownum} '''.format(rownum=WechatService._rownum, size=WechatService._rownum + SIZE) print(sql) results = WechatService._db.find(sql) if not results: if WechatService._rownum == 1: # 今日公众号发布的新文章均已爬取,爬虫休息,明日再爬 WechatService._is_all_done = True # 为了WeichatAction 设置休眠时间用 # 取下一天的公众号 self.__load_todo_account() else: WechatService._is_done = True WechatService._rownum = 1 self.__load_todo_account() else: WechatService._todo_accounts = collections.deque( results) # 转为队列 WechatService._rownum += SIZE def is_have_new_article(self, account_id, account_name, __biz): ''' @summary: 检查是否有新发布的文章 --------- @param account_id: @param __biz: --------- @result: ''' result = '' if WechatService._wechat_sogou_enable: # 搜狗微信可用 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: # 被封了 请求失败 记录下失败时间 WechatService._wechat_sogou_enable = False WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 搜狗微信停用时间超过24小时了 可重新尝试 elif tools.get_current_timestamp( ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 搜狗微信可用 WechatService._wechat_sogou_enable = True elif result == constance.NOT_UPDATE: pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: pass # 更新下可用时间 WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章 if not result or result == constance.VERIFICATION_CODE: if WechatService._wechat_public_platform_enable: # 微信公众平台可用 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 WechatService._wechat_public_platform_enable = False WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) elif tools.get_current_timestamp( ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 WechatService._wechat_public_platform_enable = True elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 pass # 更新下可用时间 WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) return result def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' while True: if not WechatService._todo_accounts: self.__load_todo_account() next_account_info = WechatService._todo_accounts.popleft() next_account_id = next_account_info[2] next_account_biz = next_account_info[3] next_account_name = next_account_info[4] next_account = next_account_id, next_account_biz, WechatService._is_done, WechatService._is_all_done if not WechatService._wechat_sogou_enable: log.debug('搜狗微信不可用') if not WechatService._wechat_public_platform_enable: log.debug('微信公众平台不可用') # 不用检查是否发布新文章 直接跳出 if not CHECK_NEW_ARTICLE: break # 搜狗微信和微信公众平台均不可用 跳出 if not WechatService._wechat_sogou_enable and not WechatService._wechat_public_platform_enable: break # 使用检查新文章时,有一定的几率跳出, 采用微信客户端直接爬取,防止搜狗微信使用频繁出现验证码 if random.randint(1, 5) == 1: log.debug('跳出 防止搜狗微信被封') break # 检查是今日是否有文章发布 result = self.is_have_new_article(next_account_id, next_account_name, next_account_biz) if result == constance.UPDATE: break elif result == constance.NOT_UPDATE: if WechatService._is_done: # 防止公众号都没更新, 产生死循环 都检查完一遍 发现都没更新 直接跳出 break else: # tools.delay_time(5) continue elif result == constance.ERROR: break elif result == constance.VERIFICATION_CODE: break else: # 检查更新不可用 直接调用客户端爬取 break # 重置_is_done与_is_all_done 状态 WechatService._is_done = False WechatService._is_all_done = False return next_account def update_account_article_num(self, __biz): # 查询es 统计数量 # 今日 body = { "size": 0, "query": { "filtered": { "filter": { "range": { "record_time": { "gte": tools.get_current_date('%Y-%m-%d') + ' 00:00:00', "lte": tools.get_current_date('%Y-%m-%d') + ' 23:59:59' } } }, "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) today_msg = result.get('hits', {}).get('total', 0) # 历史总信息量 body = { "size": 0, "query": { "filtered": { "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) total_msg = result.get('hits', {}).get('total', 0) if total_msg: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d where t.biz = '%s'" % ( today_msg, total_msg, __biz) else: sql = "update TAB_IOPM_SITE t set t.today_msg = %d where t.biz = '%s'" % ( today_msg, __biz) print(sql) WechatService._db.update(sql) def is_exist(self, table, data_id): if WechatService._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- 标题 %s 发布时间 %s 作者 %s 公众号 %s url %s ''' % (article_info['title'], article_info['release_time'], article_info['author'], article_info['account'], article_info['url'])) WechatService._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) WechatService._es.add('wechat_account', account_info, account_info.get('__biz'))
def __init__(self): super(SyncArtice, self).__init__() self._es = ES() self._redis = RedisDB() self._sync_count = 0
class HotSync(): def __init__(self): self._es = ES() def _get_today_hots(self, text, release_time): release_day = release_time[:release_time.find(' ')] body = { "query": { "filtered": { "filter": { "range": { "RELEASE_TIME": { # 当日发布的新闻 "gte": release_day + ' 00:00:00', "lte": release_day + ' 23:59:59' } } }, "query": { "multi_match": { "query": text, "fields": ["TITLE"], "operator": "or", "minimum_should_match": "{percent}%".format(percent=int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比 } } } }, "_source": [ "ID", "TITLE", "CONTENT", "HOT", "CLUES_IDS", "VIP_COUNT", "NEGATIVE_EMOTION_COUNT" ], "highlight": { "fields": { "TITLE": {} } } } # 默认按照匹配分数排序 hots = self._es.search('tab_iopm_hot_info', body) # print(tools.dumps_json(hots)) return hots.get('hits', {}).get('hits', []) def get_hot_id(self, article_info): article_text = article_info.get( "TITLE") # + article_info.get("CONTENT") release_time = article_info.get("RELEASE_TIME") article_text = tools.del_html_tag(article_text) hots = self._get_today_hots(article_text, release_time) # 找最相似的热点 similar_hot = None max_similarity = 0 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('TITLE') # + hot.get('CONTENT') hot_text = tools.del_html_tag(hot_text) temp_similarity = compare_text(article_text, hot_text) if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity: similar_hot = hot max_similarity = temp_similarity break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 if similar_hot: # 找到相似的热点 if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次 data = {} # 更新热点的热度 data["HOT"] = similar_hot["HOT"] + 1 # 更新主流媒体数量及负面舆情数量 data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + ( 1 if article_info["IS_VIP"] else 0) data["NEGATIVE_EMOTION_COUNT"] = similar_hot[ 'NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0) # 更新相关度 if similar_hot['CLUES_IDS']: url = IOPM_SERVICE_ADDRESS + 'related_sort' data_args = { 'hot_id': similar_hot['ID'], # 文章id 'hot_value': data['HOT'], # 热度值 'clues_id': similar_hot['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count': data['HOT'], # 文章总数 'vip_count': data["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } result = tools.get_json_by_requests(url, data=data_args) if result: data['WEIGHT'] = result.get('weight', 0) # 更新热点 self._es.update_by_id("tab_iopm_hot_info", data_id=similar_hot.get("ID"), data=data) # 返回热点id return similar_hot.get("ID") else: # 将该舆情添加为热点 hot_info = deepcopy(article_info) hot_info.pop('HOT_ID') # 热点表中无hot_id # 默认用户行为数量为零 hot_info['ACCEPT_COUNT'] = 0 hot_info['UNACCEPT_COUNT'] = 0 hot_info['WATCH_COUNT'] = 0 # 其他值 hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0 hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info[ 'EMOTION'] == 2 else 0 hot_info['HOT'] = 1 hot_info['ID'] = article_info.get("ID") self._es.add('tab_iopm_hot_info', hot_info, data_id=hot_info['ID']) # 返回热点id return hot_info['ID']