Пример #1
0
def main():
    oracledb = OracleDB()
    esdb = ES()

    # sql = 'select MSG_ID from TAB_IOPM_USER_ACTION t where action_type=301 and msg_type = 502 and record_time>=sysdate-1'
    # article_ids = oracledb.find(sql)

    article_ids = [8888515, 8888293, 8891299]
    for article_id in article_ids:
        # article_id = article_id[0]

        body = {"WEIGHT": 0}

        print(article_id)
        esdb.update_by_id('tab_iopm_article_info', article_id, body)
Пример #2
0
class HotWeekSync():
    def __init__(self):
        self._es = ES()
        self._event_filter = EventFilter()
        self._event_filter.start()

    def _get_week_hots(self, text, release_time):
        before_week = tools.get_before_date(release_time, -7)

        body = {
            "size":
            1,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "RELEASE_TIME": {  # 当日发布的新闻
                                "gte": before_week,
                                "lte": release_time
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["TITLE"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": [
                "ID",
                "TITLE",
                # "CONTENT",
                "HOT",
                "ARTICLE_COUNT",
                "VIP_COUNT",
                "NEGATIVE_EMOTION_COUNT",
                "HOT_DAY_IDS",
                "WEIGHT"
            ],
            # "highlight": {
            #       "fields": {
            #           "TITLE": {}
            #       }
            # }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_week_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def cluster_week_hot(self,
                         day_hot,
                         hot_value=None,
                         article_count=None,
                         vip_count=None,
                         negative_emotion_count=None,
                         weight=None):
        '''
        @summary: 聚类
        ---------
        @param hot:每日热点信息
        @param hot_value: 一条舆情的热度 (不为空时表示该条每日热点为更新热点,那么7日热点已经聚过此热点, 热度应该只加该条舆情的热度)
        @param article_count:
        @param vip_count:
        @param negative_emotion_count:
        @param weight:
        ---------
        @result:
        '''

        article_text = day_hot.get("TITLE")  # + hot.get("CONTENT")
        release_time = day_hot.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_week_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != day_hot["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + (hot_value
                                                    or day_hot.get('HOT'))
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + (
                    article_count or day_hot.get('ARTICLE_COUNT'))

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    vip_count or day_hot.get('VIP_COUNT'))
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (
                        negative_emotion_count
                        or hot.get('NEGATIVE_EMOTION_COUNT'))

                # 更新相关度
                # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT'])

                # 更新 hot_day_ids
                if not hot_value:
                    data["HOT_DAY_IDS"] = similar_hot[
                        'HOT_DAY_IDS'] + ',' + day_hot['ID']

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_week_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(day_hot)

            # 处理事件类型
            del_tag_content = tools.del_html_tag(hot_info['CONTENT'])
            text = hot_info['TITLE'] + del_tag_content
            contain_event_ids = self._event_filter.find_contain_event(text)
            hot_info['EVENT_IDS'] = ','.join(contain_event_ids)

            hot_info['HOT_DAY_IDS'] = day_hot.get("ID")

            self._es.add('tab_iopm_hot_week_info',
                         hot_info,
                         data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']
Пример #3
0
class ArticleSync():
    def __init__(self, table):
        self._sync_time_file = SYNC_TIME_FILE + table + '.txt'
        self._record_time = tools.get_json(
            tools.read_file(self._sync_time_file)) or {}
        self._compare_keywords = CompareKeywords()
        self._summary = Summary()
        self._emotion = Emotion()
        self._word_cloud = WordCloud()
        self._yqtj_es = ES(YQTJ)
        self._data_pool_es = ES(DATA_POOL)
        self._hot_sync = HotSync()
        self._vip_checked = VipChecked()
        self._province_filter = ProvinceFilter()
        # self._event_filter = EventFilter()
        self._table = table
        self._per_record_time_key = '{table}_record_time'.format(
            table=self._table)

        self._vip_checked.start()
        self._compare_keywords.start()
        # self._event_filter.start()

    def get_article_info(self):
        '''
        @summary: 取article的结构信息
        ---------
        ---------
        @result:
        '''

        article_info = {
            "EMOTION": None,
            "HOST": "",
            "AUTHOR": "",
            "URL": "",
            "WEBSITE_NAME": "",
            "ACCOUNT": "",
            "REVIEW_COUNT": None,
            "KEYWORDS_COUNT": None,
            "RELEASE_TIME": "",
            "CONTENT": "",
            "ID": None,
            "UUID": "",
            "WEIGHT": None,
            "CLUES_IDS": "",
            "UP_COUNT": None,
            "INTERACTION_COUNT": None,
            "RECORD_TIME": None,
            "COMMENT_COUNT": None,
            "IS_VIP": None,
            "INFO_TYPE": None,
            "HOT_ID": None,
            "KEYWORD_CLUES_ID": "",
            "MAY_INVALID": None,
            "TITLE": "",
            "KEYWORDS": "",
            "TRANSMIT_COUNT": None,
            "ZERO_ID": None,
            "FIRST_ID": None,
            "SECOND_ID": None,
            "SUMMARY": "",
            "WORD_CLOUD": "",
            "IMAGE_URL": ""
        }

        return article_info

    def get_article_clues_src(self):
        article_clues_src = {"CLUES_ID": "", "ARTICLE_ID": "", "ID": ""}

        return article_clues_src

    def get_per_record_time(self):
        per_record_time = self._record_time.get(self._per_record_time_key)
        return per_record_time

    def record_now_record_time(self, record_time):
        self._record_time[self._per_record_time_key] = record_time
        tools.write_file(self._sync_time_file,
                         tools.dumps_json(self._record_time))

    def get_article(self):
        '''
        @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO
        ---------
        ---------
        @result:
        '''
        log.debug("取代做种子集...")

        per_record_time = self.get_per_record_time()
        today_time = tools.get_current_date("%Y-%m-%d")
        min_day_ago = tools.get_before_date(today_time,
                                            -30,
                                            current_date_format='%Y-%m-%d',
                                            return_date_format='%Y-%m-%d')

        if per_record_time:
            # body = {
            #     "size":1500,
            #     "query": {
            #         "filtered": {
            #           "filter": {
            #             "range": {
            #                 "record_time" : {
            #                     "gt": per_record_time
            #                 }
            #             }
            #           }
            #         }
            #     },
            #     "sort":[{"record_time":"asc"}]
            # }

            body = {
                "size": 1500,
                "query": {
                    "filtered": {
                        "filter": {
                            "bool": {
                                "must": [
                                    {
                                        "range": {
                                            "record_time": {
                                                "gt": per_record_time
                                            }
                                        }
                                    },
                                    {
                                        "range": {
                                            "release_time": {
                                                "gte": min_day_ago +
                                                ' 00:00:00',  # 30日前
                                                "lte":
                                                today_time + ' 23:59:59'  # 今日
                                            }
                                        }
                                    }
                                ]
                            }
                        }
                    }
                },
                "sort": [{
                    "record_time": "asc"
                }]
            }

        else:
            body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "release_time": {
                                    "gte": three_day_ago + ' 00:00:00',  # 三日前
                                    "lte": today_time + ' 23:59:59'  # 今日
                                }
                            }
                        }
                    }
                },
                "size": 1500,
                "sort": [{
                    "record_time": "asc"
                }]
            }

        log.debug(self._table + " => " + tools.dumps_json(body))

        article = self._data_pool_es.search(self._table, body)
        return article.get('hits', {}).get('hits', [])

    def deal_article(self, article_list):
        '''
        @summary:处理article
        ---------
        @param article_list:
        ---------
        @result:
        '''
        article_infos = []
        # 补全剩余的信息
        for article_info in article_list:
            # print(tools.dumps_json(article_info))
            # 互动量
            article_info['INTERACTION_COUNT'] = (
                article_info['UP_COUNT']
                or 0) + (article_info['TRANSMIT_COUNT']
                         or 0) + (article_info['REVIEW_COUNT']
                                  or 0) + (article_info['COMMENT_COUNT'] or 0)

            # 检查库中是否已存在 存在则更新互动量
            if self._yqtj_es.get('tab_iopm_article_info', article_info["ID"]):
                log.debug('%s 已存在' % article_info['TITLE'])
                data = {
                    "INTERACTION_COUNT": article_info['INTERACTION_COUNT'],
                    "UP_COUNT": article_info['UP_COUNT'],
                    "TRANSMIT_COUNT": article_info['TRANSMIT_COUNT'],
                    "REVIEW_COUNT": article_info['REVIEW_COUNT'],
                    "COMMENT_COUNT": article_info['COMMENT_COUNT']
                }

                # 更新舆情
                self._yqtj_es.update_by_id("tab_iopm_article_info",
                                           data_id=article_info.get("ID"),
                                           data=data)
                continue

            # 标题+内容文本信息
            del_tag_content = tools.del_html_tag(article_info['CONTENT'])
            text = article_info['TITLE'] + del_tag_content
            # print(text)

            # 地域过滤
            contain_airs = ','.join(
                self._province_filter.find_contain_air(text))
            weight_factor = 1  # 权重系数
            if not contain_airs and PROVINCE:
                # log.debug('%s 不包含 本地地名 pass' % article_info['TITLE'])
                weight_factor = 0.01  # 不是本市的,权重系数较小; 权值 = 权重 * 权重系数

            # 线索关键词比对
            keywords, clues_ids, zero_ids, first_ids, second_ids, keyword_clues = self._compare_keywords.get_contained_keys(
                text)

            article_info[
                'KEYWORDS'] = keywords + ',' + contain_airs if keywords else contain_airs
            article_info['KEYWORDS'] = ','.join(
                set(article_info['KEYWORDS'].split(',')))
            article_info['CLUES_IDS'] = clues_ids
            article_info['ZERO_ID'] = zero_ids
            article_info['FIRST_ID'] = first_ids
            article_info['SECOND_ID'] = second_ids
            article_info['KEYWORDS_COUNT'] = len(keyword_clues)
            article_info['KEYWORD_CLUES_ID'] = str(keyword_clues)

            # # 线索与舆情中间表
            # article_clues_srcs = []
            # if clues_ids:
            #     for clues_id in clues_ids.split(','):
            #         article_clues_src = self.get_article_clues_src()
            #         article_clues_src['ID'] =  tools.get_uuid(clues_id, article_info['ID'])
            #         article_clues_src['CLUES_ID'] =  clues_id
            #         article_clues_src['ARTICLE_ID'] = article_info['ID']

            #         article_clues_srcs.append(article_clues_src)
            #         self._yqtj_es.add_batch(article_clues_srcs, "ID", 'tab_iopm_article_clues_src')

            # 词语图
            word_cloud = self._word_cloud.get_word_cloud(del_tag_content)
            article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud)

            # 摘要
            if not article_info['SUMMARY']:
                article_info['SUMMARY'] = self._summary.get_summary(
                    del_tag_content)

            # 情感分析 (1 正 2 负 3 中立, 百度:0:负向,1:中性,2:正向)
            emotion = self._emotion.get_emotion(article_info['SUMMARY'])
            if emotion == 0:
                emotion = 2

            elif emotion == 1:
                emotion = 3

            elif emotion == 2:
                emotion = 1

            else:
                emotion = 3

            article_info['EMOTION'] = emotion

            # 主流媒体
            is_vip, zero_id, first_id, second_id = self._vip_checked.is_vip(
                article_info['HOST'], article_info['WEBSITE_NAME'])
            article_info["IS_VIP"] = is_vip
            if is_vip:
                article_info['ZERO_ID'] = article_info[
                    'ZERO_ID'] + ',' + zero_id if article_info[
                        'ZERO_ID'] else zero_id
                article_info['FIRST_ID'] = article_info[
                    'FIRST_ID'] + ',' + first_id if article_info[
                        'FIRST_ID'] else first_id
                article_info['SECOND_ID'] = article_info[
                    'SECOND_ID'] + ',' + second_id if article_info[
                        'SECOND_ID'] else second_id

            # 计算相关度
            url = IOPM_SERVICE_ADDRESS + 'related_sort'
            data = {
                'article_id': article_info['ID'],  # 文章id
                'clues_ids': article_info['CLUES_IDS'],  # 线索ids
                'may_invalid': 0,  #是否可能无效(微博包含@ 或者#)
                'vip_count': article_info['IS_VIP'],  # 主流媒体数
                'negative_emotion_count':
                1 if article_info['EMOTION'] == 2 else 0,  # 负面情感数
                'zero_ids': article_info['ZERO_ID']
            }

            result = tools.get_json_by_requests(url, data=data)
            article_info['WEIGHT'] = result.get('weight', 0) * weight_factor

            # 统计相似文章 热点
            if article_info['INFO_TYPE'] == 3:  # 微博
                article_info['TITLE'] = article_info['SUMMARY'][:30]

            article_info['HOT_ID'] = self._hot_sync.get_hot_id(
                article_info, contain_airs, weight_factor)

            log.debug('''
                title         %s
                release_time  %s
                record_time   %s
                url           %s
                匹配的关键字:%s
                线索id        %s
                一级分类      %s
                二级分类      %s
                三级分类      %s
                关键词-线索   %s
                地域          %s
                ''' % (article_info['TITLE'], article_info['RELEASE_TIME'],
                       article_info['RECORD_TIME'], article_info["URL"],
                       keywords, clues_ids, zero_ids, first_id, second_ids,
                       keyword_clues, contain_airs))

            # print(tools.dumps_json(article_info))
            article_infos.append(article_info)

            # print('article入库')
            # self._yqtj_es.add('tab_iopm_article_info', article_info, article_info["ID"])

        # article入库 批量
        print('article批量入库 size = %s' % len(article_infos))
        # print(tools.dumps_json(article_infos))
        self._yqtj_es.add_batch(article_infos, "ID", 'tab_iopm_article_info')
Пример #4
0
class HotSync():
    def __init__(self):
        self._es = ES()
        self._hot_week_sync = HotWeekSync()
        self._cut_text = CutText()
        self._cut_text.set_stop_words('utils/stop_words.txt')

    def _get_today_hots(self, text, release_time):
        release_day = release_time[:release_time.find(' ')]

        body = {
        "size":1,
          "query": {
            "filtered": {
              "filter": {
                "range": {
                   "RELEASE_TIME": { # 当日发布的新闻
                        "gte": release_day + ' 00:00:00',
                        "lte": release_day + ' 23:59:59'
                    }
                }
              },
              "query": {
                "multi_match": {
                    "query": text,
                    "fields": [
                        "TITLE"
                    ],
                    "operator": "or",
                    "minimum_should_match": "{percent}%".format(percent = int(MIN_SIMILARITY * 100)) # 匹配到的关键词占比
                }
              }
            }
          }#,
          # "_source": [
          #       "ID",
          #       "TITLE",
          #       # "CONTENT",
          #       "RELEASE_TIME",
          #       "WEIGHT",
          #       "HOT",
          #       "ARTICLE_COUNT",
          #       "CLUES_IDS",
          #       "VIP_COUNT",
          #       "NEGATIVE_EMOTION_COUNT"
          # ],
          # "highlight": {
          #       "fields": {
          #           "TITLE": {}
          #       }
          # }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])


    def get_hot_id(self, article_info, positions, weight_factor):
        '''
        @summary: 聚类
        ---------
        @param article_info:
        ---------
        @result:
        '''
        # weight_factor = 1

        article_text = article_info.get("TITLE")# + article_info.get("CONTENT")
        release_time = article_info.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_today_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')# + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

        if similar_hot:# 找到相似的热点
            if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) *  weight_factor
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + 1

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + article_info["IS_VIP"]
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot['NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0)

                weight_temp = 0 # 记录更新前后的差值
                # 更新相关度
                if similar_hot['CLUES_IDS']:
                    url = IOPM_SERVICE_ADDRESS + 'related_sort'
                    data_args = {
                        'hot_id': similar_hot['ID'], # 文章id
                        'hot_value' :data['HOT'], # 热度值
                        'clues_ids': similar_hot['CLUES_IDS'],  #相关舆情匹配到的线索id
                        'article_count' : data['ARTICLE_COUNT'], # 文章总数
                        'vip_count': data["VIP_COUNT"],   # 主流媒体数
                        'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                        'zero_ids':article_info['ZERO_ID']
                    }

                    result = tools.get_json_by_requests(url, data = data_args)
                    weight_temp = similar_hot['WEIGHT'] - result.get('weight', 0)
                    data['WEIGHT'] = result.get('weight', 0) * weight_factor

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_info", data_id = similar_hot.get("ID"), data = data)
                # 同步7日热点
                self._hot_week_sync.cluster_week_hot(similar_hot, hot_value = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0), article_count = 1, vip_count = article_info["IS_VIP"], negative_emotion_count = 1 if article_info['EMOTION'] == 2 else 0, weight = weight_temp)


            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(article_info)
            hot_info.pop('HOT_ID') # 热点表中无hot_id

            # 默认用户行为数量为零
            hot_info['ACCEPT_COUNT'] = 0
            hot_info['UNACCEPT_COUNT'] = 0
            hot_info['WATCH_COUNT'] = 0

            # 其他值
            hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0
            hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info['EMOTION'] == 2 else 0

            hot_info['HOT'] = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor
            hot_info['ID'] = article_info.get("ID")
            hot_info['ARTICLE_COUNT'] = 1
            hot_info['HOT_KEYWORDS'] = ','.join(self._cut_text.cut_for_keyword(article_info["TITLE"]))  # 关键词 可优化速度  在比较相似度时已经分词了 TODO
            hot_info['POSITIONS'] = positions
            hot_info['EVENT_IDS'] = ''  # 事件类型(每日热点不需要 TODO | 每周热点已加)

            self._es.add('tab_iopm_hot_info', hot_info, data_id = hot_info['ID'])
            # 同步7日热点
            self._hot_week_sync.cluster_week_hot(hot_info)

            # 返回热点id
            return hot_info['ID']
Пример #5
0
class UpdateWeight():
    """docstring for UpdateWeight"""
    def __init__(self):
        self._yqtj_es = ES(YQTJ)

    def get_articles(self, table, record_time, release_time_begin,
                     release_time_end):
        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "range": {
                                        "RECORD_TIME": {  # 查询大于该csr_res_id 的信息
                                            "gt": record_time
                                        }
                                    }
                                },
                                {
                                    "range": {
                                        "RELEASE_TIME": {
                                            "gte": release_time_begin,
                                            "lte": release_time_end
                                        }
                                    }
                                }
                            ]
                        }
                    }
                }
            },
            "size": 1500,
            "sort": [{
                "RECORD_TIME": "asc"
            }]
        }

        print(tools.dumps_json(body))

        article = self._yqtj_es.search(table, body)
        return article.get('hits', {}).get('hits', [])

    def update_article_weight(self, articles):
        release_time = ''
        for article in articles:
            article_info = article.get('_source')
            if article_info['WEIGHT'] == 0:
                continue

            data = {
                'article_id': article_info['ID'],  # 文章id
                'clues_ids': article_info['CLUES_IDS'],  # 线索ids
                'may_invalid': 0,  #是否可能无效(微博包含@ 或者#)
                'vip_count': article_info['IS_VIP'],  # 主流媒体数
                'negative_emotion_count': article_info['EMOTION'],  # 负面情感数
                'zero_ids': article_info['ZERO_ID']
            }
            print(article_info["TITLE"])
            print(article_info["RELEASE_TIME"])

            result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS,
                                                data=data)
            weight = result.get('weight', 0)  # * weight_factor 没有考虑到地域
            tools.print_one_line("修改相关度 %s -> %s" %
                                 (article_info['WEIGHT'], weight))

            if self._yqtj_es.update_by_id('tab_iopm_article_info',
                                          article_info['ID'],
                                          {"WEIGHT": weight}):
                release_time, record_time = article_info[
                    "RELEASE_TIME"], article_info["RECORD_TIME"]

        return release_time, record_time

    def update_hot_weight(self, articles):
        release_time = ''
        for article in articles:
            article_info = article.get('_source')
            if article_info['WEIGHT'] == 0:
                continue

            data = {
                'hot_id': article_info['ID'],  # 文章id
                'hot_value': article_info['HOT'],  # 热度值
                'clues_ids': article_info['CLUES_IDS'],  #相关舆情匹配到的线索id
                'article_count': article_info['ARTICLE_COUNT'],  # 文章总数
                'vip_count': article_info["VIP_COUNT"],  # 主流媒体数
                'negative_emotion_count':
                article_info["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                'zero_ids': article_info['ZERO_ID']
            }
            print('''
                release_time %s
                record_time  %s
                ''' %
                  (article_info["RELEASE_TIME"], article_info["RECORD_TIME"]))

            result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS,
                                                data=data)
            weight = result.get('weight', 0)  # * weight_factor 没有考虑到地域
            tools.print_one_line("修改相关度 %s -> %s" %
                                 (article_info['WEIGHT'], weight))

            if self._yqtj_es.update_by_id('tab_iopm_hot_info',
                                          article_info['ID'],
                                          {"WEIGHT": weight}):
                record_time = article_info['RECORD_TIME']

        return record_time
Пример #6
0
class NewsCluster():
    def __init__(self):
        self._es = ES()
        self._current_csr_res_id = tools.read_file(STO_CURRENT_ID_FILE)
        self._current_csr_res_id = self._current_csr_res_id and int(
            self._current_csr_res_id) or 0

    def _get_same_day_hots(self, text, start_time):
        news_day_time = start_time[:start_time.find(' ')]
        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "start_time": {
                                "gte": news_day_time + ' 00:00:00',
                                'lte': news_day_time + ' 59:59:59'
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["csr_content"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": ["hot_id", "csr_res_ids", "csr_content", 'hot'],
            "highlight": {
                "fields": {
                    "csr_content": {}
                }
            }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_news_csr_hot', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def _save_current_id(self):
        '''
        @summary: 保存做到的id, 下次接着做
        ---------
        ---------
        @result:
        '''

        tools.write_file(STO_CURRENT_ID_FILE, str(self._current_csr_res_id))

    def deal_news(self):
        '''
        @summary: 取tab_news_csr_result信息
        ---------
        ---------
        @result:
        '''
        while True:
            body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "csr_res_id": {  # 查询大于该csr_res_id 的信息
                                    "gt": self._current_csr_res_id
                                }
                            }
                        }
                    }
                },
                "_source": ["csr_res_id", "csr_content", "start_time"],
                "sort": [{
                    "csr_res_id": "asc"
                }]
            }

            news_json = self._es.search('tab_news_csr_result', body)
            news_list = news_json.get('hits', {}).get('hits', [])

            if not news_list:
                log.debug(
                    'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' %
                    (self._current_csr_res_id, SLEEP_TIME))
                tools.delay_time(SLEEP_TIME)
                continue

            for news_info in news_list:
                news = news_info.get('_source')
                csr_res_id = news.get('csr_res_id')
                csr_content = news.get('csr_content')
                start_time = news.get('start_time')

                log.debug('''
                    处理 tab_news_csr_result
                    csr_res_id  %s
                    start_time  %s
                    csr_content %s
                    ''' % (csr_res_id, start_time, csr_content))

                # 找相似文章
                similar_hot = None
                hots = self._get_same_day_hots(csr_content, start_time)

                # 遍历相似的文章,比较相似度
                for hot_info in hots:
                    hot = hot_info.get('_source')
                    hot_text = hot.get('csr_content')

                    temp_similarity = compare_text(csr_content, hot_text)
                    if temp_similarity > MIN_SIMILARITY:
                        similar_hot = hot

                    break  #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

                # 如果找到相似的文章,追加csr_res_id和hot值, 否则将该条信息最为新的热点
                if similar_hot:  # 找到相似的热点
                    log.debug('找到所属热点:%s' % similar_hot.get('csr_content'))

                    data = {}

                    # 更新热点的热度及追加文章的id
                    data["hot"] = similar_hot["hot"] + 1
                    data["csr_res_ids"] = similar_hot[
                        "csr_res_ids"] + ',' + csr_res_id

                    # 更新热点
                    self._es.update_by_id("tab_news_csr_hot",
                                          data_id=similar_hot.get("hot_id"),
                                          data=data)

                else:  # 没有找到相似的热点, 将当前文章作为热点
                    log.debug('无所属热点')

                    hot_info = {
                        'hot_id': csr_res_id,
                        'hot': 1,
                        'start_time': start_time,
                        'csr_res_ids': csr_res_id,
                        'csr_content': csr_content
                    }
                    self._es.add('tab_news_csr_hot',
                                 hot_info,
                                 data_id=csr_res_id)

                # 保存当前的id
                self._current_csr_res_id = csr_res_id
                self._save_current_id()
Пример #7
0
class HotSync():
    def __init__(self):
        self._es = ES()

    def _get_today_hots(self, text, release_time):
        release_day = release_time[:release_time.find(' ')]

        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "RELEASE_TIME": {  # 当日发布的新闻
                                "gte": release_day + ' 00:00:00',
                                "lte": release_day + ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        "multi_match": {
                            "query":
                            text,
                            "fields": ["TITLE"],
                            "operator":
                            "or",
                            "minimum_should_match":
                            "{percent}%".format(percent=int(MIN_SIMILARITY *
                                                            100))  # 匹配到的关键词占比
                        }
                    }
                }
            },
            "_source": [
                "ID", "TITLE", "CONTENT", "HOT", "CLUES_IDS", "VIP_COUNT",
                "NEGATIVE_EMOTION_COUNT"
            ],
            "highlight": {
                "fields": {
                    "TITLE": {}
                }
            }
        }

        # 默认按照匹配分数排序
        hots = self._es.search('tab_iopm_hot_info', body)
        # print(tools.dumps_json(hots))

        return hots.get('hits', {}).get('hits', [])

    def get_hot_id(self, article_info):
        article_text = article_info.get(
            "TITLE")  # + article_info.get("CONTENT")
        release_time = article_info.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_today_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != article_info["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度
                data["HOT"] = similar_hot["HOT"] + 1

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    1 if article_info["IS_VIP"] else 0)
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION']
                                                 == 2 else 0)

                # 更新相关度
                if similar_hot['CLUES_IDS']:
                    url = IOPM_SERVICE_ADDRESS + 'related_sort'
                    data_args = {
                        'hot_id': similar_hot['ID'],  # 文章id
                        'hot_value': data['HOT'],  # 热度值
                        'clues_id': similar_hot['CLUES_IDS'],  #相关舆情匹配到的线索id
                        'article_count': data['HOT'],  # 文章总数
                        'vip_count': data["VIP_COUNT"],  # 主流媒体数
                        'negative_emotion_count':
                        data["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                        'zero_ids': article_info['ZERO_ID']
                    }

                    result = tools.get_json_by_requests(url, data=data_args)
                    if result:
                        data['WEIGHT'] = result.get('weight', 0)

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(article_info)
            hot_info.pop('HOT_ID')  # 热点表中无hot_id

            # 默认用户行为数量为零
            hot_info['ACCEPT_COUNT'] = 0
            hot_info['UNACCEPT_COUNT'] = 0
            hot_info['WATCH_COUNT'] = 0

            # 其他值
            hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0
            hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info[
                'EMOTION'] == 2 else 0

            hot_info['HOT'] = 1
            hot_info['ID'] = article_info.get("ID")

            self._es.add('tab_iopm_hot_info', hot_info, data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']