def get_clues():
    db = OracleDB()
    sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.name  from TAB_IOPM_CLUES t where zero_id != 7'  # 7 为传播途径
    results = db.find(sql)

    clues_json = {
        "message": "查询成功",
        "status": 1,
        "data": [{
            "clues_id": 104,
            "包含": "",
            "不包含": "",
            "线索": 2
        }]
    }

    clues_json['data'] = []

    file = open('clues/clues.csv', 'w+', encoding='utf8')
    file.write("线索,关键词\n")

    for result in results:
        print(result)
        data = {
            "线索id":
            result[0] if result[0] else "",
            "包含":
            "%s" %
            (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1]
             == ',' else result[1].replace('"', '')) if result[1] else "",
            "不包含":
            "%s" %
            (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1]
             == ',' else result[2].replace('"', '')) if result[2] else "",
            "线索":
            result[3] if result[3] else ""
        }

        # is_delete, keyword2 = delete_keys(result[2])
        # if is_delete:
        #     print('修改后的key->  ', keyword2)
        #     sql = "update TAB_IOPM_CLUES t set t.keyword2 = to_clob('%s') where t.id = %s"%(keyword2, result[0])
        #     if db.update(sql):
        #         print('更新数据库成功\n')

        print(data)
        # data['keyword2'] = format_keys(data['keyword2'])
        # data['keyword3'] = format_keys(data['keyword3'])
        clues_json["data"].append(data)
        file.write('"%s","%s"\n' % (data['线索'], data['包含']))

    file.close()
    # clues_json = tools.dumps_json(clues_json)
    # print(clues_json)

    # tools.write_file('clues/clues.txt', clues_json)
    os.system('start clues\\')

    return clues_json
示例#2
0
    def __init__(self):
        super(VipChecked, self).__init__()
        if not hasattr(self,'_vip_sites'):
            self._vip_sites = set()

            self._oracledb = OracleDB()

            self.load_vip_site()
示例#3
0
class SyncES():
    def __init__(self):
        self._es = ES()
        self._db = OracleDB()

        self._max_id = tools.read_file(STO_MAX_ID_FILE)
        self._max_id = self._max_id and eval(self._max_id) or {}

    def get_data(self, sql):
        return self._db.find(sql, to_json=True)

    def export_to_es(self, table, data, data_id):
        self._es.add(table=table, data=data, data_id=data_id)

    def sync_data(self, table, step=20):
        '''
        @summary: 需要先把id设为主键
        ---------
        @param sql:
        @param table:
        @param is_step: 分批导 0 位一次导入, 适合数据量不多情况。速度快
        ---------
        @result:
        '''

        max_id = self._max_id.get(table, 0)
        self._db.set_primary_key(table)

        while True:
            inner_sql = 'select * from %s where id > %d and rownum <= %d order by id' % (
                table, max_id, step)
            datas = sync_es.get_data(inner_sql)

            if not datas:
                self.close()
                break

            for data in datas:
                data_id = data['ID']
                data = tools.dumps_json(data)
                print(data)
                print(data_id)
                # print(data)

                max_id = data_id

                self.export_to_es(table, data, data_id)

        self._max_id[table] = max_id

    def close(self):
        tools.write_file(STO_MAX_ID_FILE, str(self._max_id))
示例#4
0
    def __init__(self, province_name=PROVINCE):
        self._province_airs = []
        if province_name:
            self._db = OracleDB()
            self._province_airs.append(province_name)
            province_id = self.load_province_id(province_name)
            if province_id:
                self._province_airs.extend(
                    air[0] for air in self.load_province_air(province_id))
                self._province_airs.extend(
                    town[0] for town in self.load_province_town(province_id))

        print(self._province_airs)
示例#5
0
class ProvinceFilter():
    def __init__(self, province_name=PROVINCE):
        self._province_airs = []
        self._db = OracleDB()
        if province_name:
            self._province_airs.append(province_name)
            province_id = self.load_province_id(province_name)
            if province_id:
                self._province_airs.extend(
                    air[0] for air in self.load_province_air(province_id))
                # self._province_airs.extend(town[0] for town in self.load_province_town(province_id))
        else:  # 全国
            self._province_airs.extend(province[0]
                                       for province in self.load_province())

        print(self._province_airs)

    def load_province_id(self, province_name):
        sql = "select t.id from TAB_MANAGE_PROVINCE_INFO t where t.province_name like '%{province_name}%'".format(
            province_name=province_name)
        result = self._db.find(sql)
        province_id = result[0][0] if result else None
        if not province_id:
            log.debug('TAB_MANAGE_PROVINCE_INFO 无 %s 省份' % province_name)

        return province_id

    def load_province(self):
        sql = "select province_name from TAB_MANAGE_PROVINCE_INFO"
        province_names = self._db.find(sql)
        return province_names

    def load_province_air(self, province_id):
        sql = "select t.area_name from TAB_MANAGE_AREA_INFO t where t.province_id = %s" % province_id
        province_air = self._db.find(sql)
        return province_air

    def load_province_town(self, province_id):
        sql = "select t.town_name from TAB_MANAGE_TOWN_INFO t where t.province_id = %s" % province_id
        province_town = self._db.find(sql)
        return province_town

    def find_contain_air(self, text):
        contain_airs = []

        for air in self._province_airs:
            if air in text:
                contain_airs.append(air)

        return list(set(contain_airs))
def main():
    db = OracleDB()
    mongodb = MongoDB()

    sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time  and search_type = 702'
    result_list = db.find(sql, fetch_one=False)
    if not result_list:
        log.debug('无任务 结束')
        return

    parser_params = {'result_list': result_list}
    # parser_params = []
    # for i in result:
    #     parser_params.extend(str(i[0]).split(','))

    def begin_callback():
        log.info('\n********** WWA_weibo_user begin **********')
        mongodb.delete('WWA_weibo_user_urls')

    def end_callback():
        # 导出数据
        key_map = {
            'id': 'int__id',
            'name': 'str_name',
            'sex': 'int_sex',
            'summary': 'str_summary',
            'fans_count': 'int_fans_count',
            'blog_verified': 'str_blog_verified',
            'is_verified': 'int_is_verified',
            'account_url': 'str_url',
            'follow_count': 'int_follow_count',
            'image_url': 'str_image_url',
            'monitor_status': 'vint_401',
            'SEARCH_TYPE' : 'vint_702',
            'region' : 'str_area'
        }

        export = ExportData('WWA_weibo_user_info', 'tab_mvms_weibo_info', key_map, 'account_url')
        export.export_to_oracle()
        log.info('\n********** WWA_weibo_user end **********')

    # 配置spider
    spider = Spider(tab_urls = 'WWA_weibo_user_urls', tab_site = 'WWA_site_info', tab_content = 'WWA_weibo_user_info',
                    parser_count = 1, begin_callback = begin_callback, end_callback = end_callback,
                    parser_params = parser_params)

    # 添加parser
    spider.add_parser(weibo_user_parser)
    spider.start()
示例#7
0
class VipChecked(Singleton):
    def __init__(self):
        super(VipChecked, self).__init__()
        if not hasattr(self,'_vip_sites'):
            self._vip_sites = set()

            self._oracledb = OracleDB()

            self.load_vip_site()

    def load_vip_site(self):
        sql = 'select to_char(t.keyword2) from TAB_IOPM_CLUES t where zero_id = 7'
        sites = self._oracledb.find(sql)
        for site in sites:
            site_list = site[0].split(',')
            for site in site_list:
                if site:
                    self._vip_sites.add(site)

        # print(self._vip_sites)

    def is_vip(self, content):
        is_vip = False
        for site in self._vip_sites:
            is_vip = (content or False) and ((site in content) or (content in site))

            if is_vip:
                # print(site)
                break

        return int(is_vip)
示例#8
0
class Keywords():
    def __init__(self):
        self._oracledb = OracleDB()
        self._clues = self.get_clues()

    def get_clues(self):
        sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.zero_id, FIRST_ID, second_id  from TAB_IOPM_CLUES t where zero_id != 7'  # 7 为传播途径
        clues = self._oracledb.find(sql)
        return clues

    def get_keywords(self):

        keywords = []

        for clue in self._clues:
            clue_id = clue[0]
            key2 = clue[1]
            key3 = clue[2]
            zero_id = clue[3]
            first_id = clue[4]
            second_id = clue[5]

            keys = format_keywords(key2)  # 格式化线索词
            for key in keys:  #['新闻节目', '总理&主席', 'the xi factor']
                unit_keys = key.replace('&', ' ')  # [总理, 主席]
                keywords.append(unit_keys)

        return keywords
示例#9
0
    def export_to_oracle(self,
                         source_table='',
                         aim_table='',
                         key_map='',
                         unique_key=None,
                         unique_key_mapping_source_key=None,
                         update_read_status=True,
                         condition={'read_status': 0},
                         datas=[],
                         callback='',
                         sync_to_es=False):
        if aim_table:
            if self._aim_table != aim_table:
                self._is_set_unique_key = False
                self._es = ES() if sync_to_es else ''
                self._mongodb = MongoDB() if source_table else ''

            self._source_table = source_table
            self._aim_table = aim_table
            self._key_map = key_map
            self._unique_key = unique_key
            self._export_count = 0
            self._update_count = 0
            self._unique_key_mapping_source_key = unique_key_mapping_source_key
            self._update_read_status = update_read_status if not datas else False
            self._condition = condition
            self._datas = datas
            self._callback = callback
            self._sync_to_es = sync_to_es
            self._es = None

        self._aim_db = OracleDB()
        self._is_oracle = True

        return self.__export()
示例#10
0
def main():
    db = OracleDB()
    sql = 'select t.id clues_id,to_char(t.keyword1),to_char(t.keyword2),to_char(t.keyword3),t.zero_id  from TAB_IOPM_CLUES t'
    results = db.find(sql)

    clues_json = {
        "message":
        "查询成功",
        "status":
        1,
        "data": [{
            "clues_id": 104,
            "keyword1": "",
            "keyword2": "",
            "keyword3": "",
            "zero_id": 2
        }]
    }

    clues_json['data'] = []

    for result in results:
        data = {
            "clues_id":
            result[0] if result[0] else "",
            "keyword1":
            "%s" %
            (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1]
             == ',' else result[1].replace('"', '')) if result[1] else "",
            "keyword2":
            "%s" %
            (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1]
             == ',' else result[2].replace('"', '')) if result[2] else "",
            "keyword3":
            "%s" %
            (result[3].replace('"', '“').replace('、', '')[:-1] if result[3][-1]
             == ',' else result[3].replace('"', '')) if result[3] else "",
            "zero_id":
            result[4] if result[4] else ""
        }
        clues_json["data"].append(data)

    clues_json = tools.dumps_json(clues_json)
    print(clues_json)

    tools.write_file('./clues.txt', clues_json)
def main():
    db = MongoDB()
    oracle = OracleDB()

    def begin_callback():
        #db.update('WWA_app_urls',{'depth':0}, {'status':0})
        db.delete('WWA_search_app_urls')
        log.info('\n********** wwa begin **********')

    def end_callback():
        log.info('\n********** wwa end **********')
        export_data.main()

    keywords = []

    result_list = oracle.find(
        'select keyword from TAB_MVMS_SEARCH_INFO where  MONITOR_START_TIME <= sysdate AND MONITOR_END_TIME >= sysdate and search_type=703'
    )
    if not result_list:
        log.debug('无任务 结束')
        return

    keywords = []
    for result in result_list:
        keywords.extend(result[0].split(','))

    parser_params = {'keywords': keywords}

    # 配置spider
    spider = Spider(tab_urls='WWA_search_app_urls',
                    tab_site='WWA_search_app_site_info',
                    tab_content='WWA_search_app_content_info',
                    content_unique_key='title',
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(yingyongbao_parser)
    spider.add_parser(android_market_parser)
    spider.add_parser(baidu_mobile_assistant_parser)
    spider.add_parser(mobile360_assistant_parser)
    spider.start()
示例#12
0
def main():
    db = OracleDB()
    sql = 'update  tab_nbsp_anchor_info t set t.live_view = 0'
    db.update(sql)

    # 导出数据
    # 主播信息
    key_map = {
        'id': 'vint_sequence.nextval',
        'room_id': 'int_room_id',
        'name': 'str_name',
        'sex': 'int_sex',
        'age': 'int_age',
        'address': 'str_address',
        'image_url': 'str_image_url',
        'fans_count': 'int_fans_count',
        'watched_count': 'int_watched_count',
        'room_url': 'str_room_url',
        'video_path': 'str_video_path',
        'site_id': 'int_site_id',
        'record_time': 'date_record_time',
        'live_view': 'int_live_view',
        'monitor_status': 'vint_401',
        'json_data_url': 'str_watched_count_url'
    }

    export_data = ExportData(source_table = 'LiveApp_anchor_info', aim_table = 'tab_nbsp_anchor_info', key_map = key_map, unique_key = 'room_id', update_read_status = False, unique_key_mapping_source_key = {'room_id':'int_room_id'})
    export_data.export_to_oracle()

    # 违规信息
    key_map = {
        'id' : 'vint_sequence.nextval',
        'TASK_ID':'int_task_id',
        'ANCHOR_ID':'int_room_id',
        'FOUND_TIME':'date_record_time',
        'CONTENT':'str_violate_content',
        'VIOLATE_IMAGE_STATUS':'str_sexy_image_status',
        'VIOLATE_IMAGE_URL':'str_sexy_image_url'
    }

    export_data = ExportData(source_table = 'LiveApp_anchor_info', aim_table = 'tab_nbsp_violate_anchor_info', key_map = key_map, unique_key = 'ANCHOR_ID', update_read_status = True, condition = {'violate_content' : {'$ne':''}, 'read_status':0}, unique_key_mapping_source_key = {'ANCHOR_ID':'int_room_id'})
    export_data.export_to_oracle()
示例#13
0
class CompareKeywords():
    def __init__(self):
        self._oracledb = OracleDB()
        self._clues = self.get_clues()

    def get_clues(self):
        sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.zero_id, FIRST_ID, second_id  from TAB_IOPM_CLUES t where zero_id != 7'  # 7 为传播途径
        clues = self._oracledb.find(sql)
        return clues

    def get_contained_keys(self, text):
        '''
        @summary:
        ---------
        @param text:比较的文本
        @param keys:关键词列表
        ---------
        @result:
        '''
        keywords = []
        clues_ids = []
        zero_ids = []
        first_ids = []
        second_ids = []
        keyword_clues = {}

        for clue in self._clues:
            clue_id = clue[0]
            key2 = clue[1]
            key3 = clue[2]
            zero_id = clue[3]
            first_id = clue[4]
            second_id = clue[5]

            keys = format_keywords(key2)  # 格式化线索词
            for key in keys:  #['新闻节目', '总理&主席', 'the xi factor']
                # 获取单元key 如 总理&主席 必须全包含
                unit_keys = key.split('&')  # [总理, 主席]
                for unit_key in unit_keys:
                    if unit_key not in text:
                        break
                else:
                    keywords.extend(unit_keys)
                    clues_ids.append(str(clue_id))
                    zero_ids.append(str(zero_id))
                    first_ids.append(str(first_id))
                    second_ids.append(str(second_id))
                    for unit_key in unit_keys:
                        keyword_clues[unit_key] = clue_id

        return ','.join(set(keywords)), ','.join(set(clues_ids)), ','.join(
            set(zero_ids)), ','.join(set(first_ids)), ','.join(
                set(second_ids)), keyword_clues
示例#14
0
    def check_new_article(self, account):
        oralce_id, account_id, account_name, last_article_release_time, biz = account

        article_release_time = self._wechat_sogo.get_article_release_time(
            account_id=account_id, account=account_name)
        print(article_release_time)
        if article_release_time:
            last_article_release_time = last_article_release_time or ''
            if article_release_time >= tools.get_current_date(
                    '%Y-%m-%d'
            ) and article_release_time > last_article_release_time:
                print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name,
                                                       article_release_time))

                sql = '''
                    update TAB_IOPM_SITE t set t.spider_status = 601,
                     t.last_article_release_time =
                           to_date('{}', 'yyyy-mm-dd hh24:mi:ss')
                     where id = {}
                '''.format(article_release_time, oralce_id)

                # 多线程, 数据库需每个线程持有一个
                oracledb = OracleDB()
                oracledb.update(sql)
                oracledb.close()

                # 入redis, 作为微信爬虫的任务池
                data = (oralce_id, account_id, account_name,
                        last_article_release_time, biz)
                self._redisdb.sadd('wechat:account', data)
示例#15
0
class TaskService():
    _task_ring_buff = RingBuff(TASK_BUFFER_SIZE)
    _offset = 1
    _lock = threading.RLock()
    _db = OracleDB()

    def __init__(self):
        pass

    def load_task(self):
        task_sql = '''
            select *
              from (select t.id, t.name, t.position, t.url, t.domain, rownum r
                      from TAB_IOPM_SITE t
                     where classify = 1
                       and t.mointor_status = 701
                       and t.position != 35
                       and rownum < {page_size})
             where r >= {offset}
        '''.format(page_size=TaskService._offset + TASK_BUFFER_SIZE,
                   offset=TaskService._offset)
        TaskService._offset += TASK_BUFFER_SIZE

        print(task_sql)
        tasks = TaskService._db.find(task_sql)

        if not tasks:
            TaskService._offset = 1
            self.load_task()

        TaskService._task_ring_buff.put_data(tasks)

    def get_task(self, count=TASK_COUNT):
        TaskService._lock.acquire()  #加锁
        tasks = TaskService._task_ring_buff.get_data(count)
        if not tasks:
            self.load_task()
            tasks = TaskService._task_ring_buff.get_data(count)

        TaskService._lock.release()
        return tasks

    def update_task_status(self, tasks, status):
        TaskService._lock.acquire()  #加锁
        for task in tasks:
            website_id = task[0]

            sql = "update tab_iopm_site t set t.spider_time = to_date('%s', 'yyyy-mm-dd :hh24:mi:ss'), t.spider_status = %s where id = %s" % (
                tools.get_current_date(), status, website_id)

            TaskService._db.update(sql)
        TaskService._lock.release()
示例#16
0
def main():
    oracledb = OracleDB()
    sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time  and search_type = 701'
    result_list = oracledb.find(sql)  #[(keys, monitor_type),()]
    if not result_list:
        log.debug('无任务 结束')
        return

    # print(result_list)
    # keywords = []
    # for result in result_list:
    #     keywords.extend(result[0].split(','))

    def begin_callback():
        log.info('\n********** WWA_wechat_account begin **********')
        db = MongoDB()
        db.delete('WWA_wechat_account_url', {})

    def end_callback():
        log.info('\n********** WWA_wechat_account end **********')
        export_data.account_main()

    parser_params = {'result_list': result_list}

    # 配置spider
    spider = Spider(tab_urls='WWA_wechat_account_url',
                    tab_site='WWA_wechat_site_info',
                    tab_content='WWA_wechat_official_accounts',
                    content_unique_key='account_id',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(wechat_account_parser)

    spider.start()
示例#17
0
def main():
    oracledb = OracleDB()
    sql = 'select t.account_id, t.monitor_type from TAB_MVMS_WECHAT_INFO t where monitor_status = 402'
    result_list = oracledb.find(sql)
    if not result_list:
        log.debug('无任务 结束')
        return

    # keywords = []
    # for result in result_list:
    #     keywords.append(result[0])

    def begin_callback():
        log.info('\n********** WWA_wechat_article begin **********')
        db = MongoDB()
        db.delete('WWA_wechat_article_url', {})

    def end_callback():
        log.info('\n********** WWA_wechat_article end **********')
        export_data.article_main()

    parser_params = result_list

    # 配置spider
    spider = Spider(tab_urls='WWA_wechat_article_url',
                    tab_site='WWA_wechat_site_info',
                    tab_content='WWA_wechat_article',
                    content_unique_key='title',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(wechat_article_parser)

    spider.start()
示例#18
0
def main():
    db = OracleDB()

    sql = '''
        select t.program_id, c.chan_name, program_name, d.name, t.image_url, t.official_blog
          from TAB_MMS_PROGRAM t
          left join tab_mam_chan c
            on c.chan_id = t.chan_id
          left join tab_mms_dictionary d
            on t.type = d.id
           and d.type = 2
    '''
    # where t.program_id =  226
    program_info = db.find(sql)

    def begin_callback():
        log.info('\n********** news begin **********')
        # 更新任务状态 doing

    def end_callback():
        log.info('\n********** news end **********')

    # 配置spider
    spider = Spider(tab_urls='mms_urls',
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    delete_tab_urls=True,
                    parser_params=program_info)

    # 添加parser
    # spider.add_parser(iqiyi_hot_parser)
    spider.add_parser(iqiyi_search_parser)
    # spider.add_parser(weibo_user_parser)
    # spider.add_parser(weibo_article_parser)

    spider.start()
示例#19
0
def main():
    oracledb = OracleDB()
    esdb = ES()

    # sql = 'select MSG_ID from TAB_IOPM_USER_ACTION t where action_type=301 and msg_type = 502 and record_time>=sysdate-1'
    # article_ids = oracledb.find(sql)

    article_ids = [8888515, 8888293, 8891299]
    for article_id in article_ids:
        # article_id = article_id[0]

        body = {"WEIGHT": 0}

        print(article_id)
        esdb.update_by_id('tab_iopm_article_info', article_id, body)
示例#20
0
class EventFilter(threading.Thread):
    def __init__(self):
        super(EventFilter, self).__init__()

        self._db = OracleDB()
        self._event_knowledges = self.load_event_knowledges()

    def run(self):
        while True:
            tools.delay_time(60 * 60)
            print('更新事件知识库...')
            self._event_knowledges = self.load_event_knowledges()
            print('更新事件知识库完毕')

    def load_event_knowledges(self):
        '''
        @summary:
        801 时事政治
        802 社会民生
        803 教育改革
        804 医疗卫生
        805 科技舆情
        806 意识形态(无)
        807 政策法规
        808 经济舆情(无)
        809 生态文明
        810 体育舆情(无)
        811 突发安全(无)
        ---------
        ---------
        @result:
        '''
        sql = 'select t.keyword, t.type from TAB_IOPM_EVENT_KNOWLEDEGE t'
        event_knowledges = self._db.find(sql)
        return event_knowledges

    def find_contain_event(self, text):
        contain_event_type = set()
        for event in self._event_knowledges:
            event_keyword = event[0]
            event_type = event[1]

            if event_keyword in text:
                contain_event_type.add(str(event_type))

        return list(contain_event_type)
示例#21
0
 def __init__(self):
     self._oracledb = OracleDB()
     self._clues = self.get_clues()
# -*- coding: utf-8 -*-
'''
Created on 2017-07-26 19:04
---------
@summary:
---------
@author: Boris
'''

import sys
sys.path.append('../')

from db.oracledb import OracleDB
import utils.tools as tools

oracledb = OracleDB()


def main():
    url = 'http://192.168.60.38:8001/hotspot_al/interface/getHotAnalysis_self'
    json = tools.get_json_by_requests(url)
    # print(json)

    hot_list = []
    datas = json['data']
    for data in datas:
        clus_id = list(data.keys())[0]
        sql = 'select t.name from TAB_IOPM_CLUES t where id = ' + clus_id
        name = oracledb.find(sql)[0][0]

        hot_infos = data[clus_id]['data']
示例#23
0
    def __init__(self):
        self._es = ES()
        self._db = OracleDB()

        self._max_id = tools.read_file(STO_MAX_ID_FILE)
        self._max_id = self._max_id and eval(self._max_id) or {}
示例#24
0
    def __init__(self):
        super(EventFilter, self).__init__()

        self._db = OracleDB()
        self._event_knowledges = self.load_event_knowledges()
示例#25
0
@author: Boris
'''

import sys

sys.path.append('..')
import init

import utils.tools as tools
from utils.log import log
from db.oracledb import OracleDB
from base.wechat_public_platform import WechatPublicPlatform
from base.wechat_sogou import WechatSogou

if __name__ == '__main__':
    db = OracleDB()
    # wechat_public_platform =  WechatPublicPlatform()
    wechat_sogou = WechatSogou()
    # 取微信号
    # sql = 'select t.name, t.keyword2 from TAB_IOPM_CLUES t where t.zero_id = 7 and t.first_id = 137 and t.second_id = 183'
    # accounts = db.find(sql)
    accounts = ['骨朵网络影视']
    for account in accounts:
        account_id = ''
        account_name = account
        biz = wechat_sogou.get_biz(account_id=account_id, account=account_name)
        if biz:
            sql = "insert into TAB_IOPM_SITE t (t.id, t.name, t.position, t.classify, t.mointor_status, t.biz, t.priority) values (seq_iopm_site.nextval, '{name}', 1, 2, 701, '{biz}', 1)".format(
                name=account_name, biz=biz)
            print(sql)
            db.add(sql)
示例#26
0
def main():
    db = OracleDB()

    # 查文章
    sql = '''
        select *
          from (select rownum r, id, title
                  from tab_iopm_article_info
                 where rownum >= 1)
         where r <= 100000
    '''
    articles = db.find(sql)

    # 查热点
    sql = 'select id, title from tab_iopm_hot_info'
    hots = db.find(sql)

    for article in articles:
        max_similar = {
            'similarity': 0,
            'hot_id': -1,
            'article_id': -1,
            'hot_title': ''
        }  # 最相似的文章 similarity表示相似度(0~1)
        article_id = article[1]
        article_text = article[2]

        for hot in hots:
            hot_id = hot[0]
            hot_text = hot[1]

            similarity = compare_text(hot_text, article_text)
            # print('''
            #     article_text %s
            #     hot_text     %s
            #     similarity   %s
            #     '''%(article_text, hot_text, similarity))
            if similarity > max_similar['similarity']:
                max_similar['similarity'] = similarity
                max_similar['hot_id'] = hot_id
                max_similar['article_id'] = article_id
                max_similar['hot_title'] = article_text if len(hot_text) > len(
                    article_text) else hot_text

        if max_similar['similarity'] > SIMILARITY:
            sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % (
                max_similar['hot_id'], max_similar['article_id'])
            db.update(sql)
            sql = "update tab_iopm_hot_info set hot = hot + 1, title = '%s' where id = %s" % (
                max_similar['hot_title'], max_similar['hot_id'])
            db.update(sql)

        else:
            sql = 'select sequence.nextval from dual'
            hot_id = db.find(sql)[0][0]
            sql = "insert into tab_iopm_hot_info (id, title, hot) values (%s, '%s', 1)" % (
                hot_id, article_text)
            db.add(sql)
            sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % (
                hot_id, article_id)
            db.update(sql)

        sql = 'select id, title from tab_iopm_hot_info'
        hots = db.find(sql)
def add_anchor_info(table,
                    site_id,
                    title='',
                    name='',
                    image_url='',
                    room_id='',
                    room_url='',
                    video_path='',
                    watched_count='',
                    fans_count='',
                    sex='',
                    age='',
                    address='',
                    live_view=1,
                    watched_count_url=''):
    '''
    @summary:
    ---------
    @param table: 表名
    @param site_id: 网站id
    @param name: 主播名
    @param image_url: 贴图地址
    @param room_id: 房间号
    @param room_url: 房间网页的url
    @param video_path: 房间视频流地址
    @param watched_count: 观众数
    @param fans_count: 粉丝数
    @param sex:  性别
    @param age:  年龄
    @param address:   主播所在地址(城市)
    @param live_view: 直播状态(0 未直播 1 直播)
    @param watched_count_url: 实时观众数地址
    ---------
    @result:
    '''

    #违规知识库检索
    task_id = 0
    violate_content = ''
    #-交验--
    from db.oracledb import OracleDB
    oracle_db = OracleDB()

    sql = 'select t.name, t.keyword, t.task_id from tab_nbsp_violate_knowledge t where t.monitor_start_time <= sysdate and sysdate <= t.monitor_end_time'
    results = oracle_db.find(sql)  #[('色情低俗', '性感,枪支,格斗,脱衣,透视,胸器', 1)]

    for result in results:
        name_, keywords, task_id_ = result
        keywords = keywords.split(',')
        for keyword in keywords:
            if name.find(keyword) != -1:
                task_id = task_id_
                violate_content = name

    anchor_info_dict = {
        'site_id': site_id,
        'title': title,
        'task_id': task_id,
        'violate_content': violate_content,
        'name': name,
        'image_url': image_url,
        'sex': sex,
        'age': age,
        'address': address,
        'fans_count': fans_count,
        'watched_count': watched_count,
        'room_id': room_id,
        'room_url': room_url,
        'video_path': video_path,
        'live_view': live_view,
        'record_time': tools.get_current_date(),
        'watched_count_url': watched_count_url,
        'read_status': 0,
        'sexy_image_status': '',
        'sexy_image_url': '',
        'image_pron_status': 0
    }

    if not db.add(table, anchor_info_dict):
        anchor_info_dict.pop('_id')
        anchor_info_dict.pop('sexy_image_status')
        anchor_info_dict.pop('sexy_image_url')
        anchor_info_dict.pop('image_pron_status')
        db.update(table, {'room_id': room_id}, anchor_info_dict)
示例#28
0
@author: Boris
'''

from cluster.compare_text import compare_text
from db.oracledb import OracleDB
import utils.tools as tools

SIMILARITY = 0.45  # 相似度 聚类阈值  相似度大于 n 就算一类 0<=n<=1
CLUSTER_BUFFER_ZISE = 100
PAGE_SIZE = 1000

cluster_buffer = {
    # "hot_id":{'title':'xxxx', 'article_ids':[1,2,3,4], 'article_count':0},
    # "hot_id":{'title':'xxxx', 'article_ids':[1,2,3,4], 'article_count':0}
}
db = OracleDB()


def deal_cluster_buffer():
    global cluster_buffer
    cluster_buffer_tota_count = len(cluster_buffer)
    cluster_buffer_deal_count = 0
    for hot_id, data in cluster_buffer.items():
        tools.print_loading(
            '缓存到达最大限制 正在向数据库中写数据 %d/%d' %
            (cluster_buffer_deal_count, cluster_buffer_tota_count))
        article_ids = data['article_ids']
        article_count = data['article_count']
        hot_title = data['title'].replace("'", "''")

        # 更新线索对应的热点id
示例#29
0
def main():
    search_task_sleep_time = int(
        tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time'))

    db = OracleDB()

    #  更新符合日期条件的任务状态 未做
    sql = 'update tab_ivms_task_info t set t.task_status = 501 where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
    db.update(sql)

    # 更新关键词状态 未做
    sql = 'update tab_ivms_task_keyword k set k.finish_status = 601 where k.task_id in (select t.task_id from tab_ivms_task_info t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time)'
    db.update(sql)

    while True:
        # 查任务
        log.debug('查询任务...')

        sql = 'select t.task_id from TAB_IVMS_TASK_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and t.task_status = 501'
        result = db.find(sql, fetch_one=True)
        if not result:
            break

        task_id = result[0]

        while True:
            # 查看是否有正在执行的任务
            sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 602' % task_id
            do_task = db.find(sql, fetch_one=True)
            if do_task:
                time.sleep(search_task_sleep_time)
                continue

            sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 601' % task_id
            result = db.find(sql, fetch_one=True)
            if not result:
                break

            keyword_id = result[0]
            task_id = result[1]
            search_keyword1 = []
            search_keyword2 = result[2].split(',') if result[2] else []
            search_keyword3 = result[3].split(',') if result[3] else []

            def begin_callback():
                log.info('\n********** VA begin **********')
                # 更新任务状态 正在做
                sql = 'update TAB_IVMS_TASK_INFO set task_status = 502 where task_id = %d' % task_id
                db.update(sql)

                # 更新关键词状态 正在做
                sql = 'update tab_ivms_task_keyword set finish_status = 602 where id = %d' % keyword_id
                db.update(sql)

            def end_callback():
                # 更新关键词状态 做完
                sql = 'update tab_ivms_task_keyword set finish_status = 603 where id = %d' % keyword_id
                db.update(sql)

                # 如果该任务的所有关键词都做完 则更新任务状态为做完
                sql = 'select t.* from tab_ivms_task_keyword t where task_id = %d and finish_status = 601' % task_id
                results = db.find(sql)
                if not results:
                    # 导出数据
                    key_map = {
                        'program_id': 'vint_sequence.nextval',
                        'search_type': 'int_search_type',
                        'program_name': 'str_title',
                        'program_url': 'str_url',
                        'release_date': 'date_release_time',
                        'image_url': 'str_image_url',
                        'program_content': 'str_content',
                        'task_id': 'vint_%d' % task_id,
                        'keyword': 'str_keyword',
                        'keyword_count': 'int_keyword_count',
                        'check_status': 'vint_202'
                    }

                    export = ExportData('VA_content_info',
                                        'tab_ivms_program_info', key_map,
                                        'program_url')
                    export.export_to_oracle()

                    # 更新任务状态 做完
                    sql = 'update TAB_IVMS_TASK_INFO set task_status = 503 where task_id = %d' % task_id
                    db.update(sql)
                    log.info('\n********** VA end **********')

            # 配置spider
            spider = Spider(tab_urls='VA_urls',
                            tab_site='VA_site_info',
                            tab_content='VA_content_info',
                            parser_count=1,
                            begin_callback=begin_callback,
                            end_callback=end_callback,
                            search_keyword1=search_keyword1,
                            search_keyword2=search_keyword2,
                            search_keyword3=search_keyword3)

            # 添加parser
            spider.add_parser(baidu_parser)
            spider.add_parser(magnet_parser)
            spider.add_parser(netdisk_parser)
            spider.add_parser(weibo_parser)
            spider.add_parser(wechat_parser)
            spider.add_parser(soubaidupan_parser)
            spider.add_parser(douban_parser)

            spider.start()

            time.sleep(search_task_sleep_time)
def add_WWA_search_app_info(table,
                            site_id,
                            url,
                            title='',
                            summary='',
                            update_info='',
                            score='',
                            author='',
                            app_url='',
                            image_url='',
                            software_size='',
                            tag='',
                            platform='',
                            download_count='',
                            release_time='',
                            language='',
                            sensitive_id='',
                            read_status=0):
    '''
    @summary:
    ---------
    @param title: 标题
    @param site_id: 网站id
    @param summary: 简介
    @param update_info: 更新信息
    @param socre: 评分
    @param author: 作者
    @param url: 原文url
    @param app_url: app下载的url
    @param image_url : 图片url(多个url逗号分割)
    @param classify_id: 分类
    @param software_size: 大小
    @param tag: 版本 |
    @param platform: 平台(ios / android)
    @param download_count:下载次数
    @param release_time: 发布时间
    @param record_time: 记录时间
    @param sensitive_id: varchar|||敏感信息id(多个敏感信息id用逗号分割)
    @param read_status: 读取状态(0没读, 1读取)
    ---------
    @result:
    '''

    # 过滤掉不符合的app
    from db.oracledb import OracleDB
    oracle_db = OracleDB()

    sql = 'select keyword from TAB_MVMS_SEARCH_INFO t where search_type = 703'
    results = oracle_db.find(sql)  #[('天天快报,今日头条,黑龙江',)]

    is_usefull = False

    text_content = title + summary + update_info + author
    for result in results:
        keywords = result[0]
        keywords = keywords.split(',')
        for keyword in keywords:
            if keyword in text_content:
                is_usefull = True
                break
        if is_usefull:
            break

    if not is_usefull:
        return

    if language == '中文':
        language = 601
    elif language == '英文':
        language = 602
    else:
        language = 603

    title = tools.del_html_tag(title)

    gameApp_info_dict = {
        'site_id': site_id,
        'url': url,
        'summary': tools.del_html_tag(summary, except_line_break=True),
        'title': title,
        'update_info': tools.del_html_tag(update_info, except_line_break=True),
        'score': score,
        'author': author,
        'app_url': app_url,
        'image_url': image_url,
        'software_size': software_size,
        'tag': tag,
        'platform': platform,
        'download_count': download_count,
        'release_time': release_time,
        'record_time': tools.get_current_date(),
        'language': language,
        'sensitive_id': sensitive_id,
        'read_status': 0,
        'sexy_image_status': '',
        'sexy_image_url': '',
        'image_pron_status': 0
    }
    db.add(table, gameApp_info_dict)