Exemplo n.º 1
0
def main():
    db = MongoDB()
    while True:

        def begin_callback():
            log.info('\n********** proxies begin **********')
            db.delete('proxies_urls')

        def end_callback():
            log.info('\n********** proxies end **********')

            # 更新任务状态 done

            # 导出数据
            # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '')
            # export_data.export_to_oracle()

        # 配置spider
        spider = Spider(tab_urls='proxies_urls',
                        tab_site='proxies_site_info',
                        tab_content='proxies_content_info',
                        parser_count=1,
                        begin_callback=begin_callback,
                        end_callback=end_callback,
                        parser_params={},
                        content_unique_key='ip')

        # 添加parser
        spider.add_parser(gaoni_parser)

        spider.start()

        # time.sleep(60)
        break
Exemplo n.º 2
0
def main():
    while True:
        if task_status.is_doing:  #done
            log.debug('is doing sleep ...%ss' % SLEEP_TIME)
            time.sleep(SLEEP_TIME)
            continue

        task_status.is_doing = True

        keywords = Keywords().get_keywords()

        def begin_callback():
            log.info('\n********** spider_main begin **********')

        def end_callback():
            log.info('\n********** spider_main end **********')
            task_status.is_doing = False

        # 配置spider
        spider = Spider(tab_list,
                        tab_unique_key_list,
                        tab_ensure_index_list,
                        parser_count=1,
                        site_parsers=parser_siteid_list,
                        begin_callback=begin_callback,
                        end_callback=end_callback,
                        parser_params=keywords)

        # 添加parser
        for parser in parser_list:
            spider.add_parser(parser)

        spider.start()
def main():
    search_keyword1 = ['成龙']
    search_keyword2 = []
    search_keyword3 = []

    def begin_callback():
        log.info('\n********** VA begin **********')

    def end_callback():
        # 更新关键词状态 做完
        log.info('\n********** VA end **********')

    # 配置spider
    spider = Spider(tab_urls='VA_urls',
                    tab_site='VA_site_info',
                    tab_content='VA_content_info',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    search_keyword1=search_keyword1,
                    search_keyword2=search_keyword2,
                    search_keyword3=search_keyword3)

    # 添加parser
    # spider.add_parser(baidu_parser)
    # spider.add_parser(magnet_parser)
    # spider.add_parser(netdisk_parser)
    # spider.add_parser(weibo_parser)
    # spider.add_parser(wechat_parser)
    # spider.add_parser(soubaidupan_parser)
    spider.add_parser(douban_parser)

    spider.start()
Exemplo n.º 4
0
def main():
    def begin_callback():
        log.info('\n********** spider_article begin **********')

    def end_callback():
        log.info('\n********** spider_article end **********')

    # 配置spider
    spider = Spider(tab_urls='article_urls',
                    tab_site='article_site_info',
                    tab_content='article_text_info',
                    parser_count=40,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params={})

    # 添加parser
    spider.add_parser(cctv_parser)
    spider.add_parser(ifeng_parser)
    spider.add_parser(xinhua_parser)
    spider.add_parser(tencent_parser)
    spider.add_parser(sohu_parser)
    spider.add_parser(wangyi_parser)
    spider.add_parser(people_parser)
    spider.add_parser(sina_parser)
    spider.start()
Exemplo n.º 5
0
def main():
    db = MongoDB()

    def begin_callback():
        log.info('\n********** wp begin **********')
        db.delete('WP_urls', {})

    def end_callback():
        # 更新关键词状态 做完
        log.info('\n********** wp end **********')
        export_data.main()

    # 配置spider
    spider = Spider(tab_urls = 'WP_urls', tab_site = 'WP_site_info', tab_content = 'WP_content_info',
                    parser_count = 20, begin_callback = begin_callback, end_callback = end_callback,
                    content_unique_key = 'title')

    # 添加parser
    spider.add_parser(dongmanla_parser)
    # spider.add_parser(zx_novel_parser)
    # spider.add_parser(jisu_cartoon_parser)
    # spider.add_parser(ximalaya_parser)


    spider.start()
Exemplo n.º 6
0
def main():
    db = MongoDB()

    def begin_callback():
        log.info('\n********** live_app begin **********')
        db.delete('LiveApp_urls', {})
        db.update('LiveApp_anchor_info', {}, {"live_view": 0})
        db.update('LiveApp_anchor_info', {}, {"watched_count": 0})
        db.update('LiveApp_anchor_info', {}, {'read_status': 0})

    def end_callback():
        # 更新关键词状态 做完
        log.info('\n********** live_app end **********')
        export_data.main()

    # 配置spider
    spider = Spider(tab_urls='LiveApp_urls',
                    tab_site='LiveApp_site_info',
                    tab_content='LiveApp_anchor_info',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    content_unique_key='room_id')

    # 添加parser
    spider.add_parser(inke_parser)
    spider.add_parser(huajiao_parser)
    spider.add_parser(momo_parser)

    spider.start()
Exemplo n.º 7
0
def main():
    while True:
        if task_status.is_doing:
            log.debug('正在做 不取任务')
            tools.delay_time(SEARCH_TASK_SLEEP_TIME)
            continue

        task_status.is_doing = True

        # 查找任务
        get_task_url = MASTER_ADDRESS + '/task/get_task'
        print(get_task_url)
        update_task_url = MASTER_ADDRESS + '/task/update_task'
        data = tools.get_json_by_requests(get_task_url)
        # tasks = [[209690, '百度新闻', 11, 'http://news.baidu.com/?tn=news',  3]]
        print(data)
        tasks = data.get('tasks', [])
        parser_count = data.get('thread_count')

        def begin_callback():
            log.info('\n********** news begin **********')
            # 更新任务状态 doing

            data = {'tasks': str(tasks), 'status': 602}

            if tools.get_json_by_requests(update_task_url, data=data):
                log.debug('更新任务状态 正在做...')

        def end_callback():
            log.info('\n********** news end **********')
            task_status.is_doing = False

            data = {'tasks': str(tasks), 'status': 603}

            if tools.get_json_by_requests(update_task_url, data=data):
                log.debug('更新任务状态 已做完!')

        # 配置spider
        spider = Spider(tab_urls='news:news_urls',
                        parser_count=parser_count,
                        begin_callback=begin_callback,
                        end_callback=end_callback,
                        parser_params=tasks,
                        delete_tab_urls=False)

        # 添加parser
        spider.add_parser(news_parser)

        spider.start()
def main():
    db = OracleDB()
    mongodb = MongoDB()

    sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time  and search_type = 702'
    result_list = db.find(sql, fetch_one=False)
    if not result_list:
        log.debug('无任务 结束')
        return

    parser_params = {'result_list': result_list}
    # parser_params = []
    # for i in result:
    #     parser_params.extend(str(i[0]).split(','))

    def begin_callback():
        log.info('\n********** WWA_weibo_user begin **********')
        mongodb.delete('WWA_weibo_user_urls')

    def end_callback():
        # 导出数据
        key_map = {
            'id': 'int__id',
            'name': 'str_name',
            'sex': 'int_sex',
            'summary': 'str_summary',
            'fans_count': 'int_fans_count',
            'blog_verified': 'str_blog_verified',
            'is_verified': 'int_is_verified',
            'account_url': 'str_url',
            'follow_count': 'int_follow_count',
            'image_url': 'str_image_url',
            'monitor_status': 'vint_401',
            'SEARCH_TYPE' : 'vint_702',
            'region' : 'str_area'
        }

        export = ExportData('WWA_weibo_user_info', 'tab_mvms_weibo_info', key_map, 'account_url')
        export.export_to_oracle()
        log.info('\n********** WWA_weibo_user end **********')

    # 配置spider
    spider = Spider(tab_urls = 'WWA_weibo_user_urls', tab_site = 'WWA_site_info', tab_content = 'WWA_weibo_user_info',
                    parser_count = 1, begin_callback = begin_callback, end_callback = end_callback,
                    parser_params = parser_params)

    # 添加parser
    spider.add_parser(weibo_user_parser)
    spider.start()
Exemplo n.º 9
0
def main():
    search_task_sleep_time = int(
        tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time'))
    # 更新任务状态 正在做的更新为等待
    while True:
        # 查询任务状态 有正在做的 sleep contine
        # TODO

        search_keyword1 = ['hi']
        search_keyword2 = ['hello']
        search_keyword3 = ['hello, hi']
        task_id = 1

        # 任务为空 sleep continue
        # TODO

        def begin_callback():
            log.info('\n********** template begin **********')
            # 更新任务状态 doing

        def end_callback():
            log.info('\n********** template end **********')

            # 更新任务状态 done

            # 导出数据
            # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '')
            # export_data.export_to_oracle()

        # 配置spider
        # spider = Spider(tab_urls = 'template_urls', tab_site = 'template_site_info', tab_content = '', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback)
        spider = Spider(tab_urls='template_urls',
                        tab_site='template_site_info',
                        tab_content='template_content_info',
                        parser_count=1,
                        begin_callback=begin_callback,
                        end_callback=end_callback,
                        search_keyword1=search_keyword1,
                        search_keyword2=search_keyword2,
                        search_keyword3=search_keyword3)

        # 添加parser
        spider.add_parser(xxx_parser)
        spider.add_parser(yyy_parser)

        spider.start()

        # time.sleep(search_task_sleep_time)
        break
Exemplo n.º 10
0
def main():
    db = MongoDB()

    def begin_callback():
        log.info('\n********** template begin **********')
        db.delete('op_urls', {})
        db.delete('op_content_info', {})

    def end_callback():
        log.info('\n********** template end **********')

        # 更新任务状态 done

        # 导出数据
        # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '')
        # export_data.export_to_oracle()

    # 配置spider
    spider = Spider(tab_urls='op_urls',
                    tab_site='op_site_info',
                    tab_content='op_content_info',
                    parser_count=20,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params={})

    #添加parser
    spider.add_parser(luzhou_parser)
    spider.add_parser(longmatan_parser)
    spider.add_parser(naxi_parser)
    spider.add_parser(luxian_parser)
    spider.add_parser(hejiang_parser)
    spider.add_parser(gulin_parser)
    spider.add_parser(luzhouzhiye_parser)
    spider.add_parser(sichuanhuagong_parser)
    spider.add_parser(luzhougaozhong_parser)
    spider.add_parser(xuyong_parser)
    spider.add_parser(jiangyang_parser)
    spider.add_parser(luzhoutianli_parser)
    spider.add_parser(sichuanluxian_parser)
    spider.add_parser(sichuan_police_parser)
    spider.add_parser(sichuanyikeda_parser)
    spider.add_parser(luzhoubaidu_parser)
    spider.start()
def main():
    db = MongoDB()
    oracle = OracleDB()

    def begin_callback():
        #db.update('WWA_app_urls',{'depth':0}, {'status':0})
        db.delete('WWA_search_app_urls')
        log.info('\n********** wwa begin **********')

    def end_callback():
        log.info('\n********** wwa end **********')
        export_data.main()

    keywords = []

    result_list = oracle.find(
        'select keyword from TAB_MVMS_SEARCH_INFO where  MONITOR_START_TIME <= sysdate AND MONITOR_END_TIME >= sysdate and search_type=703'
    )
    if not result_list:
        log.debug('无任务 结束')
        return

    keywords = []
    for result in result_list:
        keywords.extend(result[0].split(','))

    parser_params = {'keywords': keywords}

    # 配置spider
    spider = Spider(tab_urls='WWA_search_app_urls',
                    tab_site='WWA_search_app_site_info',
                    tab_content='WWA_search_app_content_info',
                    content_unique_key='title',
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(yingyongbao_parser)
    spider.add_parser(android_market_parser)
    spider.add_parser(baidu_mobile_assistant_parser)
    spider.add_parser(mobile360_assistant_parser)
    spider.start()
Exemplo n.º 12
0
def main():
    def begin_callback():
        db.update('GameApp_urls', {'depth': 0}, {'status': 0})
        log.info('\n********** game_app begin **********')

    def end_callback():
        log.info('\n********** game_app end **********')

    # 配置spider
    spider = Spider(tab_urls='GameApp_urls',
                    tab_site='GameApp_site_info',
                    tab_content='GameApp_content_info',
                    begin_callback=begin_callback,
                    end_callback=end_callback)

    # 添加parser
    spider.add_parser(yingyongbao_parser)
    spider.add_parser(android_market_parser)
    spider.add_parser(baidu_mobile_assistant_parser)
    spider.add_parser(mobile360_assistant_parser)
    spider.start()
def main():
    db = MongoDB()
    db.set_unique_key('WWA_app_vioation_content_info', 'url')
    db.set_ensure_index('WWA_app_vioation_content_info', 'read_status')

    def begin_callback():
        log.info('\n********** WWA_APP begin **********')
        db.delete('WWA_app_urls', {})

    def end_callback():
        export_data.main()
        log.info('\n********** WWA_APP end **********')


    # 配置spider
    spider = Spider(tab_urls = 'WWA_app_urls', tab_site = 'WWA_app_site_info', tab_content = 'WWA_app_content_info', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback, parser_params = {})

    # 添加parser
    spider.add_parser(headline_parser)
    spider.add_parser(kuaibao_parser)

    spider.start()
Exemplo n.º 14
0
def main():
    oracledb = OracleDB()
    sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time  and search_type = 701'
    result_list = oracledb.find(sql)  #[(keys, monitor_type),()]
    if not result_list:
        log.debug('无任务 结束')
        return

    # print(result_list)
    # keywords = []
    # for result in result_list:
    #     keywords.extend(result[0].split(','))

    def begin_callback():
        log.info('\n********** WWA_wechat_account begin **********')
        db = MongoDB()
        db.delete('WWA_wechat_account_url', {})

    def end_callback():
        log.info('\n********** WWA_wechat_account end **********')
        export_data.account_main()

    parser_params = {'result_list': result_list}

    # 配置spider
    spider = Spider(tab_urls='WWA_wechat_account_url',
                    tab_site='WWA_wechat_site_info',
                    tab_content='WWA_wechat_official_accounts',
                    content_unique_key='account_id',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(wechat_account_parser)

    spider.start()
Exemplo n.º 15
0
def main():
    oracledb = OracleDB()
    sql = 'select t.account_id, t.monitor_type from TAB_MVMS_WECHAT_INFO t where monitor_status = 402'
    result_list = oracledb.find(sql)
    if not result_list:
        log.debug('无任务 结束')
        return

    # keywords = []
    # for result in result_list:
    #     keywords.append(result[0])

    def begin_callback():
        log.info('\n********** WWA_wechat_article begin **********')
        db = MongoDB()
        db.delete('WWA_wechat_article_url', {})

    def end_callback():
        log.info('\n********** WWA_wechat_article end **********')
        export_data.article_main()

    parser_params = result_list

    # 配置spider
    spider = Spider(tab_urls='WWA_wechat_article_url',
                    tab_site='WWA_wechat_site_info',
                    tab_content='WWA_wechat_article',
                    content_unique_key='title',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(wechat_article_parser)

    spider.start()
Exemplo n.º 16
0
def main():
    def begin_callback():
        # mongo_db = MongoDB()
        # mongo_db.update('ZHEJIANG_APP_urls', {'depth': 0}, {'status': 0})
        log.info('\n********** spider_main begin **********')

    def end_callback():
        log.info('\n********** spider_main end **********')

    # 配置spider
    spider = Spider(tab_list,
                    tab_unique_key_list,
                    tab_ensure_index_list,
                    parser_count=1,
                    site_parsers=parser_siteid_list,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params={})

    # 添加parser
    for parser in parser_list:
        spider.add_parser(parser)

    spider.start()
Exemplo n.º 17
0
def main():
    def begin_callback():
        log.info('\n********** VA_APP begin **********')
        db = MongoDB()
        db.delete('VAApp_urls', {})

    def end_callback():
        export_data.main()
        log.info('\n********** VA_APP end **********')

    # 配置spider
    spider = Spider(tab_urls='VAApp_urls',
                    tab_site='VAApp_site_info',
                    tab_content='VAApp_content_info',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params={})

    # 添加parser
    spider.add_parser(headline_parser)
    spider.add_parser(kuaibao_parser)

    spider.start()
Exemplo n.º 18
0
def main():
    db = OracleDB()

    sql = '''
        select t.program_id, c.chan_name, program_name, d.name, t.image_url, t.official_blog
          from TAB_MMS_PROGRAM t
          left join tab_mam_chan c
            on c.chan_id = t.chan_id
          left join tab_mms_dictionary d
            on t.type = d.id
           and d.type = 2
    '''
    # where t.program_id =  226
    program_info = db.find(sql)

    def begin_callback():
        log.info('\n********** news begin **********')
        # 更新任务状态 doing

    def end_callback():
        log.info('\n********** news end **********')

    # 配置spider
    spider = Spider(tab_urls='mms_urls',
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    delete_tab_urls=True,
                    parser_params=program_info)

    # 添加parser
    # spider.add_parser(iqiyi_hot_parser)
    spider.add_parser(iqiyi_search_parser)
    # spider.add_parser(weibo_user_parser)
    # spider.add_parser(weibo_article_parser)

    spider.start()
Exemplo n.º 19
0
def main():
    search_task_sleep_time = int(
        tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time'))

    db = OracleDB()

    #  更新符合日期条件的任务状态 未做
    sql = 'update tab_ivms_task_info t set t.task_status = 501 where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
    db.update(sql)

    # 更新关键词状态 未做
    sql = 'update tab_ivms_task_keyword k set k.finish_status = 601 where k.task_id in (select t.task_id from tab_ivms_task_info t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time)'
    db.update(sql)

    while True:
        # 查任务
        log.debug('查询任务...')

        sql = 'select t.task_id from TAB_IVMS_TASK_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and t.task_status = 501'
        result = db.find(sql, fetch_one=True)
        if not result:
            break

        task_id = result[0]

        while True:
            # 查看是否有正在执行的任务
            sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 602' % task_id
            do_task = db.find(sql, fetch_one=True)
            if do_task:
                time.sleep(search_task_sleep_time)
                continue

            sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 601' % task_id
            result = db.find(sql, fetch_one=True)
            if not result:
                break

            keyword_id = result[0]
            task_id = result[1]
            search_keyword1 = []
            search_keyword2 = result[2].split(',') if result[2] else []
            search_keyword3 = result[3].split(',') if result[3] else []

            def begin_callback():
                log.info('\n********** VA begin **********')
                # 更新任务状态 正在做
                sql = 'update TAB_IVMS_TASK_INFO set task_status = 502 where task_id = %d' % task_id
                db.update(sql)

                # 更新关键词状态 正在做
                sql = 'update tab_ivms_task_keyword set finish_status = 602 where id = %d' % keyword_id
                db.update(sql)

            def end_callback():
                # 更新关键词状态 做完
                sql = 'update tab_ivms_task_keyword set finish_status = 603 where id = %d' % keyword_id
                db.update(sql)

                # 如果该任务的所有关键词都做完 则更新任务状态为做完
                sql = 'select t.* from tab_ivms_task_keyword t where task_id = %d and finish_status = 601' % task_id
                results = db.find(sql)
                if not results:
                    # 导出数据
                    key_map = {
                        'program_id': 'vint_sequence.nextval',
                        'search_type': 'int_search_type',
                        'program_name': 'str_title',
                        'program_url': 'str_url',
                        'release_date': 'date_release_time',
                        'image_url': 'str_image_url',
                        'program_content': 'str_content',
                        'task_id': 'vint_%d' % task_id,
                        'keyword': 'str_keyword',
                        'keyword_count': 'int_keyword_count',
                        'check_status': 'vint_202'
                    }

                    export = ExportData('VA_content_info',
                                        'tab_ivms_program_info', key_map,
                                        'program_url')
                    export.export_to_oracle()

                    # 更新任务状态 做完
                    sql = 'update TAB_IVMS_TASK_INFO set task_status = 503 where task_id = %d' % task_id
                    db.update(sql)
                    log.info('\n********** VA end **********')

            # 配置spider
            spider = Spider(tab_urls='VA_urls',
                            tab_site='VA_site_info',
                            tab_content='VA_content_info',
                            parser_count=1,
                            begin_callback=begin_callback,
                            end_callback=end_callback,
                            search_keyword1=search_keyword1,
                            search_keyword2=search_keyword2,
                            search_keyword3=search_keyword3)

            # 添加parser
            spider.add_parser(baidu_parser)
            spider.add_parser(magnet_parser)
            spider.add_parser(netdisk_parser)
            spider.add_parser(weibo_parser)
            spider.add_parser(wechat_parser)
            spider.add_parser(soubaidupan_parser)
            spider.add_parser(douban_parser)

            spider.start()

            time.sleep(search_task_sleep_time)
Exemplo n.º 20
0
def main():
    db = OracleDB()
    mongodb = MongoDB()

    sql = 'select t.ID, t.monitor_type from TAB_MVMS_WEIBO_INFO t where monitor_status = 402'
    result_list = db.find(sql, fetch_one=False)
    if not result_list:
        log.debug('无任务 结束')
        return

    parser_params = result_list

    # for i in result:
    #     parser_params.extend(str(i[0]).split(','))

    def begin_callback():
        log.info('\n********** WWA_weibo_info begin **********')
        mongodb.delete('WWA_weibo_info_urls')

    def end_callback():
        # 导出数据
        key_map = {
            'id': 'int__id',
            'release_time': 'date_release_time',
            'come_from': 'str_come_from',
            'content': 'clob_content',
            'image_url': 'str_image_url',
            'video_url': 'str_video_url',
            'transpond_count': 'int_transpond_count',
            'praise_count': 'int_praise_count',
            'check_status': 'vint_301',
            'weibo_id': 'int_weibo_id',
            'article_url': 'str_url',
            'violate_status': 'int_violate_id',
            'sensitive_id': 'int_sensitive_id',
            'record_time': 'date_record_time',
            'SEXY_IMAGE_STATUS': 'str_sexy_image_status'
        }

        export = ExportData('WWA_weibo_info_info',
                            'tab_mvms_weibo_article_info',
                            key_map,
                            unique_key='ARTICLE_url',
                            condition={
                                'read_status': 0,
                                "image_pron_status": 2
                            })
        export.export_to_oracle()
        log.info('\n********** WWA_weibo_info end **********')

    # 配置spider
    spider = Spider(tab_urls='WWA_weibo_info_urls',
                    tab_site='WWA_site_info',
                    tab_content='WWA_weibo_info_info',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(weibo_info_parser)
    spider.start()