Exemplo n.º 1
0
    def check_data_base(self):
        """数据库结构检查"""

        list = [
            TBNews(),
            TBArticle(),
            TBProgram(),
            TBWenzhangInfo(),
            TBDictionaryType(),
            TBDictionary(),
            TBNewsGroup(),
            TBNlpFilter(),
            TBScrapingTarget(),
            TBSoap(),
            TBSoapTarget(),
            TBGlobalTarget(),
            TBMR(),
            TBSpecialTarget(),
            TBProgramType(),
            TBSoapBlackList(),
            TBHeavyText()
        ]
        # list = [TBSoapBlackList()]

        guard = StructureGuard(Configs())
        log = guard.check(list)

        if len(log) > 0:
            LogGo.info(str(log))
Exemplo n.º 2
0
    def send_to_queue(result):
        """
        传送
        code: 0 到达最大访问频率
              1 正常结果
        :param request:
        :param result:
        :return:
        """
        global target_mutex, target_count, target_transported_count, all_target_transported

        code, value = result
        target, detail_page_bundle_list, content_ruler, encode = value

        if code == 1:
            for detail_page_bundle in detail_page_bundle_list:
                target_producer.target_queue.queue.put(
                    (target, detail_page_bundle, content_ruler, encode))
                ScrabingTarget.set_last_access_date(target.id)
        else:
            LogGo.error("List Page Error:" + str(target.data_key) + " Code: " +
                        str(code))
            ScrabingTarget.set_elog(target.id, "error code: " + str(code))

        if target_mutex.acquire():
            if target_count == target_transported_count:
                all_target_transported = True
            else:
                target_transported_count += 1
                LogGo.debug('target_transported_count: ' +
                            str(target_transported_count))

            target_mutex.release()
Exemplo n.º 3
0
    def scrape_detail(self, target, detail_page_bundle, content_ruler, encode):
        try:
            order = self.web_order

            ulweb = UlWebRuler()
            result = ulweb.scan_detail(target, detail_page_bundle, order,
                                       content_ruler, encode)

            code, detail_page_result = result

            if code == 1:
                if detail_page_result is not None:
                    return (1, detail_page_result)
            elif code == -3:
                self.temp_list.append(
                    (target, detail_page_bundle, content_ruler, encode))

        except WebTargetOutOfDateException as e:
            return (-1, e.args[0])
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))
            return (-1, e.args[0])

        return (0, None)
Exemplo n.º 4
0
    def scan_list(self, target, exists):
        list = []
        result_list = []

        cap = 'data'

        ruler = 'author:author;title:title;date:posttime;img:picurl;link:url;top:top;click:readnum_newest;vote_up:likenum_newest;subject:content'

        url = self.url.format(target.extra0, target.wx_hao)
        header = {'X-Requested-With': 'XMLHttpRequest'}

        raw = RequestHelper.get(url, header=header, file_cookie=Configs.gsdata_cookie_file)

        try:
            self.looper_js(list, raw, exists, ruler, cap)
        except Exception as e:
            E.out_err(e)
            return (-1, (target, None, None, None))

        if len(list) > 0:
            list = self.sort(list)
            list.reverse()

            for item in list:
                if exists.count(item['title']) < 1:
                    result_list.append(item)

            LogGo.debug('newrank list length:' + str(len(result_list)))

        if len(result_list) > 0:
            return (1, (target, list, None, None))
        return(-1, (target, None, None, None))
Exemplo n.º 5
0
    def reset_shutdown_status():
        try:
            file_name = Configs().system_shutdown_flag_filename

            FileHelper.create(file_name, "0")
        except:
            LogGo.error("error while reseting shutdown flag file!!!")
Exemplo n.º 6
0
    def start_mormal_mission(self):
        global all_target_transported

        target_list = get_target_list()

        self.target_producer = target_producer(target_list,
                                               self.config.target_pool_size,
                                               self.config.target_queue_size)
        self.target_consumer = target_consumer()
        self.upload_consumer = upload_consumer(self.config.uploader_queue_size)

        self.upload_consumer.start()
        self.target_consumer.start()
        self.target_producer.start()

        # self.target_producer.pool.close()
        # self.target_producer.pool.join()

        while True:  # LogGo.debug(">>> target queue unfinishd count: " + str(self.target_producer.target_queue.queue.unfinished_tasks))
            time.sleep(5)
            LogGo.debug("target_transported_over: " +
                        str(target_producer.is_all_target_transported()))

        # self.target_consumer.queue.queue.join()
        # time.sleep(6000)

        LogGo.info('Loop Done! task count: ' + str(len(target_list)))
        SMTPServer.launch_mission_report()
Exemplo n.º 7
0
    def newrank_detail(self, target, detail_page_bundle, content_ruler,
                       encode):
        LogGo.info("On newrank detail: " + str(target.data_key))

        try:
            order = self.wechat_order

            newrank = NewrankRuler()
            result = newrank.scan_detail(target, detail_page_bundle, order,
                                         content_ruler, encode)

            code, detail_page_result = result

            if code == 1:
                if detail_page_result is not None:
                    return (1, detail_page_result)
            elif code == -3:
                self.temp_list.append(
                    (target, detail_page_bundle, content_ruler, encode))

        except Exception:
            LogGo.warning('error in newrank detail')
            return (-1, None)

        return (0, None)
Exemplo n.º 8
0
    def sogou_transfor(self):

        LogGo.info('搜狗转移')

        result = 0

        # sogou = SougouTransforRuler()
        #
        # existsUrls = self.news.get_all_title()
        # order = self.news.get_max_order_code('wechat')
        #
        # news, article = sogou.ExtraList(existsUrls, order)
        #
        # content = '搜狗转移任务(' + ')' + '\r\n'
        # content += '此次采集数量: ' + str(len(news)) + '\r\n'
        #
        # if self.store(news, article):
        #     result = 1
        # else:
        #     result = -1
        #
        # if result == 1:
        #     content += '存储成功!'
        #
        # SMTPServer.build_mission_report(content)
        # print(content)

        return result
Exemplo n.º 9
0
    def store_soap_target(self, targets: [], banned: bool = False):
        status = False
        element = 0

        for target in targets:
            try:
                if banned:
                    self.banned_program.save(target)
                else:
                    self.soap_target.save(target)

                element += 1
            except Exception as e:
                import traceback
                msg = traceback.format_exc()
                LogGo.warning(msg)

        LogGo.info('Total :' + str(element) + ' / ' + str(len(targets)) +
                   ' elements Saved!')

        if element == 0:
            status = False
        else:
            status = True

        return status
Exemplo n.º 10
0
    def gs_detail(self, target, detail_page_bundle, content_ruler, encode):
        LogGo.info("On gs detail: " + str(target.data_key))

        try:
            order = self.wechat_order

            gs = GsdataRuler()
            result = gs.scan_detail(target, detail_page_bundle, order,
                                    content_ruler, encode)

            code, detail_page_result = result

            if code == 1:
                if detail_page_result is not None:
                    return (1, detail_page_result)
            elif code == 3:
                self.temp_list.append(
                    (target, detail_page_bundle, content_ruler, encode))

        except Exception as e:
            import traceback
            LogGo.warning(repr(e))
            return (-1, e.args[0])

        return (0, None)
Exemplo n.º 11
0
    def scan_list(self, target, exists):
        self.limited_forward_count = target.limited_forward_count
        self.limited_attitude_count = target.limited_attitude_count

        list = []
        result_list = []

        """模拟登陆"""
        status = 'you got it'

        """如果登陆成功"""
        if status != '':
            self.loops(target,exists,list)
            if len(list) < 1:
                return (0, (target, None, None, None))
        else:
            LogGo.warning("Weibo: Loop scan faild!")
            return (-1, (target, None, None, None))

        if len(list) > 0:
            list = self.purify(list)
            list.reverse()

            for item in list:
                if exists.count(item['id']) < 1:
                    result_list.append(item)

            LogGo.debug('newrank list length:' + str(len(result_list)))

        if len(result_list) > 0:
            return (1, (target, list, None, None))
        return(-1, (target, None, None, None))
Exemplo n.º 12
0
    def set_shutdown_status():
        try:
            file_name = Configs().system_shutdown_flag_filename

            FileHelper.create(file_name, "1")
        except:
            LogGo.error("something wrong at setting shutdown flag file !!!")
Exemplo n.º 13
0
    def purify(self,list):
        if len(list) < 1:
            return []

        result = []

        for i in list:
            try:
                flag = True

                id = i['id']
                text = i['text']

                limited_attitude_count = i['attitudes_count']
                limited_forward_count = i['reposts_count']

                program_count = 0

                # 基础过滤重复id
                for seq in result[::-1]:
                    sid = seq['id']
                    if id == sid:
                        flag = False
                        break

                # 第一次节目名过滤(有可能会包含到非节目)
                if flag and text.count('《') < 1:
                    flag = False

                # 条二次参数过滤
                if flag and self.limited_attitude_count is not None and limited_attitude_count is not None:
                    if limited_attitude_count < self.limited_attitude_count:
                        flag = False

                if flag and self.limited_forward_count is not None and limited_forward_count is not None:
                    if limited_forward_count < self.limited_forward_count:
                        flag = False

                # 第三次依据节目名过滤
                if flag:
                    for program in self.exist_program:
                        if text.count(program) >= 1:
                            program_count = program_count + 1

                        if program_count > 3:
                            flag = False
                            break

                if flag:
                    result.append(i)
            except Exception as e:
                import traceback
                msg = traceback.format_exc()
                LogGo.warning(msg)

        return result
Exemplo n.º 14
0
def base_init():
    LogGo.init(Configs())
    RequestHelper.init(Configs())
    SMTPServer.init(Configs())
    Download(Configs())

    RequestHelperClassVer.init(Configs())
    ProxyHelper.init(Configs())
    MysqlHelper.init(Configs())
    BaseStrategy.init()
Exemplo n.º 15
0
    def ExtraList(self, target, existsUrls, order):

        # order = ScrappdeDataDao.get_max_order_code()  # 数据库中排序代码
        result = []

        url = str(target.extra0)
        next_index = ""

        """抓取地址"""
        raw = WechatRuler.req._get(url)

        try:
            trup = ExtraJSON.extraWechatList(raw, 'msgList', WechatRuler.keys)
            list = trup[0]
            next_index = str(trup[1])
        except Exception as e:
            print(e)
            print("ERROR")
            return result

        while True:
            try:
                print('>>> scaning id: ' + next_index)
                LogGo.info('>>> scaning id: ' + next_index)
                tup = self.loopToFail(url, next_index)

                re_list = tup[0]
                next_index = str(tup[1])
                is_continue = tup[2]

                if len(re_list) > 0:
                    for item in re_list:
                        list.append(item)
                    # break
                else:
                    break

                if is_continue != 1:
                    break

            except Exception as e:
                print(e)
                break

        print('>>> list scaning completed')
        print('>>>')

        list.reverse()

        print('>>> Start Build SQL')
        result = self.build_base_dic(target,list,existsUrls,order)
        print('>>> Build SQL Success')
        print('>>>')

        return result
Exemplo n.º 16
0
    def start(self):
        """
        开启线程池,把所有 Target 放入池中,单独获取抓取列表
        :return:
        """

        while True:
            LogGo.info("Start target pool")
            [
                self.pool.apply_async(target_producer.worker, (target, ))
                for target in self.targets
            ]
            time.sleep(Configs().work_interval)
Exemplo n.º 17
0
    def check_shutdown_status():
        try:
            file_name = Configs().system_shutdown_flag_filename

            status = int(StringHelper.trim(FileHelper.read(file_name)))

            if status == 1:
                somebody_help.reset_shutdown_status()
                return True
            else:
                return False
        except:
            LogGo.error("system_shutdown_flag_file unavailable!")
            return False
Exemplo n.º 18
0
    def store_program(self, ids, programs):
        status = False

        if len(ids) != len(programs):
            LogGo.error("ids count unmatch programs count")
            return False

        count = len(ids)
        element = 0

        if count > 0:
            for id, program in zip(ids, programs):
                try:
                    self.program.save(program, id)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")

        return status
Exemplo n.º 19
0
    def start(self):
        try:
            if self.config.check_table:
                self.check_data_base()

            # RequestHelperClassVer.init(self.config)
            # ProxyHelper.init(self.config)
            # MysqlHelper.init(self.config)

            self.start_mormal_mission()
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)
Exemplo n.º 20
0
    def newrank_list(self, target):
        LogGo.info("On newrank list: " + str(target.data_key))

        try:
            exists = self.exists_title

            newrank = NewrankRuler()
            code, value = newrank.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

            return (0, None)
Exemplo n.º 21
0
    def gs_list(self, target):
        LogGo.info("On gs list: " + str(target.data_key))

        try:
            exists = self.exists_signature

            gs = GsdataRuler()
            code, value = gs.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

        return (0, None)
Exemplo n.º 22
0
    def weibo_list(self, target):
        LogGo.info("On weibo list: " + str(target.data_key))

        try:
            exists = self.exists_identifier

            weibo = WeiboRuler()
            code, value = weibo.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

        return (0, None)
Exemplo n.º 23
0
    def store(self, news_list: [], article_list: [], heavy_list: [] = None):
        status = False

        if len(article_list) != len(news_list):
            LogGo.error("news count unmatch article count")
            return False

        count = len(news_list)
        element = 0

        if count > 0:
            if heavy_list is not None:
                for news, article, heavy in zip(news_list, article_list,
                                                heavy_list):
                    try:
                        id = genUUID()

                        self.news.save(news, id)
                        self.article.save(article, id)
                        self.heavy.save_with_news_id(heavy, id)
                        element += 1
                    except Exception as e:
                        LogGo.warning(repr(e))
            else:
                for news, article in zip(news_list, article_list):
                    try:
                        id = genUUID()

                        self.news.save(news, id)
                        self.article.save(article, id)
                        element += 1
                    except Exception as e:
                        LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
            # return True
        else:
            LogGo.info("0 Element!")
            # return False

        return status
Exemplo n.º 24
0
    def start(self):
        result = 0

        try:

            LogGo.info('搜狗爬虫')

            # Updatemp.loot()

            return result
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            print(msg)
            LogGo.warning(e)
            result = -1

        return result
Exemplo n.º 25
0
    def weibo_detail(self, target, detail_page_bundle, content_ruler, encode):
        LogGo.info("On weibo detail: " + str(target.data_key))

        try:
            order = self.weibo_order

            weibo = WeiboRuler()
            detail_page_result_dic = weibo.scan_detail(target,
                                                       detail_page_bundle,
                                                       order, content_ruler,
                                                       encode)

            if detail_page_result_dic is not None:
                return (1, detail_page_result_dic)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))
            return (-1, e.args[0])

        return (0, None)
Exemplo n.º 26
0
    def scan_list(self, target, exists):
        """请求参数"""
        par = (['flag', 'true'], ['uuid', target.extra0])
        """抓取关键字"""
        keys = [
            'title', 'author', 'publicTime', 'url', 'clicksCount', 'likeCount',
            'publicTime', 'summary'
        ]

        list = []
        result_list = []

        try:
            raw = RequestHelper.post(NewrankRuler.url,
                                     par,
                                     file_cookie=Configs.newrank_cookie_file)
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            # print(msg)
            LogGo.warning(msg)
            return (-1, (target, None, None, None))

        try:
            list = ExtraJSON.extra_newrank_wechat_list(raw, keys)
        except:
            return (-1, (target, None, None, None))

        if len(list) > 0:
            list.reverse()

            for item in list:
                if exists.count(item['title']) < 1:
                    result_list.append(item)

            LogGo.debug('newrank list length:' + str(len(result_list)))

        if len(result_list) > 0:
            return (1, (target, list, None, None))
        return (-1, (target, None, None, None))
Exemplo n.º 27
0
    def store_program_type(self, pair):
        status = False

        count = len(pair)
        element = 0

        if count > 0:
            for item in pair:
                try:
                    program, type = item
                    self.program_type.save_by_program_type(program, type)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")

        return status
Exemplo n.º 28
0
    def store_count(self, list):
        status = False
        count = len(list)
        element = 0
        update = 0

        if count > 0:
            for program in list:
                try:
                    if self.check_for_exists(program) == 1:
                        update += 1
                    else:
                        id = genUUID()
                        self.pc.save(program, id)

                        element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' (' + str(element) +
                       ' Saved, ' + str(update) + ' Updated)')

            if element == 0 and update == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            # return False

        return status
Exemplo n.º 29
0
    def gensto(self, dao, list):
        status = False
        count = len(list)
        element = 0

        if count > 0:
            for item in list:
                try:
                    id = genUUID()
                    dao.save_or_update(item, id)
                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    # print(msg)
                    LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')
            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            stauts = False

        return status
Exemplo n.º 30
0
    def genup(self, dao, list):
        status = False
        count = len(list)
        element = 0

        if count > 0:
            for (update, where) in list:
                try:
                    dao.update(update, where)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    # print(msg)
                    LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Updated!')
            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            stauts = False

        return status