Пример #1
0
def start_yingyongbao(**kwargs):
    hr = HandleRedis(1)
    type2 = hr.get_data_redis("yingyongbao_types")
    t = type2.split(',')[0][2:-1]
    a = 0
    for i in range(0, 2000):
        try:
            kwargs = dict(module="yingyongbao_data", data=dict(t=t, i=i))
            dicts = start(**kwargs)
            if dicts:
                log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts)))
                details = dict(module="yingyongbao_save_details", data=dicts)
                # start(**details)
                comment = dict(module="yingyongbao_save_comment", data=dicts)
                # start(**comment)
                for _ in range(1):
                    threading.Thread(target=start, kwargs=details,
                                     name=None).start()
                    threading.Thread(target=start, kwargs=comment,
                                     name=None).start()
            else:
                a += 1
            if a == 10:
                break
        except Exception as e:
            raise e
Пример #2
0
def main():
    global hr
    hr = HandleRedis(7)
    while True:
        url = hr.get_data_redis("TB_CREDIT_FJ_URL")
        if url:
            kwargs = dict(url=url)
            get_detail(**kwargs)
Пример #3
0
def start_wenshu_peichang(**kwargs):
    r = HandleRedis(1)
    name = r.get_data_redis("wenshu_keys")
    if name:
        name = name
        print(name)
    else:
        log.crawler.info('裁判文书关键词遍历完毕.....')
        return
    index = 1
    while True:
        log.crawler.info("*" * 80)
        log.crawler.info("start crawler wenshu page is:%d" % index)
        kwargs = dict(module="wenshu_peichang_data",
                      data=dict(key=name, index=index),
                      proxies=True)
        items = start(**kwargs)
        log.crawler.info("获取的文件ID长度为:%d" % (len(items) - 1))
        if len(items) == 1:
            break
        if items:
            run_eval = items[0]['RunEval']
        else:
            break
        # monkey.patch_all()
        # pool = Pool(20)
        threads = []
        for item in items[1:]:
            if threads:
                for t1 in threads:
                    if not t1.is_alive():
                        threads.remove(t1)
            if len(threads) == 10:
                time.sleep(3)
                continue
            data = {}
            data["docid"] = item["文书ID"]
            data["CASE_TYPE"] = item["案件类型"]
            data["CASE_TIME"] = item["裁判日期"]
            data["CASE_NAME"] = item["案件名称"]
            data["CASE_NUM"] = item["案号"]
            data["COURT_NAME"] = item["法院名称"]
            data['runeval'] = run_eval
            d = dict(module="wenshu_peichang_detail", data=data, proxies=True)
            t = threading.Thread(target=start, kwargs=d, name=None)
            t.setDaemon(True)
            t.start()
            t.join()
        for t in threads:
            t.join()
        index += 1
Пример #4
0
def baidu_shixin(**kwargs):
    """
       定时任务调用失信爬虫百度失信的爬取策略
       :return:
       """
    # ip_pool=get_proxies_from_redis()
    r = HandleRedis(1)
    name = r.get_data_redis("shixin_words")
    # flag为一个开关确定是否需要重新遍历关键词
    # flag = r.get('baidushixin_flag')

    data = {}
    if name:
        name = name
    else:
        log.crawler.info('百度失信关键词遍历完毕.....')
        return
    try:
        pn = 0
        hr = HandleRedis(7)
        while isinstance(pn, int):
            kwargs = dict(module='baidu', data=dict(name=name, pn=pn))
            # if ip_pool:
            #     proxies = random.choice(ip_pool)
            # else:
            #     ip_pool = get_proxies_from_redis()
            #     proxies = ip_pool.pop()
            # kwargs['data']['proxies'] = proxies
            log.crawler.info("crawler name is:{},pn is:{}".format(name, pn))
            result_dict = start(**kwargs)
            qiye = result_dict['enterprise']
            person = result_dict['person']
            if qiye:
                hr.cache_list_redis('TB_SHIXIN_ENTERPRISE', qiye)
                log.crawler.info(
                    "cache qiye shixin into redis success length is:%s" %
                    len(qiye))
            if person:
                hr.cache_list_redis('TB_SHIXIN_PERSON', person)
                log.crawler.info(
                    "cache person shixin into redis success length is:%s" %
                    len(person))
            pn = result_dict["pn"]
            if pn == "finished":
                log.crawler.info("数据请求完毕name:{},pn:{}".format(name, pn))
                break
            elif pn == 2000:
                break
            else:
                pn += 10
    except Exception as err:
        log.error.info('百度失信爬虫发生异常,信息为:\n%s' % err)
Пример #5
0
def get_page(**kwargs):
    hr = HandleRedis(7)
    proxies = kwargs.get("proxies", None)
    keyword = kwargs.get('keyword')
    for page in range(1, 101):
        url = 'http://www.fjcredit.gov.cn/creditsearch.redlist.dhtml?source_id=100&kw={}&page={}'.format(keyword, page)
        response = requests.get(url, headers=headers, proxies=proxies)
        response.encoding = response.apparent_encoding
        kwargs = dict(response=response.text, hr=hr)
        get_detail(**kwargs)
Пример #6
0
def start_sougou(**kwargs):
    hr = HandleRedis(1)
    type2 = hr.get_data_redis("sougou_type").split(',')[0][2:-1]
    a = 0
    for i in range(0, 20000):
        try:
            kwargs = dict(module="sougou_content", data=dict(type1=type2, i=i))
            content = start(**kwargs)
            dicts = {"content": content}
            if content:
                log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts)))
                details = dict(module="sougou_save_data", data=dicts)
                # start(**details)
                for _ in range(1):
                    threading.Thread(target=start, kwargs=details,
                                     name=None).start()
            else:
                a += 1
            if a == 10:
                break
        except Exception as e:
            log.crawler.error(e)
Пример #7
0
# @Author  : liuyd
# @Site    :
# @File    : tasks.py
# @Software: PyCharm
import time
from gevent import monkey
import threading
from gevent.pool import Pool
import ast
from tax.monitor.save_mysql import Mysql
from tax.util import log
from tax.control_spider import start
from tax.model.handle_redis import HandleRedis, RedisPool
import multiprocessing

hr = HandleRedis(1)


def enterprise_list(**kwargs):
    k = kwargs.get("k", None)
    if not k:
        raise ValueError("k 参数存在错误......")
    item = hr.get_data_redis(k)
    if item:
        url = item
        item = ast.literal_eval(item)
        if isinstance(item, dict):
            url = item['url']
        city = item['city']
        prov = item['prov']
        log.crawler.info("start crawler prov:%s,city:%s,url is:%s" %
Пример #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time    : 2018/10/9 14:06
@Author  : liuyd
@File    : persist_task.py
@desc    : 从redis里取数据存入mysql
"""
import ast
from tax.config.conf import *
from tax.PublicSpider.get_content_static import log, HandDb, DbHandle
from tax.model.handle_redis import HandleRedis
from tax.PublicSpider.common import getmd5

hr = HandleRedis(7)
db = DbHandle()


def persis_data_into_mysql(table, datas):
    hd = HandDb(table)
    for data in datas:
        sql = hd.generate_sql_dict(data)
        db.insert_db_func(sql=sql)


def get_result_data(table):
    keys_name = table
    pop_data_list = []
    # 定义一个列表用来保存反馈的数据
    feedback_data_list = []
    # 一次最大传递数据量