示例#1
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Created on    : 2019-03-14 18:49
# @Author  : zpy
# @Software: PyCharm

# todo 完善

from conf.config import mongo_storage
from plogger import get_logger

log = get_logger("storage")

class BaseBackend(object):

    def __init__(self, app):
        self.app = app

    def pre_check(self):
        """
        检查传入的配置、参数
        :return:
        """
        pass


class MongoBackend(BaseBackend):

    def save(self, data): # todo 这里应该传入数据还是?
        log.info("{} {} insert {}".format(str(self.app), self.app.group, len(data)))
        mongo_storage[str(self.app)][self.app.group].insert_many(data)
示例#2
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Created on    : 2019-03-11 22:25
# @Author  : zpy
# @Software: PyCharm

from plogger import get_logger
from conf.config import redis_client
from backend.storage import MongoBackend

log = get_logger('core_task')

# 1个sdk -> n个 source -> n个group

# todo config 子task粒度的(celery配置) 具体一批任务粒度的(结果配置、备份配置等)


class Task(object):
    """
    在 celery 上封装一层, 任务的调度,执行,分发都会依靠这里来做
    """

    app = None

    def __init__(self, **kwargs):  # todo 接口设计
        self.tasks = kwargs['tasks']
        self.group = kwargs['group']
        self.source = kwargs['source']
        self.code = -1
        log.info(('init', kwargs))
示例#3
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Created on    : 2019-03-06 17:17
# @Author  : zpy
# @Software: PyCharm

# 处理请求相关

import requests
from requests_html import HTMLSession
import random
from conf.config import PROXY as proxy
from plogger import get_logger

log  = get_logger('prequest')

def get_proxy():
    """
    返回一个代理
    :return:
    """
    return requests.get(proxy).json()['proxy']

def ua():
    headers_list = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
示例#4
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Created on    : 2019-03-06 17:19
# @Author  : zpy
# @Software: PyCharm

import abc
from abc import ABCMeta
from plogger import get_logger, func_time_logger
from exceptions import SpiderException
from requests.exceptions import Timeout, ConnectionError
from collections import deque
from spider.prequest import Msession

log = get_logger('pspider')

# todo v1 完成
# 请求解析初版
# 先不考虑链式请求


class Pspider(metaclass=ABCMeta):
    def __init__(self):
        self.result = {}
        self.session = None
        self.tasks = []

    @abc.abstractmethod
    def task(self):
        pass
示例#5
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Created on    : 2019-03-12 15:42
# @Author  : zpy
# @Software: PyCharm

from celery import Celery
from kombu import Queue, Exchange
from conf.config import celery_broker
from plogger import get_logger

log = get_logger('celery_init')

capp = Celery('app', broker=celery_broker)

# 保证任务是可靠的执行了
capp.conf.update(CELERY_REJECT_ON_WORKER_LOST=True, CELERY_ACKS_LATE=True)


def init_sdks():
    from app.register import _all_sdk_
    from app import tasks
    queues = []
    for s in _all_sdk_:
        s.app = capp
        name = s.__str__()
        log.info("load %s", name)
        tasks.__dict__[name] = s.ptask(name, rate_limit='10/m')
        queues.append(
            Queue(name,
                  exchange=Exchange(name, type='direct'),
示例#6
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Created on    : 2019-03-12 17:02
# @Author  : zpy
# @Software: PyCharm

from app.task import Task
from example.testspider import LagouSpider
from example.zhihuspider import BihuSpider
from conf.config import redis_client, mongo_storage
import time
from plogger import get_logger

log = get_logger('testsdks')


class TestTask(Task):
    def start(self, **kwargs):
        print('instance start', kwargs)
        return 'test'


class LagouTask(Task):
    def start(self):
        spider = LagouSpider()
        spider.tasks = self.tasks
        spider.start()
        for d in spider.result['job'].export_sql('test.test'):
            print(d)