def test_EasyJobCrawlerHandler():
    jobProcessor = EasyJobCrawlerHandler(
        config={
            "id": "3",
            "params": {
                "spiders": [
                    "DioSpider.OldSpider.Boss.BossSearchSpider.BossSearchSpider",
                    "DioSpider.OldSpider.Boss.BossJobSpider.BossJobSpider"
                ],
                "writer": {
                    "id": 2,
                    "params": {
                        "db_name": "db",
                        "collection_name": "boss"
                    }
                }
            }
        })
    msg = Message(
        info={
            MSG_FIELD.ENTER_URL:
            "https://www.zhipin.com/c101280100/?query=%E7%88%AC%E8%99%AB&period=3&ka=sel-scale-3"
        })

    job = Job(initMsgs=[msg])
    jobProcessor.execute(job)
Exemplo n.º 2
0
def run():
    logging.info("准备跑数")
    writer = MessageMongodbWriter(config={
        "id": -1,
        "params": {
            "db_name": "test",
            "collection_name": "boss"
        }
    })
    job = Job(id="boss_crawl_dali")

    logging.info("搜索爬虫跑数")
    msgs = BossSearchSpider().crawl(ENTER_URL, {})
    logging.info("搜索爬虫跑数 结束")

    for msg in msgs:
        enterUrl = msg.getEnterUrl()
        try:
            logging.info("处理url {}".format(enterUrl))
            rst = list(BossJobSpider().crawl(enterUrl, {}))
            logging.info("写入数据 {}条".format(len(rst)))
            writer.run(job, rst)
            logging.info("写入成功")
        except Exception as e:
            logging.error("{} 跑数失败".format(enterUrl))
            traceback.print_exc()
    logging.info("跑数结束")
Exemplo n.º 3
0
def test_write():
    job = Job(id="xxxxxx")
    msg = Message(info={"name": "mryang", "age": "18"}, type=SeedType.content)

    params = {
        "db_name": "dio",
        "collection_name": "person"
    }

    w = MongodbWriter(params=params)
    w.write(job, msg)
Exemplo n.º 4
0
    def run(self, job: Job, **kwargs) -> Job:
        """
        跑数
        :param job:
        :return:
        """
        context = kwargs.get("context")
        runnerId = context.get("runner_id")
        while True:
            # 获取json
            runnerJobMatch = Hash(Connection.REDIS_DEFAULT,
                                  self.runnerJobMatchName)
            jobJsonStr = runnerJobMatch.hget(runnerId)

            # 生成job
            if job is not None:
                job = Job.form(jobJsonStr)
                self.logger.info("{} get job {}".format(runnerId, jobJsonStr))
                return job

            # 暂停数秒
            TimeUtil.sleep(self.waitingTime)
Exemplo n.º 5
0
def createTestJob():
    return Job(id="dio_test")
Exemplo n.º 6
0
 def toPython(item):
     """构造成 Job对象"""
     return Job.form(item)
Exemplo n.º 7
0
# @Time         : 18-11-28 下午10:39
# @Author       : DioMryang
# @File         : Example.py
# @Description  :
import uuid

from DioFramework import Const
from DioFramework.Base.Job.Job import Job
from DioFramework.Base.Message import Message

testMsg = Message(type=Message.CONTENT)
testMsg.updateInfo({
    Const.MSG_FIELD.ENTER_URL: "http://dio.com",
    Const.MSG_FIELD.CONTENT: "la la la test Done"
})

testJob = Job(id="miao_miao_test")

testMsgs = [testMsg]