def process_one_cat(url, cat_list):
    PAGE_NUM_PROCESSING = 1
    global_var._init()
    global_var.set_value("PAGE_NUM_PROCESSING", PAGE_NUM_PROCESSING)
    global_var.set_value("isLastPage", False)
    while True:
        page_num = global_var.get_value('PAGE_NUM_PROCESSING')
        tmp_url = url + str(page_num) + "/"
        # getJobList(tmp_url)
        print("while True main:" + tmp_url)
        s = requests.Session()
        lg = Lagou()
        pagegen = lg.getJobListPerPage(tmp_url, s)
        for item in pagegen:
            time_wait = 1 + float(random.randint(1, 100)) / 20
            time.sleep(time_wait)
            print("休息时间:" + str(time_wait))
            for job in item:
                db.insert(job, cat_list)
        print("跳出生成器")
        print("休息5秒钟")
        time.sleep(5)

        if global_var.get_value("isLastPage"):
            print("----------爬取结束---------,共" +
                  str(global_var.get_value('PAGE_NUM_PROCESSING')) + "页")
            break
示例#2
0
文件: main.py 项目: zkj007/crawler
def do_task(city, query):
    boss = Boss(city, query)
    boss.open_url()

    lagou = Lagou(city, query)
    lagou.open_url()

    zhilian = Zhilian(city, query)
    zhilian.open_url()
示例#3
0
    def __init__(self):
        self.mrq = MyRedisQueue()
        self.boss_obj = Boss()
        self.lagou_obj = Lagou()

        self.boss_key = 'boss'
        self.boss_task_level = 2
        self.boss_url_level = 1

        self.lagou_key = 'lagou'
        self.lagou_task_level = 2
        self.lagou_url_level = 1
def get_lagou():
    keyword = input("【拉勾网】 请输入搜索关键词: ")
    judge = input("是否确定?y/n: ")
    while True:
        if judge == "y":
            break
        else:
            keyword = input(">> 【拉勾网】 请输入搜索关键词: ")
            judge = input("是否确定?y/n: ")
    city = input(">> 【拉勾网】 请输入城市: ")
    print("Sucessfully! 开始采集【拉勾网 %s-%s】 数据..." % (keyword, city))
    clawer = Lagou(username=None,
                   password=None,
                   city=city,
                   keywords=keyword,
                   n=None,
                   rootfile=rootfile)
    #clawer.dbtable.remove({"城市":clawer.city,"关键词":clawer.keywords})
    clawer.main()
示例#5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# @Author: Dongyouyuan
# @Software: PyCharm
# @File: main.py
# @Time: 17-11-23 上午11:57
from lagou import Lagou
from db import init_db

if __name__ == "__main__":
    # 第一次运行请初始化你的数据库
    init_db()

    cookie = "user_trace_token=20171120113056-ba4eb409-b665-4fc9-8cbf-92ea7e4b1520; LGUID=20171120113057-38d05b37-cda3-11e7-996a-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; X_MIDDLE_TOKEN=8f7c1b487db070b59111650e8396d3d3; X_HTTP_TOKEN=9cfa44c75b49564bf4bb24410edeb4e7; TG-TRACK-CODE=index_hotsearch; JSESSIONID=ABAAABAACDBAAIAFDC183EB010548C17203563788B54991; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511148657,1511172863,1511258038; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511328820; LGSID=20171122133340-b23fe104-cf46-11e7-9986-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fzhaopin%2FPHP%2F%3FlabelWords%3Dlabel%3FlabelWords%3Dhot; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_PHP%3Fpx%3Ddefault%26city%3D%25E5%25B9%25BF%25E5%25B7%259E; LGRID=20171122133340-b23fe39c-cf46-11e7-9986-5254005c3644; _ga=GA1.2.525386246.1511148657; _gid=GA1.2.2044927823.1511148657; hibext_instdsigdip=1; SEARCH_ID=107bd6aa53a042e5955ff22f624fe3c5"
    city = "杭州"
    lang = "python"
    lagou = Lagou(cookie=cookie, city=city, lang=lang)
    lagou.get_data(page_sum=20)
    print(lagou.list_dict_results)
    # lagou.insert_to_db()
            time_wait = 1 + float(random.randint(1, 100)) / 20
            time.sleep(time_wait)
            print("休息时间:" + str(time_wait))
            for job in item:
                db.insert(job, cat_list)
        print("跳出生成器")
        print("休息5秒钟")
        time.sleep(5)

        if global_var.get_value("isLastPage"):
            print("----------爬取结束---------,共" +
                  str(global_var.get_value('PAGE_NUM_PROCESSING')) + "页")
            break


lg = Lagou()
top_cat_dict = lg.get_all_positions()

for top_cat_key in top_cat_dict:
    # print(top_cat_key + ":" + top_cat_dict[top_cat_key])
    grade2_cat_dict = top_cat_dict[top_cat_key]
    for grade2_key in grade2_cat_dict:
        # print(grade2_key + ":" + grade2_cat_dict[grade2_key])
        grade3_cat_dict = grade2_cat_dict[grade2_key]

        for grade3_key in grade3_cat_dict:
            url = grade3_cat_dict[grade3_key]
            print(top_cat_key)
            print(grade2_key)
            print(grade3_key)
            cat_list = [top_cat_key, grade2_key, grade3_key]
示例#7
0
class Main:
    def __init__(self):
        self.mrq = MyRedisQueue()
        self.boss_obj = Boss()
        self.lagou_obj = Lagou()

        self.boss_key = 'boss'
        self.boss_task_level = 2
        self.boss_url_level = 1

        self.lagou_key = 'lagou'
        self.lagou_task_level = 2
        self.lagou_url_level = 1

    def boss_worker(self):
        while True:
            task_type, task = self.mrq.pop_task(keys=[self.boss_key])
            print task_type, task
            level = task_type.split('-')[-1]

            if level == str(self.boss_task_level):
                detail_urls = self.boss_obj.get_position_url(
                    task)  # 获取职位详细信息的url
                self.mrq.push_task(self.boss_key, detail_urls,
                                   self.boss_url_level)

            elif level == str(self.boss_url_level):
                self.boss_obj.position_detail(task)  # 获取每个职位的信息

            time.sleep(15)  # 控制每隔10秒访问一次

    def lagou_worker(self):
        while True:
            task_type, task = self.mrq.pop_task(keys=[self.lagou_key])
            print task_type, task
            level = task_type.split('-')[-1]

            if level == str(self.lagou_task_level):
                detail_urls = self.lagou_obj.get_position_url(
                    task)  # 获取职位详细信息的url
                self.mrq.push_task(self.lagou_key, detail_urls,
                                   self.lagou_url_level)

            elif level == str(self.lagou_url_level):
                self.lagou_obj.position_detail(task)  # 获取每个职位的信息

            time.sleep(15)  # 控制每隔10秒访问一次

    def boss_task(self):
        pos_lst = ['Python', 'Python爬虫', 'Python数据分析', '机器学习', '数据挖掘', '大数据']
        url_str = 'https://www.zhipin.com/c100010000/h_100010000/?query={pos}&page={page}&ka=page-{page}'
        url = [
            url_str.format(page=str(i + 1), pos=p) for p in pos_lst
            for i in xrange(1)
        ]
        self.mrq.push_task(self.boss_key, url, level=self.boss_task_level)

    def lagou_task(self):
        pos_lst = ['Python', 'shujuwajue']
        url = []
        for p in pos_lst:
            url += [
                'https://www.lagou.com/zhaopin/{pos}/{page}/'.format(
                    pos=p, page=str(i + 1)) for i in xrange(5)
            ]
        self.mrq.push_task(self.lagou_key, url, level=self.lagou_task_level)