예제 #1
0
class MainSpider(scrapy.Spider, db.MydbOperator):
    name = 'KSHR Spider'
    email_content = ''
    start = 0
    totalSize = 0
    start_url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=646&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%E5%A4%96%E8%B4%B8&kt=3&=0&_v=0.55545747&x-zp-page-request-id=df3b86ad2bca4742ad84253e0958c6e5-1562992810274-844145&x-zp-client-id=5c977c9d-c1ee-4c67-b475-97de3b07b16c'
    # mydb = db.MydbOperator()
    emailService = email_service.EmailService()
    SITE_NAME = "ZhiLian"
    download_timeout = 20

    def __init__(self, table_name='', webhook_url='', **kwargs):
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.mydb = db.MydbOperator(table_name)
        print(webhook_url)
        self.webhook_service = webhook.WebHook(webhook_url)
        self.mydb.create_table()
        self.isInitialize = self.mydb.is_empty_table()
        print(self.isInitialize)
        self.page_limit = 5
        super().__init__(**kwargs)

    def start_requests(self):
            yield scrapy.Request(self.start_url,
                                        method="GET",
                                        headers={
                                            'Referer':'https://sou.zhaopin.com/?jl=646&sf=0&st=0&kw=%E5%A4%96%E8%B4%B8&kt=3',
                                            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
                                            },
                                        callback=self.parse,
                                        errback=self.errback_httpbin,
                                        dont_filter=True)
    def errback_httpbin(self, failure):
        self.logger.error(repr(failure))

    def parse(self,response):
        result_json = json.loads(response.text)
        if result_json['code'] == 200 and result_json['data']['count'] > 0:
            if (self.start == 0): 
                self.totalSize= result_json['data']['count']

            for company_info in result_json['data']['results']:
                job_title = "暂未开放"
                location = "暂未开放"
                company_name = company_info['company']['name']
                company_url = company_info['company']['url']
                company_in_db = self.mydb.getByCompanyName(company_name)
                if company_in_db is None:
                    company_obj = company.company(job_title, company_name, company_url, location)
                    self.email_content = self.email_content + company_name + " " + company_url + '\r\n'
                    self.mydb.save_company(company_obj)
                else: 
                    if not any(self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")):
                        siteName = company_in_db[3] + "," + self.SITE_NAME
                        company_obj = company.company(company_name, company_url, siteName)
                        self.mydb.updateCompany(company_obj)

            if (self.start + 90 <= self.totalSize):
                self.start += 90
                follow_url = 'https://fe-api.zhaopin.com/c/i/sou?start=' + str(self.start) + '&pageSize=90&cityId=646&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%E5%A4%96%E8%B4%B8&kt=3&=0&_v=0.55545747&x-zp-page-request-id=df3b86ad2bca4742ad84253e0958c6e5-1562992810274-844145&x-zp-client-id=5c977c9d-c1ee-4c67-b475-97de3b07b16c'

                yield scrapy.Request(follow_url, 
                                        method="GET",
                                        headers={
                                            'Referer':'https://sou.zhaopin.com/?jl=646&sf=0&st=0&kw=%E5%A4%96%E8%B4%B8&kt=3',
                                            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
                                            },
                                        callback=self.parse, 
                                        errback=self.errback_httpbin, 
                                        dont_filter=True)

    def spider_closed(self, spider):
        if self.email_content != '':
            self.emailService.sendEmail(self.email_content)
        self.mydb.close()
예제 #2
0
class Sequential_MainSpider(scrapy.Spider, db.MydbOperator):
    name = 'KSHR Spider'
    email_content = ''
    start_url = 'http://kshr.com.cn/handler/CommonDataHandler.ashx?t='
    # mydb = db.MydbOperator()
    emailService = email_service.EmailService()
    SITE_NAME = "kshr"
    pageNo = 1
    form_request_payload = {
        "Industry": "",
        "Area": "",
        "PFun": "",
        "MonthSalary": "",
        "CompanyProperty": "",
        "CompanyScale": "",
        "Degree": "",
        "WorkYear": "",
        "Sex": "",
        "PublishTime": "",
        "OrderbySalary": "",
        "CurrentPage": pageNo,
        "KeyType": "all",
        "KeyWord": "外贸"
    }
    download_timeout = 20

    def __init__(self, table_name='', webhook_url='', **kwargs):
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.mydb = db.MydbOperator(table_name)
        print(webhook_url)
        self.webhook_service = webhook.WebHook(webhook_url)
        self.mydb.create_table()
        self.isInitialize = self.mydb.is_empty_table()
        print(self.isInitialize)
        self.page_limit = 5
        super().__init__(**kwargs)

    def start_requests(self):
        yield scrapy.FormRequest(
            self.start_url + str(random.random()),
            method="POST",
            body='parm=' + str(self.form_request_payload) + '&m=getposition',
            headers={
                'Content-Type':
                'application/x-www-form-urlencoded;charset=UTF-8',
                'Referer':
                'http://kshr.com.cn/CacheSearch.aspx?keyword=%E5%A4%96%E8%B4%B8&strtype=all',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest'
            },
            callback=self.parse,
            errback=self.errback_httpbin,
            dont_filter=True)

    def errback_httpbin(self, failure):
        self.logger.error(repr(failure))

    def parse(self, response):
        self.logger.debug("Current Page:" + str(self.pageNo))
        result_json = json.loads(response.text)
        if result_json['ResultPageCount'] > 0:
            response = HtmlResponse(url=self.start_url,
                                    body=result_json['ResultHtml'],
                                    encoding='utf-8')
            selectors = response.selector.xpath(
                "//div[contains(@class,'data-fy')]//div[contains(@class, 'yp-search-list')]//p//a"
            )
            if selectors:
                for company_info in selectors:
                    location = "暂未开放"
                    company_name = company_info.css("::text").get()
                    company_url = "http://kshr.com.cn" + company_info.xpath(
                        "@href").get()
                    job_title = "暂未开放"
                    company_in_db = self.mydb.getByCompanyName(company_name)
                    if company_in_db is None:
                        company_obj = company.company(job_title, company_name,
                                                      company_url, location)
                        self.email_content = self.email_content + company_name + " " + company_url + '\r\n'
                        self.mydb.save_company(company_obj)
                        # 添加 webhook发送器
                        if self.isInitialize:
                            formatted_context = self.webhook_service.format_with_template(
                                company_obj)
                            print(formatted_context)
                            self.webhook_service.send_markdown(
                                company_name, formatted_context, True)
                    else:
                        if not any(
                                self.SITE_NAME in from_site
                                for from_site in company_in_db[3].split(",")):
                            siteName = company_in_db[3] + "," + self.SITE_NAME
                            company_obj = company.company(
                                job_title, company_name, company_url, location)
                            self.mydb.updateCompany(company_obj)

            last_page = response.selector.xpath(
                "//div[@data-xh][last()]/@data-xh").get()
            if (int(last_page) == self.pageNo):
                self.logger.debug("Page crawling finished...")
                return

            self.pageNo += 1
            self.form_request_payload['CurrentPage'] = self.pageNo

            yield scrapy.FormRequest(
                self.start_url + str(random.random()),
                method="POST",
                body='parm=' + str(self.form_request_payload) +
                '&m=getposition',
                headers={
                    'Content-Type':
                    'application/x-www-form-urlencoded;charset=UTF-8',
                    'Referer':
                    'http://kshr.com.cn/CacheSearch.aspx?keyword=%E5%A4%96%E8%B4%B8&strtype=all',
                    'X-Requested-With': 'XMLHttpRequest'
                },
                callback=self.parse,
                errback=self.errback_httpbin,
                dont_filter=True)

    def spider_closed(self, spider):
        if self.email_content != '':
            self.emailService.sendEmail(self.email_content)
        self.mydb.close()
예제 #3
0
class MainSpider(scrapy.Spider, db.MydbOperator):
    name = '58DY Spider'
    email_content = ''
    # start_urls = ['https://nanjing.58.com/job/?key=%E5%A4%96%E8%B4%B8&classpolicy=main_null,job_A&final=1&jump=1']
    emailService = email_service.EmailService()
    SITE_NAME = "58DY"

    def __init__(self,
                 table_name='',
                 webhook_url='',
                 site=[''],
                 location='',
                 **kwargs):
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.start_urls = [
            f'https://{site}.58.com/job/?key=%E5%A4%96%E8%B4%B8&classpolicy=main_null,job_A&final=1&jump=1'
        ]
        self.mydb = db.MydbOperator(table_name)
        print(webhook_url)
        self.webhook_service = webhook.WebHook(webhook_url)
        self.mydb.create_table()
        self.isInitialize = self.mydb.is_empty_table()
        print(self.isInitialize)
        self.location = location
        self.page_limit = 5
        super().__init__(**kwargs)

    def parse(self, response):
        for company_info in response.selector.xpath(
                "//ul[@id='list_con']//li[contains(@class,'job_item')]//div[@class='comp_name']//a"
        ):
            # for company_info in response.selector.xpath("//ul[@id='list_con']//li[contains(@class,'job_item')]"):
            job_title = company_info.xpath(
                "//span[@class = 'cate']//text()").get()
            print("***********************")
            print(job_title)
            company_name = company_info.xpath("@title").get()
            print(company_name)
            company_url = company_info.xpath("@href").get()
            print(company_url)
            location = self.location
            print(location)
            print("##############")
            company_in_db = self.mydb.getByCompanyName(company_name)
            if company_in_db is None:
                company_obj = company.company(job_title, company_name,
                                              company_url, location)
                # self.email_content = self.email_content + company_name + " " + company_url + '\r\n'
                self.mydb.save_company(company_obj)
                # 添加 webhook发送器
                if not self.isInitialize:
                    formatted_context = self.webhook_service.format_with_template(
                        company_obj)
                    print(formatted_context)
                    self.webhook_service.send_markdown(company_name,
                                                       formatted_context, True)
            else:
                # Quit as reaching existing data records
                logging.info("Found existing record, hence quite.")
                raise CloseSpider("There's no new record yet.")
                # if not any(self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")):
                #     siteName = company_in_db[3] + "," + self.SITE_NAME
                #     companyObj = company.company(company_name, company_url, siteName)
                #     self.mydb.updateCompany(companyObj)
                # if not any(self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")):
                #     siteName = company_in_db[3] + "," + self.SITE_NAME
                #     company_obj = company.company(job_title, company_name, company_url, location)
                #     self.mydb.updateCompany(company_obj)
        for next_page in response.selector.xpath(
                "//div[@class='pagesout']/a[contains(@class,'next')]"):
            nextPageSelector = next_page.css(".disabled").get()
            if nextPageSelector is None:
                yield response.follow(next_page, self.parse)
            else:
                if self.email_content != '':
                    self.emailService.sendEmail(self.email_content)

    def spider_closed(self, spider):
        self.mydb.close()
예제 #4
0
class Parallel_MainSpider(scrapy.Spider, db.MydbOperator):
    name = 'KSHR Spider'
    email_content = ''
    start_url = 'http://kshr.com.cn/handler/CommonDataHandler.ashx?t='
    mydb = db.MydbOperator()
    emailService = email_service.EmailService()
    SITE_NAME = "kshr"
    pageNo = 1
    form_request_payload = {
        "Industry": "",
        "Area": "",
        "PFun": "",
        "MonthSalary": "",
        "CompanyProperty": "",
        "CompanyScale": "",
        "Degree": "",
        "WorkYear": "",
        "Sex": "",
        "PublishTime": "",
        "OrderbySalary": "",
        "CurrentPage": pageNo,
        "KeyType": "all",
        "KeyWord": "外贸"
    }
    download_timeout = 30

    def __init__(self):
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def start_requests(self):
        while self.pageNo < 20:
            yield scrapy.FormRequest(
                self.start_url + str(random.random()),
                method="POST",
                body='parm=' + str(self.form_request_payload) +
                '&m=getposition',
                headers={
                    'Content-Type':
                    'application/x-www-form-urlencoded;charset=UTF-8',
                    'Referer':
                    'http://kshr.com.cn/CacheSearch.aspx?keyword=%E5%A4%96%E8%B4%B8&strtype=all',
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
                    'X-Requested-With': 'XMLHttpRequest'
                },
                callback=self.parse,
                errback=self.errback_httpbin,
                dont_filter=True)
            self.pageNo += 1
            self.form_request_payload['CurrentPage'] = self.pageNo

    def errback_httpbin(self, failure):
        self.logger.error(repr(failure))

    def parse(self, response):
        result_json = json.loads(response.text)
        if result_json['ResultPageCount'] > 0:
            response = HtmlResponse(url=self.start_url,
                                    body=result_json['ResultHtml'],
                                    encoding='utf-8')
            selectors = response.selector.xpath(
                "//div[contains(@class,'data-fy')]//div[contains(@class, 'yp-search-list')]//p//a"
            )
            if selectors:
                for company_info in selectors:
                    company_name = company_info.css("::text").get()
                    company_url = company_info.xpath("@href").get()
                    company_in_db = self.mydb.getByCompanyName(company_name)
                    if company_in_db is None:
                        companyObj = company.company(company_name, company_url,
                                                     self.SITE_NAME)
                        self.email_content = self.email_content + company_name + " " + company_url + '\r\n'
                        self.mydb.saveCompany(companyObj)
                    else:
                        if not any(
                                self.SITE_NAME in from_site
                                for from_site in company_in_db[3].split(",")):
                            siteName = company_in_db[3] + "," + self.SITE_NAME
                            companyObj = company.company(
                                company_name, company_url, siteName)
                            self.mydb.updateCompany(companyObj)

    def spider_closed(self, spider):
        if self.email_content != '':
            self.emailService.sendEmail(self.email_content)
            self.logger.info("Sending Email...")
        self.mydb.close()
예제 #5
0
class MainSpider(scrapy.Spider, db.MydbOperator):
    name = 'DyHR Spider'
    email_content = ''
    # start_urls = ['http://www.dyhr.cn/index.php?m=&c=jobs&a=jobs_list&key=%E5%A4%96%E8%B4%B8']
    start_urls = [
        'https://www.dyhr.cn/index.php/content/jobs?act=AIX_jobslist&key=%E5%A4%96%E8%B4%B8&lng=&lat=&ldLng=&ldLat=&ruLng=&ruLat='
    ]
    # mydb = db.MydbOperator()
    emailService = email_service.EmailService()
    SITE_NAME = "DYHR"

    def __init__(self, table_name='', webhook_url='', **kwargs):
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.mydb = db.MydbOperator(table_name)
        print(webhook_url)
        self.webhook_service = webhook.WebHook(webhook_url)
        self.mydb.create_table()
        self.isInitialize = self.mydb.is_empty_table()
        print(self.isInitialize)
        self.page_limit = 5
        super().__init__(**kwargs)

    def parse(self, response):
        for company_info in response.selector.xpath(
                "//div[@class='plist']//a[@class='line_substring']"):
            job_title = company_info.xpath(
                "//div[@class ='td-j-name']//@title").get()
            company_name = company_info.css("::text").get()
            # company_url = "http://www.dyhr.cn" + company_info.xpath("@href").get()
            company_url = company_info.xpath("@href").get()
            location = "江苏丹阳"
            company_in_db = self.mydb.getByCompanyName(company_name)
            if company_in_db is None:
                company_obj = company.company(job_title, company_name,
                                              company_url, location)
                self.email_content = self.email_content + company_name + " " + company_url + '\r\n'
                self.mydb.save_company(company_obj)
                # 添加 webhook发送器
                if not self.isInitialize:
                    formatted_context = self.webhook_service.format_with_template(
                        company_obj)
                    print(formatted_context)
                    self.webhook_service.send_markdown(company_name,
                                                       formatted_context, True)
            else:
                # Quit as reaching existing data records
                logging.info("Found existing record, hence quite.")
                raise CloseSpider("There's no new record yet.")
                # if not any(self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")):
                #     siteName = company_in_db[3] + "," + self.SITE_NAME
                #     company_obj = company.company(job_title, company_name, company_url, location)
                #     self.mydb.updateCompany(company_obj)

        for next_page in response.selector.xpath(
                "//div[@class='qspage']/a[contains(text(),'下一页')]"):
            nextPageSelector = next_page.css(".unable").get()
            if nextPageSelector is None:
                yield response.follow(next_page, self.parse)
            else:
                if self.email_content != '':
                    self.emailService.sendEmail(self.email_content)

    def spider_closed(self, spider):
        self.mydb.close()