class MainSpider(scrapy.Spider, db.MydbOperator): name = 'KSHR Spider' email_content = '' start = 0 totalSize = 0 start_url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=646&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%E5%A4%96%E8%B4%B8&kt=3&=0&_v=0.55545747&x-zp-page-request-id=df3b86ad2bca4742ad84253e0958c6e5-1562992810274-844145&x-zp-client-id=5c977c9d-c1ee-4c67-b475-97de3b07b16c' # mydb = db.MydbOperator() emailService = email_service.EmailService() SITE_NAME = "ZhiLian" download_timeout = 20 def __init__(self, table_name='', webhook_url='', **kwargs): dispatcher.connect(self.spider_closed, signals.spider_closed) self.mydb = db.MydbOperator(table_name) print(webhook_url) self.webhook_service = webhook.WebHook(webhook_url) self.mydb.create_table() self.isInitialize = self.mydb.is_empty_table() print(self.isInitialize) self.page_limit = 5 super().__init__(**kwargs) def start_requests(self): yield scrapy.Request(self.start_url, method="GET", headers={ 'Referer':'https://sou.zhaopin.com/?jl=646&sf=0&st=0&kw=%E5%A4%96%E8%B4%B8&kt=3', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', }, callback=self.parse, errback=self.errback_httpbin, dont_filter=True) def errback_httpbin(self, failure): self.logger.error(repr(failure)) def parse(self,response): result_json = json.loads(response.text) if result_json['code'] == 200 and result_json['data']['count'] > 0: if (self.start == 0): self.totalSize= result_json['data']['count'] for company_info in result_json['data']['results']: job_title = "暂未开放" location = "暂未开放" company_name = company_info['company']['name'] company_url = company_info['company']['url'] company_in_db = self.mydb.getByCompanyName(company_name) if company_in_db is None: company_obj = company.company(job_title, company_name, company_url, location) self.email_content = self.email_content + company_name + " " + company_url + '\r\n' self.mydb.save_company(company_obj) else: if not any(self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")): siteName = company_in_db[3] + "," + self.SITE_NAME company_obj = company.company(company_name, company_url, siteName) self.mydb.updateCompany(company_obj) if (self.start + 90 <= self.totalSize): self.start += 90 follow_url = 'https://fe-api.zhaopin.com/c/i/sou?start=' + str(self.start) + '&pageSize=90&cityId=646&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%E5%A4%96%E8%B4%B8&kt=3&=0&_v=0.55545747&x-zp-page-request-id=df3b86ad2bca4742ad84253e0958c6e5-1562992810274-844145&x-zp-client-id=5c977c9d-c1ee-4c67-b475-97de3b07b16c' yield scrapy.Request(follow_url, method="GET", headers={ 'Referer':'https://sou.zhaopin.com/?jl=646&sf=0&st=0&kw=%E5%A4%96%E8%B4%B8&kt=3', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', }, callback=self.parse, errback=self.errback_httpbin, dont_filter=True) def spider_closed(self, spider): if self.email_content != '': self.emailService.sendEmail(self.email_content) self.mydb.close()
class Sequential_MainSpider(scrapy.Spider, db.MydbOperator): name = 'KSHR Spider' email_content = '' start_url = 'http://kshr.com.cn/handler/CommonDataHandler.ashx?t=' # mydb = db.MydbOperator() emailService = email_service.EmailService() SITE_NAME = "kshr" pageNo = 1 form_request_payload = { "Industry": "", "Area": "", "PFun": "", "MonthSalary": "", "CompanyProperty": "", "CompanyScale": "", "Degree": "", "WorkYear": "", "Sex": "", "PublishTime": "", "OrderbySalary": "", "CurrentPage": pageNo, "KeyType": "all", "KeyWord": "外贸" } download_timeout = 20 def __init__(self, table_name='', webhook_url='', **kwargs): dispatcher.connect(self.spider_closed, signals.spider_closed) self.mydb = db.MydbOperator(table_name) print(webhook_url) self.webhook_service = webhook.WebHook(webhook_url) self.mydb.create_table() self.isInitialize = self.mydb.is_empty_table() print(self.isInitialize) self.page_limit = 5 super().__init__(**kwargs) def start_requests(self): yield scrapy.FormRequest( self.start_url + str(random.random()), method="POST", body='parm=' + str(self.form_request_payload) + '&m=getposition', headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Referer': 'http://kshr.com.cn/CacheSearch.aspx?keyword=%E5%A4%96%E8%B4%B8&strtype=all', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' }, callback=self.parse, errback=self.errback_httpbin, dont_filter=True) def errback_httpbin(self, failure): self.logger.error(repr(failure)) def parse(self, response): self.logger.debug("Current Page:" + str(self.pageNo)) result_json = json.loads(response.text) if result_json['ResultPageCount'] > 0: response = HtmlResponse(url=self.start_url, body=result_json['ResultHtml'], encoding='utf-8') selectors = response.selector.xpath( "//div[contains(@class,'data-fy')]//div[contains(@class, 'yp-search-list')]//p//a" ) if selectors: for company_info in selectors: location = "暂未开放" company_name = company_info.css("::text").get() company_url = "http://kshr.com.cn" + company_info.xpath( "@href").get() job_title = "暂未开放" company_in_db = self.mydb.getByCompanyName(company_name) if company_in_db is None: company_obj = company.company(job_title, company_name, company_url, location) self.email_content = self.email_content + company_name + " " + company_url + '\r\n' self.mydb.save_company(company_obj) # 添加 webhook发送器 if self.isInitialize: formatted_context = self.webhook_service.format_with_template( company_obj) print(formatted_context) self.webhook_service.send_markdown( company_name, formatted_context, True) else: if not any( self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")): siteName = company_in_db[3] + "," + self.SITE_NAME company_obj = company.company( job_title, company_name, company_url, location) self.mydb.updateCompany(company_obj) last_page = response.selector.xpath( "//div[@data-xh][last()]/@data-xh").get() if (int(last_page) == self.pageNo): self.logger.debug("Page crawling finished...") return self.pageNo += 1 self.form_request_payload['CurrentPage'] = self.pageNo yield scrapy.FormRequest( self.start_url + str(random.random()), method="POST", body='parm=' + str(self.form_request_payload) + '&m=getposition', headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Referer': 'http://kshr.com.cn/CacheSearch.aspx?keyword=%E5%A4%96%E8%B4%B8&strtype=all', 'X-Requested-With': 'XMLHttpRequest' }, callback=self.parse, errback=self.errback_httpbin, dont_filter=True) def spider_closed(self, spider): if self.email_content != '': self.emailService.sendEmail(self.email_content) self.mydb.close()
class MainSpider(scrapy.Spider, db.MydbOperator): name = '58DY Spider' email_content = '' # start_urls = ['https://nanjing.58.com/job/?key=%E5%A4%96%E8%B4%B8&classpolicy=main_null,job_A&final=1&jump=1'] emailService = email_service.EmailService() SITE_NAME = "58DY" def __init__(self, table_name='', webhook_url='', site=[''], location='', **kwargs): dispatcher.connect(self.spider_closed, signals.spider_closed) self.start_urls = [ f'https://{site}.58.com/job/?key=%E5%A4%96%E8%B4%B8&classpolicy=main_null,job_A&final=1&jump=1' ] self.mydb = db.MydbOperator(table_name) print(webhook_url) self.webhook_service = webhook.WebHook(webhook_url) self.mydb.create_table() self.isInitialize = self.mydb.is_empty_table() print(self.isInitialize) self.location = location self.page_limit = 5 super().__init__(**kwargs) def parse(self, response): for company_info in response.selector.xpath( "//ul[@id='list_con']//li[contains(@class,'job_item')]//div[@class='comp_name']//a" ): # for company_info in response.selector.xpath("//ul[@id='list_con']//li[contains(@class,'job_item')]"): job_title = company_info.xpath( "//span[@class = 'cate']//text()").get() print("***********************") print(job_title) company_name = company_info.xpath("@title").get() print(company_name) company_url = company_info.xpath("@href").get() print(company_url) location = self.location print(location) print("##############") company_in_db = self.mydb.getByCompanyName(company_name) if company_in_db is None: company_obj = company.company(job_title, company_name, company_url, location) # self.email_content = self.email_content + company_name + " " + company_url + '\r\n' self.mydb.save_company(company_obj) # 添加 webhook发送器 if not self.isInitialize: formatted_context = self.webhook_service.format_with_template( company_obj) print(formatted_context) self.webhook_service.send_markdown(company_name, formatted_context, True) else: # Quit as reaching existing data records logging.info("Found existing record, hence quite.") raise CloseSpider("There's no new record yet.") # if not any(self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")): # siteName = company_in_db[3] + "," + self.SITE_NAME # companyObj = company.company(company_name, company_url, siteName) # self.mydb.updateCompany(companyObj) # if not any(self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")): # siteName = company_in_db[3] + "," + self.SITE_NAME # company_obj = company.company(job_title, company_name, company_url, location) # self.mydb.updateCompany(company_obj) for next_page in response.selector.xpath( "//div[@class='pagesout']/a[contains(@class,'next')]"): nextPageSelector = next_page.css(".disabled").get() if nextPageSelector is None: yield response.follow(next_page, self.parse) else: if self.email_content != '': self.emailService.sendEmail(self.email_content) def spider_closed(self, spider): self.mydb.close()
class Parallel_MainSpider(scrapy.Spider, db.MydbOperator): name = 'KSHR Spider' email_content = '' start_url = 'http://kshr.com.cn/handler/CommonDataHandler.ashx?t=' mydb = db.MydbOperator() emailService = email_service.EmailService() SITE_NAME = "kshr" pageNo = 1 form_request_payload = { "Industry": "", "Area": "", "PFun": "", "MonthSalary": "", "CompanyProperty": "", "CompanyScale": "", "Degree": "", "WorkYear": "", "Sex": "", "PublishTime": "", "OrderbySalary": "", "CurrentPage": pageNo, "KeyType": "all", "KeyWord": "外贸" } download_timeout = 30 def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) def start_requests(self): while self.pageNo < 20: yield scrapy.FormRequest( self.start_url + str(random.random()), method="POST", body='parm=' + str(self.form_request_payload) + '&m=getposition', headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Referer': 'http://kshr.com.cn/CacheSearch.aspx?keyword=%E5%A4%96%E8%B4%B8&strtype=all', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' }, callback=self.parse, errback=self.errback_httpbin, dont_filter=True) self.pageNo += 1 self.form_request_payload['CurrentPage'] = self.pageNo def errback_httpbin(self, failure): self.logger.error(repr(failure)) def parse(self, response): result_json = json.loads(response.text) if result_json['ResultPageCount'] > 0: response = HtmlResponse(url=self.start_url, body=result_json['ResultHtml'], encoding='utf-8') selectors = response.selector.xpath( "//div[contains(@class,'data-fy')]//div[contains(@class, 'yp-search-list')]//p//a" ) if selectors: for company_info in selectors: company_name = company_info.css("::text").get() company_url = company_info.xpath("@href").get() company_in_db = self.mydb.getByCompanyName(company_name) if company_in_db is None: companyObj = company.company(company_name, company_url, self.SITE_NAME) self.email_content = self.email_content + company_name + " " + company_url + '\r\n' self.mydb.saveCompany(companyObj) else: if not any( self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")): siteName = company_in_db[3] + "," + self.SITE_NAME companyObj = company.company( company_name, company_url, siteName) self.mydb.updateCompany(companyObj) def spider_closed(self, spider): if self.email_content != '': self.emailService.sendEmail(self.email_content) self.logger.info("Sending Email...") self.mydb.close()
class MainSpider(scrapy.Spider, db.MydbOperator): name = 'DyHR Spider' email_content = '' # start_urls = ['http://www.dyhr.cn/index.php?m=&c=jobs&a=jobs_list&key=%E5%A4%96%E8%B4%B8'] start_urls = [ 'https://www.dyhr.cn/index.php/content/jobs?act=AIX_jobslist&key=%E5%A4%96%E8%B4%B8&lng=&lat=&ldLng=&ldLat=&ruLng=&ruLat=' ] # mydb = db.MydbOperator() emailService = email_service.EmailService() SITE_NAME = "DYHR" def __init__(self, table_name='', webhook_url='', **kwargs): dispatcher.connect(self.spider_closed, signals.spider_closed) self.mydb = db.MydbOperator(table_name) print(webhook_url) self.webhook_service = webhook.WebHook(webhook_url) self.mydb.create_table() self.isInitialize = self.mydb.is_empty_table() print(self.isInitialize) self.page_limit = 5 super().__init__(**kwargs) def parse(self, response): for company_info in response.selector.xpath( "//div[@class='plist']//a[@class='line_substring']"): job_title = company_info.xpath( "//div[@class ='td-j-name']//@title").get() company_name = company_info.css("::text").get() # company_url = "http://www.dyhr.cn" + company_info.xpath("@href").get() company_url = company_info.xpath("@href").get() location = "江苏丹阳" company_in_db = self.mydb.getByCompanyName(company_name) if company_in_db is None: company_obj = company.company(job_title, company_name, company_url, location) self.email_content = self.email_content + company_name + " " + company_url + '\r\n' self.mydb.save_company(company_obj) # 添加 webhook发送器 if not self.isInitialize: formatted_context = self.webhook_service.format_with_template( company_obj) print(formatted_context) self.webhook_service.send_markdown(company_name, formatted_context, True) else: # Quit as reaching existing data records logging.info("Found existing record, hence quite.") raise CloseSpider("There's no new record yet.") # if not any(self.SITE_NAME in from_site for from_site in company_in_db[3].split(",")): # siteName = company_in_db[3] + "," + self.SITE_NAME # company_obj = company.company(job_title, company_name, company_url, location) # self.mydb.updateCompany(company_obj) for next_page in response.selector.xpath( "//div[@class='qspage']/a[contains(text(),'下一页')]"): nextPageSelector = next_page.css(".unable").get() if nextPageSelector is None: yield response.follow(next_page, self.parse) else: if self.email_content != '': self.emailService.sendEmail(self.email_content) def spider_closed(self, spider): self.mydb.close()