def crawler(self, key_words = None, hy = None, city = None): ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() #第一步,请求首页,拿到cookie retry_count = 10 while True: try: html_src = webutil.request(zhilian_crawler_data.first_url, headers = zhilian_crawler_data.first_url_request_header, timeout = 60, encoding = 'utf-8', proxy = None, cookie = cookieJar, ua = ua) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载首页太大或太小') break except Exception as e: print u'下载首页异常%s' % e retry_count -= 1 if retry_count <= 0: raise Exception(u'下次首页%s都失败,抛出异常' % retry_count) time.sleep(10) continue search_url = zhilian_crawler_data.get_search_url(key_words, hy, city, page_num = 1) #第二步,根据关键字搜索 while True: try: html_src = self.get_result_page_by_page_num(search_url, cookieJar, ua) search_url = self.get_next_page_url(html_src) if search_url == None: break self.get_and_product_detail_url(html_src) except Exception as e: raise Exception(u'下载搜索首页异常%s' % e)
def get_result_page(self, search_url, hy = None, city = None, post_data = {}, cookieJar = None, ua = None, proxy = None): try: html_src = webutil.request(lagou_crawler_data.get_lagou_position_ajax_url(hy, city), headers = lagou_crawler_data.get_lagou_position_ajax_header(search_url), data = post_data, encoding = "utf-8", timeout = 60, retry = 5, cookie = cookieJar, method = webutil.POST, ua = ua, proxy = proxy) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'请求结果首页内容太大或太小') return html_src except Exception as e: print u'请求结果首页异常%s' % e raise Exception(u'下载结果首页异常')
def get_result_page_by_page_num(self, search_url, cookieJar = None, ua = None, proxy = None): search_header = zhilian_crawler_data.get_search_url_header() try: html_src = webutil.request(search_url, headers = search_header, cookie = cookieJar, ua = ua, encoding = 'utf-8', retry = 5, timeout = 60, proxy = proxy) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载结果页太大或太小') return html_src except Exception as e: print u'下载结果页异常 %s' % e raise Exception(u'下载结果页异常')
def crawl(self, url): if url == None: return ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() try: html_src = webutil.request(url, headers = lagou_crawler_data.get_jobs_url_header(), ua = ua, cookie = cookieJar, timeout = 60, encoding = 'utf-8', retry = 5, savefile=None) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载的页面太大或太小') except Exception as e: print u'抓取%s异常 %s' % (url, e) raise Exception(u'抓取数据异常') save_data = {} #使用xpath提取信息 tree = etree.HTML(html_src) try: job_title_list = tree.xpath('.//*[@class="clearfix join_tc_icon"]/h1') if job_title_list != None and len(job_title_list) > 0: job_title = job_title_list[0].get('title') save_data['job_title'] = str(job_title) else: save_data['job_title'] = '' work_place_xpath = tree.xpath('.//*[@class="job_request"]/span[2]/text()') if work_place_xpath != None and len(work_place_xpath) > 0: work_place = work_place_xpath[0] save_data['work_place'] = str(work_place) else: save_data['work_place'] = '' publish_time_xpath = tree.xpath('.//*[@class="job_request"]/div[1]/text()') if publish_time_xpath != None and len(publish_time_xpath) > 0: publish_time = publish_time_xpath[0] save_data['publish_time'] = str(publish_time) else: save_data['publish_time'] = '' work_request_xpath = tree.xpath('.//*[@class="job_bt"]/p/text()') if work_request_xpath != None and len(work_request_xpath) > 0: work_request = work_request_xpath[0] save_data['work_request'] = str(work_request) else: save_data['work_request'] = '' except Exception as e: print u'解析页面异常%s' % e #存储数据 try: self.save_data(url, save_data) except Exception as e: print u'存储数据异常%s' % e raise Exception(u'存储数据异常')
def inter_ip(): """ 获取外网IP地址 1.访问 http://www.whereismyip.com 网站 2。解析页面内容,抽取ip地址 :return: (str)外网IP地址 """ html_src = webutil.request("http://www.whereismyip.com", timeout=30, retry=2, encoding="iso8859-1") return re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_src).group(0)
def get_gdxx_detail(self,company_url,detail_url): """ 根据公司详情url和公司股东详情相对url计算出绝对的url地址,访问并返回内容 :param company_url: (str) 公司详情url :param detail_url: (str) 公司股东详情相对url :return: (str) 公司股东详情详情html内容 """ detail_ab_url=urlparse.urljoin(company_url,detail_url) urlpret = urlparse.urlparse(detail_ab_url) head={ "Referer":company_url, "Host":urlpret.netloc } return webutil.request(detail_ab_url,headers=head,encoding=webutil.detect_encoding)
def Get_href(self,url): f1=open('href_zph.txt','a+') # proxy=None count=10 while True: try: #proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk") break except Exception as e3: print e3 if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) try: href_list=re.findall('] <a href="(.*?)" target="_blank"',myPage,re.S) address_list=re.findall('<td width="220" class="left">(.*?)</td>',myPage) city_list=re.findall('class="city">(.*?)</a>]',myPage) date_list=[] name_list=[] for i in href_list: if i.find('http://')==-1: href='http://zph.yingjiesheng.com'+i else: href=i if href not in f1.read(): print href f1.write(href) f1.write('\r\n') else: pass f1.close() except Exception as e2: logging.error("product id:%s" %e2) print e2 pass
def Get_href(self,url): RedisQueue=RedisQueue_master1.getredisQueuev2('yingjiesheng_href1') f1=open('href.txt','ar+') proxy=None count=10 while True: try: #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy).decode('gbk').encode('utf-8') break except Exception as e3: print e3 proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) try: pp=re.findall('<a href="(.*?)" target="_blank"><span style="color:(.*?);">(.*?)</a>(.*?)</td>(.*?)<td class="date">(.*?)</td>',myPage,re.S) for i in pp: m=i[0] p=m.split('href="')[-1] if p.find('http://')==-1: href='http://www.yingjiesheng.com'+p else: href=p if href not in f1.read(): print href,i[-1] hh=href+'BBD'+i[-1] RedisQueue.put(hh) f1.write(href) f1.write('\r\n') f1.write(i[-1]) f1.write('\r\n') print len(pp) f1.close() except Exception as e2: logging.error("product id:%s" %e2) print e2 pass
def Get_message(self,url): db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd) now=timeutil.format("%Y-%m-%d",time.time()) proxy=None count=10 while True: try: #proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk") break except Exception as e3: print e3 if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) tree=etree.HTML(myPage) title=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/caption/h1",num=0,split=u" ") #address=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/tbody/tr[3]/td",num=0,split=u" ") #pp=re.findall('<td>汉阳郭茨口香格里都3楼腾飞人才市场</td>') print title myPage=myPage.encode('utf-8') address1=re.findall('<th width="90">(.*?)</th>(.*?)<td>(.*?)</td>',myPage,re.S) j=0 for i in address1: if j==0: city1=re.findall('">(.*?)</a>',i[-1]) city=city1[0] elif j==1: date=i[-1] elif j==2: address=i[-1] j=j+1 print len(address) print city print date print address key=url+now mongoutil.updatev3(db_yjs,key,{"标题":title,"城市":city,"招聘会时间":date,'招聘会地点':address,"页面链接":url,"dotime":now,"uptime":time.time(),"source":"yingjiesheng","type":"2"})
def crawl(self, url): if url == None or len(url) < 1: return ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() data_dict = {} data_dict['type'] = 'zhilian' data_dict['version'] = 1 data_dict['url'] = url try: html_src = webutil.request(url, headers = zhilian_crawler_data.get_search_url_header(), ua = ua, cookie = cookieJar, timeout = 60, retry = 5, encoding = 'utf-8', proxy = None) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载详情页异常') data_dict['html'] = html_src self.parse_html(html_src, data_dict) self.save_data(url, data_dict) except Exception as e: print u'下载详情页异常%s' % e raise Exception(u'下载详情页异常')
def crawler(self, key_words = None, hy = None, city = None): ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() #第一步,请求拉勾网首页,获取cookie retry_count = 10 while True: try: html_src = webutil.request(lagou_crawler_data.lagou_url, headers = lagou_crawler_data.get_lagou_header(), timeout = 60, encoding = 'utf-8', cookie = cookieJar, ua = ua) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'请求的页面太大或太小,异常') break except Exception as e: print u'获取首页异常%s' % e retry_count -= 1 if retry_count > 0: time.sleep(5) continue raise Exception(u'获取首页异常,需要加换代理或其它手段') #第二步,提交查询请求 search_url, query_data = lagou_crawler_data.get_lagou_search_url(key_words, hy, city) if search_url == None: raise Exception(u'搜索关键字为空,异常') try: html_src = webutil.request(search_url, headers = lagou_crawler_data.get_lagou_search_header(), data = query_data, cookie = cookieJar, ua = ua, proxy = None, encoding = 'utf-8', retry = 5, timeout = 60) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'搜索%s异常' % search_url) except Exception as e: print u'下载搜索首页异常 %s' % e raise Exception(u'下载搜索首页异常') #第三步,提取post请求,查询具体数据 #提取第一页查询结果 post_data = lagou_crawler_data.get_lagou_position_post_data(first = 'true', keyword = key_words, page_num = 1) position_id_list = [] try: html_src = self.get_result_page(search_url, hy, city, post_data, cookieJar, ua, proxy = None) data_dict = self.json_to_dict(html_src) if "success" in data_dict: if data_dict["success"] != "true" and data_dict["success"] != True: return if "content" in data_dict: content = data_dict["content"] total_page_count = content["totalPageCount"] if int(total_page_count) == 0: return seach_results = content["result"] if seach_results != None and len(seach_results) > 0: [self.product(str(result['positionId'])) for result in seach_results] position_id_list.extend([result['positionId'] for result in seach_results]) except Exception as e: print u'请求结果首页异常%s' % e raise Exception(u"请求结果首页异常") if total_page_count != None and total_page_count > 1: post_data['first'] = 'false' for i in xrange(2, total_page_count): post_data['pn'] = i try: html_src = self.get_result_page(search_url, hy, city, post_data, cookieJar, ua, proxy = None) data_dict = self.json_to_dict(html_src) if "content" in data_dict: content = data_dict["content"] seach_results = content["result"] if seach_results != None and len(seach_results) > 0: [self.product(str(result['positionId'])) for result in seach_results] position_id_list.extend([result['positionId'] for result in seach_results]) # for result in seach_results: # position_id = result['positionId'] # print position_id except Exception as e: print u'请求结果页异常' time.sleep(2) continue
def Get_message(self,url,date): db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30) now=timeutil.format("%Y-%m-%d",time.time()) proxy=None count=10 while True: try: #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk") break except Exception as e3: print e3 proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) tree=etree.HTML(myPage) jiben=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[2]/div/ul",num=0,split=u" ")#.//*[@id='container']/div[3]/div[2]/div/ul/li[2] text=xpathutil.get_all_text(tree,".//*[@id='wordDiv']/div/div",num=0,split=u" ") print len(text) if len(text)<=10: text=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]",num=0,split=u" ") else: pass p=functions.remove_all_space_char(text) p=functions.remove_all_space_char(p) emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+') e=emeail.findall(p) a=[] phone_number=re.compile('^(?:\+86)?(\d{3})\d{8}$|^(?:\+86)?(0\d{2,3})\d{7,8}$') pn=phone_number.findall(p) posdict=dict() for key in self.key_word: found=False for j in key: index=p.find(j) if index>=0: if found: print "error" else: posdict[j]=index found=True for key in posdict: a.append(posdict[key]) a.sort() save_data=dict() for i in range(0,len(a)): if i+1<len(a): text3='' text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1) text3=text3.split(':') if len(text3)>1: if len(text3)==2: save_data[text3[0]]=text3[1] else: save_data[text3[0]]=text3[1]+text3[2] elif len(text3)==1: save_data[text3[0]]='None' else: pass else: pass if e: save_data["邮箱"]=e[0] else: save_data["邮箱"]="无" if pn: save_data['电话']=pn[0] else: save_data["电话"]="无" myPage=myPage.encode('utf-8') title=re.findall('<title>(.*?)</title>',myPage) if not title: title=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[1]/h1/a",num=0,split=u" ") name= title else: name=title[0] print name p='' if not jiben: pp=re.findall('<div class="info clearfix"><ol><li>\xe5\x8f\x91\xe5\xb8\x83\xe6\x97\xb6\xe9\x97\xb4:<u>(.*?)</u></li><li>\xe5\xb7\xa5\xe4\xbd\x9c\xe5\x9c\xb0\xe7\x82\xb9:<u>(.*?) </u></li><li>\xe8\x81\x8c\xe4\xbd\x8d\xe7\xb1\xbb\xe5\x9e\x8b:<u>(.*?)</u></li><li>\xe6\x9d\xa5\xe6\xba\x90:<a href="#" onclick="window.open(.*?)">(.*?)</a></li>',myPage) for i in pp: for j in i: if j.find('(')==-1: p=p+j+'\r\n' else: jiben=jiben.replace('\t','') jiben=jiben.split('\r\n') for i in jiben: p=p+i.split(':')[-1].replace('\n','')+'\r\n' print p myPage=myPage.decode('utf-8') keys=url+now save_data["公司名称"]=name save_data["发布时间"]=date save_data["文本1"]=p save_data['文本2']=text save_data["页面链接"]=url save_data["页面源码"]=myPage save_data['dotime']=now save_data['uptime']=time.time() save_data['source']="yingjiesheng" save_data["type"]="1" mongoutil.updatev3(db_yjs,keys,save_data) print("数据入库成功!")