Пример #1
0
    def crawler(self, key_words = None, hy = None, city = None):
        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        #第一步,请求首页,拿到cookie
        retry_count = 10
        while True:
            try:
                html_src = webutil.request(zhilian_crawler_data.first_url, headers = zhilian_crawler_data.first_url_request_header, timeout = 60, encoding = 'utf-8', proxy = None, cookie = cookieJar, ua = ua)
                if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                    raise Exception(u'下载首页太大或太小')
                break
            except Exception as e:
                print u'下载首页异常%s' % e
                retry_count -= 1
                if retry_count <= 0:
                    raise Exception(u'下次首页%s都失败,抛出异常' % retry_count)
                time.sleep(10)
                continue

        search_url = zhilian_crawler_data.get_search_url(key_words, hy, city, page_num = 1)
        #第二步,根据关键字搜索
        while True:
            try:
                html_src = self.get_result_page_by_page_num(search_url, cookieJar, ua)
                search_url = self.get_next_page_url(html_src)
                if search_url == None:
                    break
                self.get_and_product_detail_url(html_src)
            except Exception as e:
                raise Exception(u'下载搜索首页异常%s' % e)
Пример #2
0
 def get_result_page(self, search_url, hy = None, city = None, post_data = {}, cookieJar = None, ua = None, proxy = None):
      try:
         html_src = webutil.request(lagou_crawler_data.get_lagou_position_ajax_url(hy, city), headers = lagou_crawler_data.get_lagou_position_ajax_header(search_url),
                                    data = post_data, encoding = "utf-8", timeout = 60, retry = 5, cookie = cookieJar, method = webutil.POST, ua = ua, proxy = proxy)
         if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
             raise Exception(u'请求结果首页内容太大或太小')
         return html_src
      except Exception as e:
         print u'请求结果首页异常%s' % e
         raise Exception(u'下载结果首页异常')
Пример #3
0
    def get_result_page_by_page_num(self, search_url, cookieJar = None, ua = None, proxy = None):
        search_header = zhilian_crawler_data.get_search_url_header()

        try:
            html_src = webutil.request(search_url, headers = search_header, cookie = cookieJar, ua = ua, encoding = 'utf-8', retry = 5, timeout = 60, proxy = proxy)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'下载结果页太大或太小')
            return html_src
        except Exception as e:
            print u'下载结果页异常 %s' % e
            raise Exception(u'下载结果页异常')
    def crawl(self, url):
        if url == None:
            return

        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        try:
            html_src = webutil.request(url, headers = lagou_crawler_data.get_jobs_url_header(), ua = ua, cookie = cookieJar, timeout = 60, encoding = 'utf-8', retry = 5, savefile=None)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'下载的页面太大或太小')
        except Exception as e:
            print u'抓取%s异常 %s' % (url, e)
            raise Exception(u'抓取数据异常')

        save_data = {}
        #使用xpath提取信息
        tree = etree.HTML(html_src)
        try:
            job_title_list = tree.xpath('.//*[@class="clearfix join_tc_icon"]/h1')
            if job_title_list != None and len(job_title_list) > 0:
                job_title = job_title_list[0].get('title')
                save_data['job_title'] = str(job_title)
            else:
                save_data['job_title'] = ''
            work_place_xpath = tree.xpath('.//*[@class="job_request"]/span[2]/text()')
            if work_place_xpath != None and len(work_place_xpath) > 0:
                work_place = work_place_xpath[0]
                save_data['work_place'] = str(work_place)
            else:
                save_data['work_place'] = ''

            publish_time_xpath = tree.xpath('.//*[@class="job_request"]/div[1]/text()')
            if publish_time_xpath != None and len(publish_time_xpath) > 0:
                publish_time = publish_time_xpath[0]
                save_data['publish_time'] = str(publish_time)
            else:
                save_data['publish_time'] = ''

            work_request_xpath = tree.xpath('.//*[@class="job_bt"]/p/text()')
            if work_request_xpath != None and len(work_request_xpath) > 0:
                work_request = work_request_xpath[0]
                save_data['work_request'] = str(work_request)
            else:
                save_data['work_request'] = ''
        except Exception as e:
            print u'解析页面异常%s' % e

        #存储数据
        try:
            self.save_data(url, save_data)
        except Exception as e:
            print u'存储数据异常%s' % e
            raise Exception(u'存储数据异常')
Пример #5
0
def inter_ip():
    """
    获取外网IP地址
    1.访问 http://www.whereismyip.com 网站
    2。解析页面内容,抽取ip地址
    :return: (str)外网IP地址
    """

    html_src = webutil.request("http://www.whereismyip.com",
                               timeout=30,
                               retry=2,
                               encoding="iso8859-1")
    return re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_src).group(0)
Пример #6
0
 def get_gdxx_detail(self,company_url,detail_url):
     """
     根据公司详情url和公司股东详情相对url计算出绝对的url地址,访问并返回内容
     :param company_url: (str) 公司详情url
     :param detail_url:  (str) 公司股东详情相对url
     :return: (str) 公司股东详情详情html内容
     """
     detail_ab_url=urlparse.urljoin(company_url,detail_url)
     urlpret = urlparse.urlparse(detail_ab_url)
     head={
         "Referer":company_url,
         "Host":urlpret.netloc
     }
     return  webutil.request(detail_ab_url,headers=head,encoding=webutil.detect_encoding)
    def Get_href(self,url):

        f1=open('href_zph.txt','a+')
        #
        proxy=None
        count=10
        while True:
            try:
                #proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880)
                #proxy=None
                myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk")
                break
            except Exception as e3:
                print e3
                if count<=0:
                    raise  Exception(u"连续10次失败,放弃")
                count-=1
                time.sleep(1) 
        try:
            href_list=re.findall('] <a href="(.*?)" target="_blank"',myPage,re.S)
            address_list=re.findall('<td width="220" class="left">(.*?)</td>',myPage)
            city_list=re.findall('class="city">(.*?)</a>]',myPage)
            date_list=[]
            name_list=[]
            for i in href_list:
                if i.find('http://')==-1:
                    href='http://zph.yingjiesheng.com'+i
                    
                else:
                    href=i
                if href not in f1.read():
                    print href
                    f1.write(href)
                    f1.write('\r\n')
                else:
                    pass
            f1.close()
           
                    
                    

        except Exception as e2:
            logging.error("product id:%s" %e2)
            print e2
            pass
    def Get_href(self,url):
        RedisQueue=RedisQueue_master1.getredisQueuev2('yingjiesheng_href1')
        f1=open('href.txt','ar+')
        proxy=None
        count=10
        while True:
            try:
                
                #proxy=None
                myPage=webutil.request(url,timeout=10,proxy=proxy).decode('gbk').encode('utf-8')
                break
            except Exception as e3:
                print e3
		proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880)

                if count<=0:
                    raise  Exception(u"连续10次失败,放弃")
                count-=1
                time.sleep(1) 
        try:
            pp=re.findall('<a href="(.*?)" target="_blank"><span style="color:(.*?);">(.*?)</a>(.*?)</td>(.*?)<td class="date">(.*?)</td>',myPage,re.S)
            for i in pp:
                m=i[0]
                p=m.split('href="')[-1]
                if p.find('http://')==-1:
                    href='http://www.yingjiesheng.com'+p
                else:
                    href=p
                if href not in f1.read(): 
                    print href,i[-1]

                    hh=href+'BBD'+i[-1]
                    RedisQueue.put(hh)
                    f1.write(href)
                    f1.write('\r\n')
                    f1.write(i[-1])
                    f1.write('\r\n')
            
            print len(pp)
            f1.close()
        except Exception as e2:
            logging.error("product id:%s" %e2)
            print e2
            pass
 def Get_message(self,url):
     db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd)
     now=timeutil.format("%Y-%m-%d",time.time())
     proxy=None
     count=10
     while True:
         try:
             #proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880)
             #proxy=None
             myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk")
             break
         except Exception as e3:
             print e3
             if count<=0:
                 raise  Exception(u"连续10次失败,放弃")
             count-=1
             time.sleep(1) 
     tree=etree.HTML(myPage)
     title=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/caption/h1",num=0,split=u" ")
     #address=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/tbody/tr[3]/td",num=0,split=u" ")
     #pp=re.findall('<td>汉阳郭茨口香格里都3楼腾飞人才市场</td>')
     print title
     myPage=myPage.encode('utf-8')
     address1=re.findall('<th width="90">(.*?)</th>(.*?)<td>(.*?)</td>',myPage,re.S)
     j=0
     for i in address1:
         if j==0:
             city1=re.findall('">(.*?)</a>',i[-1])
             city=city1[0]
         elif j==1:
             date=i[-1]
         elif j==2:
             address=i[-1]
         j=j+1
     
     print len(address)
     print city
     print date
     print address
     key=url+now
     mongoutil.updatev3(db_yjs,key,{"标题":title,"城市":city,"招聘会时间":date,'招聘会地点':address,"页面链接":url,"dotime":now,"uptime":time.time(),"source":"yingjiesheng","type":"2"})
    def crawl(self, url):
        if url == None or len(url) < 1:
            return

        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        data_dict = {}
        data_dict['type'] = 'zhilian'
        data_dict['version'] = 1
        data_dict['url'] = url

        try:
            html_src = webutil.request(url, headers = zhilian_crawler_data.get_search_url_header(), ua = ua, cookie = cookieJar, timeout = 60, retry = 5, encoding = 'utf-8', proxy = None)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'下载详情页异常')
            data_dict['html'] = html_src
            self.parse_html(html_src, data_dict)
            self.save_data(url, data_dict)
        except Exception as e:
            print u'下载详情页异常%s' % e
            raise Exception(u'下载详情页异常')
Пример #11
0
    def crawler(self, key_words = None, hy = None, city = None):
        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        #第一步,请求拉勾网首页,获取cookie
        retry_count = 10
        while True:
            try:
                html_src = webutil.request(lagou_crawler_data.lagou_url, headers = lagou_crawler_data.get_lagou_header(), timeout = 60, encoding = 'utf-8', cookie = cookieJar, ua = ua)
                if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                    raise Exception(u'请求的页面太大或太小,异常')
                break
            except Exception as e:
                print u'获取首页异常%s' % e
                retry_count -= 1
                if retry_count > 0:
                    time.sleep(5)
                    continue
                raise Exception(u'获取首页异常,需要加换代理或其它手段')


        #第二步,提交查询请求

        search_url, query_data = lagou_crawler_data.get_lagou_search_url(key_words, hy, city)
        if search_url == None:
            raise Exception(u'搜索关键字为空,异常')

        try:
            html_src = webutil.request(search_url, headers = lagou_crawler_data.get_lagou_search_header(), data = query_data, cookie = cookieJar, ua = ua, proxy = None, encoding = 'utf-8', retry = 5, timeout = 60)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'搜索%s异常' % search_url)
        except Exception as e:
            print u'下载搜索首页异常 %s' % e
            raise Exception(u'下载搜索首页异常')

        #第三步,提取post请求,查询具体数据
        #提取第一页查询结果
        post_data = lagou_crawler_data.get_lagou_position_post_data(first = 'true', keyword = key_words, page_num = 1)

        position_id_list = []
        try:
            html_src = self.get_result_page(search_url, hy, city, post_data, cookieJar, ua, proxy = None)
            data_dict = self.json_to_dict(html_src)
            if "success" in data_dict:
                if data_dict["success"] != "true" and data_dict["success"] != True:
                    return

            if "content" in data_dict:
                content = data_dict["content"]
                total_page_count = content["totalPageCount"]
                if int(total_page_count) == 0:
                    return

                seach_results = content["result"]
                if seach_results != None and len(seach_results) > 0:
                    [self.product(str(result['positionId'])) for result in seach_results]
                    position_id_list.extend([result['positionId'] for result in seach_results])
        except Exception as e:
            print u'请求结果首页异常%s' % e
            raise Exception(u"请求结果首页异常")

        if total_page_count != None and total_page_count > 1:
            post_data['first'] = 'false'
            for i in xrange(2, total_page_count):
                post_data['pn'] = i
                try:
                    html_src = self.get_result_page(search_url, hy, city, post_data, cookieJar, ua, proxy = None)
                    data_dict = self.json_to_dict(html_src)
                    if "content" in data_dict:
                        content = data_dict["content"]
                        seach_results = content["result"]
                        if seach_results != None and len(seach_results) > 0:
                            [self.product(str(result['positionId'])) for result in seach_results]
                            position_id_list.extend([result['positionId'] for result in seach_results])
                            # for result in seach_results:
                            #     position_id = result['positionId']
                            #     print position_id
                except Exception as e:
                    print u'请求结果页异常'
                    time.sleep(2)
                    continue
    def Get_message(self,url,date):
        db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30)
        now=timeutil.format("%Y-%m-%d",time.time())
        proxy=None
        count=10
        while True:
            try:
               
                #proxy=None
                myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk")
                break
            except Exception as e3:
                print e3
		proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880)
                if count<=0:
                    raise  Exception(u"连续10次失败,放弃")
                count-=1
                time.sleep(1) 
        tree=etree.HTML(myPage)
        
        jiben=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[2]/div/ul",num=0,split=u" ")#.//*[@id='container']/div[3]/div[2]/div/ul/li[2]
        text=xpathutil.get_all_text(tree,".//*[@id='wordDiv']/div/div",num=0,split=u" ")
        print len(text)
        if len(text)<=10:
            text=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]",num=0,split=u" ")
        else:
            pass
	    p=functions.remove_all_space_char(text)
        p=functions.remove_all_space_char(p)
        emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
                    
        e=emeail.findall(p)
        a=[]
        phone_number=re.compile('^(?:\+86)?(\d{3})\d{8}$|^(?:\+86)?(0\d{2,3})\d{7,8}$')
        pn=phone_number.findall(p)
        posdict=dict()
                   
        for  key in self.key_word:
		found=False
                for j in key:
			index=p.find(j)
                        if index>=0:
				if found:
                                    print "error"
                                else:
                                    posdict[j]=index
                                    found=True
        for key in  posdict:
		a.append(posdict[key])
	a.sort()
	save_data=dict()
	for i in range(0,len(a)):
		if i+1<len(a):
			text3=''
                        text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1)
                        text3=text3.split(':')

                        if len(text3)>1:
				if len(text3)==2:
                                        save_data[text3[0]]=text3[1]

                                else:
                                        save_data[text3[0]]=text3[1]+text3[2]
                        elif len(text3)==1:
                               	save_data[text3[0]]='None'
                        else:
				pass
		
		else:
			pass

        if e:
            save_data["邮箱"]=e[0]
        else:
            save_data["邮箱"]="无"
        if pn:
               	save_data['电话']=pn[0]
        else:
                save_data["电话"]="无"
    
	
        myPage=myPage.encode('utf-8')
        title=re.findall('<title>(.*?)</title>',myPage)
        if not title:
            title=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[1]/h1/a",num=0,split=u" ")
            name= title
        else:
            name=title[0]
        print name

        p=''
        if not jiben:
            pp=re.findall('<div class="info clearfix"><ol><li>\xe5\x8f\x91\xe5\xb8\x83\xe6\x97\xb6\xe9\x97\xb4:<u>(.*?)</u></li><li>\xe5\xb7\xa5\xe4\xbd\x9c\xe5\x9c\xb0\xe7\x82\xb9:<u>(.*?) </u></li><li>\xe8\x81\x8c\xe4\xbd\x8d\xe7\xb1\xbb\xe5\x9e\x8b:<u>(.*?)</u></li><li>\xe6\x9d\xa5\xe6\xba\x90:<a href="#" onclick="window.open(.*?)">(.*?)</a></li>',myPage)
            for i in pp:
                for j in i:
                    if j.find('(')==-1:
                        p=p+j+'\r\n'
        else:
            jiben=jiben.replace('\t','')
            jiben=jiben.split('\r\n')
            for i in jiben:
                p=p+i.split(':')[-1].replace('\n','')+'\r\n' 
        print p 
        myPage=myPage.decode('utf-8')
        keys=url+now
	save_data["公司名称"]=name
	save_data["发布时间"]=date
	save_data["文本1"]=p
	save_data['文本2']=text
	save_data["页面链接"]=url
	save_data["页面源码"]=myPage
	save_data['dotime']=now
	save_data['uptime']=time.time()
	save_data['source']="yingjiesheng"
	save_data["type"]="1"
        mongoutil.updatev3(db_yjs,keys,save_data)
	print("数据入库成功!")