예제 #1
0
파일: ganji.py 프로젝트: ptphp/PyLib
    def rent(self,url):
#        self.fd['house_city'] = urlparse(url)[1].replace('.ganji.com',"")
        hc= urlparse(url)[1].replace('.ganji.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            raise
        tree = etree.HTML(response)
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            raise
        
        self.fd['house_flag'] = 2
        self.fd['house_type'] = 6
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        
        soup =BeautifulSoup(response)
        detail_mer = soup.find('div',{'class':'detail_mer'})
        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):raise
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = str(Dname.string)
        else:
            self.fd['owner_name'] = ""
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone_pic'] = None            
        else:
            self.fd['owner_phone_pic'] = None
            
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:raise        
        
        if re.search(self.house_totalarea_regex, response):
            house_totalarea=re.search(self.house_totalarea_regex, response).group(1)
            self.fd['house_area'] = house_totalarea
        else:
            self.fd['house_area'] = None
        
        if re.search(self.house_price_regex_2, response):
            house_price=re.search(self.house_price_regex_2, response).group(1)
            if house_price=="面议":
                house_price=0
            self.fd['house_price'] = int(house_price)
        else:
            self.fd['house_price'] = 0
    #    house_price=tree.xpath("/html/body/div[2]/div/div/ul/li/span") and tree.xpath("/html/body/div[2]/div/div/ul/li/span")[0].text.strip() or None    
    #    v['house_price'] = house_price
        
        posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        if posttime:
            Y=int(time.strftime('%Y', time.localtime()))
            M=int(posttime.split(' ')[0].split('-')[0])
            D=int(posttime.split(' ')[0].split('-')[1])
            H=int(time.strftime('%H',time.localtime(time.time())))
            Min=int(time.strftime('%M',time.localtime(time.time())))
            s = datetime.datetime(Y,M,D,H,Min)
            posttime=str(int(time.mktime(s.timetuple())))
            self.fd['house_posttime'] =posttime 
        else:
            s=time.localtime(time.time())
            self.fd['house_posttime'] =str(int(time.mktime(s)))
            
        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        

        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = house_room
        else:
            self.fd['house_room'] = 0
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = house_hall
        else:
            self.fd['house_hall'] = 0
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = house_toilet
        else:
            self.fd['house_toilet'] = 0
            
        if re.search(self.house_veranda_regex, response):
            house_veranda=re.search(self.house_veranda_regex, response).group(1)
            self.fd['house_veranda'] = house_veranda
        else:
            self.fd['house_veranda'] = 0
            
            
        if re.search(self.house_floor_regex, response):
            house_floor=re.search(self.house_floor_regex, response).group(1)
            house_topfloor=re.search(self.house_floor_regex, response).group(2)
            self.fd['house_floor']    = int(house_floor)
            self.fd['house_topfloor'] = int(house_topfloor)
        else:
            self.fd['house_floor'] = 0
            self.fd['house_topfloor'] = 0
          
            
        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = None

        d_i = soup.find('ul',{'class':'d_i'})        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if d_i.find(text="小区: "):
                borough_box = d_i.find(text="小区: ").parent        
                borough_name = borough_box.find("a")
                if borough_name:
                    self.fd['borough_name'] = borough_name.string
                else:
                    self.fd['borough_name'] = None            
                #地址
                if borough_name and borough_name.nextSibling:
                    house_addr = borough_name.nextSibling.string
                    self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr)
                else:
                    self.fd['house_addr'] = None
            else:
                if re.search(self.borough_name_regex, response):
                    borough_name=re.search(self.borough_name_regex, response).group(1)
                    self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name)
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = str(area_a[1].string)
        elif area_a and len(area_a)==1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = ""
        else:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            Y=int(time.strftime('%Y', time.localtime()))
            house_age=Y-int(house_age)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = 0
            
        #朝向
        if re.search(self.house_toward_regex, response):
            house_toward=re.search(self.house_toward_regex, response).group(1)
            self.fd['house_toward'] = toward(house_toward)
        else:
            self.fd['house_toward'] = 0        
            
        if re.search(self.house_fitment_regex, response):
            house_fitment=re.search(self.house_fitment_regex, response).group(1)
            self.fd['house_fitment'] = fitment(house_fitment)
        else:
            self.fd['house_fitment'] = 2
            
        if re.search(self.house_deposit_regex, response):
            house_deposit=re.search(self.house_deposit_regex, response).group(1)
            self.fd['house_deposit'] = deposit(house_deposit)
        else:
            self.fd['house_deposit'] = None
        request = None
        response = None
        soup=None
        tree=None
        del tree
        del request
        del response
        del soup  
예제 #2
0
파일: ganji.py 프로젝트: ptphp/PyLib
    def require(self,url):
        hc= urlparse(url)[1].replace('.ganji.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            raise
        tree = etree.HTML(response)
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            raise
        
        self.fd['house_flag'] = 4
        self.fd['house_type'] = 6
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        self.fd['house_area']=0
        self.fd['house_age'] = 0
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
        self.fd['house_deposit'] = 0
#        self.fd['house_totalarea_max'] = 0
#        self.fd['house_totalarea_min'] = 0
        
        soup =BeautifulSoup(response)
        detail_mer = soup.find('div',{'class':'detail_mer'})
        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):raise
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = Dname.string
        else:
            self.fd['owner_name'] = None
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone_pic'] = None            
        else:
            self.fd['owner_phone_pic'] = None
            
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:raise     
        
        if re.search(self.house_price_regex_zu, response):
            house_price_zu = re.search(self.house_price_regex_zu, response).group(1)
            house_price_zu = house_price_zu.replace('元/月','')
            if house_price_zu.find("以上") != -1:
                self.fd['house_price_max'] = 0
                self.fd['house_price'] = int(house_price_zu.replace('以上',''))
            elif house_price_zu.find("以下") != -1:
                self.fd['house_price_max'] = int(house_price_zu.replace('以下',''))
                self.fd['house_price'] = 0
            elif house_price_zu.find("-") != -1:
                self.fd['house_price_max'] = int(house_price_zu.split('-')[1])
                self.fd['house_price'] = int(house_price_zu.split('-')[0])
            else:
                self.fd['house_price_max'] = 0
                self.fd['house_price'] = 0
        else:
            self.fd['house_price_max'] = 0
            self.fd['house_price'] = 0
        
        posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        if posttime:
            Y=int(time.strftime('%Y', time.localtime()))
            M=int(posttime.split(' ')[0].split('-')[0])
            D=int(posttime.split(' ')[0].split('-')[1])
            H=int(time.strftime('%H',time.localtime(time.time())))
            Min=int(time.strftime('%M',time.localtime(time.time())))
            s = datetime.datetime(Y,M,D,H,Min)
            posttime=str(int(time.mktime(s.timetuple())))
            self.fd['house_posttime'] =posttime 
        else:
            s=time.localtime(time.time())
            self.fd['house_posttime'] =str(int(time.mktime(s)))
            
        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        

        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = int(house_room)
        else:
            self.fd['house_room'] = 0
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = int(house_hall)
        else:
            self.fd['house_hall'] = 0
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = int(house_toilet)
        else:
            self.fd['house_toilet'] = 0

        if re.search(self.house_veranda_regex, response):
            house_veranda=re.search(self.house_veranda_regex, response).group(1)
            self.fd['house_veranda'] = int(house_veranda)
        else:
            self.fd['house_veranda'] = 0

        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = ""

        d_i = soup.find('ul',{'class':'d_i'})        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if re.search(self.borough_name_regex_reg, response):
                borough_name=re.search(self.borough_name_regex_reg, response).group(1)
                self.fd['borough_name'] = borough_name
            if re.search(self.house_addr_regex_reg, response):
                house_addr=re.search(self.house_addr_regex_reg, response).group(1)
                self.fd['house_addr'] = house_addr
            else:
                self.fd['house_addr'] = ''
                
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = str(area_a[1].string)
        elif area_a and len(area_a)==1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = ""
        else:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
        request = None
        response = None
        soup=None
        tree=None
        del tree
        del request
        del response
        del soup
예제 #3
0
    def sell(self,url):
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        self.fd['house_flag'] = 1
        self.fd['belong']=0
        
        detail_mer = soup.find('div',{'class':'detail_mer'})        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):return        
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = Dname.string
        else:
            self.fd['owner_name'] = None
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone'] = None            
        else:
            self.fd['owner_phone'] = None            
            
        #没有联系方式  return
        if not self.fd['owner_phone']:return     
        
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            return   
        
        if re.search(self.house_floor_regex, response):
            house_floor=re.search(self.house_floor_regex, response).group(1)
            house_topfloor=re.search(self.house_floor_regex, response).group(2)
            self.fd['house_floor']    = house_floor
            self.fd['house_topfloor'] = house_topfloor
        else:
            self.fd['house_floor'] = None
            self.fd['house_topfloor'] = None   
        
        if re.search(self.house_totalarea_regex, response):
            house_totalarea=re.search(self.house_totalarea_regex, response).group(1)
            self.fd['house_totalarea'] = house_totalarea
        else:
            self.fd['house_totalarea'] = None
            
        #类型 
        if re.search(self.house_type_regex, response):
            house_type=re.search(self.house_type_regex, response).group(1)
            self.fd['house_type'] = housetype(house_type)
        else:
            self.fd['house_type'] = None   
            
        if re.search(self.house_price_regex, response):
            house_price=re.search(self.house_price_regex, response).group(1)
            if house_price=="面议":
                house_price="0"
            self.fd['house_price'] = house_price
        else:
            self.fd['house_price'] = None
    
        posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        if posttime:
            Y=int(time.strftime('%Y', time.localtime()))
            M=int(posttime.split(' ')[0].split('-')[0])
            D=int(posttime.split(' ')[0].split('-')[1])
            s = datetime.datetime(Y,M,D,0,0)
            posttime=int(time.mktime(s.timetuple()))
            self.fd['posttime'] =posttime 
        else:
            self.fd['posttime'] =None
            
        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = house_room
        else:
            self.fd['house_room'] = '0'
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = house_hall
        else:
            self.fd['house_hall'] = '0'
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = house_toilet
        else:
            self.fd['house_toilet'] = '0'

        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = None

        d_i = soup.find('ul',{'class':'d_i'})
        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if d_i.find(text="小区: "):
                borough_box = d_i.find(text="小区: ").parent        
                borough_name = borough_box.find("a")
                if borough_name:
                    self.fd['borough_name'] = borough_name.string
                else:
                    self.fd['borough_name'] = None            
                #地址
                if borough_name and borough_name.nextSibling:
                    house_addr = borough_name.nextSibling.string
                    self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr)
                else:
                    self.fd['house_addr'] = None
            else:
                if re.search(self.borough_name_regex, response):
                    borough_name=re.search(self.borough_name_regex, response).group(1)
                    self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name)
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = area_a[1].string
        elif area_a and len(area_a)==1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = None
        else:
            self.fd['cityarea'] = None
            self.fd['section'] = None
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = None
            
        #朝向
        if re.search(self.house_toward_regex, response):
            house_toward=re.search(self.house_toward_regex, response).group(1)
            self.fd['house_toward'] = toward(house_toward)
        else:
            self.fd['house_toward'] = None        
            
        if re.search(self.house_fitment_regex, response):
            house_fitment=re.search(self.house_fitment_regex, response).group(1)
            self.fd['house_fitment'] = fitment(house_fitment)
        else:
            self.fd['house_fitment'] = 2
        request = None
        response = None
        soup=None
        tree=None
        del tree
        del request
        del response
        del soup