def rent(self,url): # self.fd['house_city'] = urlparse(url)[1].replace('.ganji.com',"") hc= urlparse(url)[1].replace('.ganji.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} raise tree = etree.HTML(response) if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: raise self.fd['house_flag'] = 2 self.fd['house_type'] = 6 self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 soup =BeautifulSoup(response) detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):raise Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = str(Dname.string) else: self.fd['owner_name'] = "" ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone_pic'] = None else: self.fd['owner_phone_pic'] = None #没有联系方式 return if not self.fd['owner_phone_pic']:raise if re.search(self.house_totalarea_regex, response): house_totalarea=re.search(self.house_totalarea_regex, response).group(1) self.fd['house_area'] = house_totalarea else: self.fd['house_area'] = None if re.search(self.house_price_regex_2, response): house_price=re.search(self.house_price_regex_2, response).group(1) if house_price=="面议": house_price=0 self.fd['house_price'] = int(house_price) else: self.fd['house_price'] = 0 # house_price=tree.xpath("/html/body/div[2]/div/div/ul/li/span") and tree.xpath("/html/body/div[2]/div/div/ul/li/span")[0].text.strip() or None # v['house_price'] = house_price posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None if posttime: Y=int(time.strftime('%Y', time.localtime())) M=int(posttime.split(' ')[0].split('-')[0]) D=int(posttime.split(' ')[0].split('-')[1]) H=int(time.strftime('%H',time.localtime(time.time()))) Min=int(time.strftime('%M',time.localtime(time.time()))) s = datetime.datetime(Y,M,D,H,Min) posttime=str(int(time.mktime(s.timetuple()))) self.fd['house_posttime'] =posttime else: s=time.localtime(time.time()) self.fd['house_posttime'] =str(int(time.mktime(s))) house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = house_room else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = house_hall else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = house_toilet else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = house_veranda else: self.fd['house_veranda'] = 0 if re.search(self.house_floor_regex, response): house_floor=re.search(self.house_floor_regex, response).group(1) house_topfloor=re.search(self.house_floor_regex, response).group(2) self.fd['house_floor'] = int(house_floor) self.fd['house_topfloor'] = int(house_topfloor) else: self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = None d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if d_i.find(text="小区: "): borough_box = d_i.find(text="小区: ").parent borough_name = borough_box.find("a") if borough_name: self.fd['borough_name'] = borough_name.string else: self.fd['borough_name'] = None #地址 if borough_name and borough_name.nextSibling: house_addr = borough_name.nextSibling.string self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr) else: self.fd['house_addr'] = None else: if re.search(self.borough_name_regex, response): borough_name=re.search(self.borough_name_regex, response).group(1) self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name) #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) Y=int(time.strftime('%Y', time.localtime())) house_age=Y-int(house_age) self.fd['house_age'] = house_age else: self.fd['house_age'] = 0 #朝向 if re.search(self.house_toward_regex, response): house_toward=re.search(self.house_toward_regex, response).group(1) self.fd['house_toward'] = toward(house_toward) else: self.fd['house_toward'] = 0 if re.search(self.house_fitment_regex, response): house_fitment=re.search(self.house_fitment_regex, response).group(1) self.fd['house_fitment'] = fitment(house_fitment) else: self.fd['house_fitment'] = 2 if re.search(self.house_deposit_regex, response): house_deposit=re.search(self.house_deposit_regex, response).group(1) self.fd['house_deposit'] = deposit(house_deposit) else: self.fd['house_deposit'] = None request = None response = None soup=None tree=None del tree del request del response del soup
def require(self,url): hc= urlparse(url)[1].replace('.ganji.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} raise tree = etree.HTML(response) if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: raise self.fd['house_flag'] = 4 self.fd['house_type'] = 6 self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 self.fd['house_area']=0 self.fd['house_age'] = 0 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 self.fd['house_deposit'] = 0 # self.fd['house_totalarea_max'] = 0 # self.fd['house_totalarea_min'] = 0 soup =BeautifulSoup(response) detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):raise Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = Dname.string else: self.fd['owner_name'] = None ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone_pic'] = None else: self.fd['owner_phone_pic'] = None #没有联系方式 return if not self.fd['owner_phone_pic']:raise if re.search(self.house_price_regex_zu, response): house_price_zu = re.search(self.house_price_regex_zu, response).group(1) house_price_zu = house_price_zu.replace('元/月','') if house_price_zu.find("以上") != -1: self.fd['house_price_max'] = 0 self.fd['house_price'] = int(house_price_zu.replace('以上','')) elif house_price_zu.find("以下") != -1: self.fd['house_price_max'] = int(house_price_zu.replace('以下','')) self.fd['house_price'] = 0 elif house_price_zu.find("-") != -1: self.fd['house_price_max'] = int(house_price_zu.split('-')[1]) self.fd['house_price'] = int(house_price_zu.split('-')[0]) else: self.fd['house_price_max'] = 0 self.fd['house_price'] = 0 else: self.fd['house_price_max'] = 0 self.fd['house_price'] = 0 posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None if posttime: Y=int(time.strftime('%Y', time.localtime())) M=int(posttime.split(' ')[0].split('-')[0]) D=int(posttime.split(' ')[0].split('-')[1]) H=int(time.strftime('%H',time.localtime(time.time()))) Min=int(time.strftime('%M',time.localtime(time.time()))) s = datetime.datetime(Y,M,D,H,Min) posttime=str(int(time.mktime(s.timetuple()))) self.fd['house_posttime'] =posttime else: s=time.localtime(time.time()) self.fd['house_posttime'] =str(int(time.mktime(s))) house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = int(house_room) else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = int(house_hall) else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = int(house_toilet) else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = int(house_veranda) else: self.fd['house_veranda'] = 0 house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = "" d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if re.search(self.borough_name_regex_reg, response): borough_name=re.search(self.borough_name_regex_reg, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.house_addr_regex_reg, response): house_addr=re.search(self.house_addr_regex_reg, response).group(1) self.fd['house_addr'] = house_addr else: self.fd['house_addr'] = '' #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" request = None response = None soup=None tree=None del tree del request del response del soup
def sell(self,url): request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return tree = etree.HTML(response) soup =BeautifulSoup(response) self.fd['house_flag'] = 1 self.fd['belong']=0 detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):return Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = Dname.string else: self.fd['owner_name'] = None ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone'] = None else: self.fd['owner_phone'] = None #没有联系方式 return if not self.fd['owner_phone']:return if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: return if re.search(self.house_floor_regex, response): house_floor=re.search(self.house_floor_regex, response).group(1) house_topfloor=re.search(self.house_floor_regex, response).group(2) self.fd['house_floor'] = house_floor self.fd['house_topfloor'] = house_topfloor else: self.fd['house_floor'] = None self.fd['house_topfloor'] = None if re.search(self.house_totalarea_regex, response): house_totalarea=re.search(self.house_totalarea_regex, response).group(1) self.fd['house_totalarea'] = house_totalarea else: self.fd['house_totalarea'] = None #类型 if re.search(self.house_type_regex, response): house_type=re.search(self.house_type_regex, response).group(1) self.fd['house_type'] = housetype(house_type) else: self.fd['house_type'] = None if re.search(self.house_price_regex, response): house_price=re.search(self.house_price_regex, response).group(1) if house_price=="面议": house_price="0" self.fd['house_price'] = house_price else: self.fd['house_price'] = None posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None if posttime: Y=int(time.strftime('%Y', time.localtime())) M=int(posttime.split(' ')[0].split('-')[0]) D=int(posttime.split(' ')[0].split('-')[1]) s = datetime.datetime(Y,M,D,0,0) posttime=int(time.mktime(s.timetuple())) self.fd['posttime'] =posttime else: self.fd['posttime'] =None if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = house_room else: self.fd['house_room'] = '0' if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = house_hall else: self.fd['house_hall'] = '0' if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = house_toilet else: self.fd['house_toilet'] = '0' house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = None d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if d_i.find(text="小区: "): borough_box = d_i.find(text="小区: ").parent borough_name = borough_box.find("a") if borough_name: self.fd['borough_name'] = borough_name.string else: self.fd['borough_name'] = None #地址 if borough_name and borough_name.nextSibling: house_addr = borough_name.nextSibling.string self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr) else: self.fd['house_addr'] = None else: if re.search(self.borough_name_regex, response): borough_name=re.search(self.borough_name_regex, response).group(1) self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name) #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = area_a[1].string elif area_a and len(area_a)==1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = None else: self.fd['cityarea'] = None self.fd['section'] = None if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) self.fd['house_age'] = house_age else: self.fd['house_age'] = None #朝向 if re.search(self.house_toward_regex, response): house_toward=re.search(self.house_toward_regex, response).group(1) self.fd['house_toward'] = toward(house_toward) else: self.fd['house_toward'] = None if re.search(self.house_fitment_regex, response): house_fitment=re.search(self.house_fitment_regex, response).group(1) self.fd['house_fitment'] = fitment(house_fitment) else: self.fd['house_fitment'] = 2 request = None response = None soup=None tree=None del tree del request del response del soup