def extractDict(self): self.fd["citycode"]=self.citycode for url in self.urls: if checkPath(homepath,self.folder,url): continue req=urllib2.Request(url, None, self.header) page=self.br.open(req).read() if re.search(self.ht_r, page): if "商铺"==re.search(self.ht_r, page).group(1): continue else: ht=housetype(re.search(self.ht_r, page).group(1)) self.fd["house_type"]=ht #lambda a: a and self.fd["borough_section"]=a.group(1) or self.fd["borough_section"]="" self.fd["borough_section"]=re.search(self.ad_r, page)!=None and re.search(self.ad_r, page).group(1) or "" self.fd["cityarea"]=re.search(self.ca_r, page)!=None and re.search(self.ca_r, page).group(1) or "" self.fd["house_fitment"]=re.search(self.fm_r, page)!=None and re.search(self.fm_r, page).group(1) or "" self.fd["house_kind"]=self.kind self.fd["belong"]=re.search(self.bl_r, page)!=None and re.search(self.bl_r, page).group(1) or "" self.fd["house_price"]=re.search(self.hp_r, page)!=None and re.search(self.hp_r, page).group(1) or "" self.fd["house_totalarea"]=re.search(self.hta_r, page)!=None and re.search(self.hta_r, page).group(1) or "" house_type=re.search(self.hrht_r, page)!=None and re.search(self.hrht_r, page).group(1) or "" blank=0 if house_type.find("室")!= -1: self.fd["house_room"]=house_type[blank:house_type.find("室")] blank=house_type.find("室")+3 else: self.fd["house_room"]="" if house_type.find("厅")!=-1: self.fd["house_hall"]=house_type[blank:house_type.find("厅")] blank=house_type.find("厅")+3 else: self.fd["house_hall"]="" if house_type.find("卫")!=-1: self.fd["house_toilet"]=house_type[blank:house_type.find("卫")] else: self.fd["house_toilet"]="" self.fd["house_floor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(1) or "" self.fd["house_topfloor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(2) or "" self.fd["house_age"]=re.search(self.ha_r, page)!=None and re.search(self.ha_r, page).group(1) or "" self.fd["house_sup"]=re.search(self.hs_r, page)!=None and re.search(self.hs_r, page).group(1) or "" self.fd["house_desc"]=re.search(self.hd_r, page)!=None and re.search(self.hd_r, page).group(1) or "" self.fd["borough_name"]=re.search(self.nm_r, page)!=None and re.search(self.nm_r, page).group(1) or "" makePath(homepath,self.folder,url) for ddd in self.fd.items(): print ddd[0],ddd[1] print "="*60
def QiuZu(self,url): self.fd['house_flag'] = 3 self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 self.fd['house_age'] = 0 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 self.fd['house_deposit'] = 0 self.fd['house_totalarea_max'] = 0 self.fd['house_totalarea_min'] = 0 self.fd['house_totalarea'] = 0 request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =str(detail_mer).replace(" ", "") #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname == '经纪人':return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = None owner_phone = soup('img') self.fd['owner_phone'] = '' for phone in owner_phone: if phone['src'].find('http://image.58.com/showphone.aspx') != -1: self.fd['owner_phone'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime.replace(" ", " ")) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: posttime = '' print posttime if not posttime: return elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['posttime'] = posttime if (time.time() - self.fd['posttime']) > 3600*24*7: return print "++++++++++++++++" print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_req_regex, detail_mer_str): house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1) house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2) self.fd['house_totalarea'] = house_totalarea_min self.fd['house_totalarea_max'] = house_totalarea_max self.fd['house_totalarea_min'] = house_totalarea_min else: if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_totalarea'] = house_totalarea self.fd['house_totalarea_max'] = house_totalarea self.fd['house_totalarea_min'] = house_totalarea else: self.fd['house_totalarea'] = 0 self.fd['house_totalarea_max'] = 0 self.fd['house_totalarea_min'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) house_price = detail_mer.em.string if house_price: house_price = house_price.replace('元','') if house_price.find("以上") != -1: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = house_price.replace('以上','') self.fd['house_price'] = house_price.replace('以上','') elif house_price.find("以下") != -1: self.fd['house_price_max'] = house_price.replace('以下','') self.fd['house_price_min'] = 0 self.fd['house_price'] = house_price.replace('以下','') elif house_price.find("-") != -1: self.fd['house_price_max'] = house_price.split('-')[1] self.fd['house_price_min'] = house_price.split('-')[0] self.fd['house_price'] = house_price.split('-')[0] else: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = 0 self.fd['house_price'] = 0 else: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = 0 self.fd['house_price'] = 0 if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] = house_room self.fd['house_room1'] = house_room else: self.fd['house_room'] = '0' self.fd['house_room1'] = '0' self.fd['house_hall'] = '0' self.fd['house_toilet'] = '0' self.fd['house_toilet'] = '0' if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = None #小区名 if re.search(self.house_addr_regex, detail_mer_str): house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1) self.fd['house_addr'] = house_addr self.fd['borough_name'] = house_addr else: self.fd['house_addr'] = '' self.fd['borough_name'] = '' #区域 #print detail_mer area_box = detail_mer.find(text="地段:").parent.parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = area_a[1].string elif area_a and len(area_a)==1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = None else: self.fd['cityarea'] = None self.fd['section'] = None self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0
def sell(self,url): request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} raise tree = etree.HTML(response) soup =BeautifulSoup(response) self.fd['house_flag'] = 1 self.fd['house_belong']=0 self.fd['owner_phone']='' self.fd['house_area_max']=0 self.fd['house_price_max']='' detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):raise Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = str(Dname.string) else: self.fd['owner_name'] = None ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone_pic'] = None else: self.fd['owner_phone_pic'] = None #没有联系方式 return if not self.fd['owner_phone_pic']:raise if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: raise if re.search(self.house_floor_regex, response): house_floor=re.search(self.house_floor_regex, response).group(1) house_topfloor=re.search(self.house_floor_regex, response).group(2) self.fd['house_floor'] = int(house_floor) self.fd['house_topfloor'] = int(house_topfloor) else: self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_regex, response): house_totalarea=re.search(self.house_totalarea_regex, response).group(1) self.fd['house_area'] = int(house_totalarea) else: self.fd['house_area'] = 0 #类型 if re.search(self.house_type_regex, response): house_type=re.search(self.house_type_regex, response).group(1) self.fd['house_type'] = housetype(house_type) else: self.fd['house_type'] = 6 if re.search(self.house_price_regex, response): house_price=re.search(self.house_price_regex, response).group(1) if house_price=="面议": house_price=0 self.fd['house_price'] = int(house_price) else: self.fd['house_price'] = 0 #posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None #if posttime: #Y=int(time.strftime('%Y', time.localtime())) #M=int(posttime.split(' ')[0].split('-')[0]) #D=int(posttime.split(' ')[0].split('-')[1]) #s = datetime.datetime(Y,M,D,0,0) #posttime=int(time.mktime(s.timetuple())) #self.fd['house_posttime'] =posttime #else: #self.fd['house_posttime'] =None if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = int(house_room) else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = int(house_hall) else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = int(house_toilet) else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = int(house_veranda) else: self.fd['house_veranda'] = 0 house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = "" d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if d_i.find(text="小区: "): borough_box = d_i.find(text="小区: ").parent borough_name = borough_box.find("a") if borough_name: self.fd['borough_name'] = str(borough_name.string) else: self.fd['borough_name'] = None #地址 if borough_name and borough_name.nextSibling: house_addr = borough_name.nextSibling.string self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr) else: self.fd['house_addr'] = "" else: if re.search(self.borough_name_regex, response): borough_name=re.search(self.borough_name_regex, response).group(1) self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name) #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) Y=int(time.strftime('%Y', time.localtime())) house_age=Y-int(house_age) self.fd['house_age'] = house_age else: self.fd['house_age'] = 0 #朝向 if re.search(self.house_toward_regex, response): house_toward=re.search(self.house_toward_regex, response).group(1) self.fd['house_toward'] = toward(house_toward) else: self.fd['house_toward'] = 0 if re.search(self.house_fitment_regex, response): house_fitment=re.search(self.house_fitment_regex, response).group(1) self.fd['house_fitment'] = fitment(house_fitment) else: self.fd['house_fitment'] = 2 request = None response = None soup=None tree=None del tree del request del response del soup
def rent(self,url): self.fd['house_flag'] = 2 request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =re.sub("\n|\t\r| ","",str(detail_mer)) #print detail_mer_str #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname != '个人房源':return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = None owner_phone = soup('img') # print owner_phone self.fd['owner_phone'] = '' for phone in owner_phone: if phone['src'].find('58.com/showphone.aspx') != -1: self.fd['owner_phone'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: posttime = '' if not posttime: return elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['posttime'] = posttime if (time.time() - self.fd['posttime']) > 3600*24*7: return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) if re.search(self.house_floor_regex, detail_mer_str): house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1) self.fd['house_floor'] = house_floor else: self.fd['house_floor'] = None if re.search(self.house_topfloor_regex, detail_mer_str): house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1) self.fd['house_topfloor'] = house_topfloor else: self.fd['house_topfloor'] = None if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_totalarea'] = house_totalarea else: self.fd['house_totalarea'] = None #类型 self.fd['house_type'] = housetype(detail_mer_str) self.fd['house_price'] = detail_mer.em.string if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] = house_room else: self.fd['house_room'] = '0' if re.search(self.house_hall_regex, detail_mer_str): house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1) self.fd['house_hall'] = house_hall else: self.fd['house_hall'] = '0' if re.search(self.house_toilet_regex, detail_mer_str): house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1) self.fd['house_toilet'] = house_toilet else: self.fd['house_toilet'] = '0' if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = None #小区名 if re.search(self.borough_name_regex, detail_mer_str): borough_name=re.search(self.borough_name_regex, detail_mer_str).group(1) try: self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name) except: self.fd['borough_name'] =borough_name else: self.fd['borough_name'] = '' #区域 area=detail_mer.find(text=u"区域:") if area: area_box = area.parent.parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = area_a[1].string elif area_a and len(area_a)==1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = "" else: self.fd['cityarea'] = "" self.fd['section'] = "" else: self.fd['cityarea'] = "" self.fd['section'] = "" if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) self.fd['house_age'] = house_age else: self.fd['house_age'] = None #朝向 self.fd['house_toward'] = toward(detail_mer_str) self.fd['house_fitment'] = fitment(detail_mer_str) self.fd['house_deposit'] = deposit(detail_mer_str) request = None response = None soup=None del request del response del soup
def ChuShou(self,url): self.fd['city'] = '' self.fd['house_flag'] = 1 self.fd['belong']="" request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):return Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = Dname.string else: self.fd['owner_name'] = None ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone'] = None else: self.fd['owner_phone'] = None #没有联系方式 return if not self.fd['owner_phone']:return if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: return if re.search(self.house_floor_regex, response): house_floor=re.search(self.house_floor_regex, response).group(1) house_topfloor=re.search(self.house_floor_regex, response).group(2) self.fd['house_floor'] = house_floor self.fd['house_topfloor'] = house_topfloor else: self.fd['house_floor'] = None self.fd['house_topfloor'] = None if re.search(self.house_totalarea_regex, response): house_totalarea=re.search(self.house_totalarea_regex, response).group(1) self.fd['house_totalarea'] = house_totalarea else: self.fd['house_totalarea'] = None #类型 if re.search(self.house_type_regex, response): house_type=re.search(self.house_type_regex, response).group(1) self.fd['house_type'] = housetype(house_type) else: self.fd['house_type'] = None if re.search(self.house_price_regex, response): house_price=re.search(self.house_price_regex, response).group(1) self.fd['house_price'] = house_price else: self.fd['house_price'] = None posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None if posttime: Y=int(time.strftime('%Y', time.localtime())) M=int(posttime.split(' ')[0].split('-')[0]) D=int(posttime.split(' ')[0].split('-')[1]) s = datetime.datetime(Y,M,D,0,0) posttime=int(time.mktime(s.timetuple())) self.fd['posttime'] =posttime else: self.fd['posttime'] =None if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = house_room else: self.fd['house_room'] = '0' if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = house_hall else: self.fd['house_hall'] = '0' if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = house_toilet else: self.fd['house_toilet'] = '0' house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = None d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name print borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if d_i.find(text="小区: "): borough_box = d_i.find(text="小区: ").parent borough_name = borough_box.find("a") if borough_name: self.fd['borough_name'] = borough_name.string else: self.fd['borough_name'] = None #地址 if borough_name and borough_name.nextSibling: house_addr = borough_name.nextSibling.string self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr) else: self.fd['house_addr'] = None else: if re.search(self.borough_name_regex, response): borough_name=re.search(self.borough_name_regex, response).group(1) self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name) #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = area_a[1].string elif area_a and len(area_a)==1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = None else: self.fd['cityarea'] = None self.fd['section'] = None if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) self.fd['house_age'] = house_age else: self.fd['house_age'] = None #朝向 if re.search(self.house_toward_regex, response): house_toward=re.search(self.house_toward_regex, response).group(1) self.fd['house_toward'] = toward(house_toward) else: self.fd['house_toward'] = None if re.search(self.house_fitment_regex, response): house_fitment=re.search(self.house_fitment_regex, response).group(1) self.fd['house_fitment'] = fitment(house_fitment) else: self.fd['house_fitment'] = 2
def require(self,url): self.fd['house_flag'] = 4 self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 self.fd['house_age'] = 0 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 self.fd['house_deposit'] = 0 self.fd['house_totalarea_max'] = 0 self.fd['house_totalarea_min'] = 0 self.fd['house_totalarea'] = 0 hc= urlparse(url)[1].replace('.58.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =str(detail_mer).replace(" ", "") #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname == '经纪人': self.fd['is_ok']=False return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = "" owner_phone = soup('img') self.fd['owner_phone_pic'] = '' for phone in owner_phone: if phone['src'].find('http://image.58.com/showphone.aspx') != -1: self.fd['owner_phone_pic'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone_pic']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime.replace(" ", " ")) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: posttime = '' # print posttime if not posttime: return elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['house_posttime'] = posttime if (time.time() - self.fd['house_posttime']) > 3600*24*7: return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_req_regex, detail_mer_str): house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1) house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2) self.fd['house_area'] = int(house_totalarea_min) self.fd['house_area_max'] = int(house_totalarea_max) else: if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_area'] = int(house_totalarea) self.fd['house_area_max'] = int(house_totalarea) else: self.fd['house_area'] = 0 self.fd['house_area_max'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) house_price = detail_mer.em.string if house_price=="面议": house_price="0" if house_price: house_price = house_price.replace('元','') if house_price.find("以上") != -1: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = house_price.replace('以上','') self.fd['house_price'] = house_price.replace('以上','') elif house_price.find("以下") != -1: self.fd['house_price_max'] = house_price.replace('以下','') self.fd['house_price_min'] = 0 self.fd['house_price'] = house_price.replace('以下','') elif house_price.find("-") != -1: self.fd['house_price_max'] = house_price.split('-')[1] self.fd['house_price_min'] = house_price.split('-')[0] self.fd['house_price'] = house_price.split('-')[0] else: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = 0 self.fd['house_price'] = 0 else: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = 0 self.fd['house_price'] = 0 if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] = int(house_room) self.fd['house_room1'] = int(house_room) else: self.fd['house_room'] = 0 self.fd['house_room1'] = 0 self.fd['house_hall'] = 0 self.fd['house_toilet'] = 0 self.fd['house_toilet'] = 0 if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = None #小区名 if re.search(self.house_region_regex, detail_mer_str): house_addr = re.search(self.house_region_regex, detail_mer_str).group(1) ha=re.search("<a.*>(.*)</a> <a.*>(.*)</a>",house_addr) try: self.fd['house_region'] = ha.group(1) self.fd['house_section'] = ha.group(2) except: pass # lss=PyQuery(unicode(house_addr,"utf-8"))('a') # if len(lss)==1: # self.fd['house_region'] = PyQuery(lss[0]).text() # elif len(lss)==2: # self.fd['house_region'] = PyQuery(lss[1]).text() # self.fd['borough_name'] = PyQuery(lss[0]).text() # self.fd['house_addr'] = re.sub("\(.*\)|<.*?>","",house_addr).replace(' '," ") # self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",house_addr).replace(' '," ") # self.fd['house_region'] = re.sub("\(.*\)|<.*?>","",house_addr).replace(' '," ") else: self.fd['house_addr'] = '' self.fd['borough_name'] = '' self.fd['house_region'] ="" #区域 #print detail_mer area=detail_mer.find(text="地段:") if area: area_box = area.parent.parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = area_a[1].string elif area_a and len(area_a)==1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = None else: self.fd['cityarea'] = None self.fd['section'] = None else: self.fd['cityarea'] = None self.fd['section'] = None self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 request = None response = None soup=None del request del response del soup
def buy(self,url): self.fd['house_flag'] = 3 hc= urlparse(url)[1].replace('.58.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =str(detail_mer).replace(" ", "") #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname != '个人房源':return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = "" owner_phone = soup('img') self.fd['owner_phone_pic'] = '' for phone in owner_phone: if phone['src'].find('http://image.58.com/showphone.aspx') != -1: self.fd['owner_phone_pic'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone_pic']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: posttime = '' if not posttime: return elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['house_posttime'] = posttime if (time.time() - self.fd['house_posttime']) > 3600*24*7: return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_req_regex, detail_mer_str): house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1) house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2) self.fd['house_area'] = int(house_totalarea_min) self.fd['house_area_max'] = int(house_totalarea_max) else: if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_area'] = int(house_totalarea) self.fd['house_area_max'] = int(house_totalarea) else: self.fd['house_area'] = 0 self.fd['house_area_max'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) house_price = detail_mer.em.string if house_price=="面议": house_price="0" # print house_price if house_price.find('-') !=-1: self.fd['house_price_max'] = int(house_price.split('-')[0]) self.fd['house_price_min'] = int(house_price.split('-')[1]) self.fd['house_price'] = int(house_price.split('-')[0]) else: self.fd['house_price_min'] = int(house_price) self.fd['house_price_min'] = int(house_price) self.fd['house_price'] = int(house_price) if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] = int(house_room) self.fd['house_room1'] = int(house_room) else: self.fd['house_room'] = 0 self.fd['house_room1'] = 0 self.fd['house_hall'] = 0 self.fd['house_toilet'] = 0 self.fd['house_toilet'] = 0 if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = "" #小区名 if re.search(self.house_addr_regex, detail_mer_str): house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1) self.fd['house_addr'] = house_addr # self.fd['borough_name'] = house_addr else: self.fd['house_addr'] = '' # self.fd['borough_name'] = '' #区域 lis=PyQuery(unicode(repr(detail_mer),"UTF-8"))("li") for li in lis: lit=PyQuery(li).text() if "区域:" in lit: ls=PyQuery(li)("a") if len(ls)==1: self.fd['house_region'] = PyQuery(ls.eq(0)).text() elif len(ls)==2: self.fd['house_region'] = PyQuery(ls.eq(0)).text() self.fd['house_section'] = PyQuery(ls.eq(1)).text() break #print detail_mer # area=detail_mer.find(text=u"地段:") # if area : # area_box = area.parent.parent # area_a = area_box('a') # if area_a and len(area_a)>1: # self.fd['house_region'] = str(area_a[0].string) # self.fd['house_section'] = str(area_a[1].string) # elif area_a and len(area_a)==1: # self.fd['house_region'] = str(area_a[0].string) # self.fd['house_section'] = "" # else: # self.fd['house_region'] = "" # self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 request = None response = None soup=None del request del response del soup
def sell(self,url): hc= urlparse(url)[1].replace('.58.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) self.fd['house_flag'] = 1 detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =str(detail_mer).replace(" ", "") #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname != '个人房源': self.fd['is_ok']=False return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = None owner_phone = soup('img') self.fd['owner_phone_pic'] = '' for phone in owner_phone: if phone['src'].find('http://image.58.com/showphone.aspx') != -1: self.fd['owner_phone_pic'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone_pic']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: s=time.localtime(time.time()) posttime = str(int(time.mktime(s))) if not posttime: self.fd['house_posttime'] = time.time() elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['house_posttime'] = posttime # if (time.time() - self.fd['posttime']) > 3600*24*7: # return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) if re.search(self.house_floor_regex, detail_mer_str): house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1) self.fd['house_floor'] = int(house_floor) else: self.fd['house_floor'] = 0 if re.search(self.house_topfloor_regex, detail_mer_str): house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1) self.fd['house_topfloor'] = int(house_topfloor) else: self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_area'] = int(house_totalarea) else: self.fd['house_area'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) self.fd['house_price'] = detail_mer.em and int(detail_mer.em.string) or 0 if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] = int(house_room) else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, detail_mer_str): house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1) self.fd['house_hall'] = int(house_hall) else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, detail_mer_str): house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1) self.fd['house_toilet'] = int(house_toilet) else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = int(house_veranda) else: self.fd['house_veranda'] = 0 if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = "" #小区名 lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li") for li in lis: lit=PyQuery(li) if "小区:" in lit.text(): xq= lit.text().replace("小区:","") if u"二手房信息" in xq: self.fd['borough_name'] =xq[:xq.find("(")] else: self.fd['borough_name'] =xq break # if re.search(self.borough_name1_regex, detail_mer_str): # borough_name=re.search(self.borough_name1_regex, detail_mer_str).group(1) # self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name) # # else: # self.fd['borough_name'] = '' # lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li") for li in lis: lit= PyQuery(li).text() if "地址:" in lit: self.fd['house_addr']=lit[lit.find(":")+1:lit.find(u"(")] break #区域 try: area_box = detail_mer.find(text="区域:").parent.parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['section'] = "" except: self.fd['house_region'] = "" self.fd['house_section'] = "" if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) Y=int(time.strftime('%Y', time.localtime())) house_age=Y-int(house_age) self.fd['house_age'] = house_age else: self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = toward(detail_mer_str) self.fd['house_fitment'] = fitment(detail_mer_str) request = None response = None soup=None del request del response del soup
def QiuZu(self, url): self.fd["house_flag"] = 3 self.fd["house_floor"] = 0 self.fd["house_topfloor"] = 0 self.fd["house_age"] = 0 self.fd["house_toward"] = 0 self.fd["house_fitment"] = 0 self.fd["house_deposit"] = 0 self.fd["house_totalarea_max"] = 0 self.fd["house_totalarea_min"] = 0 self.fd["house_totalarea"] = 0 request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() tree = etree.HTML(response) soup = BeautifulSoup(response) detail_mer = soup.find("ul", {"class": "info"}) detail_mer_str = str(detail_mer).replace(" ", "") # 非个人房源 return # print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname = re.search(self.agencyname_regex, response).group(1) if agencyname == "经纪人": return else: return if re.search(self.username_regex, response): username = re.search(self.username_regex, response).group(1) self.fd["owner_name"] = username else: self.fd["owner_name"] = None owner_phone = soup("img") self.fd["owner_phone"] = "" for phone in owner_phone: if phone["src"].find("http://image.58.com/showphone.aspx") != -1: self.fd["owner_phone"] = phone["src"] # 没有联系方式 return if not self.fd["owner_phone"]: return if soup.find("div", {"class": "other"}): posttime = soup.find("div", {"class": "other"}).contents[0] posttime = re.sub("\n|\r| |\t", "", posttime.replace(" ", " ")) posttime = posttime.replace("发布时间:", "").replace(" 浏览", "") else: posttime = "" print posttime if not posttime: return elif posttime.find("-") != -1: s = datetime.datetime(int(posttime.split("-")[0]), int(posttime.split("-")[1]), int(posttime.split("-")[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find("分钟") != -1: n = int(posttime.replace("分钟前", "")) * 60 posttime = int(time.time() - n) elif posttime.find("小时") != -1: n = int(posttime.replace("小时前", "")) * 60 * 60 posttime = int(time.time() - n) self.fd["posttime"] = posttime if (time.time() - self.fd["posttime"]) > 3600 * 24 * 7: return print "++++++++++++++++" print time.strftime("%Y %m %d", time.localtime(self.fd["posttime"])) self.fd["house_floor"] = 0 self.fd["house_topfloor"] = 0 if re.search(self.house_totalarea_req_regex, detail_mer_str): house_totalarea_min = re.search(self.house_totalarea_req_regex, detail_mer_str).group(1) house_totalarea_max = re.search(self.house_totalarea_req_regex, detail_mer_str).group(2) self.fd["house_totalarea"] = house_totalarea_min self.fd["house_totalarea_max"] = house_totalarea_max self.fd["house_totalarea_min"] = house_totalarea_min else: if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea = re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd["house_totalarea"] = house_totalarea self.fd["house_totalarea_max"] = house_totalarea self.fd["house_totalarea_min"] = house_totalarea else: self.fd["house_totalarea"] = 0 self.fd["house_totalarea_max"] = 0 self.fd["house_totalarea_min"] = 0 # 类型 self.fd["house_type"] = housetype(detail_mer_str) house_price = detail_mer.em.string if house_price: house_price = house_price.replace("元", "") if house_price.find("以上") != -1: self.fd["house_price_max"] = 0 self.fd["house_price_min"] = house_price.replace("以上", "") self.fd["house_price"] = house_price.replace("以上", "") elif house_price.find("以下") != -1: self.fd["house_price_max"] = house_price.replace("以下", "") self.fd["house_price_min"] = 0 self.fd["house_price"] = house_price.replace("以下", "") elif house_price.find("-") != -1: self.fd["house_price_max"] = house_price.split("-")[1] self.fd["house_price_min"] = house_price.split("-")[0] self.fd["house_price"] = house_price.split("-")[0] else: self.fd["house_price_max"] = 0 self.fd["house_price_min"] = 0 self.fd["house_price"] = 0 else: self.fd["house_price_max"] = 0 self.fd["house_price_min"] = 0 self.fd["house_price"] = 0 if re.search(self.house_room_regex, detail_mer_str): house_room = re.search(self.house_room_regex, detail_mer_str).group(1) self.fd["house_room"] = house_room self.fd["house_room1"] = house_room else: self.fd["house_room"] = "0" self.fd["house_room1"] = "0" self.fd["house_hall"] = "0" self.fd["house_toilet"] = "0" self.fd["house_toilet"] = "0" if re.search(self.house_title_regex, response): house_title = re.search(self.house_title_regex, response).group(1) self.fd["house_title"] = house_title else: self.fd["house_title"] = "" # 描述 detail_box = soup.find("div", {"class": "maincon"}) if detail_box: house_desc = str(detail_box) self.fd["house_desc"] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!", "", house_desc) else: self.fd["house_desc"] = None # 小区名 if re.search(self.house_addr_regex, detail_mer_str): house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1) self.fd["house_addr"] = house_addr self.fd["borough_name"] = house_addr else: self.fd["house_addr"] = "" self.fd["borough_name"] = "" # 区域 # print detail_mer area_box = detail_mer.find(text="地段:").parent.parent area_a = area_box("a") if area_a and len(area_a) > 1: self.fd["cityarea"] = area_a[0].string self.fd["section"] = area_a[1].string elif area_a and len(area_a) == 1: self.fd["cityarea"] = area_a[0].string self.fd["section"] = None else: self.fd["cityarea"] = None self.fd["section"] = None self.fd["house_age"] = 0 # 朝向 self.fd["house_toward"] = 0 self.fd["house_fitment"] = 0