def buy(self,url): self.fd['house_flag'] = 3 hc= urlparse(url)[1].replace('.58.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =str(detail_mer).replace(" ", "") #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname != '个人房源':return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = "" owner_phone = soup('img') self.fd['owner_phone_pic'] = '' for phone in owner_phone: if phone['src'].find('http://image.58.com/showphone.aspx') != -1: self.fd['owner_phone_pic'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone_pic']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: posttime = '' if not posttime: return elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['house_posttime'] = posttime if (time.time() - self.fd['house_posttime']) > 3600*24*7: return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_req_regex, detail_mer_str): house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1) house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2) self.fd['house_area'] = int(house_totalarea_min) self.fd['house_area_max'] = int(house_totalarea_max) else: if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_area'] = int(house_totalarea) self.fd['house_area_max'] = int(house_totalarea) else: self.fd['house_area'] = 0 self.fd['house_area_max'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) house_price = detail_mer.em.string if house_price=="面议": house_price="0" # print house_price if house_price.find('-') !=-1: self.fd['house_price_max'] = int(house_price.split('-')[0]) self.fd['house_price_min'] = int(house_price.split('-')[1]) self.fd['house_price'] = int(house_price.split('-')[0]) else: self.fd['house_price_min'] = int(house_price) self.fd['house_price_min'] = int(house_price) self.fd['house_price'] = int(house_price) if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] = int(house_room) self.fd['house_room1'] = int(house_room) else: self.fd['house_room'] = 0 self.fd['house_room1'] = 0 self.fd['house_hall'] = 0 self.fd['house_toilet'] = 0 self.fd['house_toilet'] = 0 if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = "" #小区名 if re.search(self.house_addr_regex, detail_mer_str): house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1) self.fd['house_addr'] = house_addr # self.fd['borough_name'] = house_addr else: self.fd['house_addr'] = '' # self.fd['borough_name'] = '' #区域 lis=PyQuery(unicode(repr(detail_mer),"UTF-8"))("li") for li in lis: lit=PyQuery(li).text() if "区域:" in lit: ls=PyQuery(li)("a") if len(ls)==1: self.fd['house_region'] = PyQuery(ls.eq(0)).text() elif len(ls)==2: self.fd['house_region'] = PyQuery(ls.eq(0)).text() self.fd['house_section'] = PyQuery(ls.eq(1)).text() break #print detail_mer # area=detail_mer.find(text=u"地段:") # if area : # area_box = area.parent.parent # area_a = area_box('a') # if area_a and len(area_a)>1: # self.fd['house_region'] = str(area_a[0].string) # self.fd['house_section'] = str(area_a[1].string) # elif area_a and len(area_a)==1: # self.fd['house_region'] = str(area_a[0].string) # self.fd['house_section'] = "" # else: # self.fd['house_region'] = "" # self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 request = None response = None soup=None del request del response del soup