def deal_price(self): for data in collection.find(no_cursor_timeout=True): if 'fj_flag' in data: if data['fj_flag'] == 1: second_price = Comm(self.source) second_price.city = data['fj_city'] second_price.direction = data['CJ_CX'] second_price.avg_price = float(data['CJ_CJDJ']) second_price.area = float(data['CJ_JZMJ']) second_price.trade_date = data['CJ_CJRQ'] second_price.total_price = float(data['CJ_CJDJ']) * float( data['CJ_JZMJ']) second_price.district_name = data['fj_name'] if 'CJ_ZH' in data: second_price.house_num = data['CJ_ZH'] if 'CJ_SHBW' in data: second_price.room_num = data['CJ_SHBW'] try: second_price.floor = int(data['CJ_CS']) except Exception as e: print('楼层error', e) second_price.region = data['fj_region'] is_success = second_price.insert_db()
def comm_info(self, comm_url_list, city_url): for comm_url in comm_url_list: url = city_url.replace('/esf/', comm_url) re_url = url.replace('xq', 'fangjia') res = requests.get(re_url, headers=self.headers) con = res.text co_name = re.search('wrap-head-name">(.*?)</div', con, re.S | re.M).group(1) co_name = co_name.strip() try: page = re.search('(\d+)">尾页', con).group(1) except: page = 1 for i in range(1, int(page) + 1): page_url = re_url.rstrip('.html') + "/?n=" + str(i) co_res = requests.get(page_url, headers=self.headers) co_con = co_res.text co_html = etree.HTML(co_con) city = co_html.xpath("//span[@class='change-city']/text()")[0] romm_info_list = co_html.xpath("//div[@class='list-cont']/div") for room_info in romm_info_list: try: room = Comm(source) room.city = city room.district_name = co_name floor = room_info.xpath( ".//div[@class='text']/p[2]/span[1]/text()")[0] room.floor = int(re.search('\d+', floor).group(0)) trade_date = room_info.xpath( ".//span[@class='cj-data-num']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = datetime.datetime(y, m, d) total_price = room_info.xpath( ".//span[@class='cj-data-num c4a4a4a']/em/text()" )[0] room.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 avg_price = room_info.xpath( ".//span[@class='cj-data-num']/em/text()")[0] room.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) room.direction = room_info.xpath( ".//div[@class='text']/p[2]/span[2]/text()")[0] area = room_info.xpath(".//p[1]/text()")[1] room.region = area size = re.search('建筑面积(.*?)平', area).group(1) if size: area = float(size) room.area = round(area, 2) room.insert_db() except Exception as e: log.error("{}解析房屋错误{}".format(page_url, e))
def into_mongo(coll): com = Comm('澜斯') results = coll.find(no_cursor_timeout=True) for result in results: # 这个地方写一个try是因为我再测试的时候发现有的木有fj_city try: com.city = result['fj_city'] # 城市 com.region = result['fj_region'] # 区域 except Exception as e: log.error('城市或者区域没有') com.m_date = result['updatedate'] # 更新日期 com.create_date = datetime.datetime.now() # 创建时间 com.fitment = result['newdiskdecoration'] # 装修 com.floor = result['flevel'] # 所在楼层 # try是因为在插入数据库中这几个如果不符合,就不会插入 try: com.district_name = result['fj_name'] # 小区名称 com.avg_price = result['unitprice'] # 单价 com.total_price = result['usd'] # 总价 com.area = result['acreage'] # 面积=建筑面积 t = time.strptime(result['signingdate'].split('T')[0], "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday com.trade_date = datetime.datetime(y, m, d) except Exception as e: log.error(e) # 这一部分我写了正则从地址中匹配单元号和室号,如果组长感觉不对,,直接注释掉就好 houseaddress = result['houseaddress'] try: res = re.search('(\d+)号(\d+)', houseaddress) com.unit_num = res.group(1) # 单元号 com.room_num = res.group(2) # 室号 except Exception as e: print('无法匹配大盘单元号和室号,houseaddress={}'.find(houseaddress)) # 以下数据库确定无法匹配,写上是为了让您看看 # com.direction = None # 朝向 # com.room = None # 室数 # com.hall = None # 厅数 # com.toilet = None # 卫数 # com.height = None # 总楼层 # com.house_num = None # 楼栋号 # 执行插入操作 com.insert_db()
def room(self, co_list, city_name): for co in co_list: try: co_name = co.xpath("./div[1]/a/text()")[0] co_url = "http:" + co.xpath("./div[1]/a/@href")[0] region = co.xpath("./div[3]/span[1]/a[1]/text()")[0] addr = co.xpath("./div[3]/span[3]/@title")[0] detail = requests.get(co_url, headers=self.headers) html = etree.HTML(detail.text) room_url = "http:" + html.xpath("//div[@class='tab-toolbar pr']//li/a/@href")[-1] page_index = requests.get(room_url, headers=self.headers) except: continue if re.search('共(\d+)页', page_index.text): page_num = re.search('共(\d+)页', page_index.text).group(1) else: log.info('小区无相关数据') continue for i in range(1, int(page_num) + 1): url = re.sub('#.*', 'n', room_url) + str(i) while True: try: res = requests.get(url, headers=self.headers) break except: continue con = res.text room_html = etree.HTML(con) room_list = room_html.xpath("//div[@class='right-information']") for m in room_list: try: room = Comm(source) room.district_name = co_name room.city = city_name room.region = region room_type = m.xpath("./h3/span[2]/text()")[0] try: room.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: room.room = None try: room.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: room.hall = None size = m.xpath("./h3/span[3]/text()")[0] area = size.replace('平米', '') if area: area = float(area) room.area = round(area, 2) total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0] room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 avg_price = m.xpath(".//div[@class='size fs14']/text()")[0] room.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) try: room.fitment = m.xpath(".//div[@class='t1 fs14']/text()[3]")[0] room.direction = m.xpath(".//div[@class='t1 fs14']/text()[2]")[0] # room.use = m.xpath(".//div[@class='t1 fs14']/text()[1]")[0] except: room.fitment = None room.direction = None # room.use = None floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0] try: floor = re.search('(.*?)/', floor_info).group(1) room.floor = int(re.search('\d+',floor).group(0)) except Exception as e: room.floor = None try: room.height = int(re.search('.*?/(\d+)层', floor_info).group(1)) except: room.height = None trade_date = m.xpath(".//div[@class='date']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = datetime.datetime(y, m, d) room.insert_db() except Exception as e: log.error('房屋信息提取失败{}'.format(e))
def comm_info(self, comm_url_list, city_url): for comm_url in comm_url_list: url = city_url.replace('/esf/', comm_url) re_url = url.replace('xq', 'fangjia') try: res = requests.get(url=re_url, headers=self.headers, proxies=next(p)) except Exception as e: log.error('请求失败, source={}, url={}, e={}'.format( '乐有家', re_url, e)) continue con = res.text co_name = re.search('wrap-head-name">(.*?)</div', con, re.S | re.M).group(1) co_name = co_name.strip() try: page = re.search('(\d+)">尾页', con).group(1) except: page = 1 for i in range(1, int(page) + 1): page_url = re_url.rstrip('.html') + "/?n=" + str(i) print(page_url) try: co_res = requests.get(url=page_url, headers=self.headers, proxies=next(p)) except Exception as e: log.error('请求失败, source={}, url={}, e={}'.format( '乐有家', page_url, e)) continue co_html = etree.HTML(co_res.text) city = co_html.xpath( "//span[@class='change-city']/text()")[0].replace( '\t', '').replace('[', '') romm_info_list = co_html.xpath("//div[@class='list-cont']/div") for room_info in romm_info_list: room = Comm(source) # 城市 room.city = city # 小区名称 room.district_name = co_name try: # 所在楼层 floor = room_info.xpath( ".//div[@class='text']/p[2]/span[1]/text()")[0] floor = re.search('(.*?)/', floor).group(1) room.floor = int(re.search('\d+', floor).group(0)) except: room.floor = None try: # 总楼层 height = room_info.xpath( ".//div[@class='text']/p[2]/span[1]/text()")[0] room.height = int( re.search('/(\d+)层', height).group(1)) except: room.height = None try: # 交易时间 trade_date = room_info.xpath( ".//span[@class='cj-data-num']/text()")[0] t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = datetime.datetime(y, m, d) except: room.trade_date = None try: # 总价 total_price = room_info.xpath( ".//span[@class='cj-data-num c4a4a4a']/em/text()" )[0] if '*' in total_price: log.error('source={}, 总价有问题 带*号'.format('乐有家')) continue else: room.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 except: room.total_price = None try: # 均价 avg_price = room_info.xpath( ".//span[@class='cj-data-num']/em/text()")[0] if '*' in avg_price: log.error('source={}, 均价有问题 带*号'.format('乐有家')) continue else: room.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) except: room.avg_price = None try: # 朝向 room.direction = room_info.xpath( ".//div[@class='text']/p[2]/span[2]/text()" )[0].replace('朝', '') except: room.direction = None try: region_area_info = room_info.xpath( "./div[@class='text']/p[1]/text()")[1] except: return try: # 区域 room.region = region_area_info.split(' ')[1] except: room.region = None try: # 面积 size = re.search('建筑面积(.*?)平', region_area_info).group(1) if size: area = float(size) room.area = round(area, 2) except: room.area = None room.insert_db()
def crawler(self, city_url, city): res = requests.get(city_url, headers=self.headers) con = etree.HTML(res.text) last_page = con.xpath("//a[@class='down_page']/@href")[1] page_num = re.search('\d+', last_page).group(0) for i in range(1, int(page_num) + 1): page_url = city_url + "/PG" + str(i) page_res = requests.get(page_url, headers=self.headers) page_con = etree.HTML(page_res.text) temp = page_con.xpath("//h1/a/@href") for temp_url in temp: try: com = Comm(source) comm_url = city + temp_url while True: try: co_res = requests.get(comm_url, headers=self.headers, timeout=10) break except: continue time.sleep(2) co_con = etree.HTML(co_res.text) com.city = co_con.xpath("//div/a[@class='show']/text()")[0] region = co_con.xpath("//section/p/a/text()")[-1] com.region = region com.district_name = co_con.xpath("//cite/span/text()")[0] info = co_con.xpath("//table/tbody/tr") for tag in info: size = tag.xpath("./td[2]/text()")[0] area = size.replace('㎡', '') area = float(area) com.area = round(area, 2) avg_price = tag.xpath("./td[3]/text()")[0] com.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) total_price = tag.xpath("./td/span/text()")[0] com.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 trade_date = tag.xpath("./td/text()")[-2] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday com.trade_date = datetime.datetime(y, m, d) room_type = tag.xpath("./td//p/a/text()")[0] try: com.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: com.room = None try: com.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: com.hall = None floor = tag.xpath("./td//p/span/text()")[0] com.floor = int(re.search('(\d+)层', floor).group(1)) com.direction = re.search('层 (.*?)', floor).group(1) com.insert_db() except Exception as e: log.error("{}小区信息提取错误".format(comm_url))