示例#1
0
    def city_info(self, index_url, city):
        for i in range(1, 101):
            index_url_ = index_url + 'i3' + str(i) + '/'
            if i == 1:
                index_url_ = index_url
            try:
                response = requests.get(index_url_, headers=self.headers)
                html = response.text
                try:
                    city_real = re.search('city = "(.*?)"', html, re.S | re.M).group(1)
                    if city != city_real:
                        break
                    house_num = re.search('class="org">(.*?)</b>', html, re.S | re.M).group(1)
                    if house_num == '0':
                        break
                    comm_info_paper_list = re.findall('class="info rel floatr".*?</dd>', html, re.S | re.M)
                    for comm_info_paper in comm_info_paper_list:
                        comm = Comm('房天下')
                        comm.city = city
                        comm.district_name = re.search('<a.*?>(.*?)<', comm_info_paper, re.S | re.M).group(1).strip()

                        if '�' in comm.district_name:
                            log.error('网页出现繁体字, url={}'.format(index_url_))
                            continue

                        comm.direction = re.search('class="mt18">(.*?)<', comm_info_paper, re.S | re.M).group(1)
                        try:
                            comm.height = int(re.search('共(.*?)层', comm_info_paper, re.S | re.M).group(1))
                        except Exception as e:
                            comm.height = None
                        comm.region = re.search('class="mt15">.*?<a.*?chengjiao.*?>(.*?)<', comm_info_paper,
                                                re.S | re.M).group(
                            1)
                        total_price = re.search('class="price">(.*?)<', comm_info_paper, re.S | re.M).group(1)
                        if '*' in total_price:
                            continue
                        comm.total_price = int(total_price) * 10000
                        comm.room = int(re.search('(\d+)室', comm.district_name, re.S | re.M).group(1))
                        comm.hall = int(re.search('(\d+)厅', comm.district_name, re.S | re.M).group(1))
                        try:
                            comm.area = float(re.search('(\d+\.\d+)平米', comm.district_name, re.S | re.M).group(1))
                        except Exception as e:
                            comm.area = None
                        trade_date = re.search('class="time".*?>(.*?)<', comm_info_paper, re.S | re.M).group(1)
                        t = time.strptime(trade_date, "%Y-%m-%d")
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        comm.trade_date = datetime.datetime(y, m, d)
                        try:
                            comm.avg_price = int(comm.total_price / comm.area)
                        except Exception as e:
                            comm.avg_price = None
                        comm.insert_db()
                except Exception as e:
                    log.error('解析错误,source="{}",html="{}",e="{}"'.format('房天下', html, e))
            except Exception as e:
                log.error('请求错误,source="{}",url="{}",e="{}"'.format('房天下', index_url_, e))
示例#2
0
    def room(self, co_list, city_name):
        for co in co_list:
            try:
                co_name = co.xpath("./div[1]/a/text()")[0]
                co_url = "http:" + co.xpath("./div[1]/a/@href")[0]
                region = co.xpath("./div[3]/span[1]/a[1]/text()")[0]
                addr = co.xpath("./div[3]/span[3]/@title")[0]
                detail = requests.get(co_url, headers=self.headers)
                html = etree.HTML(detail.text)
                room_url = "http:" + html.xpath("//div[@class='tab-toolbar pr']//li/a/@href")[-1]
                page_index = requests.get(room_url, headers=self.headers)
            except:
                continue
            if re.search('共(\d+)页', page_index.text):
                page_num = re.search('共(\d+)页', page_index.text).group(1)
            else:
                log.info('小区无相关数据')
                continue
            for i in range(1, int(page_num) + 1):
                url = re.sub('#.*', 'n', room_url) + str(i)
                while True:
                    try:
                        res = requests.get(url, headers=self.headers)
                        break
                    except:
                        continue
                con = res.text
                room_html = etree.HTML(con)
                room_list = room_html.xpath("//div[@class='right-information']")
                for m in room_list:
                    try:
                        room = Comm(source)
                        room.district_name = co_name
                        room.city = city_name
                        room.region = region
                        room_type = m.xpath("./h3/span[2]/text()")[0]
                        try:
                            room.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
                        except Exception as e:
                            room.room = None
                        try:
                            room.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
                        except Exception as e:
                            room.hall = None

                        size = m.xpath("./h3/span[3]/text()")[0]
                        area = size.replace('平米', '')
                        if area:
                            area = float(area)
                            room.area = round(area, 2)


                        total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0]
                        room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                        avg_price = m.xpath(".//div[@class='size  fs14']/text()")[0]
                        room.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
                        try:
                            room.fitment = m.xpath(".//div[@class='t1 fs14']/text()[3]")[0]
                            room.direction = m.xpath(".//div[@class='t1 fs14']/text()[2]")[0]
                            # room.use = m.xpath(".//div[@class='t1 fs14']/text()[1]")[0]
                        except:
                            room.fitment = None
                            room.direction = None
                            # room.use = None
                        floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0]
                        try:
                            floor = re.search('(.*?)/', floor_info).group(1)
                            room.floor = int(re.search('\d+',floor).group(0))
                        except Exception as e:
                            room.floor = None
                        try:
                            room.height = int(re.search('.*?/(\d+)层', floor_info).group(1))
                        except:
                            room.height = None
                        trade_date = m.xpath(".//div[@class='date']/text()")[0]
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            room.trade_date = datetime.datetime(y, m, d)
                        room.insert_db()
                    except Exception as e:
                        log.error('房屋信息提取失败{}'.format(e))
示例#3
0
 def comm_info(self, comm_url_list, city_url):
     for comm_url in comm_url_list:
         url = city_url.replace('/esf/', comm_url)
         re_url = url.replace('xq', 'fangjia')
         try:
             res = requests.get(url=re_url,
                                headers=self.headers,
                                proxies=next(p))
         except Exception as e:
             log.error('请求失败, source={}, url={}, e={}'.format(
                 '乐有家', re_url, e))
             continue
         con = res.text
         co_name = re.search('wrap-head-name">(.*?)</div', con,
                             re.S | re.M).group(1)
         co_name = co_name.strip()
         try:
             page = re.search('(\d+)">尾页', con).group(1)
         except:
             page = 1
         for i in range(1, int(page) + 1):
             page_url = re_url.rstrip('.html') + "/?n=" + str(i)
             print(page_url)
             try:
                 co_res = requests.get(url=page_url,
                                       headers=self.headers,
                                       proxies=next(p))
             except Exception as e:
                 log.error('请求失败, source={}, url={}, e={}'.format(
                     '乐有家', page_url, e))
                 continue
             co_html = etree.HTML(co_res.text)
             city = co_html.xpath(
                 "//span[@class='change-city']/text()")[0].replace(
                     '\t', '').replace('[', '')
             romm_info_list = co_html.xpath("//div[@class='list-cont']/div")
             for room_info in romm_info_list:
                 room = Comm(source)
                 # 城市
                 room.city = city
                 # 小区名称
                 room.district_name = co_name
                 try:
                     # 所在楼层
                     floor = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[1]/text()")[0]
                     floor = re.search('(.*?)/', floor).group(1)
                     room.floor = int(re.search('\d+', floor).group(0))
                 except:
                     room.floor = None
                 try:
                     # 总楼层
                     height = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[1]/text()")[0]
                     room.height = int(
                         re.search('/(\d+)层', height).group(1))
                 except:
                     room.height = None
                 try:
                     # 交易时间
                     trade_date = room_info.xpath(
                         ".//span[@class='cj-data-num']/text()")[0]
                     t = time.strptime(trade_date, "%Y-%m-%d")
                     y = t.tm_year
                     m = t.tm_mon
                     d = t.tm_mday
                     room.trade_date = datetime.datetime(y, m, d)
                 except:
                     room.trade_date = None
                 try:
                     # 总价
                     total_price = room_info.xpath(
                         ".//span[@class='cj-data-num c4a4a4a']/em/text()"
                     )[0]
                     if '*' in total_price:
                         log.error('source={}, 总价有问题 带*号'.format('乐有家'))
                         continue
                     else:
                         room.total_price = int(
                             re.search('(\d+)', total_price,
                                       re.S | re.M).group(1)) * 10000
                 except:
                     room.total_price = None
                 try:
                     # 均价
                     avg_price = room_info.xpath(
                         ".//span[@class='cj-data-num']/em/text()")[0]
                     if '*' in avg_price:
                         log.error('source={}, 均价有问题 带*号'.format('乐有家'))
                         continue
                     else:
                         room.avg_price = int(
                             re.search('(\d+)', avg_price,
                                       re.S | re.M).group(1))
                 except:
                     room.avg_price = None
                 try:
                     # 朝向
                     room.direction = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[2]/text()"
                     )[0].replace('朝', '')
                 except:
                     room.direction = None
                 try:
                     region_area_info = room_info.xpath(
                         "./div[@class='text']/p[1]/text()")[1]
                 except:
                     return
                 try:
                     # 区域
                     room.region = region_area_info.split(' ')[1]
                 except:
                     room.region = None
                 try:
                     # 面积
                     size = re.search('建筑面积(.*?)平',
                                      region_area_info).group(1)
                     if size:
                         area = float(size)
                         room.area = round(area, 2)
                 except:
                     room.area = None
                 room.insert_db()
示例#4
0
 def get_page_url(self, page_url, city, area_):
     response = requests.get(page_url,
                             headers=self.headers,
                             proxies=self.proxy)
     html = response.text
     comm_html_list = re.findall('<li class=" clearfix">.*?</li>', html,
                                 re.S | re.M)
     for i in comm_html_list:
         try:
             comm = Comm('Q房网')
             comm.city = city.strip()
             comm.region = area_.strip()
             comm.district_name = re.search('house-title">.*?<a.*?>(.*?)<',
                                            i,
                                            re.S | re.M).group(1).strip()
             comm.direction = re.search(
                 'class="house-about clearfix".*?showKeyword">(.*?)<', i,
                 re.S | re.M).group(1).strip()
             try:
                 comm.height = int(
                     re.search(
                         'class="house-about clearfix".*?showKeyword">.*?<span.*?<span>.*?/(.*?)<',
                         i, re.S | re.M).group(1).strip())
             except Exception as e:
                 comm.height = None
             total_price = re.search('class="show-price".*?span.*?>(.*?)<',
                                     i, re.S | re.M).group(1).strip()
             comm.total_price = int(total_price) * 10000
             avg_price = re.search('class="show-price".*?<p.*?>(.*?)<', i,
                                   re.S | re.M).group(1).strip()
             comm.avg_price = int(re.search('(\d+)', avg_price).group(1))
             trade_date = re.search(
                 'class="show-price concluded".*?span.*?>(.*?)<', i,
                 re.S | re.M).group(1).strip()
             if trade_date:
                 t = time.strptime(trade_date, "%Y.%m.%d")
                 y = t.tm_year
                 m = t.tm_mon
                 d = t.tm_mday
                 comm.trade_date = datetime.datetime(y, m, d)
             room_type = re.search('house-title">.*?<a.*?>.*? (.*?) ', i,
                                   re.S | re.M).group(1).strip()
             try:
                 comm.room = int(
                     re.search('(\d)室', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.room = None
             try:
                 comm.hall = int(
                     re.search('(\d)厅', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.hall = None
             area = re.search('house-title">.*?<a.*?>.*? .*? (.*?平米)', i,
                              re.S | re.M).group(1).strip()
             area = area.replace('㎡', '').replace('平米', '')
             if area:
                 area = float(area)
                 comm.area = round(area, 2)
             comm.insert_db()
         except Exception as e:
             log.error('解析错误,source="{}",html="{}",e="{}"'.format(
                 'Q房网', i, e))
示例#5
0
 def get_comm_detail(self, comm_url, region, city):
     comm = Comm('购房网')
     comm.url = comm_url
     comm.region = region.strip()
     comm.city = city
     try:
         response = requests.get(url=comm_url,
                                 headers=self.headers,
                                 proxies=next(p))
     except Exception as e:
         log.error('请求错误,source="{}",url="{}",e="{}"'.format(
             '购房网', comm_url, e))
         return
     html = response.text
     comm.district_name = re.search('title fl.*?<h1>(.*?)</h1>', html,
                                    re.S | re.M).group(1).strip()
     comm_info_html = re.search('<ul class="lscjlist">.*?</ul>', html,
                                re.S | re.M).group()
     comm_info_list = re.findall('<li>(.*?)</li>', comm_info_html,
                                 re.S | re.M)
     if not comm_info_list:
         log.info('source={}, 此小区没有数据,url="{}"'.format('购房网', comm_url))
     for i in comm_info_list:
         trade_date = re.search('<span>(.*?)</span>', i,
                                re.S | re.M).group(1).strip()
         if trade_date:
             t = time.strptime(trade_date, "%Y-%m-%d")
             y = t.tm_year
             m = t.tm_mon
             d = t.tm_mday
             comm.trade_date = datetime.datetime(y, m, d)
         room_type = re.search('<span>.*?<span>(.*?)</span>', i,
                               re.S | re.M).group(1).strip()
         try:
             comm.room = int(
                 re.search('(\d)室', room_type, re.S | re.M).group(1))
             comm.hall = int(
                 re.search('(\d)厅', room_type, re.S | re.M).group(1))
         except Exception as e:
             comm.room = None
             comm.hall = None
         area = re.search('<span>.*?<span>.*?<span>(.*?)</span>', i,
                          re.S | re.M).group(1).strip().replace('㎡',
                                                                '').replace(
                                                                    '平', '')
         if area:
             area = float(area)
             comm.area = round(area, 2)
         try:
             height = re.search(
                 '<span>.*?<span>.*?<span>.*?<span>.*?/(.*?)</span>', i,
                 re.S | re.M).group(1).strip()
             comm.height = int(re.search('(\d+)', height).group(1))
         except Exception as e:
             comm.height = None
         comm.fitment = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i,
             re.S | re.M).group(1).strip()
         comm.direction = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>',
             i, re.S | re.M).group(1).strip()
         avg_price = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>',
             i, re.S | re.M).group(1)
         comm.avg_price = int(
             re.search('(\d+)', avg_price, re.S | re.M).group(1))
         total_price = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span.*?>(.*?)</span>',
             i, re.S | re.M).group(1)
         comm.total_price = int(
             re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
         comm.insert_db()
示例#6
0
 def get_city_info(self, city_dict):
     for city in city_dict:
         city_url = city_dict[city] + 'chengjiao/'
         try:
             response = requests.get(city_url, headers=self.headers)
             html = response.text
             area_html = re.search('data-role="ershoufang".*?地铁', html,
                                   re.S | re.M).group()
             area_list_str = re.findall('<a.*?</a>', area_html, re.S | re.M)
             for area_i in area_list_str:
                 if 'ershoufang' in area_i:
                     continue
                 area_url = re.search('href="(.*?)"', area_i,
                                      re.S | re.M).group(1)
                 area = re.search('<a.*?>(.*?)<', area_i,
                                  re.S | re.M).group(1)
                 for i in range(1, 101):
                     city_url_ = city_url.replace(
                         '/chengjiao/', '') + area_url + 'pg' + str(i)
                     try:
                         result = requests.get(city_url_,
                                               headers=self.headers)
                         content = result.text
                         comm_str_list = re.findall(
                             'class="info".*?</div></div></li>', content,
                             re.S | re.M)
                         for i in comm_str_list:
                             comm = Comm('链家在线')
                             comm.region = area.strip()
                             comm.city = city.strip()
                             comm.district_name = re.search(
                                 'target="_blank">(.*?)<', i,
                                 re.S | re.M).group(1).strip()
                             comm.direction = re.search(
                                 'class="houseIcon"></span>(.*?) \|', i,
                                 re.S | re.M).group(1).strip()
                             try:
                                 comm.fitment = re.search(
                                     'class="houseIcon"></span>.*? \|(.*?)\| ',
                                     i, re.S | re.M).group(1).strip()
                             except Exception as e:
                                 comm.fitment = None
                             try:
                                 height = re.search(
                                     'class="positionIcon"></span>.*?\((.*?)\)',
                                     i, re.S | re.M).group(1).strip()
                                 comm.height = int(
                                     re.search('(\d+)', height,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.height = None
                             total_price = re.search(
                                 "class='number'>(.*?)<", i,
                                 re.S | re.M).group(1).strip()
                             if "*" in total_price:
                                 continue
                             comm.total_price = int(
                                 re.search('(\d+)', total_price,
                                           re.S | re.M).group(1)) * 10000
                             room_type = re.search(
                                 'arget="_blank">.*? (.*?) ', i,
                                 re.S | re.M).group(1).strip()
                             try:
                                 comm.room = int(
                                     re.search('(\d)室', room_type,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.room = 0
                             try:
                                 comm.hall = int(
                                     re.search('(\d)厅', room_type,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.hall = None
                             area_ = re.search(
                                 'target="_blank">.*? .*? (.*?平米)', i,
                                 re.S | re.M).group(1).strip()
                             if area_:
                                 area_ = area_.replace('㎡', '').replace(
                                     '平米', '')
                                 try:
                                     area_ = float(area_)
                                     comm.area = round(area_, 2)
                                 except Exception as e:
                                     comm.area = None
                             trade_date = re.search(
                                 'dealDate">(.*?)<', i,
                                 re.S | re.M).group(1).strip()
                             if trade_date:
                                 t = time.strptime(trade_date, "%Y.%m.%d")
                                 y = t.tm_year
                                 m = t.tm_mon
                                 d = t.tm_mday
                                 comm.trade_date = datetime.datetime(
                                     y, m, d)
                             try:
                                 comm.avg_price = int(i['total_price'] /
                                                      i['area'])
                             except Exception as e:
                                 comm.avg_price = None
                             comm.insert_db()
                     except Exception as e:
                         log.error(
                             '解析错误,source="{}",html="{}",e="{}"'.format(
                                 '链家在线', html, e))
         except Exception as e:
             log.error('请求错误,source="{}",url="{}",e="{}"'.format(
                 '链家在线', city_url, e))