예제 #1
0
 def get_comm_info(self, comm_list, all_page_url):
     for i in comm_list:
         try:
             comm = Comm('中安房')
             comm.city = '合肥'
             comm.district_name = re.search('zaf-nowrap.*?>(.*?)<', i, re.S | re.M).group(1).strip()
             trade_date = re.search('zaf-fblue">(.*?)<', i, re.S | re.M).group(1).strip()
             if trade_date:
                 t = time.strptime(trade_date, "%Y-%m-%d")
                 y = t.tm_year
                 m = t.tm_mon
                 d = t.tm_mday
                 comm.trade_date = datetime.datetime(y, m, d)
             total_price = re.search('list-right-data.*?<span.*?>(.*?)<', i, re.S | re.M).group(1).strip()
             comm.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
             info = re.search('list-details-area.*?<span.*?>(.*?)<', i, re.S | re.M).group(1).strip()
             area = info.split(' ')[0].replace('㎡', '')
             if area:
                 area = float(area)
                 comm.area = round(area, 2)
             try:
                 room_type = info.split(' ')[1]
             except Exception as e:
                 room_type = None
             try:
                 comm.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.room = 0
             try:
                 comm.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.hall = None
             try:
                 comm.toilet = int(re.search('(\d)卫', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.toilet = None
             try:
                 avg_price = info.split(' ')[2]
                 comm.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
             except Exception as e:
                 comm.avg_price = None
             info_2 = re.search('list-details-area.*?<span.*?<span>(.*?)<', i, re.S | re.M).group(1).strip()
             comm.direction = info_2.split(' ')[0]
             try:
                 comm.fitment = info_2.split(' ')[1]
             except Exception as e:
                 comm.fitment = None
             info_3 = re.search('list-details-address1.*?<span>(.*?)<', i, re.S | re.M).group(1).strip()
             comm.region = info_3.split(' ')[0].strip()
             comm.insert_db()
         except Exception as e:
             log.error('解析错误,source={},url="{}",e="{}"'.format('中安房', all_page_url, e))
예제 #2
0
def into_mongo(coll):
    com = Comm('澜斯')
    results = coll.find(no_cursor_timeout=True)
    for result in results:
        # 这个地方写一个try是因为我再测试的时候发现有的木有fj_city
        try:
            com.city = result['fj_city']  # 城市
            com.region = result['fj_region']  # 区域
        except Exception as e:
            log.error('城市或者区域没有')

        com.m_date = result['updatedate']  # 更新日期
        com.create_date = datetime.datetime.now()  # 创建时间
        com.fitment = result['newdiskdecoration']  # 装修
        com.floor = result['flevel']  # 所在楼层

        # try是因为在插入数据库中这几个如果不符合,就不会插入
        try:
            com.district_name = result['fj_name']  # 小区名称
            com.avg_price = result['unitprice']  # 单价
            com.total_price = result['usd']  # 总价
            com.area = result['acreage']  # 面积=建筑面积

            t = time.strptime(result['signingdate'].split('T')[0], "%Y-%m-%d")
            y = t.tm_year
            m = t.tm_mon
            d = t.tm_mday
            com.trade_date = datetime.datetime(y, m, d)

        except Exception as e:
            log.error(e)

        # 这一部分我写了正则从地址中匹配单元号和室号,如果组长感觉不对,,直接注释掉就好
        houseaddress = result['houseaddress']
        try:
            res = re.search('(\d+)号(\d+)', houseaddress)
            com.unit_num = res.group(1)  # 单元号
            com.room_num = res.group(2)  # 室号
        except Exception as e:
            print('无法匹配大盘单元号和室号,houseaddress={}'.find(houseaddress))

        # 以下数据库确定无法匹配,写上是为了让您看看
        # com.direction = None  # 朝向
        # com.room = None  # 室数
        # com.hall = None  # 厅数
        # com.toilet = None  # 卫数
        # com.height = None  # 总楼层
        # com.house_num = None  # 楼栋号

        # 执行插入操作
        com.insert_db()
예제 #3
0
    def room(self, co_list, city_name):
        for co in co_list:
            try:
                co_name = co.xpath("./div[1]/a/text()")[0]
                co_url = "http:" + co.xpath("./div[1]/a/@href")[0]
                region = co.xpath("./div[3]/span[1]/a[1]/text()")[0]
                addr = co.xpath("./div[3]/span[3]/@title")[0]
                detail = requests.get(co_url, headers=self.headers)
                html = etree.HTML(detail.text)
                room_url = "http:" + html.xpath("//div[@class='tab-toolbar pr']//li/a/@href")[-1]
                page_index = requests.get(room_url, headers=self.headers)
            except:
                continue
            if re.search('共(\d+)页', page_index.text):
                page_num = re.search('共(\d+)页', page_index.text).group(1)
            else:
                log.info('小区无相关数据')
                continue
            for i in range(1, int(page_num) + 1):
                url = re.sub('#.*', 'n', room_url) + str(i)
                while True:
                    try:
                        res = requests.get(url, headers=self.headers)
                        break
                    except:
                        continue
                con = res.text
                room_html = etree.HTML(con)
                room_list = room_html.xpath("//div[@class='right-information']")
                for m in room_list:
                    try:
                        room = Comm(source)
                        room.district_name = co_name
                        room.city = city_name
                        room.region = region
                        room_type = m.xpath("./h3/span[2]/text()")[0]
                        try:
                            room.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
                        except Exception as e:
                            room.room = None
                        try:
                            room.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
                        except Exception as e:
                            room.hall = None

                        size = m.xpath("./h3/span[3]/text()")[0]
                        area = size.replace('平米', '')
                        if area:
                            area = float(area)
                            room.area = round(area, 2)


                        total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0]
                        room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                        avg_price = m.xpath(".//div[@class='size  fs14']/text()")[0]
                        room.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
                        try:
                            room.fitment = m.xpath(".//div[@class='t1 fs14']/text()[3]")[0]
                            room.direction = m.xpath(".//div[@class='t1 fs14']/text()[2]")[0]
                            # room.use = m.xpath(".//div[@class='t1 fs14']/text()[1]")[0]
                        except:
                            room.fitment = None
                            room.direction = None
                            # room.use = None
                        floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0]
                        try:
                            floor = re.search('(.*?)/', floor_info).group(1)
                            room.floor = int(re.search('\d+',floor).group(0))
                        except Exception as e:
                            room.floor = None
                        try:
                            room.height = int(re.search('.*?/(\d+)层', floor_info).group(1))
                        except:
                            room.height = None
                        trade_date = m.xpath(".//div[@class='date']/text()")[0]
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            room.trade_date = datetime.datetime(y, m, d)
                        room.insert_db()
                    except Exception as e:
                        log.error('房屋信息提取失败{}'.format(e))
예제 #4
0
 def get_comm_detail(self, comm_url, region, city):
     comm = Comm('购房网')
     comm.url = comm_url
     comm.region = region.strip()
     comm.city = city
     try:
         response = requests.get(url=comm_url,
                                 headers=self.headers,
                                 proxies=next(p))
     except Exception as e:
         log.error('请求错误,source="{}",url="{}",e="{}"'.format(
             '购房网', comm_url, e))
         return
     html = response.text
     comm.district_name = re.search('title fl.*?<h1>(.*?)</h1>', html,
                                    re.S | re.M).group(1).strip()
     comm_info_html = re.search('<ul class="lscjlist">.*?</ul>', html,
                                re.S | re.M).group()
     comm_info_list = re.findall('<li>(.*?)</li>', comm_info_html,
                                 re.S | re.M)
     if not comm_info_list:
         log.info('source={}, 此小区没有数据,url="{}"'.format('购房网', comm_url))
     for i in comm_info_list:
         trade_date = re.search('<span>(.*?)</span>', i,
                                re.S | re.M).group(1).strip()
         if trade_date:
             t = time.strptime(trade_date, "%Y-%m-%d")
             y = t.tm_year
             m = t.tm_mon
             d = t.tm_mday
             comm.trade_date = datetime.datetime(y, m, d)
         room_type = re.search('<span>.*?<span>(.*?)</span>', i,
                               re.S | re.M).group(1).strip()
         try:
             comm.room = int(
                 re.search('(\d)室', room_type, re.S | re.M).group(1))
             comm.hall = int(
                 re.search('(\d)厅', room_type, re.S | re.M).group(1))
         except Exception as e:
             comm.room = None
             comm.hall = None
         area = re.search('<span>.*?<span>.*?<span>(.*?)</span>', i,
                          re.S | re.M).group(1).strip().replace('㎡',
                                                                '').replace(
                                                                    '平', '')
         if area:
             area = float(area)
             comm.area = round(area, 2)
         try:
             height = re.search(
                 '<span>.*?<span>.*?<span>.*?<span>.*?/(.*?)</span>', i,
                 re.S | re.M).group(1).strip()
             comm.height = int(re.search('(\d+)', height).group(1))
         except Exception as e:
             comm.height = None
         comm.fitment = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i,
             re.S | re.M).group(1).strip()
         comm.direction = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>',
             i, re.S | re.M).group(1).strip()
         avg_price = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>',
             i, re.S | re.M).group(1)
         comm.avg_price = int(
             re.search('(\d+)', avg_price, re.S | re.M).group(1))
         total_price = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span.*?>(.*?)</span>',
             i, re.S | re.M).group(1)
         comm.total_price = int(
             re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
         comm.insert_db()
예제 #5
0
 def get_city_info(self, city_dict):
     for city in city_dict:
         city_url = city_dict[city] + 'chengjiao/'
         try:
             response = requests.get(city_url, headers=self.headers)
             html = response.text
             area_html = re.search('data-role="ershoufang".*?地铁', html,
                                   re.S | re.M).group()
             area_list_str = re.findall('<a.*?</a>', area_html, re.S | re.M)
             for area_i in area_list_str:
                 if 'ershoufang' in area_i:
                     continue
                 area_url = re.search('href="(.*?)"', area_i,
                                      re.S | re.M).group(1)
                 area = re.search('<a.*?>(.*?)<', area_i,
                                  re.S | re.M).group(1)
                 for i in range(1, 101):
                     city_url_ = city_url.replace(
                         '/chengjiao/', '') + area_url + 'pg' + str(i)
                     try:
                         result = requests.get(city_url_,
                                               headers=self.headers)
                         content = result.text
                         comm_str_list = re.findall(
                             'class="info".*?</div></div></li>', content,
                             re.S | re.M)
                         for i in comm_str_list:
                             comm = Comm('链家在线')
                             comm.region = area.strip()
                             comm.city = city.strip()
                             comm.district_name = re.search(
                                 'target="_blank">(.*?)<', i,
                                 re.S | re.M).group(1).strip()
                             comm.direction = re.search(
                                 'class="houseIcon"></span>(.*?) \|', i,
                                 re.S | re.M).group(1).strip()
                             try:
                                 comm.fitment = re.search(
                                     'class="houseIcon"></span>.*? \|(.*?)\| ',
                                     i, re.S | re.M).group(1).strip()
                             except Exception as e:
                                 comm.fitment = None
                             try:
                                 height = re.search(
                                     'class="positionIcon"></span>.*?\((.*?)\)',
                                     i, re.S | re.M).group(1).strip()
                                 comm.height = int(
                                     re.search('(\d+)', height,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.height = None
                             total_price = re.search(
                                 "class='number'>(.*?)<", i,
                                 re.S | re.M).group(1).strip()
                             if "*" in total_price:
                                 continue
                             comm.total_price = int(
                                 re.search('(\d+)', total_price,
                                           re.S | re.M).group(1)) * 10000
                             room_type = re.search(
                                 'arget="_blank">.*? (.*?) ', i,
                                 re.S | re.M).group(1).strip()
                             try:
                                 comm.room = int(
                                     re.search('(\d)室', room_type,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.room = 0
                             try:
                                 comm.hall = int(
                                     re.search('(\d)厅', room_type,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.hall = None
                             area_ = re.search(
                                 'target="_blank">.*? .*? (.*?平米)', i,
                                 re.S | re.M).group(1).strip()
                             if area_:
                                 area_ = area_.replace('㎡', '').replace(
                                     '平米', '')
                                 try:
                                     area_ = float(area_)
                                     comm.area = round(area_, 2)
                                 except Exception as e:
                                     comm.area = None
                             trade_date = re.search(
                                 'dealDate">(.*?)<', i,
                                 re.S | re.M).group(1).strip()
                             if trade_date:
                                 t = time.strptime(trade_date, "%Y.%m.%d")
                                 y = t.tm_year
                                 m = t.tm_mon
                                 d = t.tm_mday
                                 comm.trade_date = datetime.datetime(
                                     y, m, d)
                             try:
                                 comm.avg_price = int(i['total_price'] /
                                                      i['area'])
                             except Exception as e:
                                 comm.avg_price = None
                             comm.insert_db()
                     except Exception as e:
                         log.error(
                             '解析错误,source="{}",html="{}",e="{}"'.format(
                                 '链家在线', html, e))
         except Exception as e:
             log.error('请求错误,source="{}",url="{}",e="{}"'.format(
                 '链家在线', city_url, e))