示例#1
0
    def deal_price(self):
        for data in collection.find(no_cursor_timeout=True):
            if 'fj_flag' in data:
                if data['fj_flag'] == 1:
                    second_price = Comm(self.source)
                    second_price.city = data['fj_city']
                    second_price.direction = data['CJ_CX']
                    second_price.avg_price = float(data['CJ_CJDJ'])
                    second_price.area = float(data['CJ_JZMJ'])
                    second_price.trade_date = data['CJ_CJRQ']

                    second_price.total_price = float(data['CJ_CJDJ']) * float(
                        data['CJ_JZMJ'])

                    second_price.district_name = data['fj_name']
                    if 'CJ_ZH' in data:
                        second_price.house_num = data['CJ_ZH']
                    if 'CJ_SHBW' in data:
                        second_price.room_num = data['CJ_SHBW']
                    try:
                        second_price.floor = int(data['CJ_CS'])
                    except Exception as e:
                        print('楼层error', e)
                    second_price.region = data['fj_region']
                    is_success = second_price.insert_db()
示例#2
0
    def comm_info(self, comm_url_list, city_url):

        for comm_url in comm_url_list:
            url = city_url.replace('/esf/', comm_url)
            re_url = url.replace('xq', 'fangjia')
            res = requests.get(re_url, headers=self.headers)
            con = res.text
            co_name = re.search('wrap-head-name">(.*?)</div', con,
                                re.S | re.M).group(1)
            co_name = co_name.strip()
            try:
                page = re.search('(\d+)">尾页', con).group(1)
            except:
                page = 1
            for i in range(1, int(page) + 1):
                page_url = re_url.rstrip('.html') + "/?n=" + str(i)
                co_res = requests.get(page_url, headers=self.headers)
                co_con = co_res.text
                co_html = etree.HTML(co_con)
                city = co_html.xpath("//span[@class='change-city']/text()")[0]
                romm_info_list = co_html.xpath("//div[@class='list-cont']/div")
                for room_info in romm_info_list:
                    try:
                        room = Comm(source)
                        room.city = city
                        room.district_name = co_name
                        floor = room_info.xpath(
                            ".//div[@class='text']/p[2]/span[1]/text()")[0]
                        room.floor = int(re.search('\d+', floor).group(0))
                        trade_date = room_info.xpath(
                            ".//span[@class='cj-data-num']/text()")[0]
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            room.trade_date = datetime.datetime(y, m, d)
                        total_price = room_info.xpath(
                            ".//span[@class='cj-data-num c4a4a4a']/em/text()"
                        )[0]
                        room.total_price = int(
                            re.search('(\d+)', total_price,
                                      re.S | re.M).group(1)) * 10000
                        avg_price = room_info.xpath(
                            ".//span[@class='cj-data-num']/em/text()")[0]
                        room.avg_price = int(
                            re.search('(\d+)', avg_price,
                                      re.S | re.M).group(1))
                        room.direction = room_info.xpath(
                            ".//div[@class='text']/p[2]/span[2]/text()")[0]
                        area = room_info.xpath(".//p[1]/text()")[1]
                        room.region = area
                        size = re.search('建筑面积(.*?)平', area).group(1)
                        if size:
                            area = float(size)
                            room.area = round(area, 2)
                        room.insert_db()
                    except Exception as e:
                        log.error("{}解析房屋错误{}".format(page_url, e))
示例#3
0
def into_mongo(coll):
    com = Comm('澜斯')
    results = coll.find(no_cursor_timeout=True)
    for result in results:
        # 这个地方写一个try是因为我再测试的时候发现有的木有fj_city
        try:
            com.city = result['fj_city']  # 城市
            com.region = result['fj_region']  # 区域
        except Exception as e:
            log.error('城市或者区域没有')

        com.m_date = result['updatedate']  # 更新日期
        com.create_date = datetime.datetime.now()  # 创建时间
        com.fitment = result['newdiskdecoration']  # 装修
        com.floor = result['flevel']  # 所在楼层

        # try是因为在插入数据库中这几个如果不符合,就不会插入
        try:
            com.district_name = result['fj_name']  # 小区名称
            com.avg_price = result['unitprice']  # 单价
            com.total_price = result['usd']  # 总价
            com.area = result['acreage']  # 面积=建筑面积

            t = time.strptime(result['signingdate'].split('T')[0], "%Y-%m-%d")
            y = t.tm_year
            m = t.tm_mon
            d = t.tm_mday
            com.trade_date = datetime.datetime(y, m, d)

        except Exception as e:
            log.error(e)

        # 这一部分我写了正则从地址中匹配单元号和室号,如果组长感觉不对,,直接注释掉就好
        houseaddress = result['houseaddress']
        try:
            res = re.search('(\d+)号(\d+)', houseaddress)
            com.unit_num = res.group(1)  # 单元号
            com.room_num = res.group(2)  # 室号
        except Exception as e:
            print('无法匹配大盘单元号和室号,houseaddress={}'.find(houseaddress))

        # 以下数据库确定无法匹配,写上是为了让您看看
        # com.direction = None  # 朝向
        # com.room = None  # 室数
        # com.hall = None  # 厅数
        # com.toilet = None  # 卫数
        # com.height = None  # 总楼层
        # com.house_num = None  # 楼栋号

        # 执行插入操作
        com.insert_db()
示例#4
0
    def room(self, co_list, city_name):
        for co in co_list:
            try:
                co_name = co.xpath("./div[1]/a/text()")[0]
                co_url = "http:" + co.xpath("./div[1]/a/@href")[0]
                region = co.xpath("./div[3]/span[1]/a[1]/text()")[0]
                addr = co.xpath("./div[3]/span[3]/@title")[0]
                detail = requests.get(co_url, headers=self.headers)
                html = etree.HTML(detail.text)
                room_url = "http:" + html.xpath("//div[@class='tab-toolbar pr']//li/a/@href")[-1]
                page_index = requests.get(room_url, headers=self.headers)
            except:
                continue
            if re.search('共(\d+)页', page_index.text):
                page_num = re.search('共(\d+)页', page_index.text).group(1)
            else:
                log.info('小区无相关数据')
                continue
            for i in range(1, int(page_num) + 1):
                url = re.sub('#.*', 'n', room_url) + str(i)
                while True:
                    try:
                        res = requests.get(url, headers=self.headers)
                        break
                    except:
                        continue
                con = res.text
                room_html = etree.HTML(con)
                room_list = room_html.xpath("//div[@class='right-information']")
                for m in room_list:
                    try:
                        room = Comm(source)
                        room.district_name = co_name
                        room.city = city_name
                        room.region = region
                        room_type = m.xpath("./h3/span[2]/text()")[0]
                        try:
                            room.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
                        except Exception as e:
                            room.room = None
                        try:
                            room.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
                        except Exception as e:
                            room.hall = None

                        size = m.xpath("./h3/span[3]/text()")[0]
                        area = size.replace('平米', '')
                        if area:
                            area = float(area)
                            room.area = round(area, 2)


                        total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0]
                        room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                        avg_price = m.xpath(".//div[@class='size  fs14']/text()")[0]
                        room.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
                        try:
                            room.fitment = m.xpath(".//div[@class='t1 fs14']/text()[3]")[0]
                            room.direction = m.xpath(".//div[@class='t1 fs14']/text()[2]")[0]
                            # room.use = m.xpath(".//div[@class='t1 fs14']/text()[1]")[0]
                        except:
                            room.fitment = None
                            room.direction = None
                            # room.use = None
                        floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0]
                        try:
                            floor = re.search('(.*?)/', floor_info).group(1)
                            room.floor = int(re.search('\d+',floor).group(0))
                        except Exception as e:
                            room.floor = None
                        try:
                            room.height = int(re.search('.*?/(\d+)层', floor_info).group(1))
                        except:
                            room.height = None
                        trade_date = m.xpath(".//div[@class='date']/text()")[0]
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            room.trade_date = datetime.datetime(y, m, d)
                        room.insert_db()
                    except Exception as e:
                        log.error('房屋信息提取失败{}'.format(e))
示例#5
0
 def comm_info(self, comm_url_list, city_url):
     for comm_url in comm_url_list:
         url = city_url.replace('/esf/', comm_url)
         re_url = url.replace('xq', 'fangjia')
         try:
             res = requests.get(url=re_url,
                                headers=self.headers,
                                proxies=next(p))
         except Exception as e:
             log.error('请求失败, source={}, url={}, e={}'.format(
                 '乐有家', re_url, e))
             continue
         con = res.text
         co_name = re.search('wrap-head-name">(.*?)</div', con,
                             re.S | re.M).group(1)
         co_name = co_name.strip()
         try:
             page = re.search('(\d+)">尾页', con).group(1)
         except:
             page = 1
         for i in range(1, int(page) + 1):
             page_url = re_url.rstrip('.html') + "/?n=" + str(i)
             print(page_url)
             try:
                 co_res = requests.get(url=page_url,
                                       headers=self.headers,
                                       proxies=next(p))
             except Exception as e:
                 log.error('请求失败, source={}, url={}, e={}'.format(
                     '乐有家', page_url, e))
                 continue
             co_html = etree.HTML(co_res.text)
             city = co_html.xpath(
                 "//span[@class='change-city']/text()")[0].replace(
                     '\t', '').replace('[', '')
             romm_info_list = co_html.xpath("//div[@class='list-cont']/div")
             for room_info in romm_info_list:
                 room = Comm(source)
                 # 城市
                 room.city = city
                 # 小区名称
                 room.district_name = co_name
                 try:
                     # 所在楼层
                     floor = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[1]/text()")[0]
                     floor = re.search('(.*?)/', floor).group(1)
                     room.floor = int(re.search('\d+', floor).group(0))
                 except:
                     room.floor = None
                 try:
                     # 总楼层
                     height = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[1]/text()")[0]
                     room.height = int(
                         re.search('/(\d+)层', height).group(1))
                 except:
                     room.height = None
                 try:
                     # 交易时间
                     trade_date = room_info.xpath(
                         ".//span[@class='cj-data-num']/text()")[0]
                     t = time.strptime(trade_date, "%Y-%m-%d")
                     y = t.tm_year
                     m = t.tm_mon
                     d = t.tm_mday
                     room.trade_date = datetime.datetime(y, m, d)
                 except:
                     room.trade_date = None
                 try:
                     # 总价
                     total_price = room_info.xpath(
                         ".//span[@class='cj-data-num c4a4a4a']/em/text()"
                     )[0]
                     if '*' in total_price:
                         log.error('source={}, 总价有问题 带*号'.format('乐有家'))
                         continue
                     else:
                         room.total_price = int(
                             re.search('(\d+)', total_price,
                                       re.S | re.M).group(1)) * 10000
                 except:
                     room.total_price = None
                 try:
                     # 均价
                     avg_price = room_info.xpath(
                         ".//span[@class='cj-data-num']/em/text()")[0]
                     if '*' in avg_price:
                         log.error('source={}, 均价有问题 带*号'.format('乐有家'))
                         continue
                     else:
                         room.avg_price = int(
                             re.search('(\d+)', avg_price,
                                       re.S | re.M).group(1))
                 except:
                     room.avg_price = None
                 try:
                     # 朝向
                     room.direction = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[2]/text()"
                     )[0].replace('朝', '')
                 except:
                     room.direction = None
                 try:
                     region_area_info = room_info.xpath(
                         "./div[@class='text']/p[1]/text()")[1]
                 except:
                     return
                 try:
                     # 区域
                     room.region = region_area_info.split(' ')[1]
                 except:
                     room.region = None
                 try:
                     # 面积
                     size = re.search('建筑面积(.*?)平',
                                      region_area_info).group(1)
                     if size:
                         area = float(size)
                         room.area = round(area, 2)
                 except:
                     room.area = None
                 room.insert_db()
示例#6
0
    def crawler(self, city_url, city):
        res = requests.get(city_url, headers=self.headers)
        con = etree.HTML(res.text)
        last_page = con.xpath("//a[@class='down_page']/@href")[1]
        page_num = re.search('\d+', last_page).group(0)
        for i in range(1, int(page_num) + 1):
            page_url = city_url + "/PG" + str(i)
            page_res = requests.get(page_url, headers=self.headers)
            page_con = etree.HTML(page_res.text)
            temp = page_con.xpath("//h1/a/@href")
            for temp_url in temp:
                try:
                    com = Comm(source)
                    comm_url = city + temp_url
                    while True:
                        try:
                            co_res = requests.get(comm_url,
                                                  headers=self.headers,
                                                  timeout=10)
                            break
                        except:
                            continue
                    time.sleep(2)
                    co_con = etree.HTML(co_res.text)
                    com.city = co_con.xpath("//div/a[@class='show']/text()")[0]
                    region = co_con.xpath("//section/p/a/text()")[-1]
                    com.region = region
                    com.district_name = co_con.xpath("//cite/span/text()")[0]
                    info = co_con.xpath("//table/tbody/tr")
                    for tag in info:
                        size = tag.xpath("./td[2]/text()")[0]
                        area = size.replace('㎡', '')
                        area = float(area)
                        com.area = round(area, 2)
                        avg_price = tag.xpath("./td[3]/text()")[0]
                        com.avg_price = int(
                            re.search('(\d+)', avg_price,
                                      re.S | re.M).group(1))
                        total_price = tag.xpath("./td/span/text()")[0]
                        com.total_price = int(
                            re.search('(\d+)', total_price,
                                      re.S | re.M).group(1)) * 10000
                        trade_date = tag.xpath("./td/text()")[-2]
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            com.trade_date = datetime.datetime(y, m, d)

                        room_type = tag.xpath("./td//p/a/text()")[0]
                        try:
                            com.room = int(
                                re.search('(\d)室', room_type,
                                          re.S | re.M).group(1))
                        except Exception as e:
                            com.room = None
                        try:
                            com.hall = int(
                                re.search('(\d)厅', room_type,
                                          re.S | re.M).group(1))
                        except Exception as e:
                            com.hall = None

                        floor = tag.xpath("./td//p/span/text()")[0]
                        com.floor = int(re.search('(\d+)层', floor).group(1))
                        com.direction = re.search('层 (.*?)', floor).group(1)

                        com.insert_db()
                except Exception as e:
                    log.error("{}小区信息提取错误".format(comm_url))