示例#1
0
    def city_info(self, index_url, city):
        for i in range(1, 101):
            index_url_ = index_url + 'i3' + str(i) + '/'
            if i == 1:
                index_url_ = index_url
            try:
                response = requests.get(index_url_, headers=self.headers)
                html = response.text
                try:
                    city_real = re.search('city = "(.*?)"', html, re.S | re.M).group(1)
                    if city != city_real:
                        break
                    house_num = re.search('class="org">(.*?)</b>', html, re.S | re.M).group(1)
                    if house_num == '0':
                        break
                    comm_info_paper_list = re.findall('class="info rel floatr".*?</dd>', html, re.S | re.M)
                    for comm_info_paper in comm_info_paper_list:
                        comm = Comm('房天下')
                        comm.city = city
                        comm.district_name = re.search('<a.*?>(.*?)<', comm_info_paper, re.S | re.M).group(1).strip()

                        if '�' in comm.district_name:
                            log.error('网页出现繁体字, url={}'.format(index_url_))
                            continue

                        comm.direction = re.search('class="mt18">(.*?)<', comm_info_paper, re.S | re.M).group(1)
                        try:
                            comm.height = int(re.search('共(.*?)层', comm_info_paper, re.S | re.M).group(1))
                        except Exception as e:
                            comm.height = None
                        comm.region = re.search('class="mt15">.*?<a.*?chengjiao.*?>(.*?)<', comm_info_paper,
                                                re.S | re.M).group(
                            1)
                        total_price = re.search('class="price">(.*?)<', comm_info_paper, re.S | re.M).group(1)
                        if '*' in total_price:
                            continue
                        comm.total_price = int(total_price) * 10000
                        comm.room = int(re.search('(\d+)室', comm.district_name, re.S | re.M).group(1))
                        comm.hall = int(re.search('(\d+)厅', comm.district_name, re.S | re.M).group(1))
                        try:
                            comm.area = float(re.search('(\d+\.\d+)平米', comm.district_name, re.S | re.M).group(1))
                        except Exception as e:
                            comm.area = None
                        trade_date = re.search('class="time".*?>(.*?)<', comm_info_paper, re.S | re.M).group(1)
                        t = time.strptime(trade_date, "%Y-%m-%d")
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        comm.trade_date = datetime.datetime(y, m, d)
                        try:
                            comm.avg_price = int(comm.total_price / comm.area)
                        except Exception as e:
                            comm.avg_price = None
                        comm.insert_db()
                except Exception as e:
                    log.error('解析错误,source="{}",html="{}",e="{}"'.format('房天下', html, e))
            except Exception as e:
                log.error('请求错误,source="{}",url="{}",e="{}"'.format('房天下', index_url_, e))
示例#2
0
    def deal_price(self):
        for data in collection.find(no_cursor_timeout=True):
            if 'fj_flag' in data:
                if data['fj_flag'] == 1:
                    second_price = Comm(self.source)
                    second_price.city = data['fj_city']
                    second_price.direction = data['CJ_CX']
                    second_price.avg_price = float(data['CJ_CJDJ'])
                    second_price.area = float(data['CJ_JZMJ'])
                    second_price.trade_date = data['CJ_CJRQ']

                    second_price.total_price = float(data['CJ_CJDJ']) * float(
                        data['CJ_JZMJ'])

                    second_price.district_name = data['fj_name']
                    if 'CJ_ZH' in data:
                        second_price.house_num = data['CJ_ZH']
                    if 'CJ_SHBW' in data:
                        second_price.room_num = data['CJ_SHBW']
                    try:
                        second_price.floor = int(data['CJ_CS'])
                    except Exception as e:
                        print('楼层error', e)
                    second_price.region = data['fj_region']
                    is_success = second_price.insert_db()
示例#3
0
    def new_deal_price(self):
        for i in collection_new.find(no_cursor_timeout=True):
            print(collection_new.database.client.address[0])
            if 'fj_flag' in i:
                if i['fj_flag'] == 1:
                    deal_price = Comm(self.new_source)
                    deal_price.city = i['fj_city']
                    deal_price.region = i['fj_region']
                    deal_price.district_name = i['fj_name']
                    deal_price.avg_price = float(i["CJDJ"])
                    # deal_price.total_price = float(i["CJJE"]) * 10000
                    deal_price.trade_date = i['CJRQ']
                    deal_price.area = float(i['JZMJ'])
                    deal_price.room_num = i['SH']

                    deal_price.total_price = float(i['JZMJ']) * float(
                        i["CJDJ"])
                    try:
                        room = re.search('(.)室', i['FX'], re.S | re.M).group(1)
                        deal_price.room = check_room(room)
                    except Exception as e:
                        print('找不到室,FX={}, e={}'.format(i['FX'], e))
                    try:
                        hall = re.search('(.)厅', i['FX'], re.S | re.M).group(1)
                        deal_price.hall = check_room(hall)
                    except Exception as e:
                        print('找不到厅,FX={}, e={}'.format(i['FX'], e))
                    is_success = deal_price.insert_db()
示例#4
0
    def comm_info(self, comm_url_list, city_url):

        for comm_url in comm_url_list:
            url = city_url.replace('/esf/', comm_url)
            re_url = url.replace('xq', 'fangjia')
            res = requests.get(re_url, headers=self.headers)
            con = res.text
            co_name = re.search('wrap-head-name">(.*?)</div', con,
                                re.S | re.M).group(1)
            co_name = co_name.strip()
            try:
                page = re.search('(\d+)">尾页', con).group(1)
            except:
                page = 1
            for i in range(1, int(page) + 1):
                page_url = re_url.rstrip('.html') + "/?n=" + str(i)
                co_res = requests.get(page_url, headers=self.headers)
                co_con = co_res.text
                co_html = etree.HTML(co_con)
                city = co_html.xpath("//span[@class='change-city']/text()")[0]
                romm_info_list = co_html.xpath("//div[@class='list-cont']/div")
                for room_info in romm_info_list:
                    try:
                        room = Comm(source)
                        room.city = city
                        room.district_name = co_name
                        floor = room_info.xpath(
                            ".//div[@class='text']/p[2]/span[1]/text()")[0]
                        room.floor = int(re.search('\d+', floor).group(0))
                        trade_date = room_info.xpath(
                            ".//span[@class='cj-data-num']/text()")[0]
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            room.trade_date = datetime.datetime(y, m, d)
                        total_price = room_info.xpath(
                            ".//span[@class='cj-data-num c4a4a4a']/em/text()"
                        )[0]
                        room.total_price = int(
                            re.search('(\d+)', total_price,
                                      re.S | re.M).group(1)) * 10000
                        avg_price = room_info.xpath(
                            ".//span[@class='cj-data-num']/em/text()")[0]
                        room.avg_price = int(
                            re.search('(\d+)', avg_price,
                                      re.S | re.M).group(1))
                        room.direction = room_info.xpath(
                            ".//div[@class='text']/p[2]/span[2]/text()")[0]
                        area = room_info.xpath(".//p[1]/text()")[1]
                        room.region = area
                        size = re.search('建筑面积(.*?)平', area).group(1)
                        if size:
                            area = float(size)
                            room.area = round(area, 2)
                        room.insert_db()
                    except Exception as e:
                        log.error("{}解析房屋错误{}".format(page_url, e))
 def get_comm_info(self, comm_list, all_page_url):
     for i in comm_list:
         try:
             comm = Comm('中安房')
             comm.city = '合肥'
             comm.district_name = re.search('zaf-nowrap.*?>(.*?)<', i, re.S | re.M).group(1).strip()
             trade_date = re.search('zaf-fblue">(.*?)<', i, re.S | re.M).group(1).strip()
             if trade_date:
                 t = time.strptime(trade_date, "%Y-%m-%d")
                 y = t.tm_year
                 m = t.tm_mon
                 d = t.tm_mday
                 comm.trade_date = datetime.datetime(y, m, d)
             total_price = re.search('list-right-data.*?<span.*?>(.*?)<', i, re.S | re.M).group(1).strip()
             comm.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
             info = re.search('list-details-area.*?<span.*?>(.*?)<', i, re.S | re.M).group(1).strip()
             area = info.split(' ')[0].replace('㎡', '')
             if area:
                 area = float(area)
                 comm.area = round(area, 2)
             try:
                 room_type = info.split(' ')[1]
             except Exception as e:
                 room_type = None
             try:
                 comm.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.room = 0
             try:
                 comm.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.hall = None
             try:
                 comm.toilet = int(re.search('(\d)卫', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.toilet = None
             try:
                 avg_price = info.split(' ')[2]
                 comm.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
             except Exception as e:
                 comm.avg_price = None
             info_2 = re.search('list-details-area.*?<span.*?<span>(.*?)<', i, re.S | re.M).group(1).strip()
             comm.direction = info_2.split(' ')[0]
             try:
                 comm.fitment = info_2.split(' ')[1]
             except Exception as e:
                 comm.fitment = None
             info_3 = re.search('list-details-address1.*?<span>(.*?)<', i, re.S | re.M).group(1).strip()
             comm.region = info_3.split(' ')[0].strip()
             comm.insert_db()
         except Exception as e:
             log.error('解析错误,source={},url="{}",e="{}"'.format('中安房', all_page_url, e))
示例#6
0
def into_mongo(coll):
    com = Comm('澜斯')
    results = coll.find(no_cursor_timeout=True)
    for result in results:
        # 这个地方写一个try是因为我再测试的时候发现有的木有fj_city
        try:
            com.city = result['fj_city']  # 城市
            com.region = result['fj_region']  # 区域
        except Exception as e:
            log.error('城市或者区域没有')

        com.m_date = result['updatedate']  # 更新日期
        com.create_date = datetime.datetime.now()  # 创建时间
        com.fitment = result['newdiskdecoration']  # 装修
        com.floor = result['flevel']  # 所在楼层

        # try是因为在插入数据库中这几个如果不符合,就不会插入
        try:
            com.district_name = result['fj_name']  # 小区名称
            com.avg_price = result['unitprice']  # 单价
            com.total_price = result['usd']  # 总价
            com.area = result['acreage']  # 面积=建筑面积

            t = time.strptime(result['signingdate'].split('T')[0], "%Y-%m-%d")
            y = t.tm_year
            m = t.tm_mon
            d = t.tm_mday
            com.trade_date = datetime.datetime(y, m, d)

        except Exception as e:
            log.error(e)

        # 这一部分我写了正则从地址中匹配单元号和室号,如果组长感觉不对,,直接注释掉就好
        houseaddress = result['houseaddress']
        try:
            res = re.search('(\d+)号(\d+)', houseaddress)
            com.unit_num = res.group(1)  # 单元号
            com.room_num = res.group(2)  # 室号
        except Exception as e:
            print('无法匹配大盘单元号和室号,houseaddress={}'.find(houseaddress))

        # 以下数据库确定无法匹配,写上是为了让您看看
        # com.direction = None  # 朝向
        # com.room = None  # 室数
        # com.hall = None  # 厅数
        # com.toilet = None  # 卫数
        # com.height = None  # 总楼层
        # com.house_num = None  # 楼栋号

        # 执行插入操作
        com.insert_db()
示例#7
0
    def start_crawler(self):
        for i in range(1, 346):
            url = self.start_url + "cp" + str(i)
            res = requests.get(url, headers=self.headers)
            html = etree.HTML(res.text)
            comm_info_list = html.xpath("//li//div[@class='fang-info ml20 z']")
            for comm_info in comm_info_list:
                comm_url = comm_info.xpath("./div[@class='title']/a/@href")[0]
                region = comm_info.xpath(".//a[@class='ml20']/text()")[0]
                bu_id = re.search('\d+', comm_url).group(0)
                data = {
                    "buildingId": bu_id,
                    'pageIndex': 1,
                    'pageSize': 500,
                }
                while True:
                    try:
                        deal_res = requests.post(
                            'http://hangzhou.fangtoo.com/Building/GetTradeExchange/',
                            data=data,
                            headers=self.headers)
                        deal_dict = json.loads(deal_res.text)
                        break
                    except:
                        continue

                for n in deal_dict['data']:
                    try:
                        co = Comm(source)
                        co.city = '杭州'
                        size = n['Area']
                        area = size.replace('㎡', '')
                        if area:
                            area = float(area)
                            co.area = round(area, 2)
                        co.district_name = n['ExName']
                        co.total_price = int(
                            re.search('(\d+)', n['Price'],
                                      re.S | re.M).group(1))
                        trade_date = n['ExDate']
                        if trade_date:
                            t = time.strptime(trade_date, "%Y/%m/%d %H:%M:%S")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            co.trade_date = datetime.datetime(y, m, d)
                        co.region = region
                        co.insert_db()
                    except Exception as e:
                        log.error("解析错误{}".format(e))
示例#8
0
    def info_parse(self,ro_html,co_name,region,city_name):
        room_list = ro_html.xpath("//ul[@class='pList zu']/li")
        for room in room_list:
            try:
                ro = Comm(source)
                ro.city = city_name
                ro.district_name = co_name
                ro.region = region
                room_type = room.xpath(".//p[@class='sTit']/strong/text()")[0]
                try:
                    ro.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
                except Exception as e:
                    ro.room = None
                try:
                    ro.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
                except Exception as e:
                    ro.hall = None

                total_price = room.xpath(".//div[@class='jiage']/strong/text()")[0]
                ro.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                avg_price = room.xpath(".//div[@class='jiage']/p/text()")[0]
                ro.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
                info = room.xpath(".//div/p[2]/text()")[0]
                area = re.search('·(.*?)平米',info).group(1)
                area = float(area)
                ro.area = round(area, 2)
                direction = re.search('平米 · (.*)',info).group(1)
                ro.direction = direction.strip()
                trade_date = room.xpath(".//div/p[3]/text()")[0]
                trade_date = trade_date.strip()
                if trade_date:
                    t = time.strptime(trade_date, "成交日期:%Y-%m-%d")
                    y = t.tm_year
                    m = t.tm_mon
                    d = t.tm_mday
                    ro.trade_date = datetime.datetime(y, m, d)
                ro.insert_db()
            except Exception as e:
                log.error("房间信息提取错误{}".format(e))
示例#9
0
    def room(self, co_list, city_name):
        for co in co_list:
            try:
                co_name = co.xpath("./div[1]/a/text()")[0]
                co_url = "http:" + co.xpath("./div[1]/a/@href")[0]
                region = co.xpath("./div[3]/span[1]/a[1]/text()")[0]
                addr = co.xpath("./div[3]/span[3]/@title")[0]
                detail = requests.get(co_url, headers=self.headers)
                html = etree.HTML(detail.text)
                room_url = "http:" + html.xpath("//div[@class='tab-toolbar pr']//li/a/@href")[-1]
                page_index = requests.get(room_url, headers=self.headers)
            except:
                continue
            if re.search('共(\d+)页', page_index.text):
                page_num = re.search('共(\d+)页', page_index.text).group(1)
            else:
                log.info('小区无相关数据')
                continue
            for i in range(1, int(page_num) + 1):
                url = re.sub('#.*', 'n', room_url) + str(i)
                while True:
                    try:
                        res = requests.get(url, headers=self.headers)
                        break
                    except:
                        continue
                con = res.text
                room_html = etree.HTML(con)
                room_list = room_html.xpath("//div[@class='right-information']")
                for m in room_list:
                    try:
                        room = Comm(source)
                        room.district_name = co_name
                        room.city = city_name
                        room.region = region
                        room_type = m.xpath("./h3/span[2]/text()")[0]
                        try:
                            room.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
                        except Exception as e:
                            room.room = None
                        try:
                            room.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
                        except Exception as e:
                            room.hall = None

                        size = m.xpath("./h3/span[3]/text()")[0]
                        area = size.replace('平米', '')
                        if area:
                            area = float(area)
                            room.area = round(area, 2)


                        total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0]
                        room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                        avg_price = m.xpath(".//div[@class='size  fs14']/text()")[0]
                        room.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
                        try:
                            room.fitment = m.xpath(".//div[@class='t1 fs14']/text()[3]")[0]
                            room.direction = m.xpath(".//div[@class='t1 fs14']/text()[2]")[0]
                            # room.use = m.xpath(".//div[@class='t1 fs14']/text()[1]")[0]
                        except:
                            room.fitment = None
                            room.direction = None
                            # room.use = None
                        floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0]
                        try:
                            floor = re.search('(.*?)/', floor_info).group(1)
                            room.floor = int(re.search('\d+',floor).group(0))
                        except Exception as e:
                            room.floor = None
                        try:
                            room.height = int(re.search('.*?/(\d+)层', floor_info).group(1))
                        except:
                            room.height = None
                        trade_date = m.xpath(".//div[@class='date']/text()")[0]
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            room.trade_date = datetime.datetime(y, m, d)
                        room.insert_db()
                    except Exception as e:
                        log.error('房屋信息提取失败{}'.format(e))
示例#10
0
    def start_crawler(self):
        page = self.get_all_page()
        for i in range(1, page):
            url = 'http://www.taiwu.com/building/cp' + str(i) + '/'

            while True:
                try:
                    res = requests.get(url)
                    if res.status_code == 200:
                        break
                except Exception as e:
                    print('请求出错', e)

            # print(res.content.decode())
            all_info = re.search('<ul class="fang-list">.*?</ul>',
                                 res.content.decode(), re.S | re.M).group(0)
            for k in re.findall('<li>.*?</li>', all_info, re.S | re.M):
                source = '太屋网'
                city = '上海'
                area = re.search(
                    '<div class="adds">.*?<a href="/building/.*?/">(.*?)</a>',
                    k, re.S | re.M).group(1)  # 区域
                building_id = re.search('<a href="/building/(.*?)/', k,
                                        re.S | re.M).group(1)
                detail_url = "http://www.taiwu.com/Building/GetHouseExchange/"
                payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"buildingId\"\r\n\r\n" + building_id + "\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"pageIndex\"\r\n\r\n1\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"pageSize\"\r\n\r\n5000\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
                headers = {
                    'content-type':
                    "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
                    'Cache-Control': "no-cache",
                }
                while True:
                    try:
                        response = requests.request("POST",
                                                    detail_url,
                                                    data=payload,
                                                    headers=headers)
                        if res.status_code == 200:
                            break
                    except Exception as e:
                        print('请求出错', e)

                try:
                    result_json = response.json()
                    data_list = result_json['data']
                except Exception as e:
                    print(e)
                    continue
                for j in data_list:
                    c = Comm(source)
                    c.city = city
                    c.region = area
                    c.room = j['RoomCount']
                    c.hall = j['HollCount']
                    c.district_name = j['BuildingName']
                    c.area = j['BldArea']
                    trade_date = j['ExDate']
                    trade_date_ = int(re.search('(\d+)', trade_date).group(1))
                    if trade_date_:
                        t = time.localtime(int(trade_date_ / 1000))
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        c.trade_date = datetime.datetime(y, m, d)
                    c.total_price = j['ExPrice']
                    c.insert_db()
示例#11
0
 def comm_info(self, comm_url_list, city_url):
     for comm_url in comm_url_list:
         url = city_url.replace('/esf/', comm_url)
         re_url = url.replace('xq', 'fangjia')
         try:
             res = requests.get(url=re_url,
                                headers=self.headers,
                                proxies=next(p))
         except Exception as e:
             log.error('请求失败, source={}, url={}, e={}'.format(
                 '乐有家', re_url, e))
             continue
         con = res.text
         co_name = re.search('wrap-head-name">(.*?)</div', con,
                             re.S | re.M).group(1)
         co_name = co_name.strip()
         try:
             page = re.search('(\d+)">尾页', con).group(1)
         except:
             page = 1
         for i in range(1, int(page) + 1):
             page_url = re_url.rstrip('.html') + "/?n=" + str(i)
             print(page_url)
             try:
                 co_res = requests.get(url=page_url,
                                       headers=self.headers,
                                       proxies=next(p))
             except Exception as e:
                 log.error('请求失败, source={}, url={}, e={}'.format(
                     '乐有家', page_url, e))
                 continue
             co_html = etree.HTML(co_res.text)
             city = co_html.xpath(
                 "//span[@class='change-city']/text()")[0].replace(
                     '\t', '').replace('[', '')
             romm_info_list = co_html.xpath("//div[@class='list-cont']/div")
             for room_info in romm_info_list:
                 room = Comm(source)
                 # 城市
                 room.city = city
                 # 小区名称
                 room.district_name = co_name
                 try:
                     # 所在楼层
                     floor = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[1]/text()")[0]
                     floor = re.search('(.*?)/', floor).group(1)
                     room.floor = int(re.search('\d+', floor).group(0))
                 except:
                     room.floor = None
                 try:
                     # 总楼层
                     height = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[1]/text()")[0]
                     room.height = int(
                         re.search('/(\d+)层', height).group(1))
                 except:
                     room.height = None
                 try:
                     # 交易时间
                     trade_date = room_info.xpath(
                         ".//span[@class='cj-data-num']/text()")[0]
                     t = time.strptime(trade_date, "%Y-%m-%d")
                     y = t.tm_year
                     m = t.tm_mon
                     d = t.tm_mday
                     room.trade_date = datetime.datetime(y, m, d)
                 except:
                     room.trade_date = None
                 try:
                     # 总价
                     total_price = room_info.xpath(
                         ".//span[@class='cj-data-num c4a4a4a']/em/text()"
                     )[0]
                     if '*' in total_price:
                         log.error('source={}, 总价有问题 带*号'.format('乐有家'))
                         continue
                     else:
                         room.total_price = int(
                             re.search('(\d+)', total_price,
                                       re.S | re.M).group(1)) * 10000
                 except:
                     room.total_price = None
                 try:
                     # 均价
                     avg_price = room_info.xpath(
                         ".//span[@class='cj-data-num']/em/text()")[0]
                     if '*' in avg_price:
                         log.error('source={}, 均价有问题 带*号'.format('乐有家'))
                         continue
                     else:
                         room.avg_price = int(
                             re.search('(\d+)', avg_price,
                                       re.S | re.M).group(1))
                 except:
                     room.avg_price = None
                 try:
                     # 朝向
                     room.direction = room_info.xpath(
                         ".//div[@class='text']/p[2]/span[2]/text()"
                     )[0].replace('朝', '')
                 except:
                     room.direction = None
                 try:
                     region_area_info = room_info.xpath(
                         "./div[@class='text']/p[1]/text()")[1]
                 except:
                     return
                 try:
                     # 区域
                     room.region = region_area_info.split(' ')[1]
                 except:
                     room.region = None
                 try:
                     # 面积
                     size = re.search('建筑面积(.*?)平',
                                      region_area_info).group(1)
                     if size:
                         area = float(size)
                         room.area = round(area, 2)
                 except:
                     room.area = None
                 room.insert_db()
示例#12
0
    def crawler(self, city_url, city):
        res = requests.get(city_url, headers=self.headers)
        con = etree.HTML(res.text)
        last_page = con.xpath("//a[@class='down_page']/@href")[1]
        page_num = re.search('\d+', last_page).group(0)
        for i in range(1, int(page_num) + 1):
            page_url = city_url + "/PG" + str(i)
            page_res = requests.get(page_url, headers=self.headers)
            page_con = etree.HTML(page_res.text)
            temp = page_con.xpath("//h1/a/@href")
            for temp_url in temp:
                try:
                    com = Comm(source)
                    comm_url = city + temp_url
                    while True:
                        try:
                            co_res = requests.get(comm_url,
                                                  headers=self.headers,
                                                  timeout=10)
                            break
                        except:
                            continue
                    time.sleep(2)
                    co_con = etree.HTML(co_res.text)
                    com.city = co_con.xpath("//div/a[@class='show']/text()")[0]
                    region = co_con.xpath("//section/p/a/text()")[-1]
                    com.region = region
                    com.district_name = co_con.xpath("//cite/span/text()")[0]
                    info = co_con.xpath("//table/tbody/tr")
                    for tag in info:
                        size = tag.xpath("./td[2]/text()")[0]
                        area = size.replace('㎡', '')
                        area = float(area)
                        com.area = round(area, 2)
                        avg_price = tag.xpath("./td[3]/text()")[0]
                        com.avg_price = int(
                            re.search('(\d+)', avg_price,
                                      re.S | re.M).group(1))
                        total_price = tag.xpath("./td/span/text()")[0]
                        com.total_price = int(
                            re.search('(\d+)', total_price,
                                      re.S | re.M).group(1)) * 10000
                        trade_date = tag.xpath("./td/text()")[-2]
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            com.trade_date = datetime.datetime(y, m, d)

                        room_type = tag.xpath("./td//p/a/text()")[0]
                        try:
                            com.room = int(
                                re.search('(\d)室', room_type,
                                          re.S | re.M).group(1))
                        except Exception as e:
                            com.room = None
                        try:
                            com.hall = int(
                                re.search('(\d)厅', room_type,
                                          re.S | re.M).group(1))
                        except Exception as e:
                            com.hall = None

                        floor = tag.xpath("./td//p/span/text()")[0]
                        com.floor = int(re.search('(\d+)层', floor).group(1))
                        com.direction = re.search('层 (.*?)', floor).group(1)

                        com.insert_db()
                except Exception as e:
                    log.error("{}小区信息提取错误".format(comm_url))
示例#13
0
 def get_page_url(self, page_url, city, area_):
     response = requests.get(page_url,
                             headers=self.headers,
                             proxies=self.proxy)
     html = response.text
     comm_html_list = re.findall('<li class=" clearfix">.*?</li>', html,
                                 re.S | re.M)
     for i in comm_html_list:
         try:
             comm = Comm('Q房网')
             comm.city = city.strip()
             comm.region = area_.strip()
             comm.district_name = re.search('house-title">.*?<a.*?>(.*?)<',
                                            i,
                                            re.S | re.M).group(1).strip()
             comm.direction = re.search(
                 'class="house-about clearfix".*?showKeyword">(.*?)<', i,
                 re.S | re.M).group(1).strip()
             try:
                 comm.height = int(
                     re.search(
                         'class="house-about clearfix".*?showKeyword">.*?<span.*?<span>.*?/(.*?)<',
                         i, re.S | re.M).group(1).strip())
             except Exception as e:
                 comm.height = None
             total_price = re.search('class="show-price".*?span.*?>(.*?)<',
                                     i, re.S | re.M).group(1).strip()
             comm.total_price = int(total_price) * 10000
             avg_price = re.search('class="show-price".*?<p.*?>(.*?)<', i,
                                   re.S | re.M).group(1).strip()
             comm.avg_price = int(re.search('(\d+)', avg_price).group(1))
             trade_date = re.search(
                 'class="show-price concluded".*?span.*?>(.*?)<', i,
                 re.S | re.M).group(1).strip()
             if trade_date:
                 t = time.strptime(trade_date, "%Y.%m.%d")
                 y = t.tm_year
                 m = t.tm_mon
                 d = t.tm_mday
                 comm.trade_date = datetime.datetime(y, m, d)
             room_type = re.search('house-title">.*?<a.*?>.*? (.*?) ', i,
                                   re.S | re.M).group(1).strip()
             try:
                 comm.room = int(
                     re.search('(\d)室', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.room = None
             try:
                 comm.hall = int(
                     re.search('(\d)厅', room_type, re.S | re.M).group(1))
             except Exception as e:
                 comm.hall = None
             area = re.search('house-title">.*?<a.*?>.*? .*? (.*?平米)', i,
                              re.S | re.M).group(1).strip()
             area = area.replace('㎡', '').replace('平米', '')
             if area:
                 area = float(area)
                 comm.area = round(area, 2)
             comm.insert_db()
         except Exception as e:
             log.error('解析错误,source="{}",html="{}",e="{}"'.format(
                 'Q房网', i, e))
示例#14
0
 def get_comm_detail(self, comm_url, region, city):
     comm = Comm('购房网')
     comm.url = comm_url
     comm.region = region.strip()
     comm.city = city
     try:
         response = requests.get(url=comm_url,
                                 headers=self.headers,
                                 proxies=next(p))
     except Exception as e:
         log.error('请求错误,source="{}",url="{}",e="{}"'.format(
             '购房网', comm_url, e))
         return
     html = response.text
     comm.district_name = re.search('title fl.*?<h1>(.*?)</h1>', html,
                                    re.S | re.M).group(1).strip()
     comm_info_html = re.search('<ul class="lscjlist">.*?</ul>', html,
                                re.S | re.M).group()
     comm_info_list = re.findall('<li>(.*?)</li>', comm_info_html,
                                 re.S | re.M)
     if not comm_info_list:
         log.info('source={}, 此小区没有数据,url="{}"'.format('购房网', comm_url))
     for i in comm_info_list:
         trade_date = re.search('<span>(.*?)</span>', i,
                                re.S | re.M).group(1).strip()
         if trade_date:
             t = time.strptime(trade_date, "%Y-%m-%d")
             y = t.tm_year
             m = t.tm_mon
             d = t.tm_mday
             comm.trade_date = datetime.datetime(y, m, d)
         room_type = re.search('<span>.*?<span>(.*?)</span>', i,
                               re.S | re.M).group(1).strip()
         try:
             comm.room = int(
                 re.search('(\d)室', room_type, re.S | re.M).group(1))
             comm.hall = int(
                 re.search('(\d)厅', room_type, re.S | re.M).group(1))
         except Exception as e:
             comm.room = None
             comm.hall = None
         area = re.search('<span>.*?<span>.*?<span>(.*?)</span>', i,
                          re.S | re.M).group(1).strip().replace('㎡',
                                                                '').replace(
                                                                    '平', '')
         if area:
             area = float(area)
             comm.area = round(area, 2)
         try:
             height = re.search(
                 '<span>.*?<span>.*?<span>.*?<span>.*?/(.*?)</span>', i,
                 re.S | re.M).group(1).strip()
             comm.height = int(re.search('(\d+)', height).group(1))
         except Exception as e:
             comm.height = None
         comm.fitment = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i,
             re.S | re.M).group(1).strip()
         comm.direction = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>',
             i, re.S | re.M).group(1).strip()
         avg_price = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>',
             i, re.S | re.M).group(1)
         comm.avg_price = int(
             re.search('(\d+)', avg_price, re.S | re.M).group(1))
         total_price = re.search(
             '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span.*?>(.*?)</span>',
             i, re.S | re.M).group(1)
         comm.total_price = int(
             re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
         comm.insert_db()
示例#15
0
    def comm_detail(self, comm_url_list, city):
        for comm_url in comm_url_list[1:]:
            try:
                com_url = city.replace('/xiaoqu/', comm_url)
                statecode = re.search('xq-(.*)', comm_url).group(1)
                code = statecode.upper()
                comm_detail_url = 'http://sh.centanet.com/apipost/GetDealRecord?estateCode=' + code + '&posttype=S&pageindex=1&pagesize=10000'
                com_res = requests.get(com_url, headers=self.headers)
                res = requests.get(comm_detail_url, headers=self.headers)
                time.sleep(2)
                html = etree.HTML(com_res.text)
                data_dict = json.loads(res.text)
                district_name = html.xpath("//div/h3/text()")[0]
                city_name = html.xpath("//div[@class='idx-city']/text()")[0]
                region = html.xpath("//a[@class='f000']/text()")[0]

                for data in data_dict["result"]:
                    try:
                        co = Comm(source)
                        co.district_name = district_name.strip()
                        co.region = region
                        co.city = city_name
                        try:
                            room_type = data["houseType"]
                            co.room = int(
                                re.search('(\d)室', room_type,
                                          re.S | re.M).group(1))
                            co.hall = int(
                                re.search('(\d)厅', room_type,
                                          re.S | re.M).group(1))
                        except Exception as e:
                            log.error('roomtype为空'.format(e))
                        area = data['areaSize'].replace('平', '')
                        if area:
                            area = float(area)
                            co.area = round(area, 2)
                        co.direction = data['direction']
                        trade_date = '20' + data['dealTime']
                        if trade_date:
                            t = time.strptime(trade_date, "%Y-%m-%d")
                            y = t.tm_year
                            m = t.tm_mon
                            d = t.tm_mday
                            co.trade_date = datetime.datetime(y, m, d)

                        total_price = data['dealPrice']
                        co.total_price = int(
                            re.search('(\d+)', total_price,
                                      re.S | re.M).group(1)) * 10000

                        avg_price = data['unitPrice']
                        try:
                            co.avg_price = int(
                                re.search('(\d+)', avg_price,
                                          re.S | re.M).group(1))
                        except Exception as e:
                            co.avg_price = None
                        co.insert_db()
                    except Exception as e:
                        log.error('解析失败{}'.format(e))
            except Exception as e:
                log.error("小区成交信息错误{}".format(e))
示例#16
0
 def get_city_info(self, city_dict):
     for city in city_dict:
         city_url = city_dict[city] + 'chengjiao/'
         try:
             response = requests.get(city_url, headers=self.headers)
             html = response.text
             area_html = re.search('data-role="ershoufang".*?地铁', html,
                                   re.S | re.M).group()
             area_list_str = re.findall('<a.*?</a>', area_html, re.S | re.M)
             for area_i in area_list_str:
                 if 'ershoufang' in area_i:
                     continue
                 area_url = re.search('href="(.*?)"', area_i,
                                      re.S | re.M).group(1)
                 area = re.search('<a.*?>(.*?)<', area_i,
                                  re.S | re.M).group(1)
                 for i in range(1, 101):
                     city_url_ = city_url.replace(
                         '/chengjiao/', '') + area_url + 'pg' + str(i)
                     try:
                         result = requests.get(city_url_,
                                               headers=self.headers)
                         content = result.text
                         comm_str_list = re.findall(
                             'class="info".*?</div></div></li>', content,
                             re.S | re.M)
                         for i in comm_str_list:
                             comm = Comm('链家在线')
                             comm.region = area.strip()
                             comm.city = city.strip()
                             comm.district_name = re.search(
                                 'target="_blank">(.*?)<', i,
                                 re.S | re.M).group(1).strip()
                             comm.direction = re.search(
                                 'class="houseIcon"></span>(.*?) \|', i,
                                 re.S | re.M).group(1).strip()
                             try:
                                 comm.fitment = re.search(
                                     'class="houseIcon"></span>.*? \|(.*?)\| ',
                                     i, re.S | re.M).group(1).strip()
                             except Exception as e:
                                 comm.fitment = None
                             try:
                                 height = re.search(
                                     'class="positionIcon"></span>.*?\((.*?)\)',
                                     i, re.S | re.M).group(1).strip()
                                 comm.height = int(
                                     re.search('(\d+)', height,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.height = None
                             total_price = re.search(
                                 "class='number'>(.*?)<", i,
                                 re.S | re.M).group(1).strip()
                             if "*" in total_price:
                                 continue
                             comm.total_price = int(
                                 re.search('(\d+)', total_price,
                                           re.S | re.M).group(1)) * 10000
                             room_type = re.search(
                                 'arget="_blank">.*? (.*?) ', i,
                                 re.S | re.M).group(1).strip()
                             try:
                                 comm.room = int(
                                     re.search('(\d)室', room_type,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.room = 0
                             try:
                                 comm.hall = int(
                                     re.search('(\d)厅', room_type,
                                               re.S | re.M).group(1))
                             except Exception as e:
                                 comm.hall = None
                             area_ = re.search(
                                 'target="_blank">.*? .*? (.*?平米)', i,
                                 re.S | re.M).group(1).strip()
                             if area_:
                                 area_ = area_.replace('㎡', '').replace(
                                     '平米', '')
                                 try:
                                     area_ = float(area_)
                                     comm.area = round(area_, 2)
                                 except Exception as e:
                                     comm.area = None
                             trade_date = re.search(
                                 'dealDate">(.*?)<', i,
                                 re.S | re.M).group(1).strip()
                             if trade_date:
                                 t = time.strptime(trade_date, "%Y.%m.%d")
                                 y = t.tm_year
                                 m = t.tm_mon
                                 d = t.tm_mday
                                 comm.trade_date = datetime.datetime(
                                     y, m, d)
                             try:
                                 comm.avg_price = int(i['total_price'] /
                                                      i['area'])
                             except Exception as e:
                                 comm.avg_price = None
                             comm.insert_db()
                     except Exception as e:
                         log.error(
                             '解析错误,source="{}",html="{}",e="{}"'.format(
                                 '链家在线', html, e))
         except Exception as e:
             log.error('请求错误,source="{}",url="{}",e="{}"'.format(
                 '链家在线', city_url, e))