Пример #1
0
def write_to_csv(city, area, random_delay):
    '''

    :param city: 城市名
    :param area: 区域名
    :return: 将爬取的数据写入文件:ershoufang-city-area.csv
    '''
    city_ch = cities[city]
    area_ch = get_city_area(city)[area]
    print('Now writing {0}|{1}'.format(city_ch, area_ch), 'to csv')
    with open('ershoufang-{0}-{1}.csv'.format(city_ch, area_ch),
              'w') as csvfile:
        for info in spider(city, area, random_delay):
            print("Now wrting:", '|'.join(info[0:5]))
            csvfile.write('|'.join(info))
            csvfile.write("\n")
Пример #2
0
def spider(city, area, random_delay):
    """
    返回指定城市,指定区域的房产列表信息
    :param city: 指定城市
    :param area: 指定区域
    :return: 房产信息列表,列表格式为[{city:XXX, area:XXX, title:XXX, url:XXX, ...} ]
    """
    # 找出total-page:int
    total_pg = total_page(city, area)
    # total_pg = 1
    # 分页爬取每页的数据
    for pgnum in range(1, total_pg + 1):
        if random_delay:
            time.sleep(random.randint(5, 12))
        print("now crawling:", cities[city], get_city_area(city)[area])
        print("current page/total page:", pgnum, '/', total_pg)
        for item in pg_ana_re(city, area, pgnum):
            yield item
Пример #3
0
def pg_ana(city, area, pgnum):
    """
    处理单个页面的信息
    return:信息列表
    """
    start_time = time.time()
    data_list = []
    page = 'http://{0}.lianjia.com/ershoufang/{1}/pg{2}'.format(
        city, area, pgnum)
    print(page)
    headers = create_headers()
    # 获取随机headers
    response = requests.get(page, timeout=10, headers=headers)
    html = response.content
    soup = BeautifulSoup(html, "lxml")
    # 使用bs4解析页面
    house_elements = soup.find_all('li', class_="clear")
    for house_elem in house_elements:
        price = house_elem.find('div', class_="totalPrice")
        unit_price = house_elem.find('div', class_="unitPrice")
        name = house_elem.find('div', class_='title')
        addr = house_elem.find('div', class_='positionInfo')
        # print(addr)
        url = re.search(r'.*?href="(.*?)".*', str(addr))
        # print(url.group(1))
        desc = house_elem.find('div', class_="houseInfo")
        # 查找数据
        price = price.text.strip()
        unit_price = unit_price.text.strip()
        name = name.text.replace("\n", "")
        addr = addr.text.strip().replace(" ", "")
        # print(addr)
        url = url.group(1)
        desc = desc.text.replace("\n", "").strip()
        # 清理数据
        # print(price, "\n", name, "\n", desc)
        data = [
            cities.get(city),
            get_city_area(city)[area], addr, name, price, unit_price, desc, url
        ]
        data_list.append(data)
    end_time = time.time()
    print("For this page %s , used %s s" % (page, end_time - start_time))
    return data_list
Пример #4
0
def main(city, random_delay):
    """
    主程序
    :return:主程序
    """

    area_list = list(get_city_area(city).keys())
    N = len(area_list)
    print(area_list)
    t_list = []
    # 增加多线程处理,每个区域用一个线程爬取
    for area in area_list:
        t = threading.Thread(target=write_to_csv,
                             args=(city, area, random_delay))
        t_list.append(t)
    for t in t_list:
        t.start()
        # 针对每个城市具体的区域进行分别爬取
    for t in t_list:
        # 阻塞主线程
        t.join()
Пример #5
0
def pg_ana_re(city, area, pgnum):
    """
    处理单个页面的信息
    return:信息列表
    """
    start_time = time.time()
    data_list = []
    page = 'http://{0}.lianjia.com/ershoufang/{1}/pg{2}'.format(
        city, area, pgnum)
    print(page)
    headers = create_headers()
    # 获取随机headers
    response = requests.get(page, timeout=10, headers=headers)
    html = response.content.decode('utf-8')
    # print(html)

    # 使用正则表达式解析页面
    pattern = re.compile(
        r'<div class="info clear"><div class="title"><a class="" href="(.*?)".*?>(.*?)</a>.*?data-el="region">(.*?)</a>.*?target="_blank">(.*?)</a>.*?<div class="houseInfo"><span .*?></span>(.*?)</div>.*?<div class="totalPrice"><span>(.*?)</span>万</div>.*?单价(.*?)元/平米</span>',
        re.S)
    data = re.findall(pattern, html)
    # print(data)
    for i in data:
        url = i[0]
        title = i[1]
        xiaoqu = i[2]
        jiedao = i[3]
        miaosu = i[4]
        price = i[5]
        unitprice = i[6]
        format_data = [
            cities.get(city),
            get_city_area(city)[area], jiedao, xiaoqu, title, miaosu, price,
            unitprice, url
        ]
        data_list.append(format_data)
    end_time = time.time()
    print('For this page %s, used %s s' % (page, end_time - start_time))
    return data_list