def write_to_csv(city, area, random_delay): ''' :param city: 城市名 :param area: 区域名 :return: 将爬取的数据写入文件:ershoufang-city-area.csv ''' city_ch = cities[city] area_ch = get_city_area(city)[area] print('Now writing {0}|{1}'.format(city_ch, area_ch), 'to csv') with open('ershoufang-{0}-{1}.csv'.format(city_ch, area_ch), 'w') as csvfile: for info in spider(city, area, random_delay): print("Now wrting:", '|'.join(info[0:5])) csvfile.write('|'.join(info)) csvfile.write("\n")
def spider(city, area, random_delay): """ 返回指定城市,指定区域的房产列表信息 :param city: 指定城市 :param area: 指定区域 :return: 房产信息列表,列表格式为[{city:XXX, area:XXX, title:XXX, url:XXX, ...} ] """ # 找出total-page:int total_pg = total_page(city, area) # total_pg = 1 # 分页爬取每页的数据 for pgnum in range(1, total_pg + 1): if random_delay: time.sleep(random.randint(5, 12)) print("now crawling:", cities[city], get_city_area(city)[area]) print("current page/total page:", pgnum, '/', total_pg) for item in pg_ana_re(city, area, pgnum): yield item
def pg_ana(city, area, pgnum): """ 处理单个页面的信息 return:信息列表 """ start_time = time.time() data_list = [] page = 'http://{0}.lianjia.com/ershoufang/{1}/pg{2}'.format( city, area, pgnum) print(page) headers = create_headers() # 获取随机headers response = requests.get(page, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") # 使用bs4解析页面 house_elements = soup.find_all('li', class_="clear") for house_elem in house_elements: price = house_elem.find('div', class_="totalPrice") unit_price = house_elem.find('div', class_="unitPrice") name = house_elem.find('div', class_='title') addr = house_elem.find('div', class_='positionInfo') # print(addr) url = re.search(r'.*?href="(.*?)".*', str(addr)) # print(url.group(1)) desc = house_elem.find('div', class_="houseInfo") # 查找数据 price = price.text.strip() unit_price = unit_price.text.strip() name = name.text.replace("\n", "") addr = addr.text.strip().replace(" ", "") # print(addr) url = url.group(1) desc = desc.text.replace("\n", "").strip() # 清理数据 # print(price, "\n", name, "\n", desc) data = [ cities.get(city), get_city_area(city)[area], addr, name, price, unit_price, desc, url ] data_list.append(data) end_time = time.time() print("For this page %s , used %s s" % (page, end_time - start_time)) return data_list
def main(city, random_delay): """ 主程序 :return:主程序 """ area_list = list(get_city_area(city).keys()) N = len(area_list) print(area_list) t_list = [] # 增加多线程处理,每个区域用一个线程爬取 for area in area_list: t = threading.Thread(target=write_to_csv, args=(city, area, random_delay)) t_list.append(t) for t in t_list: t.start() # 针对每个城市具体的区域进行分别爬取 for t in t_list: # 阻塞主线程 t.join()
def pg_ana_re(city, area, pgnum): """ 处理单个页面的信息 return:信息列表 """ start_time = time.time() data_list = [] page = 'http://{0}.lianjia.com/ershoufang/{1}/pg{2}'.format( city, area, pgnum) print(page) headers = create_headers() # 获取随机headers response = requests.get(page, timeout=10, headers=headers) html = response.content.decode('utf-8') # print(html) # 使用正则表达式解析页面 pattern = re.compile( r'<div class="info clear"><div class="title"><a class="" href="(.*?)".*?>(.*?)</a>.*?data-el="region">(.*?)</a>.*?target="_blank">(.*?)</a>.*?<div class="houseInfo"><span .*?></span>(.*?)</div>.*?<div class="totalPrice"><span>(.*?)</span>万</div>.*?单价(.*?)元/平米</span>', re.S) data = re.findall(pattern, html) # print(data) for i in data: url = i[0] title = i[1] xiaoqu = i[2] jiedao = i[3] miaosu = i[4] price = i[5] unitprice = i[6] format_data = [ cities.get(city), get_city_area(city)[area], jiedao, xiaoqu, title, miaosu, price, unitprice, url ] data_list.append(format_data) end_time = time.time() print('For this page %s, used %s s' % (page, end_time - start_time)) return data_list