def get_xiaoqu_page(get_url): """ 获取抓取总页数 :param get_url: 抓取 URL :return """ total_page = 0 headers = create_headers() try: response = requests.get(get_url, timeout=10, headers=headers) except Exception as e: logger.error("Have a Error {}".format(repr(e))) return total_page html = response.content soup = BeautifulSoup(html, "lxml") # 获得总的页数 try: page_box = soup.find_all('div', class_='page-box house-lst-page-box')[0] matches = re.search('.*"totalPage":(\d+),.*', str(page_box)) total_page = int(matches.group(1)) except Exception as e: total_page = 1 logger.warning("A Error {}".format(repr(e))) return total_page
def get_xiaoqu_info(self, get_url): headers = create_headers() try: BaseSpider.random_delay() response = requests.get(get_url, timeout=10, headers=headers) except Exception as e: logger.error("Have a Error {}".format(repr(e))) return html = response.text soup = BeautifulSoup(html, "lxml") # 获得有小区信息的panel house_elems = soup.find_all('li', class_="xiaoquListItem") for house_elem in house_elems: try: xiaoqu_detail_url = house_elem.find( 'div', class_='title').find('a')['href'] name = house_elem.find('div', class_='title').find('a')['title'] houseinfo = house_elem.find('div', class_='houseInfo').text positioninfo = house_elem.find('div', class_='positionInfo').text taglist = house_elem.find('div', class_='tagList').text price = house_elem.find('div', class_="xiaoquListItemPrice").text on_sale = house_elem.find( 'div', class_="xiaoquListItemSellCount").text except Exception as e: logger.error("解析程序出错!!! {}".format(repr(e))) continue # 继续清理数据 name = str(name).strip() houseinfo = str(houseinfo).strip().replace("\n", ",") positioninfo = str(positioninfo).strip().replace( "\n", ",").replace("\xa0", "").replace(" ", "") taglist = str(taglist).strip() price = str(price).strip().replace("\n", ",") on_sale = str(on_sale).strip().replace("\n", ",") xiaoqu_id = get_info_spider(xiaoqu_detail_url, self.area, self.city, 'xiaoqu') if xiaoqu_id: # 插入到 db self.data_db([(None, name)[bool(name)], (None, houseinfo)[bool(houseinfo)], (None, positioninfo)[bool(positioninfo)], (None, taglist)[bool(taglist)], (None, price)[bool(price)], (None, on_sale)[bool(on_sale)], xiaoqu_id])
def get_area_ershou_info(self, get_url): """ 通过爬取页面获得城市指定版块的二手房信息 """ headers = create_headers() try: BaseSpider.random_delay() response = requests.get(get_url, timeout=10, headers=headers) except Exception as e: logger.error("Have a Error {}".format(repr(e))) return html = response.content soup = BeautifulSoup(html, "lxml") # 获得有小区信息的panel house_elements = soup.find_all('li', class_="clear") for house_elem in house_elements: try: xiaoqu_detail_url = house_elem.find( 'div', class_="positionInfo").find('a')['href'] name = house_elem.find('div', class_='title').find('a')['title'] positioninfo = house_elem.find('div', class_="positionInfo").text houseinfo = house_elem.find('div', class_="houseInfo").text followinfo = house_elem.find('div', class_="followInfo").text tag = house_elem.find('div', class_="tag").text priceinfo = house_elem.find('div', class_="priceInfo").text except Exception as e: logger.error("解析程序出错!!! {}".format(repr(e))) continue # 继续清理数据 name = str(name).strip() positioninfo = str(positioninfo).strip() houseinfo = str(houseinfo).strip().replace("\n", ",").replace(" ", "") followinfo = str(followinfo).strip().replace("\n", ",").replace(" ", "") tag = str(tag).strip().replace("\n", ",") priceinfo = str(priceinfo).strip().replace("\n\n", ",").replace("\n", ",") xiaoqu_id = get_info_spider(xiaoqu_detail_url, self.area, self.city, 'ershou') if xiaoqu_id: # 插入到 db self.data_db([(None, name)[bool(name)], (None, positioninfo)[bool(positioninfo)], (None, houseinfo)[bool(houseinfo)], (None, followinfo)[bool(followinfo)], (None, tag)[bool(tag)], (None, priceinfo)[bool(priceinfo)], xiaoqu_id])
def spider_proxyip(num=10): try: url = 'http://www.xicidaili.com/nt/1' req = requests.get(url, headers=create_headers()) source_code = req.content print(source_code) soup = BeautifulSoup(source_code, 'lxml') ips = soup.findAll('tr') for x in range(1, len(ips)): ip = ips[x] tds = ip.findAll("td") proxy_host = "{0}://".format( tds[5].contents[0] ) + tds[1].contents[0] + ":" + tds[2].contents[0] proxy_temp = {tds[5].contents[0]: proxy_host} proxys_src.append(proxy_temp) if x >= num: break except Exception as e: print("spider_proxyip exception:") print(e)
def get_info_spider(url, area, city, houses_type): """ 获取小区具体信息的spider :param url: :param area: :param city: :return: """ headers = create_headers() try: BaseSpider.random_delay() response = requests.get(url, timeout=10, headers=headers) except Exception as e: logger.error("Have a Error {}".format(repr(e))) return html = response.text soup = BeautifulSoup(html, "lxml") try: xiaoqu_title = soup.find('h1', class_='main')['title'] xiaoquinfo = soup.find_all('div', class_='xiaoquInfoItem') building_type = xiaoquinfo[0].text property_expenses = xiaoquinfo[1].text property_company = xiaoquinfo[2].text developer = xiaoquinfo[3].text total_number_of_buildings = xiaoquinfo[4].text total_number_of_houses = xiaoquinfo[5].text nearby_stores = xiaoquinfo[6].text except Exception as e: logger.error(repr(e)) return if xiaoqu_title: # 获取到小区名字后接着获取小区其他信息 xiaoqu_title = xiaoqu_title building_type = str(building_type).strip().replace("\n", ",") property_expenses = str(property_expenses).strip().replace( "\n\n", ",").replace(" ", "") property_company = str(property_company).strip().replace("\n", ",") developer = str(developer).strip().replace("\n", ",") total_number_of_buildings = str( total_number_of_buildings).strip().replace("\n", ",") total_number_of_houses = str(total_number_of_houses).strip().replace( "\n", ",") nearby_stores = str(nearby_stores).strip().replace("\n\n", ",") # 获取经纬度 city = get_ch_city(city) area = get_ch_area(area, houses_type) if city and area: address = city[0] + area[0] + xiaoqu_title lat, lng = getlnglat(address, city[0]) # 获取 mysql 连接 coon = POOL.connection() cur = coon.cursor() sql = """select * from xiaoqu_detail_ke where xiaoqu_title=%s""" cur.execute(sql, xiaoqu_title) result = cur.fetchone() if result: xiaoqu_id = result[0] else: try: cur.execute( """ insert into xiaoqu_detail_ke( xiaoqu_title, building_type, property_expenses, property_company, developer, total_number_of_buildings, total_number_of_houses, nearby_stores, lat, lng) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, [ xiaoqu_title, ('', building_type)[bool(building_type)], ('', property_expenses)[bool(property_expenses)], ('', property_company)[bool(property_company)], ('', developer)[bool(developer)], ('', total_number_of_buildings )[bool(total_number_of_buildings)], ('', total_number_of_houses )[bool(total_number_of_houses)], ('', nearby_stores)[bool(nearby_stores)], ('', lat)[bool(lat)], ('', lng)[bool(lng)], ]) coon.commit() except Exception as e: logger.error(repr(e)) coon.rollback() cur.close() coon.close() return else: cur.execute(sql, xiaoqu_title) result = cur.fetchone() xiaoqu_id = result[0] cur.close() coon.close() return xiaoqu_id