예제 #1
0
 def get_xiaoqu_page(get_url):
     """
     获取抓取总页数
     :param get_url: 抓取 URL
     :return
     """
     total_page = 0
     headers = create_headers()
     try:
         response = requests.get(get_url, timeout=10, headers=headers)
     except Exception as e:
         logger.error("Have a Error {}".format(repr(e)))
         return total_page
     html = response.content
     soup = BeautifulSoup(html, "lxml")
     # 获得总的页数
     try:
         page_box = soup.find_all('div',
                                  class_='page-box house-lst-page-box')[0]
         matches = re.search('.*"totalPage":(\d+),.*', str(page_box))
         total_page = int(matches.group(1))
     except Exception as e:
         total_page = 1
         logger.warning("A Error {}".format(repr(e)))
     return total_page
예제 #2
0
    def get_xiaoqu_info(self, get_url):

        headers = create_headers()
        try:
            BaseSpider.random_delay()
            response = requests.get(get_url, timeout=10, headers=headers)
        except Exception as e:
            logger.error("Have a Error {}".format(repr(e)))
            return

        html = response.text
        soup = BeautifulSoup(html, "lxml")

        # 获得有小区信息的panel
        house_elems = soup.find_all('li', class_="xiaoquListItem")
        for house_elem in house_elems:
            try:
                xiaoqu_detail_url = house_elem.find(
                    'div', class_='title').find('a')['href']
                name = house_elem.find('div',
                                       class_='title').find('a')['title']
                houseinfo = house_elem.find('div', class_='houseInfo').text
                positioninfo = house_elem.find('div',
                                               class_='positionInfo').text
                taglist = house_elem.find('div', class_='tagList').text
                price = house_elem.find('div',
                                        class_="xiaoquListItemPrice").text
                on_sale = house_elem.find(
                    'div', class_="xiaoquListItemSellCount").text
            except Exception as e:
                logger.error("解析程序出错!!! {}".format(repr(e)))
                continue
            # 继续清理数据
            name = str(name).strip()
            houseinfo = str(houseinfo).strip().replace("\n", ",")
            positioninfo = str(positioninfo).strip().replace(
                "\n", ",").replace("\xa0", "").replace(" ", "")
            taglist = str(taglist).strip()
            price = str(price).strip().replace("\n", ",")
            on_sale = str(on_sale).strip().replace("\n", ",")

            xiaoqu_id = get_info_spider(xiaoqu_detail_url, self.area,
                                        self.city, 'xiaoqu')
            if xiaoqu_id:
                # 插入到 db
                self.data_db([(None, name)[bool(name)],
                              (None, houseinfo)[bool(houseinfo)],
                              (None, positioninfo)[bool(positioninfo)],
                              (None, taglist)[bool(taglist)],
                              (None, price)[bool(price)],
                              (None, on_sale)[bool(on_sale)], xiaoqu_id])
예제 #3
0
    def get_area_ershou_info(self, get_url):
        """
        通过爬取页面获得城市指定版块的二手房信息
        """
        headers = create_headers()
        try:
            BaseSpider.random_delay()
            response = requests.get(get_url, timeout=10, headers=headers)
        except Exception as e:
            logger.error("Have a Error {}".format(repr(e)))
            return
        html = response.content
        soup = BeautifulSoup(html, "lxml")
        # 获得有小区信息的panel
        house_elements = soup.find_all('li', class_="clear")
        for house_elem in house_elements:
            try:
                xiaoqu_detail_url = house_elem.find(
                    'div', class_="positionInfo").find('a')['href']
                name = house_elem.find('div',
                                       class_='title').find('a')['title']
                positioninfo = house_elem.find('div',
                                               class_="positionInfo").text
                houseinfo = house_elem.find('div', class_="houseInfo").text
                followinfo = house_elem.find('div', class_="followInfo").text
                tag = house_elem.find('div', class_="tag").text
                priceinfo = house_elem.find('div', class_="priceInfo").text
            except Exception as e:
                logger.error("解析程序出错!!! {}".format(repr(e)))
                continue
            # 继续清理数据
            name = str(name).strip()
            positioninfo = str(positioninfo).strip()
            houseinfo = str(houseinfo).strip().replace("\n",
                                                       ",").replace(" ", "")
            followinfo = str(followinfo).strip().replace("\n",
                                                         ",").replace(" ", "")
            tag = str(tag).strip().replace("\n", ",")
            priceinfo = str(priceinfo).strip().replace("\n\n",
                                                       ",").replace("\n", ",")

            xiaoqu_id = get_info_spider(xiaoqu_detail_url, self.area,
                                        self.city, 'ershou')
            if xiaoqu_id:
                # 插入到 db
                self.data_db([(None, name)[bool(name)],
                              (None, positioninfo)[bool(positioninfo)],
                              (None, houseinfo)[bool(houseinfo)],
                              (None, followinfo)[bool(followinfo)],
                              (None, tag)[bool(tag)],
                              (None, priceinfo)[bool(priceinfo)], xiaoqu_id])
예제 #4
0
def spider_proxyip(num=10):
    try:
        url = 'http://www.xicidaili.com/nt/1'
        req = requests.get(url, headers=create_headers())
        source_code = req.content
        print(source_code)
        soup = BeautifulSoup(source_code, 'lxml')
        ips = soup.findAll('tr')

        for x in range(1, len(ips)):
            ip = ips[x]
            tds = ip.findAll("td")
            proxy_host = "{0}://".format(
                tds[5].contents[0]
            ) + tds[1].contents[0] + ":" + tds[2].contents[0]
            proxy_temp = {tds[5].contents[0]: proxy_host}
            proxys_src.append(proxy_temp)
            if x >= num:
                break
    except Exception as e:
        print("spider_proxyip exception:")
        print(e)
def get_info_spider(url, area, city, houses_type):
    """
    获取小区具体信息的spider
    :param url:
    :param area:
    :param city:
    :return:
    """
    headers = create_headers()
    try:
        BaseSpider.random_delay()
        response = requests.get(url, timeout=10, headers=headers)
    except Exception as e:
        logger.error("Have a Error {}".format(repr(e)))
        return

    html = response.text
    soup = BeautifulSoup(html, "lxml")
    try:
        xiaoqu_title = soup.find('h1', class_='main')['title']
        xiaoquinfo = soup.find_all('div', class_='xiaoquInfoItem')
        building_type = xiaoquinfo[0].text
        property_expenses = xiaoquinfo[1].text
        property_company = xiaoquinfo[2].text
        developer = xiaoquinfo[3].text
        total_number_of_buildings = xiaoquinfo[4].text
        total_number_of_houses = xiaoquinfo[5].text
        nearby_stores = xiaoquinfo[6].text
    except Exception as e:
        logger.error(repr(e))
        return
    if xiaoqu_title:  # 获取到小区名字后接着获取小区其他信息
        xiaoqu_title = xiaoqu_title
        building_type = str(building_type).strip().replace("\n", ",")
        property_expenses = str(property_expenses).strip().replace(
            "\n\n", ",").replace(" ", "")
        property_company = str(property_company).strip().replace("\n", ",")
        developer = str(developer).strip().replace("\n", ",")
        total_number_of_buildings = str(
            total_number_of_buildings).strip().replace("\n", ",")
        total_number_of_houses = str(total_number_of_houses).strip().replace(
            "\n", ",")
        nearby_stores = str(nearby_stores).strip().replace("\n\n", ",")
        # 获取经纬度
        city = get_ch_city(city)
        area = get_ch_area(area, houses_type)
        if city and area:
            address = city[0] + area[0] + xiaoqu_title
            lat, lng = getlnglat(address, city[0])

            # 获取 mysql 连接
            coon = POOL.connection()
            cur = coon.cursor()
            sql = """select * from xiaoqu_detail_ke where xiaoqu_title=%s"""
            cur.execute(sql, xiaoqu_title)
            result = cur.fetchone()
            if result:
                xiaoqu_id = result[0]
            else:
                try:
                    cur.execute(
                        """
                        insert into xiaoqu_detail_ke(
                        xiaoqu_title,
                        building_type,
                        property_expenses,
                        property_company,
                        developer,
                        total_number_of_buildings,
                        total_number_of_houses,
                        nearby_stores,
                        lat,
                        lng) 
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                        """, [
                            xiaoqu_title,
                            ('', building_type)[bool(building_type)],
                            ('', property_expenses)[bool(property_expenses)],
                            ('', property_company)[bool(property_company)],
                            ('', developer)[bool(developer)],
                            ('', total_number_of_buildings
                             )[bool(total_number_of_buildings)],
                            ('', total_number_of_houses
                             )[bool(total_number_of_houses)],
                            ('', nearby_stores)[bool(nearby_stores)],
                            ('', lat)[bool(lat)],
                            ('', lng)[bool(lng)],
                        ])
                    coon.commit()
                except Exception as e:
                    logger.error(repr(e))
                    coon.rollback()
                    cur.close()
                    coon.close()
                    return
                else:
                    cur.execute(sql, xiaoqu_title)
                    result = cur.fetchone()
                    xiaoqu_id = result[0]
            cur.close()
            coon.close()
            return xiaoqu_id