예제 #1
0
    def crawl_one(cls, task):
        """

        **中文文档**

        抓取一个Url页面。
        """
        url = task.target_url
        cls.logger.info("crawl %s, %s left ..." % (url, task.left_counter))

        try:
            html = spider.get_html(url)
        except Exception as e:
            cls.logger.info("http error: %s" % e, 1)
            return

        try:
            data = task.fetch_data(html)
            if task.__class__.use_json:
                cls.objects(_id=task._id).update(json=json.dumps(data),
                                                 status=1)
            else:
                cls.objects(_id=task._id).update(pickle=pickle.dumps(data),
                                                 status=1)
            cls.logger.info("success! data: %s" % data, 1)
        except Exception as e:
            cls.logger.info("parse error: %s" % e, 1)
            return
예제 #2
0
    def get_html(self):
        """

        **中文文档**

        从Url上获得Html。默认使用自动检测文本编码的crawlib中的
        spider.get_html(url)方法。不过有些网站需要用到cookie登陆。可以通过重写
        该方法来实现。
        """
        return spider.get_html(self.target_url)
예제 #3
0
    def get_testdata():
        """

        **中文文档**

        下载测试数据。
        """
        for page, state, county, zipcode, street in testdata:
            url = urlencoder.browse_home_listpage_url(state, county, zipcode,
                                                      street)
            filepath = Path("testdata", "%s.html" % page)
            if not filepath.exists():
                html = spider.get_html(url, encoding="utf-8")
                textfile.write(html, filepath.abspath)

        for href in zillow_house_url_list:
            url = urlencoder.url_join(href)
            zid = href.split("/")[-2]
            filepath = Path("testdata", "%s.html" % zid)
            if not filepath.exists():
                html = spider.get_html(url, encoding="utf-8")
                textfile.write(html, filepath.abspath)
예제 #4
0
    def fill_state():
        """Put 51 states as entry points from
        http://www.cvs.com/store-locator/cvs-pharmacy-locations.
        """
        data = list()

        url = "http://www.cvs.com/store-locator/cvs-pharmacy-locations"
        html = spider.get_html(url)
        soup = BeautifulSoup(html)
        div = soup.find("div", class_="states")
        for a in div.find_all("a"):
            state = State(_id=a["href"], name=a.text.strip(), status=0)
            data.append(state)

        State.smart_insert(data)
예제 #5
0
 def test_Store_fetch_data():
     url = "http://www.cvs.com/store-locator/cvs-pharmacy-address/415+Monroe+Avenue-Alexandria-VA-22301/storeid=1410"
     html = spider.get_html(url)
     data = Store.fetch_data(html)
     js.pprint(data)
예제 #6
0
 def test_City_fetch_data():
     url = "http://www.cvs.com/store-locator/cvs-pharmacy-locations/Virginia/Alexandria"
     html = spider.get_html(url)
     data = City.fetch_data(html)
     js.pprint(data)
예제 #7
0
 def test_State_fetch_data():
     url = "http://www.cvs.com/store-locator/cvs-pharmacy-locations/Virginia"
     html = spider.get_html(url)
     data = State.fetch_data(html)
     js.pprint(data)