Пример #1
0
    def __init__(self, *args, **kwargs):

        BaseScraper.__init__(self, *args, **kwargs)

        if self.template_url is None:
            raise ValueError("%s must have a template_url attribute" %
                             self.__class__.__name__)

        self.request_manager = FastRequestManager()
        response = self.request_manager.make_request(self.template_url)
        self.template_html_schema = self._get_html_schema(response.raw_html)
Пример #2
0
 def __init__(self, *args, **kwargs):
     
     BaseScraper.__init__(self, *args, **kwargs)
     
     if self.template_url is None:
         raise ValueError("%s must have a template_url attribute" % self.__class__.__name__)
     
     self.request_manager = FastRequestManager()
     response = self.request_manager.make_request(self.template_url)
     self.template_html_schema = self._get_html_schema(response.raw_html)
Пример #3
0
class SmartScraper(BaseScraper):
    """
        This class is used to find similar htmls
    """

    template_url = None
    ratio = SIMILARITY_RATIO

    def __init__(self, *args, **kwargs):

        BaseScraper.__init__(self, *args, **kwargs)

        if self.template_url is None:
            raise ValueError("%s must have a template_url attribute" %
                             self.__class__.__name__)

        self.request_manager = FastRequestManager()
        response = self.request_manager.make_request(self.template_url)
        self.template_html_schema = self._get_html_schema(response.raw_html)

    def _validate(self, response):

        return BaseScraper._validate(
            self, response) and self._compare_with_template(response)

    def _compare_with_template(self, response):

        if self.debug:
            print "Evaluating similar html structure of %s" % response.url

        html_schema = self._get_html_schema(response.raw_html)

        evaluated_ratio = difflib.SequenceMatcher(
            None, html_schema, self.template_html_schema).ratio()

        if evaluated_ratio <= self.ratio:
            self.on_cannot_scrape(response)

    def _get_html_schema(self, html):

        html_schema = HtmlSchema()
        html_schema.feed(html)
        return html_schema.get_schema()
Пример #4
0
class SmartScraper(BaseScraper):
    """
        This class is used to find similar htmls
    """
    
    template_url = None
    ratio = SIMILARITY_RATIO

    def __init__(self, *args, **kwargs):
        
        BaseScraper.__init__(self, *args, **kwargs)
        
        if self.template_url is None:
            raise ValueError("%s must have a template_url attribute" % self.__class__.__name__)
        
        self.request_manager = FastRequestManager()
        response = self.request_manager.make_request(self.template_url)
        self.template_html_schema = self._get_html_schema(response.raw_html)

    def _validate(self, response):
        
        return BaseScraper._validate(self, response) and self._compare_with_template(response)

    def _compare_with_template(self, response):
        
        if self.debug :
            print "Evaluating similar html structure of %s" % response.url
        
        html_schema = self._get_html_schema(response.raw_html)
        
        evaluated_ratio = difflib.SequenceMatcher(None, html_schema, self.template_html_schema).ratio()
        
        if evaluated_ratio <= self.ratio:
            raise ScraperCantParseError("The Scraper %s can't parse the html from %s" % (self.__class__.__name__, response.url))

    def _get_html_schema(self, html):
        
        html_schema = HtmlSchema()
        html_schema.feed(html)
        return html_schema.get_schema()
Пример #5
0
    def __init__(self, *args, **kwargs):

        BaseCrawler.__init__(self, *args, **kwargs)
        self.request_manager = FastRequestManager()