Пример #1
0
    def test_extraction(self):

        samples_encoding = 'latin1'
        [(html1, data1), (html2, data2)] = list(iter_samples(
            'scraper_loadstore', html_encoding=samples_encoding))
        sc = Scraper()
        page1 = HtmlPage(body=html1, encoding=samples_encoding)
        sc.train_from_htmlpage(page1, data1)

        page2 = HtmlPage(body=html2, encoding=samples_encoding)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)

        # check still works after serialize/deserialize 
        f = StringIO()
        sc.tofile(f)
        f.seek(0)
        sc = Scraper.fromfile(f)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)
def main():
    if len(sys.argv) < len(CLI_ARGS)+1:
        print "Usage:", sys.argv[0], " ".join(CLI_ARGS)
        exit()
    try:
        with open(sys.argv[1], 'r') as f:
            data_to_match = sys.argv[2]
            body = f.read()
            scraper = Scraper()
            from scrapely.template import FragmentNotFound
            try:
                decoded_body = univ_encode(body)
                scraper.train_from_htmlpage(HtmlPage(body=decoded_body), {'score': data_to_match})
                print 0
            except FragmentNotFound:
                print -1
                return
    except IOError:
        print -2
        return
Пример #3
0
class HTMLParser(BaseParser):
    '''
    A parser that is able to parse html.
    '''
    def __init__(self, **kwargs):
        super(HTMLParser, self).__init__(**kwargs)
        self.scrapely_parser = None
        for key, value in kwargs.items():
            setattr(self, key, value)

    def _prepare_data(self, source):
        json_key = source.json_key
        data = source.data.decode('utf8')
        if json_key:  # if the data is json, return it straightaway
            json_raw = json.loads(data)
            if hasattr(json_key, '__iter__') and json_key[0] in json_raw:
                data = reduce(dict.get, json_key, json_raw)
            elif type(json_key) == str and json_key in json_raw:
                data = json_raw[json_key]
            else:
                return False
        try:  # Create an HTML object from the returned text.
            data = lxhtml.fromstring(data)
        except ValueError:  # This happens when xml is declared in html.
            data = lxhtml.fromstring('\n'.join(data.split('\n')[1:]))
        except TypeError:
            print(data)
            print('Something weird has been returned by the server.')
        data.make_links_absolute(self.domain)
        return data

    def _get_selector(self, model):
        # assert len(model.selector) == 1, "Only one selector can be used."
        if model.selector:
            if type(model.selector) in (CSSSelector, XPath):
                return model.selector
            else:
                try:
                    return CSSSelector(model.selector[0])
                except SelectorSyntaxError:
                    return XPath(model.selector[0])
                except:
                    raise Exception('Not a valid css or xpath selector',
                                    model.selector)
        return None

    def _apply_selector(self, selector, data):
        if selector:
            return selector(data)
        else:
            return (data, )

    def _extract(self, html, template):
        # We have normal html
        if not template.js_regex:
            if html is not None:
                extracted = self._apply_selector(template.selector, html)
            else:
                extracted = []
        # We want to extract a json_variable from the server
        else:
            regex = re.compile(template.js_regex)
            extracted = []
            # Find all the scripts that match the regex.
            scripts = (regex.findall(s.text_content())[0]
                       for s in html.cssselect('script')
                       if regex.search(s.text_content()))

            # Set selected to the scripts
            for script in scripts:
                extracted.extend(json.loads(script))
        return extracted

    def _source_from_object(self, objct, source):
        # TODO fix that the source object can determine for itself where data
        # or params should be placed in the object.
        new_source = objct.source._replicate()
        attrs = {
            attr.name: attr.value
            for attr in objct.attrs.values() if attr.name != 'url'
        }

        if not getattr(new_source, 'url', None):
            url = objct.attrs.get('url')

            if url and not isinstance(url, list):
                new_source.url = self.parent._apply_src_template(
                    source, url.value)
            else:
                new_source.url = self.parent._apply_src_template(
                    source, source.url)

        if new_source.copy_attrs:
            new_source = self._copy_attrs(objct, new_source)

        if new_source.parent:
            new_source.attrs['_parent'] = objct.attrs['url']._replicate()

        if new_source.method == 'post':
            new_source.data = {**new_source.data, **attrs}  # noqa
        else:
            new_source.params = attrs

        self.parent._add_source(new_source)

    def _fallback(self, template, html, source):
        if not self.scrapely_parser:
            self.scrapely_parser = Scraper()

        html = self.scrapely_parser.HtmlPage(body=html)
        db_objct = self.db.read(uri, objct)
        if not db_objct:
            data = db_objct.attrs_to_dict()

            self.scrapely_parser.train_from_htmlpage(html, data)
            attr_dicts = self.scrapely_parser.scrape_page(html)

            for attr_dict in attr_dicts:
                objct = template._replicate(name=template.name, url=source.url)
                # Add the parsed values.
                objct.attrs_from_dict(attr_dict)
                yield objct
        return []

    def _convert_to_element(self, parsed):
        elements = []
        for p in parsed:
            if not type(p) == lxhtml.HtmlElement:
                elem = lxhtml.Element('p')
                elem.text = p
                elements.append(elem)
        return elements

    @add_other_doc(BaseParser.modify_text)
    def sel_text(self, elements, all_text=True, **kwargs):  # noqa
        '''
        Select all text for a given selector.
        '''
        if all_text:
            text = [el.text_content() for el in elements]
        else:
            text = [el.text for el in elements]
        return self._sel_text(text, **kwargs)

    def sel_table(self, elements, columns: int = 2, offset: int = 0):
        '''
        Parses a nxn table into a dictionary.
        Works best when the input is a td selector.
        Specify the amount of columns with the columns parameter.
        example:
            parse a 2x2 table
            {'func': sel_table,
            'params': {
                'selector': CSSSelector('table td'),
                'columns': 2,
                'offset': 0,
                }
            }
            leads to:
            sel_table(html=lxml.etree, selector=CSSSelector('table td'),
                    columns=2, offset=0)
        '''
        keys = [el.text for el in elements[offset::columns]]
        values = [el.text for el in elements[1::columns]]
        return dict(zip(keys, values))

    def sel_row(self,
                elements,
                row_selector: int = None,
                value: str = '',
                attr=None,
                index=None):
        rows = [row for row in elements if value in row.text_contents()]
        if attr:
            selected = [
                sel for sel in sel_attr(row, row_selector) for row in rows
            ]
        else:
            selected = [
                sel for sel in sel_text(row, row_selector) for row in rows
            ]
        return self._value(selected, index)

    def sel_attr(self, elements, attr: str = '', **kwargs):
        '''
        Extract an attribute of an HTML element. Will return
        a list of attributes if multiple tags match the
        selector.

        The **kwargs are the keyword arguments that can be added are from
        the BaseParser.modify_text method.
        '''

        attrs = (el.attrib.get(attr) for el in elements)
        return self._sel_text(attrs, **kwargs)

    def sel_url(self, elements, index: int = None, **kwargs):
        return self.sel_attr(elements, attr='href', index=index, **kwargs)

    def sel_date(self,
                 elements,
                 fmt: str = 'YYYYmmdd',
                 attr: str = None,
                 index: int = None):
        '''
        Returns a python date object with the specified format.
        '''
        if attr:
            date = sel_attr(html, selector, attr=attr, index=index)
        else:
            date = sel_text(html, selector, index=index)
        if date:
            return datetime.strptime(date, fmt)

    def sel_exists(self, elements, key: str = '', index: int = None):
        '''
        Return True if a keyword is in the selector text,
        '''
        text = self.sel_text(elements)
        if text:
            if key in text:
                return True
            return False

    def sel_raw_html(self, elements):
        return [el.raw_html for el in elements]

    def sel_json(self, obj, selector, key=''):
        return obj.get(key)

    def sel_js_array(self, elements, var_name='', var_type=None):
        var_regex = 'var\s*' + var_name + '\s*=\s*(?:new Array\(|\[)(.*)(?:\)|\]);'
        array_string = self.sel_text(elements, regex=var_regex)
        if array_string:
            if var_type:
                return list(map(var_type, array_string.split(',')))
            return array_string.split(',')

    def fill_form(self, elements, fields={}, attrs=[]):
        for form in elements:
            data = {**dict(form.form_values()), **fields}
            source = Source(url=form.action,
                            method=form.method,
                            duplicate=True,
                            attrs=attrs)
            if source.method == 'GET':
                source.params = data
            else:
                source.data = data
            self._add_source(source)
class TrendingSpider(Spider):
    name="trending_monitor"
    """docstring for TrendingSpider"""
    start_urls = []
    PRINT_STATS_EVERY_X_CRAWLED_PAGES = 100
    links_rule = None
    urls_seen = set()
    aborted = False
    crawled_all_pages = 0
    score_field_text_negative_matches = []
    def make_requests_from_url(self, url):
        return Request(url, dont_filter=True, meta={'start_url': url, 'metadepth': 0})

    # rules = (
        # Rule(SgmlLinkExtractor(allow=r'.+', deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|microsoft\.com|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)')), follow=False, callback='parse_item'),
    # )

    def __init__(self, db_path, pid):
        print "\n===============================" * 2
        print "Starting TrendingSpider... FOR PID=", pid
        print "\n===============================" * 2
        self.project_id = int(pid)
        self.db_path = db_path
        self.fetch_project_data()
        if self.aborted:
            return
        print "Loaded", len(self.start_urls), "starting urls"
        self.start_time = time()
        self.crawled_pages = 0
        # This has to be set after we run fetch_project_data()
        self.links_rule = Rule(
            SgmlLinkExtractor(
                allow='.+',
                deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)')
            ),
            follow=False,
            callback='parse_item'
        )
        self.links_rule_targeted = Rule(
            SgmlLinkExtractor(
                allow=self.allow_regexp,
                deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)')
            ),
            follow=False,
            callback='parse_item'
        )
        super(TrendingSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self):
        print "Closing spider, crawled", self.crawled_pages
        if self.db is not None:
            self.db.commit()

    def done(self):
        return self.urls_limit is not 0 and (self.crawled_pages > self.urls_limit or self.crawled_all_pages > (self.urls_limit*10))

    def parse(self, response):
        self.crawled_all_pages += 1
        # This condition is there because even if we stopped adding new requests, we might still have more requests 
        # done in total than the self.url_limits
        # why? Because we stop when we reached sefl.urls_limit in terms of _crawled_ urls and not in terms of URLs 
        # added to the queue. This allows us to ensure we always crawl _at least_ self.urls_limit URLs but in return
        # we will most likely always crawl more than self.urls_limit because we will likely add new URLs before some
        # URLs in the queue (the queue having already reached the limit) have been fetched
        if self.done():
            return
        if (self.crawled_pages % self.PRINT_STATS_EVERY_X_CRAWLED_PAGES) is 0:  
            delta = time()-self.start_time
            print "Current crawl speed: ", self.crawled_pages, "urls crawled,", delta, "seconds,", self.crawled_pages / delta, "pages/second"
        if self.links_rule_targeted.link_extractor.matches(response.url):
            print "page targeted", response.url
            self.crawled_pages += 1
            html_p = htmlpage_from_response(response)
            scraped_result = self.scraper.scrape_page(html_p)
            score = scraped_result[0]['score'][0]
            if self.score_field_text_negative_matches:
                for to_strip_off in self.score_field_text_negative_matches:
                    score = score.replace(to_strip_off, '')
            print "\n===============================" * 2
            print "score=", score
            print "\n===============================" * 2
            item = (
                response.url,
                score,
                int(time())
            )
            self.save_to_db(item)
        if self.done(): # wasting a little bit resources here because of ">" instead of ">="
            return  # We do not scrap the links, this time
        unique_new_links = set(
            [
                l for l in self.links_rule.link_extractor.extract_links(response) 
                if len(l.url) <= 255 and TrendingSpider.extract_domain(l.url) == self.our_domain
            ]) - self.urls_seen

        print "Got", len(unique_new_links), "new links"
        self.urls_seen |= unique_new_links
        return [Request(link.url) for link in unique_new_links]

    def save_to_db(self, item):
        self.db.execute('INSERT INTO result(TIMESTAMP, SCORE, PAGE, SEARCH_ID) VALUES(?, ?, ?, ?)',
            (
                item[2],
                item[1],
                item[0],
                self.project_id
            )
        )
        self.db.commit()

    def init_db(self):
        import sqlite3
        self.db = sqlite3.connect(self.db_path)

    def abort(self):
        sys.stderr.write("\n===============================" * 2)
        sys.stderr.write("\nSomething went wrong, aborting.")
        sys.stderr.write("\n===============================" * 2)
        self.start_urls = []
        self.aborted = True

    def fetch_project_data(self):
        self.init_db()
        # Fetch data from DB
        test=str(self.project_id)
        c = self.db.execute('SELECT * FROM search WHERE id=?', (test,))
        d = c.fetchone()
        if d is None:
            perr("No project found in DB")
            return self.abort()
        data_to_match = {'score': d[1]}
        body = d[2]
        url = d[3]
        self.our_domain = TrendingSpider.extract_domain(url)
        self.start_urls = [url]  # This is one of the improvements we could implement
        from scrapely.template import FragmentNotFound
        try:
            self.setup_scraper(body, url, data_to_match)
        except FragmentNotFound:
            perr("Unable to learn from data")
            # We were not able to learn, cancel the crawl by having no start urls
            return self.abort()
        self.allow_regexp = d[5]
        self.urls_limit = int(d[6])
        if d[7] != '' and d[7] is not None:
            self.score_field_text_negative_matches = d[7].split(d[8])
        print "urls_limit=", self.urls_limit

    def setup_scraper(self, body, url, data_to_scrape):
        self.scraper = Scraper()
        decoded_body = univ_encode(body)
        self.scraper.train_from_htmlpage(HtmlPage(url=url, body=decoded_body), data_to_scrape)
    
    @staticmethod
    def extract_domain(url):
        try:
            url = url[url.index("//")+2:] # getting rid of protocol://
        except ValueError:
            # There was no protocol specified
            pass
        try:
            url = url[:url.index("/")] # getting rid of everything after the first "/"
        except ValueError:
            # Maybe it was a domain-onl   y url, with no "/"
            pass
        return url