예제 #1
0
파일: spider.py 프로젝트: amcclosky/slybot
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)
        
        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), 
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, 'annotated_body')
            for t in spec['templates'] if t.get('page_type', 'item') == 'form'
        ]
        
        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)
        
        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }
예제 #2
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)

        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({
            'id': "_links",
            'properties': ()
        })
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)

        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(
            self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(
                self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(
                schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(
                    schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"),
                                  meta=rdata,
                                  callback=self.parse_login_page)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                request = Request(url=rdata.pop("form_url"),
                                  meta=rdata,
                                  callback=self.parse_form_page)
                self.form_requests.append(request)
예제 #3
0
파일: spider.py 프로젝트: amcclosky/slybot
class IblSpider(BaseSpider):

    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)
        
        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), 
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, 'annotated_body')
            for t in spec['templates'] if t.get('page_type', 'item') == 'form'
        ]
        
        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)
        
        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

    def _get_allowed_domains(self, templates):
        urls = [x.url for x in templates]
        urls += self.start_urls
        return [x[1] for x in iter_unique_scheme_netloc(urls)]

    def _get_form_requests(self, templates):
        reqs = []
        # TODO: filter unique schema netlocs?
        for t in templates:
            # assume all templates are html and unicode
            response = HtmlResponse(t.url, encoding='utf-8',
                                    body=t.body, headers=t.headers)
            request = FormRequest.from_response(response,
                                                formname='SLYBOT-FORM',
                                                callback=self.parse,
                                                dont_filter=True)
            reqs.append(request)
        return reqs

    def _get_item_requests(self, templates):
        reqs = []
        urls = [x.url for x in templates]
        for scheme, netloc in iter_unique_scheme_netloc(urls):
            r = Request("%s://%s/" % (scheme, netloc), callback=self.parse, \
                dont_filter=True)
            reqs.append(r)
        return reqs
    
    def _requests_to_follow(self, htmlpage):
        requests = []
        if self._links_ibl_extractor is not None:
            extracted = self._links_ibl_extractor.extract(htmlpage)[0]
            if extracted:
                extracted_regions = extracted[0].get('_links', [])
                seen = set()
                for region in extracted_regions:
                    htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding)
                    for request in self._request_to_follow_from_region(htmlregion):
                        if request.url in seen:
                            continue
                        seen.add(request.url)
                        requests.append(request)
        else:
            requests = self._request_to_follow_from_region(htmlpage)

        return requests
            
    def _request_to_follow_from_region(self, htmlregion):
        requests = []
        seen = set()

        for link in self.link_extractor.links_to_follow(htmlregion):
            url = link.url
            if self.url_filterf(link):
                # filter out duplicate urls, later we should handle link text
                if url in seen:
                    continue
                seen.add(url)
                request = Request(url)
                if link.text:
                    request.meta['link_text'] = link.text
                requests.append(request)
        return requests

    def start_requests(self):
        if self.start_urls:
            return [Request(r, callback=self.parse, dont_filter=True) \
                for r in self.start_urls]
        if self._fpages:
            return self._get_form_requests(self._fpages)
        return self._get_item_requests(self._ipages)

    def parse(self, response):
        """Main handler for all downloaded responses"""
        if isinstance(response, HtmlResponse):
            return self.handle_html(response)
        else:
            content_type = response.headers.get('Content-Type')
            self.log("Ignoring page with content-type=%r: %s" % (content_type, \
                response.url), level=log.DEBUG)

    def _process_link_regions(self, htmlpage, link_regions):
        """Process link regions if any, and generate requests"""
        requests_to_follow = []
        if link_regions:
            for link_region in link_regions:
                htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, \
                        link_region, encoding=htmlpage.encoding)
                requests_to_follow.extend(self._requests_to_follow(htmlregion))
        else:
            requests_to_follow = self._requests_to_follow(htmlpage)
        return requests_to_follow

    def handle_html(self, response):
        htmlpage = HtmlPage(response.url, response.headers, \
                    response.body_as_unicode(), encoding=response.encoding)
        items, link_regions = self.extract_items(htmlpage)
        requests_to_follow = self._process_link_regions(htmlpage, link_regions)
        return requests_to_follow + items
        
    def extract_items(self, htmlpage):
        """This method is also called from UI webservice to extract items"""
        items = []
        link_regions = []
        for item_cls_name, info in self.itemcls_info.iteritems():
            item_descriptor = info['descriptor']
            extractor = info['extractor']
            extracted, _link_regions = self._do_extract_items_from(
                    htmlpage,
                    item_descriptor,
                    extractor,
                    item_cls_name,
            )
            items.extend(extracted)
            link_regions.extend(_link_regions)
        return items, link_regions

    def _do_extract_items_from(self, htmlpage, item_descriptor, extractor, item_cls_name):
        extracted_data, template = extractor.extract(htmlpage)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        processed_data = _process_extracted_data(extracted_data, item_descriptor, htmlpage)
        items = []
        item_cls = self.itemcls_info[item_cls_name]['class']
        for processed_attributes in processed_data:
            item = item_cls(processed_attributes)
            item['url'] = htmlpage.url
            item['_type'] = item_cls_name
            item['_template'] = template.id
            items.append(item)

        return items, link_regions

    def build_url_filter(self, spec):
        """make a filter for links"""
        respect_nofollow = spec.get('respect_nofollow', True)
        patterns = spec.get('follow_patterns')
        if spec.get("links_to_follow") == "none":
            url_filterf = lambda x: False
        elif patterns:
            pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns)
            follow_pattern = re.compile(pattern)
            if respect_nofollow:
                url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow
            else:
                url_filterf = lambda x: follow_pattern.search(x.url)
        elif respect_nofollow:
            url_filterf = lambda x: not x.nofollow
        else:
            url_filterf = bool
        # apply exclude patterns
        exclude_patterns = spec.get('exclude_patterns')
        if exclude_patterns:
            pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns)
            exclude_pattern = re.compile(pattern)
            self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x)
        else:
            self.url_filterf = url_filterf
예제 #4
0
파일: spider.py 프로젝트: canozden/slybot
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec["scrapes"]
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, log.WARNING)

        self._item_template_pages = sorted(
            (
                [t.get("scrapes", default_item), dict_to_page(t, "annotated_body"), t.get("extractors", [])]
                for t in spec["templates"]
                if t.get("page_type", "item") == "item"
            ),
            key=lambda pair: pair[0],
        )

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"]
        _links_item_descriptor = create_slybot_item_descriptor({"id": "_links", "properties": ()})
        self._links_ibl_extractor = (
            InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages])
            if _links_pages
            else None
        )

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type", "item") == "form"
        ]

        self.start_urls = self.start_urls or spec.get("start_urls")
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)

        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                "class": item_cls,
                "descriptor": item_descriptor,
                "extractor": extractor,
            }

        self._itemversion_cache = {}
예제 #5
0
파일: spider.py 프로젝트: canozden/slybot
class IblSpider(BaseSpider):
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec["scrapes"]
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, log.WARNING)

        self._item_template_pages = sorted(
            (
                [t.get("scrapes", default_item), dict_to_page(t, "annotated_body"), t.get("extractors", [])]
                for t in spec["templates"]
                if t.get("page_type", "item") == "item"
            ),
            key=lambda pair: pair[0],
        )

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"]
        _links_item_descriptor = create_slybot_item_descriptor({"id": "_links", "properties": ()})
        self._links_ibl_extractor = (
            InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages])
            if _links_pages
            else None
        )

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type", "item") == "form"
        ]

        self.start_urls = self.start_urls or spec.get("start_urls")
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)

        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                "class": item_cls,
                "descriptor": item_descriptor,
                "extractor": extractor,
            }

        self._itemversion_cache = {}

    def _get_allowed_domains(self, templates):
        urls = [x.url for x in templates]
        urls += self.start_urls
        return [x[1] for x in iter_unique_scheme_netloc(urls)]

    def _get_form_requests(self, templates):
        reqs = []
        # TODO: filter unique schema netlocs?
        for t in templates:
            # assume all templates are html and unicode
            response = HtmlResponse(t.url, encoding="utf-8", body=t.body, headers=t.headers)
            request = FormRequest.from_response(response, formname="SLYBOT-FORM", callback=self.parse, dont_filter=True)
            reqs.append(request)
        return reqs

    def _get_item_requests(self, templates):
        reqs = []
        urls = [x.url for x in templates]
        for scheme, netloc in iter_unique_scheme_netloc(urls):
            r = Request("%s://%s/" % (scheme, netloc), callback=self.parse, dont_filter=True)
            reqs.append(r)
        return reqs

    def _requests_to_follow(self, htmlpage):
        requests = []
        if self._links_ibl_extractor is not None:
            extracted = self._links_ibl_extractor.extract(htmlpage)[0]
            if extracted:
                extracted_regions = extracted[0].get("_links", [])
                seen = set()
                for region in extracted_regions:
                    htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding)
                    for request in self._request_to_follow_from_region(htmlregion):
                        if request.url in seen:
                            continue
                        seen.add(request.url)
                        requests.append(request)
        else:
            requests = self._request_to_follow_from_region(htmlpage)

        return requests

    def _request_to_follow_from_region(self, htmlregion):
        requests = []
        seen = set()

        for link in self.link_extractor.links_to_follow(htmlregion):
            url = link.url
            if self.url_filterf(link):
                # filter out duplicate urls, later we should handle link text
                if url in seen:
                    continue
                seen.add(url)
                request = Request(url)
                if link.text:
                    request.meta["link_text"] = link.text
                requests.append(request)
        return requests

    def start_requests(self):
        if self.start_urls:
            return [Request(r, callback=self.parse, dont_filter=True) for r in self.start_urls]
        if self._fpages:
            return self._get_form_requests(self._fpages)
        return self._get_item_requests(self._ipages)

    def parse(self, response):
        """Main handler for all downloaded responses"""
        if isinstance(response, HtmlResponse):
            return self.handle_html(response)
        else:
            content_type = response.headers.get("Content-Type")
            self.log("Ignoring page with content-type=%r: %s" % (content_type, response.url), level=log.DEBUG)

    def _process_link_regions(self, htmlpage, link_regions):
        """Process link regions if any, and generate requests"""
        requests_to_follow = []
        if link_regions:
            for link_region in link_regions:
                htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, link_region, encoding=htmlpage.encoding)
                requests_to_follow.extend(self._requests_to_follow(htmlregion))
        else:
            requests_to_follow = self._requests_to_follow(htmlpage)
        return requests_to_follow

    def handle_html(self, response):
        htmlpage = HtmlPage(response.url, response.headers, response.body_as_unicode(), encoding=response.encoding)
        items, link_regions = self.extract_items(htmlpage)
        requests_to_follow = self._process_link_regions(htmlpage, link_regions)
        return requests_to_follow + items

    def extract_items(self, htmlpage):
        """This method is also called from UI webservice to extract items"""
        items = []
        link_regions = []
        for item_cls_name, info in self.itemcls_info.iteritems():
            item_cls = info["class"]
            item_descriptor = info["descriptor"]
            extractor = info["extractor"]
            extracted, _link_regions = self._do_extract_items_from(
                htmlpage, item_cls, item_descriptor, extractor, item_cls_name
            )
            items.extend(extracted)
            link_regions.extend(_link_regions)
        return items, link_regions

    def _do_extract_items_from(self, htmlpage, item_cls, item_descriptor, extractor, item_cls_name):
        extracted_data, template = extractor.extract(htmlpage)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        processed_data = _process_extracted_data(extracted_data, item_descriptor, htmlpage)
        items = []
        for processed_attributes in processed_data:
            item = item_cls(processed_attributes)
            item["url"] = htmlpage.url
            item["_type"] = item_cls_name
            item["_template"] = template.id
            if self._check_not_dupe(item_cls, item):
                items.append(item)

        return items, link_regions

    def _check_not_dupe(self, item_cls, item):
        """Checks whether a scrapy item is a dupe, based on version (not vary)
        fields of the item class"""
        if not item_cls.version_fields:
            return True

        version = create_item_version(item_cls, item)
        if version in self._itemversion_cache:
            old_url = self._itemversion_cache[version]
            self.log(
                "Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url), log.WARNING
            )
            return False
        self._itemversion_cache[version] = item["url"]
        return True

    def build_url_filter(self, spec):
        """make a filter for links"""
        respect_nofollow = spec.get("respect_nofollow", True)
        patterns = spec.get("follow_patterns")
        if spec.get("links_to_follow") == "none":
            url_filterf = lambda x: False
        elif patterns:
            pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % "|".join(patterns)
            follow_pattern = re.compile(pattern)
            if respect_nofollow:
                url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow
            else:
                url_filterf = lambda x: follow_pattern.search(x.url)
        elif respect_nofollow:
            url_filterf = lambda x: not x.nofollow
        else:
            url_filterf = bool
        # apply exclude patterns
        exclude_patterns = spec.get("exclude_patterns")
        if exclude_patterns:
            pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % "|".join(exclude_patterns)
            exclude_pattern = re.compile(pattern)
            self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x)
        else:
            self.url_filterf = url_filterf
예제 #6
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)
        
        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), 
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == '_links']
        _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, 'annotated_body')
            for t in spec['templates'] if t.get('page_type', 'item') == 'form'
        ]
        
        self._start_urls = spec.get('start_urls')

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)
        
        # make a filter for links 
        respect_nofollow = spec.get('respect_nofollow', True)
        patterns = spec.get('follow_patterns')
        if patterns:
            pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns)
            follow_pattern = re.compile(pattern)
            if respect_nofollow:
                url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow
            else:
                url_filterf = lambda x: follow_pattern.search(x.url)
        elif respect_nofollow:
            url_filterf = lambda x: not x.nofollow
        else:
            url_filterf = bool
        # apply exclude patterns
        exclude_patterns = spec.get('exclude_patterns')
        if exclude_patterns:
            pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns)
            exclude_pattern = re.compile(pattern)
            self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x)
        else:
            self.url_filterf = url_filterf

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self._itemversion_cache = {}