Пример #1
0
    def __init__(
        self,
        allow=(),
        deny=(),
        allow_domains=(),
        deny_domains=(),
        restrict_xpaths=(),
        tags=("a", "area"),
        attrs=("href",),
        canonicalize=True,
        unique=True,
        process_value=None,
        deny_extensions=None,
        restrict_css=(),
    ):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process=process_value)

        super(LxmlLinkExtractor, self).__init__(
            lx,
            allow=allow,
            deny=deny,
            allow_domains=allow_domains,
            deny_domains=deny_domains,
            restrict_xpaths=restrict_xpaths,
            restrict_css=restrict_css,
            canonicalize=canonicalize,
            deny_extensions=deny_extensions,
        )
Пример #2
0
 def __init__(
     self,
     allow=(),
     deny=(),
     allow_domains=(),
     deny_domains=(),
     restrict_xpaths=(),
     tags=("a", "area"),
     attrs=("href"),
     canonicalize=True,
     unique=True,
     process_value=None,
     deny_extensions=None,
 ):
     self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
     self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
     self.allow_domains = set(arg_to_iter(allow_domains))
     self.deny_domains = set(arg_to_iter(deny_domains))
     self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
     self.canonicalize = canonicalize
     if deny_extensions is None:
         deny_extensions = IGNORED_EXTENSIONS
     self.deny_extensions = set(["." + e for e in deny_extensions])
     tag_func = lambda x: x in tags
     attr_func = lambda x: x in attrs
     BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, unique=unique, process_value=process_value)
Пример #3
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True,
                 process_value=None, deny_extensions=None, restrict_css=()):

        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
            "Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning
        )

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs

        with warnings.catch_warnings(record=True):
            lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
                unique=unique, process_value=process_value)

        super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
            allow_domains=allow_domains, deny_domains=deny_domains,
            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
            canonicalize=canonicalize, deny_extensions=deny_extensions)

        # FIXME: was added to fix a RegexLinkExtractor testcase
        self.base_url = None
Пример #4
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True,
                 process_value=None, deny_extensions=None, restrict_css=(),
                 strip=True, restrict_text=()):
        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
            "Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning, stacklevel=2,
        )

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', ScrapyDeprecationWarning)
            lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
                                       unique=unique, process_value=process_value, strip=strip,
                                       canonicalized=canonicalize)

        super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
                                                allow_domains=allow_domains, deny_domains=deny_domains,
                                                restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
                                                canonicalize=canonicalize, deny_extensions=deny_extensions,
                                                restrict_text=restrict_text)
Пример #5
0
    def __init__(self, allow=(), deny=()):
        """Initialize allow/deny attributes"""
        _re_type = type(re.compile('', 0))

        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) 
                          for x in arg_to_iter(allow)]
        self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) 
                         for x in arg_to_iter(deny)]
Пример #6
0
        def wrapper(self, response):
            if not hasattr(self, 'debug'):
                self.debug = False
            if self.debug:
                if not hasattr(self, 'failed_urls'):
                    self.failed_urls = []
                if not hasattr(self, 'missing_fields'):
                    self.missing_fields = {}
                self.crawler.stats.set_value('DEBUG MODE', 'TRUE')

            result = func(self, response)
            for result in arg_to_iter(func(self, response)):
                if isinstance(result, Item):
                    exception = DropItem
                else:
                    exception = CloseSpider

                if response.status >= 400 or any(error in response.body for error in errorstrings):
                    if self.debug:
                        self.crawler.stats.inc_value('DEBUG: failed_urls_count')
                        self.failed_urls.append(response.url)
                        self.crawler.stats.set_value(
                            'DEBUG: failed_urls', self.failed_urls)
                    if exception == DropItem:
                        self.crawler.stats.inc_value('items_dropped_count')
                    if response.status >= 400:
                        raise exception(
                            'Status Code Error: %s\nURL: %s'
                            % (response.status, response.url))
                    else:
                        errors = [error for error in errorstrings if error in response.body]
                        raise exception(
                            'Response Body Error: %s\nURL: %s'
                            % (', '.join(errors), response.url))
                    yield None

                if isinstance(result, Item):
                    if self.debug:
                        job_misses_a_required_field = False
                        for field in arg_to_iter(fields_to_check):
                            if not result.get(field):
                                if field not in self.missing_fields:
                                    self.missing_fields[field] = []
                                self.missing_fields[field].append(response.url)
                                job_misses_a_required_field = True
                        for key in self.missing_fields.keys():
                            self.crawler.stats.set_value('DEBUG: missing_%s_field' % key, self.missing_fields[key])

                        if job_misses_a_required_field:
                            self.crawler.stats.inc_value(
                                'DEBUG: jobs_missing_required_field_count')

                    else:
                        if not result.get('referencenumber'):
                            self.crawler.stats.inc_value('items_dropped_count')
                            raise MissingJobField('referencenumber', response.url)

                yield result
Пример #7
0
    def test_arg_to_iter(self):
        assert hasattr(arg_to_iter(None), '__iter__')
        assert hasattr(arg_to_iter(100), '__iter__')
        assert hasattr(arg_to_iter('lala'), '__iter__')
        assert hasattr(arg_to_iter([1,2,3]), '__iter__')
        assert hasattr(arg_to_iter(l for l in 'abcd'), '__iter__')

        self.assertEqual(list(arg_to_iter(None)), [])
        self.assertEqual(list(arg_to_iter('lala')), ['lala'])
        self.assertEqual(list(arg_to_iter(100)), [100])
        self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c'])
        self.assertEqual(list(arg_to_iter([1,2,3])), [1,2,3])
Пример #8
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area', 'script', 'link'), attrs=('href', 'src'), canonicalize=True,
                 unique=True, process_value=None, deny_extensions=None):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = SayParserLinkExtractor(tag=tag_func, attr=attr_func,
            unique=unique, process=process_value)

        super(SayLinkExtractor, self).__init__(lx, allow, deny,
            allow_domains, deny_domains, restrict_xpaths, canonicalize,
            deny_extensions)
Пример #9
0
 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
              tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
     self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
     self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
     self.allow_domains = set(arg_to_iter(allow_domains))
     self.deny_domains = set(arg_to_iter(deny_domains))
     self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
     self.canonicalize = canonicalize
     tag_func = lambda x: x in tags
     attr_func = lambda x: x in attrs
     BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, 
         unique=unique, process_value=process_value)
Пример #10
0
 def __call__(self, value, loader_context=None):
     values = arg_to_iter(value)
     if loader_context:
         context = MergeDict(loader_context, self.default_loader_context)
     else:
         context = self.default_loader_context
     wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
     for func in wrapped_funcs:
         next_values = []
         for v in values:
             next_values += arg_to_iter(func(v))
         values = next_values
     return values
Пример #11
0
	def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href',), canonicalize=True,
    unique=True, process_value=None, deny_extensions=None, restrict_css=()):
		tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
		tag_func = lambda x: x in tags
		attr_func = lambda x: x in attrs
		lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process=process_value)
		self.crawledPagesPerSite={}
		self.maximumPagesPerSite=10000 #每个网站最多可以爬取的页面个数
		super(CustomLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
            allow_domains=allow_domains, deny_domains=deny_domains,
            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
            canonicalize=canonicalize, deny_extensions=deny_extensions)
Пример #12
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
                 deny_extensions=None):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
            unique=unique, process_value=process_value)
        super(SgmlLinkExtractor, self).__init__(lx, allow, deny,
            allow_domains, deny_domains, restrict_xpaths, canonicalize,
            deny_extensions)

        # FIXME: was added to fix a RegexLinkExtractor testcase
        self.base_url = None
Пример #13
0
    def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
        if attachs:
            msg = MIMEMultipart()
        else:
            msg = MIMENonMultipart(*mimetype.split('/', 1))

        to = list(arg_to_iter(to))
        cc = list(arg_to_iter(cc))

        msg['From'] = self.mailfrom
        msg['To'] = COMMASPACE.join(to)
        msg['Date'] = formatdate(localtime=True)
        msg['Subject'] = subject
        rcpts = to[:]
        if cc:
            rcpts.extend(cc)
            msg['Cc'] = COMMASPACE.join(cc)

        if charset:
            msg.set_charset(charset)

        if attachs:
            msg.attach(MIMEText(body, 'plain', charset or 'us-ascii'))
            for attach_name, mimetype, f in attachs:
                part = MIMEBase(*mimetype.split('/'))
                part.set_payload(f.read())
                Encoders.encode_base64(part)
                part.add_header('Content-Disposition', 'attachment; filename="%s"' \
                    % attach_name)
                msg.attach(part)
        else:
            msg.set_payload(body)

        if _callback:
            _callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)

        if self.debug:
            logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
                         'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
                         {'mailto': to, 'mailcc': cc, 'mailsubject': subject,
                          'mailattachs': len(attachs)})
            return

        dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
        dfd.addCallbacks(self._sent_ok, self._sent_failed,
            callbackArgs=[to, cc, subject, len(attachs)],
            errbackArgs=[to, cc, subject, len(attachs)])
        reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
        return dfd
Пример #14
0
 def _do_extract_items_from(self, htmlpage, extractor, response=None):
     # Try to predict template to use
     template_cluster, pref_template_id = self._cluster_page(htmlpage)
     extracted, template = extractor.extract(htmlpage, pref_template_id)
     extracted = extracted or []
     link_regions = []
     for ddict in extracted:
         link_regions.extend(arg_to_iter(ddict.pop("_links", [])))
     descriptor = None
     unprocessed = False
     if template is not None and hasattr(template, 'descriptor'):
         descriptor = template.descriptor()
         if hasattr(descriptor, 'name'):
             item_cls_name = descriptor.name
         elif hasattr(descriptor, 'get'):
             item_cls_name = descriptor.get('name',
                                            descriptor.get('display_name'))
         else:
             item_cls_name = ''
     else:
         unprocessed = True
         try:
             descriptor = self.schema_descriptors[template.id]
             item_cls_name = self.template_scrapes[template.id]
         except (AttributeError, KeyError):
             try:
                 descriptor = sorted(self.schema_descriptors.items())[0][1]
                 item_cls_name = sorted(self.template_scrapes.items())[0][1]
             except IndexError:
                 descriptor, item_cls_name = None, None
     item_cls = self.item_classes.get(item_cls_name)
     items = []
     for processed_attributes in extracted:
         if processed_attributes.get('_type') in self.item_classes:
             _type = processed_attributes['_type']
             item = self.item_classes[_type](processed_attributes)
             item['_type'] = item.display_name()
         elif unprocessed:
             item = self._process_attributes(processed_attributes,
                                             descriptor, htmlpage)
             if item_cls:
                 item = item_cls(item)
         elif item_cls:
             item = item_cls(processed_attributes)
         else:
             item = dict(processed_attributes)
         item[u'url'] = htmlpage.url
         item[u'_template'] = str(template.id)
         item.setdefault('_type', item_cls_name)
         if not isinstance(item, SlybotItem):
             default_meta = {'type': 'text', 'required': False,
                             'vary': False}
             item_cls = SlybotItem.create_iblitem_class(
                 {'fields': {k: default_meta for k in item}}
             )
             item = item_cls(**item)
         if self.clustering:
             item['_template_cluster'] = template_cluster
         items.append(item)
     return items, link_regions
Пример #15
0
 def _compose(values, wrapped_funcs):
     for func in wrapped_funcs:
         next_values = []
         for v in values:
             next_values += arg_to_iter(func(v))
         values = next_values
     return values
Пример #16
0
def _get_spider_requests(*args):
    """Collect requests and spiders from the given arguments. Returns a dict of
    spider -> list of requests
    """
    spider_requests = defaultdict(list)
    for arg in args:
        if isinstance(arg, tuple):
            request, spider = arg
            spider_requests[spider] = request
        elif isinstance(arg, Request):
            spider = spiders.fromurl(arg.url) or BaseSpider('default')
            if spider:
                spider_requests[spider] += [arg]
            else:
                log.msg('Could not find spider for request: %s' % arg, log.ERROR)
        elif isinstance(arg, BaseSpider):
            spider_requests[arg] += arg.start_requests()
        elif is_url(arg):
            spider = spiders.fromurl(arg) or BaseSpider('default')
            if spider:
                for req in arg_to_iter(spider.make_requests_from_url(arg)):
                    spider_requests[spider] += [req]
            else:
                log.msg('Could not find spider for url: %s' % arg, log.ERROR)
        elif isinstance(arg, basestring):
            spider = spiders.fromdomain(arg)
            if spider:
                spider_requests[spider] += spider.start_requests()
            else:
                log.msg('Could not find spider for domain: %s' % arg, log.ERROR)
        else:
            raise TypeError("Unsupported argument: %r" % arg)
    return spider_requests
Пример #17
0
    def process_item(self, item, spider):
        obj_map = self.spider_objs[spider]
        rel_fields = item._model_rel_fields

        # If there are no related fields to resolve just save and return.
        if not rel_fields:
            return self.save_item(None, item, spider)

        # Build a list of outstanding requests.
        req_ids = sum([arg_to_iter(item.get(f.name, [])) for f in rel_fields], [])
        req_ids = [u for u in req_ids if (u not in obj_map or isinstance(obj_map[u], Deferred))]

        # If there are no requests to perform, fill, save and return.
        if not req_ids:
            return self.save_item(None, item, spider)

        # Defer?
        dlist = []
        for id in req_ids:
            if id not in obj_map:
                obj_map[id] = Deferred()
            dfd = obj_map[id]
            assert dfd is not None
            dlist.append(dfd)
        return DeferredList(dlist, consumeErrors=1).addCallback(self.save_item, item, spider)
Пример #18
0
    def _request_callback(self, spider, original_callback, response):
        """

        Close the page (lose the reference to it so it is garbage collected)
        when the callback returns.

        The original callback may prevent page closing by setting the
        should_close_webpage attribute in responses. This is useful for
        example if the page is stored somewhere else (e.g. request meta) to be
        used later. The page then needs to be closed manually at some point by
        calling its close_page() function, which is created here.

        """

        if isinstance(original_callback, basestring):
            original_callback = getattr(spider, original_callback)

        webpage = response.webpage
        response.should_close_webpage = True
        try:
            returnValue(arg_to_iter((yield maybeDeferred(original_callback,
                                                         response))))
        finally:
            # FIXME: sometimes this section is reached before the wrapped
            # callback finishes, when it returns a Deferred.
            if response.should_close_webpage:
                self._close_page(webpage)
            else:
                webpage.close_page = partial(self._close_page, webpage)
                webpage.close_page.__doc__ = ("Lose the reference to the "
                                              "webpage object and allow it "
                                              "to be garbage collected.")
Пример #19
0
	def __call__(self, value, loader_context=None):
		value = arg_to_iter(value)[0]
		if 'http://' not in value:
			#import pdb; pdb.set_trace()
			value = urljoin_rfc(get_base_url(loader_context['response']), value)
		
		return value
Пример #20
0
 def __call__(self, values):
     out = []
     for value in [x for x in arg_to_iter(values) if isinstance(x, basestring)]:
         for m in self.latin_html_map:
             value =  value.replace(m,self.latin_html_map[m])
         out.append(value)
     return out
Пример #21
0
 def __call__(self, values):
     new_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             v = remove_entities(v).strip()
         new_values.append(int(v))
     return new_values
Пример #22
0
	def __call__(self, value, loader_context=None):
		#value = TakeFirst(value)
		#value = value.Clean()
		values = arg_to_iter(value)
		for i,value in enumerate(values):
			if value:
				return value.replace(',','.')
Пример #23
0
    def __call__(self, values):
        # first we convert the strings to Element objects
        list_of_elements = []
        for x in arg_to_iter(values):
            if not x.isspace():
                try:
                    list_of_elements.append(html.fromstring(x))
                except:
                    # invalid html(eg:comment tags)
                    pass
        # then, we go ahead and remove every higher-level 'bad' tag from the list
        processed_list = [x for x in list_of_elements if x.tag not in self.bad]
        # then we check each element for sub-elements, and
        # remove the 'bad' tags under each. this is because
        # sometimes the strings we get are full-on trees in their
        # own right, so we need to go and pick off the children
        # one by one.
        for elem in processed_list:
            # if the top level element is bad
            # we just remove it and all the children
            # should follow
            if elem.tag in self.bad:
                elem.getparent().remove(elem)
            else:
                # otherwise, we use the xpath
                # method to check if there are any
                # bad children using xpath.
                self.remove_children(elem, self.bad)

        return [html.tostring(x, encoding='unicode') for x in processed_list]
Пример #24
0
    def remove_extra_words(self, value, remove_words=[]):
        """Removing all the extra words"""
        for temp in remove_words:
            temp = temp.lower()
            value = [v.lower().replace(temp, '') for v in arg_to_iter(value)]

        return value
Пример #25
0
 def __call__(self, value, loader_context=None):
     values = arg_to_iter(value)
     if loader_context:
         context = MergeDict(loader_context, self.default_loader_context)
     else:
         context = self.default_loader_context
     wrapped_funcs = (wrap_loader_context(f, context) for f in self.functions)
     return self._compose(values, wrapped_funcs)
Пример #26
0
    def _get_jmes_values(self, jmes_paths):
        if self.json_obj is None:
            raise RuntimeError("no JSON object found")

        jmes_paths = arg_to_iter(jmes_paths)
        return flatten(
            jmespath.search(jmes_path, self.json_obj)
            for jmes_path in jmes_paths)
Пример #27
0
 def test_url_has_any_extension(self):
     deny_extensions = {'.' + e for e in arg_to_iter(IGNORED_EXTENSIONS)}
     self.assertTrue(url_has_any_extension("http://www.example.com/archive.tar.gz", deny_extensions))
     self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", deny_extensions))
     self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", deny_extensions))
Пример #28
0
 def map_final_data(self, value):
     indutries_codes = []
     for x in arg_to_iter(value):
         if x.lower() in self.map.keys():
             indutries_codes.extend([self.map[x.lower()]])
         else:
             indutries_codes.extend([x])
     return indutries_codes
Пример #29
0
 def __call__(self, values):
     rvals = []
     for v in arg_to_iter(values):
         try:
             rvals.append(datetime.strptime(v.strip(), self.informat).strftime(self.outformat))
         except ValueError:
             pass
     return rvals
Пример #30
0
 def _get_cssvalues(self, csss, **kw):
     self._check_selector_method()
     csss = arg_to_iter(csss)
     ret = self._extract_hier_csss(self.selector, csss, **kw)
     if ret is None or not flatten(ret):
         return None
     else:
         return ret
Пример #31
0
 def __call__(self, value, loader_context=None):
     if not value:
         value.append(" ")
     values = arg_to_iter(value)
     if loader_context:
         context = MergeDict(loader_context, self.default_loader_context)
     else:
         context = self.default_loader_context
     wrapped_funcs = [
         wrap_loader_context(f, context) for f in self.functions
     ]
     for func in wrapped_funcs:
         next_values = []
         for v in values:
             next_values += arg_to_iter(func(v))
         values = next_values
     return values
Пример #32
0
 def process_item(self, item, spider):
     if 'meta' not in spider.name:
         return item
     info = self.spiderinfo
     requests = arg_to_iter(self.get_media_requests(item, info))
     dlist = [self._process_request(r, info) for r in requests]
     dfd = DeferredList(dlist, consumeErrors=1)
     return dfd.addCallback(self.item_completed, item, info)
Пример #33
0
 def __init__(self, settings):
     if not settings.getbool('PROXYMESH_ENABLED', True):
         raise NotConfigured
     self.proxies = itertools.cycle(
         arg_to_iter(
             settings.get('PROXYMESH_URL',
                          'http://us-il.proxymesh.com:31280')))
     self.timeout = settings.getint('PROXYMESH_TIMEOUT', 0)
Пример #34
0
 def assertReMatch(self, regex, actual, msg=None):  # {{{
     actuals = arg_to_iter(actual)
     for actual in actuals:
         match = re.search(regex, actual)
         errmsg = "%s not match %s" % (actual, regex)
         if msg:
             errmsg = "%s\n%s" % (msg, errmsg)
         self.assertTrue(match, errmsg)
Пример #35
0
    def __init__(self,
                 allow=(),
                 deny=(),
                 allow_domains=(),
                 deny_domains=(),
                 restrict_xpaths=(),
                 tags=('a', 'area'),
                 attrs=('href', ),
                 canonicalize=False,
                 unique=True,
                 process_value=None,
                 deny_extensions=None,
                 restrict_css=(),
                 strip=True,
                 restrict_text=()):
        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
            "Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning,
            stacklevel=2,
        )

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', ScrapyDeprecationWarning)
            lx = BaseSgmlLinkExtractor(tag=tag_func,
                                       attr=attr_func,
                                       unique=unique,
                                       process_value=process_value,
                                       strip=strip,
                                       canonicalized=canonicalize)

        super(SgmlLinkExtractor,
              self).__init__(lx,
                             allow=allow,
                             deny=deny,
                             allow_domains=allow_domains,
                             deny_domains=deny_domains,
                             restrict_xpaths=restrict_xpaths,
                             restrict_css=restrict_css,
                             canonicalize=canonicalize,
                             deny_extensions=deny_extensions,
                             restrict_text=restrict_text)
Пример #36
0
    def process_item(self, item, spider):
        info = self.spiderinfo
        requests = arg_to_iter(self.get_media_requests(item, info))
        dlist = [self._process_request(r, info) for r in requests]
        dfd = DeferredList(dlist, consumeErrors=1)
		
		# https://github.com/scrapy/scrapy/issues/4228
		item_copied = deepcopy(item)
Пример #37
0
 def __init__(self, url, fields, lang_priorities=None):
     self.url = url
     self.fields = fields
     self.lang_priorities = {
         lang: prio for prio, lang in enumerate(arg_to_iter(lang_priorities))
     }
     self.labels = {}
     self.logger = LOGGER
Пример #38
0
 def process_item(self, item, spider):
     # add names
     self.file_paths.update(item["file_paths"])
     info = self.spiderinfo
     requests = arg_to_iter(self.get_media_requests(item, info))
     dlist = [self._process_request(r, info) for r in requests]
     dfd = DeferredList(dlist, consumeErrors=1)
     return dfd.addCallback(self.item_completed, item, info)
Пример #39
0
 def _get_jsonpathvalues(self, jsonpaths, **kw):
     self._check_selector_method()
     jsonpaths = arg_to_iter(jsonpaths)
     ret = self._extract_hier_jsonpaths(self.selector.json, jsonpaths, **kw)
     if not flatten(ret):
         return None
     else:
         return ret
Пример #40
0
    def __call__(self, values):
        output = []
        for val in arg_to_iter(values):
            val = self.remove_empty_spaces(val)
            val = self.remove_special_chars(val)
            output.extend(val)

        return val
Пример #41
0
	def __call__(self, value, loader_context=None):
		values = arg_to_iter(value)
		values.reverse()
		secs=0
		for i, v in enumerate(values):
			secs+= int(v.strip('\:'))*(60**i)
		
		return secs
Пример #42
0
    def __init__(self, link_extractor, allow, deny, allow_domains,
                 deny_domains, restrict_xpaths, canonicalize, deny_extensions,
                 restrict_css):

        self.link_extractor = link_extractor

        self.allow_res = [
            x if isinstance(x, _re_type) else re.compile(x)
            for x in arg_to_iter(allow)
        ]
        self.deny_res = [
            x if isinstance(x, _re_type) else re.compile(x)
            for x in arg_to_iter(deny)
        ]

        self.allow_domains = set(arg_to_iter(allow_domains))
        self.deny_domains = set(arg_to_iter(deny_domains))

        self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
        self.restrict_xpaths += tuple(
            map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css)))

        self.canonicalize = canonicalize
        if deny_extensions is None:
            deny_extensions = IGNORED_EXTENSIONS
        self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
Пример #43
0
 def process_item(self, item, spider):
     """
         custom process_item func,so it will manage the Request result.
     """
     info = self.spiderinfo
     requests = arg_to_iter(self.get_media_requests(item, info))
     dlist = [self._process_request(r, info) for r in requests]
     dfd = DeferredList(dlist, consumeErrors=1)
     return dfd.addCallback(self.item_completed, item, info)
Пример #44
0
def iterate_spider_output(result):
    if inspect.isasyncgen(result):
        return result
    elif inspect.iscoroutine(result):
        d = deferred_from_coro(result)
        d.addCallback(iterate_spider_output)
        return d
    else:
        return arg_to_iter(deferred_from_coro(result))
Пример #45
0
 def process_item(self, item, spider):
     if item.get('image_urls'):
         info = self.spiderinfo
         requests = arg_to_iter(self.get_media_requests(item, info))
         dlist = [self._process_request(r, info) for r in requests]
         dfd = DeferredList(dlist, consumeErrors=True)
         return dfd.addCallback(self.item_completed, item, info)
     else:
         return item
Пример #46
0
 def start_requests(self):
     print('self.queries: ', self.queries)
     for query in arg_to_iter(self.queries):
         url = self.make_google_search_request(COUNTRIES[self.region],
                                               query)
         print('url: ', url)
         yield scrapy.Request(url=url,
                              meta={'query': query},
                              callback=self.parse)
Пример #47
0
    def __init__(self,
                 allow=(),
                 deny=(),
                 allow_domains=(),
                 deny_domains=(),
                 restrict_xpaths=(),
                 tags=('a', 'area'),
                 attrs=('href', ),
                 canonicalize=True,
                 unique=True,
                 process_value=None,
                 deny_extensions=None,
                 restrict_css=()):

        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
            "Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning,
            stacklevel=2,
        )

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs

        with warnings.catch_warnings(record=True):
            lx = BaseSgmlLinkExtractor(tag=tag_func,
                                       attr=attr_func,
                                       unique=unique,
                                       process_value=process_value)

        super(SgmlLinkExtractor,
              self).__init__(lx,
                             allow=allow,
                             deny=deny,
                             allow_domains=allow_domains,
                             deny_domains=deny_domains,
                             restrict_xpaths=restrict_xpaths,
                             restrict_css=restrict_css,
                             canonicalize=canonicalize,
                             deny_extensions=deny_extensions)

        # FIXME: was added to fix a RegexLinkExtractor testcase
        self.base_url = None
Пример #48
0
 def __call__(self, values):
     new_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             v = remove_entities(v).strip()
             v = (v.lower() == 'true')
         else:
             v = bool(v)
         new_values.append(v)
     return new_values
Пример #49
0
 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
              tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
              deny_extensions=None):
     self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
     self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
     self.allow_domains = set(arg_to_iter(allow_domains))
     self.deny_domains = set(arg_to_iter(deny_domains))
     self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
     self.canonicalize = canonicalize
     if deny_extensions is None:
         deny_extensions = IGNORED_EXTENSIONS
     self.deny_extensions = set(['.' + e for e in deny_extensions])
     tag_func = lambda x: x in tags
     attr_func = lambda x: x in attrs
     BaseSgmlLinkExtractor.__init__(self,
                                    tag=tag_func,
                                    attr=attr_func,
                                    unique=unique,
                                    process_value=process_value)
Пример #50
0
 def add_values(self, values, keys=None):
     if not keys:
         keys = self.default_keys
     elif isinstance(keys, basestring):
         keys = self.keys[keys]
     
     for k, v in zip(keys, values):
         if k:
             for k in arg_to_iter(k):
                 self.add_value(k, v)
Пример #51
0
 def _extract(self, response, option):
     """
         1. extract links/items
         2. filter desired output
     """
     conds = option.get('conds', [])
     ref = option.get('ref')
     if not ref:
         extracts = self.__extract(response, option)
         _ITEM_REFS[response] = list(arg_to_iter(extracts))
     return self.__filter(_ITEM_REFS[response], conds)
Пример #52
0
 def __call__(self, values):
     out_values = []
     values = arg_to_iter(values)
     while values:
         val = values.pop(0)
         if values and val == 'R' and values[0] == 'B':
             values.pop(0)
             out_values.append('R&B')
         elif val:
             out_values.append(val)
     return out_values
Пример #53
0
 def _deferred_field(self, field, item, spider):
     deferreds = [
         self._deferred_value(value, spider)
         for value in arg_to_iter(item.get(field))
     ]
     if not deferreds:
         item[field] = None
         return defer_result(item)
     deferred = DeferredList(deferreds, consumeErrors=True)
     deferred.addBoth(self._add_value, field, item)
     return deferred
Пример #54
0
 def _extract_hier_csss(self, node, csss, **kw):
     csss = arg_to_iter(csss)
     if len(csss) > 1:
         child_csss = csss[1:]
         return [
             self._extract_hier_csss(Selector(text=child_node_html),
                                     child_csss, **kw)
             for child_node_html in node.css(csss[0])
         ]
     else:
         return filter_regex(kw.get('regex'), node.css(csss[0]))
Пример #55
0
def iterate_spider_output(result):
    if collect_asyncgen and hasattr(
            inspect, 'isasyncgen') and inspect.isasyncgen(result):
        d = deferred_from_coro(collect_asyncgen(result))
        d.addCallback(iterate_spider_output)
        return d
    elif inspect.iscoroutine(result):
        d = deferred_from_coro(result)
        d.addCallback(iterate_spider_output)
        return d
    return arg_to_iter(result)
Пример #56
0
    def process_item(self, item, spider):
        """Copy a limited number of image URLs to be downloaded from source to target."""

        # adding target field would result in error; return item as-is
        if hasattr(item, "fields") and self.target_field not in item.fields:
            return item

        if self.limit is None or self.limit < 0:  # copy through everything
            item[self.target_field] = list(
                arg_to_iter(item.get(self.source_field)))
            return item

        if not self.limit:  # limit is zero
            item[self.target_field] = []
            return item

        # actual limit
        item[self.target_field] = list(
            islice(arg_to_iter(item.get(self.source_field)), self.limit))
        return item
Пример #57
0
 def process_item(self, item, spider):
     # ensure
     if abs(spider.count) + 1 > spider.max:
         spider.close_down = True
     info = self.spiderinfo
     requests = arg_to_iter(self.get_media_requests(item, info))
     dlist = [self._process_request(r, info) for r in requests]
     dfd = DeferredList(dlist, consumeErrors=1)
     # only update when item is passed to pipeline, ensuring count consistency
     spider.count -= 1
     return dfd.addCallback(self.item_completed, item, info)
Пример #58
0
def sticky_passthrough(spider, response, func, sticky_args, *args, **kwargs):
    meta_keys = set(
        list(sticky_args) + list(getattr(spider, 'sticky_meta', [])))
    sticky = {k: v for k, v in response.meta.items() if k in meta_keys}
    f = func(spider, response, *args, **kwargs)
    for r in arg_to_iter(f):
        if sticky and isinstance(r, Request):
            r.meta.update(
                {k: v
                 for k, v in sticky.items() if k not in r.meta.keys()})
        yield r
Пример #59
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=False,
                 unique=True, process_value=None, deny_extensions=None, restrict_css=(),
                 strip=True):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = LxmlParserLinkExtractor(
            tag=tag_func,
            attr=attr_func,
            unique=unique,
            process=process_value,
            strip=strip,
            canonicalized=canonicalize
        )

        super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
            allow_domains=allow_domains, deny_domains=deny_domains,
            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
            canonicalize=canonicalize, deny_extensions=deny_extensions)
Пример #60
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value