示例#1
0
    def __init__(
        self,
        allow=(),
        deny=(),
        allow_domains=(),
        deny_domains=(),
        restrict_xpaths=(),
        tags=("a", "area"),
        attrs=("href",),
        canonicalize=True,
        unique=True,
        process_value=None,
        deny_extensions=None,
        restrict_css=(),
    ):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process=process_value)

        super(LxmlLinkExtractor, self).__init__(
            lx,
            allow=allow,
            deny=deny,
            allow_domains=allow_domains,
            deny_domains=deny_domains,
            restrict_xpaths=restrict_xpaths,
            restrict_css=restrict_css,
            canonicalize=canonicalize,
            deny_extensions=deny_extensions,
        )
示例#2
0
 def __init__(
     self,
     allow=(),
     deny=(),
     allow_domains=(),
     deny_domains=(),
     restrict_xpaths=(),
     tags=("a", "area"),
     attrs=("href"),
     canonicalize=True,
     unique=True,
     process_value=None,
     deny_extensions=None,
 ):
     self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
     self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
     self.allow_domains = set(arg_to_iter(allow_domains))
     self.deny_domains = set(arg_to_iter(deny_domains))
     self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
     self.canonicalize = canonicalize
     if deny_extensions is None:
         deny_extensions = IGNORED_EXTENSIONS
     self.deny_extensions = set(["." + e for e in deny_extensions])
     tag_func = lambda x: x in tags
     attr_func = lambda x: x in attrs
     BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, unique=unique, process_value=process_value)
示例#3
0
文件: sgml.py 项目: Tepira/scrapy
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True,
                 process_value=None, deny_extensions=None, restrict_css=()):

        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
            "Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning
        )

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs

        with warnings.catch_warnings(record=True):
            lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
                unique=unique, process_value=process_value)

        super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
            allow_domains=allow_domains, deny_domains=deny_domains,
            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
            canonicalize=canonicalize, deny_extensions=deny_extensions)

        # FIXME: was added to fix a RegexLinkExtractor testcase
        self.base_url = None
示例#4
0
文件: sgml.py 项目: elacuesta/scrapy
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True,
                 process_value=None, deny_extensions=None, restrict_css=(),
                 strip=True, restrict_text=()):
        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
            "Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning, stacklevel=2,
        )

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', ScrapyDeprecationWarning)
            lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
                                       unique=unique, process_value=process_value, strip=strip,
                                       canonicalized=canonicalize)

        super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
                                                allow_domains=allow_domains, deny_domains=deny_domains,
                                                restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
                                                canonicalize=canonicalize, deny_extensions=deny_extensions,
                                                restrict_text=restrict_text)
示例#5
0
    def __init__(self, allow=(), deny=()):
        """Initialize allow/deny attributes"""
        _re_type = type(re.compile('', 0))

        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) 
                          for x in arg_to_iter(allow)]
        self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) 
                         for x in arg_to_iter(deny)]
示例#6
0
        def wrapper(self, response):
            if not hasattr(self, 'debug'):
                self.debug = False
            if self.debug:
                if not hasattr(self, 'failed_urls'):
                    self.failed_urls = []
                if not hasattr(self, 'missing_fields'):
                    self.missing_fields = {}
                self.crawler.stats.set_value('DEBUG MODE', 'TRUE')

            result = func(self, response)
            for result in arg_to_iter(func(self, response)):
                if isinstance(result, Item):
                    exception = DropItem
                else:
                    exception = CloseSpider

                if response.status >= 400 or any(error in response.body for error in errorstrings):
                    if self.debug:
                        self.crawler.stats.inc_value('DEBUG: failed_urls_count')
                        self.failed_urls.append(response.url)
                        self.crawler.stats.set_value(
                            'DEBUG: failed_urls', self.failed_urls)
                    if exception == DropItem:
                        self.crawler.stats.inc_value('items_dropped_count')
                    if response.status >= 400:
                        raise exception(
                            'Status Code Error: %s\nURL: %s'
                            % (response.status, response.url))
                    else:
                        errors = [error for error in errorstrings if error in response.body]
                        raise exception(
                            'Response Body Error: %s\nURL: %s'
                            % (', '.join(errors), response.url))
                    yield None

                if isinstance(result, Item):
                    if self.debug:
                        job_misses_a_required_field = False
                        for field in arg_to_iter(fields_to_check):
                            if not result.get(field):
                                if field not in self.missing_fields:
                                    self.missing_fields[field] = []
                                self.missing_fields[field].append(response.url)
                                job_misses_a_required_field = True
                        for key in self.missing_fields.keys():
                            self.crawler.stats.set_value('DEBUG: missing_%s_field' % key, self.missing_fields[key])

                        if job_misses_a_required_field:
                            self.crawler.stats.inc_value(
                                'DEBUG: jobs_missing_required_field_count')

                    else:
                        if not result.get('referencenumber'):
                            self.crawler.stats.inc_value('items_dropped_count')
                            raise MissingJobField('referencenumber', response.url)

                yield result
示例#7
0
    def test_arg_to_iter(self):
        assert hasattr(arg_to_iter(None), '__iter__')
        assert hasattr(arg_to_iter(100), '__iter__')
        assert hasattr(arg_to_iter('lala'), '__iter__')
        assert hasattr(arg_to_iter([1,2,3]), '__iter__')
        assert hasattr(arg_to_iter(l for l in 'abcd'), '__iter__')

        self.assertEqual(list(arg_to_iter(None)), [])
        self.assertEqual(list(arg_to_iter('lala')), ['lala'])
        self.assertEqual(list(arg_to_iter(100)), [100])
        self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c'])
        self.assertEqual(list(arg_to_iter([1,2,3])), [1,2,3])
示例#8
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area', 'script', 'link'), attrs=('href', 'src'), canonicalize=True,
                 unique=True, process_value=None, deny_extensions=None):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = SayParserLinkExtractor(tag=tag_func, attr=attr_func,
            unique=unique, process=process_value)

        super(SayLinkExtractor, self).__init__(lx, allow, deny,
            allow_domains, deny_domains, restrict_xpaths, canonicalize,
            deny_extensions)
示例#9
0
文件: sgml.py 项目: pkufranky/scrapy
 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
              tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
     self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
     self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
     self.allow_domains = set(arg_to_iter(allow_domains))
     self.deny_domains = set(arg_to_iter(deny_domains))
     self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
     self.canonicalize = canonicalize
     tag_func = lambda x: x in tags
     attr_func = lambda x: x in attrs
     BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, 
         unique=unique, process_value=process_value)
示例#10
0
 def __call__(self, value, loader_context=None):
     values = arg_to_iter(value)
     if loader_context:
         context = MergeDict(loader_context, self.default_loader_context)
     else:
         context = self.default_loader_context
     wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
     for func in wrapped_funcs:
         next_values = []
         for v in values:
             next_values += arg_to_iter(func(v))
         values = next_values
     return values
示例#11
0
	def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href',), canonicalize=True,
    unique=True, process_value=None, deny_extensions=None, restrict_css=()):
		tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
		tag_func = lambda x: x in tags
		attr_func = lambda x: x in attrs
		lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process=process_value)
		self.crawledPagesPerSite={}
		self.maximumPagesPerSite=10000 #每个网站最多可以爬取的页面个数
		super(CustomLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
            allow_domains=allow_domains, deny_domains=deny_domains,
            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
            canonicalize=canonicalize, deny_extensions=deny_extensions)
示例#12
0
文件: sgml.py 项目: amogh14/fintra
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
                 deny_extensions=None):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
            unique=unique, process_value=process_value)
        super(SgmlLinkExtractor, self).__init__(lx, allow, deny,
            allow_domains, deny_domains, restrict_xpaths, canonicalize,
            deny_extensions)

        # FIXME: was added to fix a RegexLinkExtractor testcase
        self.base_url = None
示例#13
0
    def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
        if attachs:
            msg = MIMEMultipart()
        else:
            msg = MIMENonMultipart(*mimetype.split('/', 1))

        to = list(arg_to_iter(to))
        cc = list(arg_to_iter(cc))

        msg['From'] = self.mailfrom
        msg['To'] = COMMASPACE.join(to)
        msg['Date'] = formatdate(localtime=True)
        msg['Subject'] = subject
        rcpts = to[:]
        if cc:
            rcpts.extend(cc)
            msg['Cc'] = COMMASPACE.join(cc)

        if charset:
            msg.set_charset(charset)

        if attachs:
            msg.attach(MIMEText(body, 'plain', charset or 'us-ascii'))
            for attach_name, mimetype, f in attachs:
                part = MIMEBase(*mimetype.split('/'))
                part.set_payload(f.read())
                Encoders.encode_base64(part)
                part.add_header('Content-Disposition', 'attachment; filename="%s"' \
                    % attach_name)
                msg.attach(part)
        else:
            msg.set_payload(body)

        if _callback:
            _callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)

        if self.debug:
            logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
                         'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
                         {'mailto': to, 'mailcc': cc, 'mailsubject': subject,
                          'mailattachs': len(attachs)})
            return

        dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
        dfd.addCallbacks(self._sent_ok, self._sent_failed,
            callbackArgs=[to, cc, subject, len(attachs)],
            errbackArgs=[to, cc, subject, len(attachs)])
        reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
        return dfd
示例#14
0
 def _do_extract_items_from(self, htmlpage, extractor, response=None):
     # Try to predict template to use
     template_cluster, pref_template_id = self._cluster_page(htmlpage)
     extracted, template = extractor.extract(htmlpage, pref_template_id)
     extracted = extracted or []
     link_regions = []
     for ddict in extracted:
         link_regions.extend(arg_to_iter(ddict.pop("_links", [])))
     descriptor = None
     unprocessed = False
     if template is not None and hasattr(template, 'descriptor'):
         descriptor = template.descriptor()
         if hasattr(descriptor, 'name'):
             item_cls_name = descriptor.name
         elif hasattr(descriptor, 'get'):
             item_cls_name = descriptor.get('name',
                                            descriptor.get('display_name'))
         else:
             item_cls_name = ''
     else:
         unprocessed = True
         try:
             descriptor = self.schema_descriptors[template.id]
             item_cls_name = self.template_scrapes[template.id]
         except (AttributeError, KeyError):
             try:
                 descriptor = sorted(self.schema_descriptors.items())[0][1]
                 item_cls_name = sorted(self.template_scrapes.items())[0][1]
             except IndexError:
                 descriptor, item_cls_name = None, None
     item_cls = self.item_classes.get(item_cls_name)
     items = []
     for processed_attributes in extracted:
         if processed_attributes.get('_type') in self.item_classes:
             _type = processed_attributes['_type']
             item = self.item_classes[_type](processed_attributes)
             item['_type'] = item.display_name()
         elif unprocessed:
             item = self._process_attributes(processed_attributes,
                                             descriptor, htmlpage)
             if item_cls:
                 item = item_cls(item)
         elif item_cls:
             item = item_cls(processed_attributes)
         else:
             item = dict(processed_attributes)
         item[u'url'] = htmlpage.url
         item[u'_template'] = str(template.id)
         item.setdefault('_type', item_cls_name)
         if not isinstance(item, SlybotItem):
             default_meta = {'type': 'text', 'required': False,
                             'vary': False}
             item_cls = SlybotItem.create_iblitem_class(
                 {'fields': {k: default_meta for k in item}}
             )
             item = item_cls(**item)
         if self.clustering:
             item['_template_cluster'] = template_cluster
         items.append(item)
     return items, link_regions
示例#15
0
 def _compose(values, wrapped_funcs):
     for func in wrapped_funcs:
         next_values = []
         for v in values:
             next_values += arg_to_iter(func(v))
         values = next_values
     return values
示例#16
0
文件: manager.py 项目: serkanh/scrapy
def _get_spider_requests(*args):
    """Collect requests and spiders from the given arguments. Returns a dict of
    spider -> list of requests
    """
    spider_requests = defaultdict(list)
    for arg in args:
        if isinstance(arg, tuple):
            request, spider = arg
            spider_requests[spider] = request
        elif isinstance(arg, Request):
            spider = spiders.fromurl(arg.url) or BaseSpider('default')
            if spider:
                spider_requests[spider] += [arg]
            else:
                log.msg('Could not find spider for request: %s' % arg, log.ERROR)
        elif isinstance(arg, BaseSpider):
            spider_requests[arg] += arg.start_requests()
        elif is_url(arg):
            spider = spiders.fromurl(arg) or BaseSpider('default')
            if spider:
                for req in arg_to_iter(spider.make_requests_from_url(arg)):
                    spider_requests[spider] += [req]
            else:
                log.msg('Could not find spider for url: %s' % arg, log.ERROR)
        elif isinstance(arg, basestring):
            spider = spiders.fromdomain(arg)
            if spider:
                spider_requests[spider] += spider.start_requests()
            else:
                log.msg('Could not find spider for domain: %s' % arg, log.ERROR)
        else:
            raise TypeError("Unsupported argument: %r" % arg)
    return spider_requests
示例#17
0
    def process_item(self, item, spider):
        obj_map = self.spider_objs[spider]
        rel_fields = item._model_rel_fields

        # If there are no related fields to resolve just save and return.
        if not rel_fields:
            return self.save_item(None, item, spider)

        # Build a list of outstanding requests.
        req_ids = sum([arg_to_iter(item.get(f.name, [])) for f in rel_fields], [])
        req_ids = [u for u in req_ids if (u not in obj_map or isinstance(obj_map[u], Deferred))]

        # If there are no requests to perform, fill, save and return.
        if not req_ids:
            return self.save_item(None, item, spider)

        # Defer?
        dlist = []
        for id in req_ids:
            if id not in obj_map:
                obj_map[id] = Deferred()
            dfd = obj_map[id]
            assert dfd is not None
            dlist.append(dfd)
        return DeferredList(dlist, consumeErrors=1).addCallback(self.save_item, item, spider)
示例#18
0
    def _request_callback(self, spider, original_callback, response):
        """

        Close the page (lose the reference to it so it is garbage collected)
        when the callback returns.

        The original callback may prevent page closing by setting the
        should_close_webpage attribute in responses. This is useful for
        example if the page is stored somewhere else (e.g. request meta) to be
        used later. The page then needs to be closed manually at some point by
        calling its close_page() function, which is created here.

        """

        if isinstance(original_callback, basestring):
            original_callback = getattr(spider, original_callback)

        webpage = response.webpage
        response.should_close_webpage = True
        try:
            returnValue(arg_to_iter((yield maybeDeferred(original_callback,
                                                         response))))
        finally:
            # FIXME: sometimes this section is reached before the wrapped
            # callback finishes, when it returns a Deferred.
            if response.should_close_webpage:
                self._close_page(webpage)
            else:
                webpage.close_page = partial(self._close_page, webpage)
                webpage.close_page.__doc__ = ("Lose the reference to the "
                                              "webpage object and allow it "
                                              "to be garbage collected.")
示例#19
0
	def __call__(self, value, loader_context=None):
		value = arg_to_iter(value)[0]
		if 'http://' not in value:
			#import pdb; pdb.set_trace()
			value = urljoin_rfc(get_base_url(loader_context['response']), value)
		
		return value
示例#20
0
 def __call__(self, values):
     out = []
     for value in [x for x in arg_to_iter(values) if isinstance(x, basestring)]:
         for m in self.latin_html_map:
             value =  value.replace(m,self.latin_html_map[m])
         out.append(value)
     return out
示例#21
0
 def __call__(self, values):
     new_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             v = remove_entities(v).strip()
         new_values.append(int(v))
     return new_values
示例#22
0
	def __call__(self, value, loader_context=None):
		#value = TakeFirst(value)
		#value = value.Clean()
		values = arg_to_iter(value)
		for i,value in enumerate(values):
			if value:
				return value.replace(',','.')
示例#23
0
    def __call__(self, values):
        # first we convert the strings to Element objects
        list_of_elements = []
        for x in arg_to_iter(values):
            if not x.isspace():
                try:
                    list_of_elements.append(html.fromstring(x))
                except:
                    # invalid html(eg:comment tags)
                    pass
        # then, we go ahead and remove every higher-level 'bad' tag from the list
        processed_list = [x for x in list_of_elements if x.tag not in self.bad]
        # then we check each element for sub-elements, and
        # remove the 'bad' tags under each. this is because
        # sometimes the strings we get are full-on trees in their
        # own right, so we need to go and pick off the children
        # one by one.
        for elem in processed_list:
            # if the top level element is bad
            # we just remove it and all the children
            # should follow
            if elem.tag in self.bad:
                elem.getparent().remove(elem)
            else:
                # otherwise, we use the xpath
                # method to check if there are any
                # bad children using xpath.
                self.remove_children(elem, self.bad)

        return [html.tostring(x, encoding='unicode') for x in processed_list]
示例#24
0
    def remove_extra_words(self, value, remove_words=[]):
        """Removing all the extra words"""
        for temp in remove_words:
            temp = temp.lower()
            value = [v.lower().replace(temp, '') for v in arg_to_iter(value)]

        return value
示例#25
0
 def __call__(self, value, loader_context=None):
     values = arg_to_iter(value)
     if loader_context:
         context = MergeDict(loader_context, self.default_loader_context)
     else:
         context = self.default_loader_context
     wrapped_funcs = (wrap_loader_context(f, context) for f in self.functions)
     return self._compose(values, wrapped_funcs)
示例#26
0
    def _get_jmes_values(self, jmes_paths):
        if self.json_obj is None:
            raise RuntimeError("no JSON object found")

        jmes_paths = arg_to_iter(jmes_paths)
        return flatten(
            jmespath.search(jmes_path, self.json_obj)
            for jmes_path in jmes_paths)
示例#27
0
 def test_url_has_any_extension(self):
     deny_extensions = {'.' + e for e in arg_to_iter(IGNORED_EXTENSIONS)}
     self.assertTrue(url_has_any_extension("http://www.example.com/archive.tar.gz", deny_extensions))
     self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", deny_extensions))
     self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", deny_extensions))
示例#28
0
 def map_final_data(self, value):
     indutries_codes = []
     for x in arg_to_iter(value):
         if x.lower() in self.map.keys():
             indutries_codes.extend([self.map[x.lower()]])
         else:
             indutries_codes.extend([x])
     return indutries_codes
示例#29
0
 def __call__(self, values):
     rvals = []
     for v in arg_to_iter(values):
         try:
             rvals.append(datetime.strptime(v.strip(), self.informat).strftime(self.outformat))
         except ValueError:
             pass
     return rvals
示例#30
0
 def _get_cssvalues(self, csss, **kw):
     self._check_selector_method()
     csss = arg_to_iter(csss)
     ret = self._extract_hier_csss(self.selector, csss, **kw)
     if ret is None or not flatten(ret):
         return None
     else:
         return ret
示例#31
0
 def __call__(self, value, loader_context=None):
     if not value:
         value.append(" ")
     values = arg_to_iter(value)
     if loader_context:
         context = MergeDict(loader_context, self.default_loader_context)
     else:
         context = self.default_loader_context
     wrapped_funcs = [
         wrap_loader_context(f, context) for f in self.functions
     ]
     for func in wrapped_funcs:
         next_values = []
         for v in values:
             next_values += arg_to_iter(func(v))
         values = next_values
     return values
示例#32
0
 def process_item(self, item, spider):
     if 'meta' not in spider.name:
         return item
     info = self.spiderinfo
     requests = arg_to_iter(self.get_media_requests(item, info))
     dlist = [self._process_request(r, info) for r in requests]
     dfd = DeferredList(dlist, consumeErrors=1)
     return dfd.addCallback(self.item_completed, item, info)
示例#33
0
 def __init__(self, settings):
     if not settings.getbool('PROXYMESH_ENABLED', True):
         raise NotConfigured
     self.proxies = itertools.cycle(
         arg_to_iter(
             settings.get('PROXYMESH_URL',
                          'http://us-il.proxymesh.com:31280')))
     self.timeout = settings.getint('PROXYMESH_TIMEOUT', 0)
示例#34
0
 def assertReMatch(self, regex, actual, msg=None):  # {{{
     actuals = arg_to_iter(actual)
     for actual in actuals:
         match = re.search(regex, actual)
         errmsg = "%s not match %s" % (actual, regex)
         if msg:
             errmsg = "%s\n%s" % (msg, errmsg)
         self.assertTrue(match, errmsg)
    def __init__(self,
                 allow=(),
                 deny=(),
                 allow_domains=(),
                 deny_domains=(),
                 restrict_xpaths=(),
                 tags=('a', 'area'),
                 attrs=('href', ),
                 canonicalize=False,
                 unique=True,
                 process_value=None,
                 deny_extensions=None,
                 restrict_css=(),
                 strip=True,
                 restrict_text=()):
        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
            "Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning,
            stacklevel=2,
        )

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', ScrapyDeprecationWarning)
            lx = BaseSgmlLinkExtractor(tag=tag_func,
                                       attr=attr_func,
                                       unique=unique,
                                       process_value=process_value,
                                       strip=strip,
                                       canonicalized=canonicalize)

        super(SgmlLinkExtractor,
              self).__init__(lx,
                             allow=allow,
                             deny=deny,
                             allow_domains=allow_domains,
                             deny_domains=deny_domains,
                             restrict_xpaths=restrict_xpaths,
                             restrict_css=restrict_css,
                             canonicalize=canonicalize,
                             deny_extensions=deny_extensions,
                             restrict_text=restrict_text)
示例#36
0
    def process_item(self, item, spider):
        info = self.spiderinfo
        requests = arg_to_iter(self.get_media_requests(item, info))
        dlist = [self._process_request(r, info) for r in requests]
        dfd = DeferredList(dlist, consumeErrors=1)
		
		# https://github.com/scrapy/scrapy/issues/4228
		item_copied = deepcopy(item)
示例#37
0
 def __init__(self, url, fields, lang_priorities=None):
     self.url = url
     self.fields = fields
     self.lang_priorities = {
         lang: prio for prio, lang in enumerate(arg_to_iter(lang_priorities))
     }
     self.labels = {}
     self.logger = LOGGER
示例#38
0
 def process_item(self, item, spider):
     # add names
     self.file_paths.update(item["file_paths"])
     info = self.spiderinfo
     requests = arg_to_iter(self.get_media_requests(item, info))
     dlist = [self._process_request(r, info) for r in requests]
     dfd = DeferredList(dlist, consumeErrors=1)
     return dfd.addCallback(self.item_completed, item, info)
示例#39
0
 def _get_jsonpathvalues(self, jsonpaths, **kw):
     self._check_selector_method()
     jsonpaths = arg_to_iter(jsonpaths)
     ret = self._extract_hier_jsonpaths(self.selector.json, jsonpaths, **kw)
     if not flatten(ret):
         return None
     else:
         return ret
示例#40
0
    def __call__(self, values):
        output = []
        for val in arg_to_iter(values):
            val = self.remove_empty_spaces(val)
            val = self.remove_special_chars(val)
            output.extend(val)

        return val
示例#41
0
	def __call__(self, value, loader_context=None):
		values = arg_to_iter(value)
		values.reverse()
		secs=0
		for i, v in enumerate(values):
			secs+= int(v.strip('\:'))*(60**i)
		
		return secs
示例#42
0
    def __init__(self, link_extractor, allow, deny, allow_domains,
                 deny_domains, restrict_xpaths, canonicalize, deny_extensions,
                 restrict_css):

        self.link_extractor = link_extractor

        self.allow_res = [
            x if isinstance(x, _re_type) else re.compile(x)
            for x in arg_to_iter(allow)
        ]
        self.deny_res = [
            x if isinstance(x, _re_type) else re.compile(x)
            for x in arg_to_iter(deny)
        ]

        self.allow_domains = set(arg_to_iter(allow_domains))
        self.deny_domains = set(arg_to_iter(deny_domains))

        self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
        self.restrict_xpaths += tuple(
            map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css)))

        self.canonicalize = canonicalize
        if deny_extensions is None:
            deny_extensions = IGNORED_EXTENSIONS
        self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
示例#43
0
 def process_item(self, item, spider):
     """
         custom process_item func,so it will manage the Request result.
     """
     info = self.spiderinfo
     requests = arg_to_iter(self.get_media_requests(item, info))
     dlist = [self._process_request(r, info) for r in requests]
     dfd = DeferredList(dlist, consumeErrors=1)
     return dfd.addCallback(self.item_completed, item, info)
示例#44
0
def iterate_spider_output(result):
    if inspect.isasyncgen(result):
        return result
    elif inspect.iscoroutine(result):
        d = deferred_from_coro(result)
        d.addCallback(iterate_spider_output)
        return d
    else:
        return arg_to_iter(deferred_from_coro(result))
示例#45
0
 def process_item(self, item, spider):
     if item.get('image_urls'):
         info = self.spiderinfo
         requests = arg_to_iter(self.get_media_requests(item, info))
         dlist = [self._process_request(r, info) for r in requests]
         dfd = DeferredList(dlist, consumeErrors=True)
         return dfd.addCallback(self.item_completed, item, info)
     else:
         return item
示例#46
0
 def start_requests(self):
     print('self.queries: ', self.queries)
     for query in arg_to_iter(self.queries):
         url = self.make_google_search_request(COUNTRIES[self.region],
                                               query)
         print('url: ', url)
         yield scrapy.Request(url=url,
                              meta={'query': query},
                              callback=self.parse)
示例#47
0
文件: sgml.py 项目: vinchu/scrapy
    def __init__(self,
                 allow=(),
                 deny=(),
                 allow_domains=(),
                 deny_domains=(),
                 restrict_xpaths=(),
                 tags=('a', 'area'),
                 attrs=('href', ),
                 canonicalize=True,
                 unique=True,
                 process_value=None,
                 deny_extensions=None,
                 restrict_css=()):

        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
            "Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning,
            stacklevel=2,
        )

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs

        with warnings.catch_warnings(record=True):
            lx = BaseSgmlLinkExtractor(tag=tag_func,
                                       attr=attr_func,
                                       unique=unique,
                                       process_value=process_value)

        super(SgmlLinkExtractor,
              self).__init__(lx,
                             allow=allow,
                             deny=deny,
                             allow_domains=allow_domains,
                             deny_domains=deny_domains,
                             restrict_xpaths=restrict_xpaths,
                             restrict_css=restrict_css,
                             canonicalize=canonicalize,
                             deny_extensions=deny_extensions)

        # FIXME: was added to fix a RegexLinkExtractor testcase
        self.base_url = None
示例#48
0
 def __call__(self, values):
     new_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             v = remove_entities(v).strip()
             v = (v.lower() == 'true')
         else:
             v = bool(v)
         new_values.append(v)
     return new_values
示例#49
0
 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
              tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
              deny_extensions=None):
     self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
     self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
     self.allow_domains = set(arg_to_iter(allow_domains))
     self.deny_domains = set(arg_to_iter(deny_domains))
     self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
     self.canonicalize = canonicalize
     if deny_extensions is None:
         deny_extensions = IGNORED_EXTENSIONS
     self.deny_extensions = set(['.' + e for e in deny_extensions])
     tag_func = lambda x: x in tags
     attr_func = lambda x: x in attrs
     BaseSgmlLinkExtractor.__init__(self,
                                    tag=tag_func,
                                    attr=attr_func,
                                    unique=unique,
                                    process_value=process_value)
示例#50
0
 def add_values(self, values, keys=None):
     if not keys:
         keys = self.default_keys
     elif isinstance(keys, basestring):
         keys = self.keys[keys]
     
     for k, v in zip(keys, values):
         if k:
             for k in arg_to_iter(k):
                 self.add_value(k, v)
示例#51
0
 def _extract(self, response, option):
     """
         1. extract links/items
         2. filter desired output
     """
     conds = option.get('conds', [])
     ref = option.get('ref')
     if not ref:
         extracts = self.__extract(response, option)
         _ITEM_REFS[response] = list(arg_to_iter(extracts))
     return self.__filter(_ITEM_REFS[response], conds)
示例#52
0
 def __call__(self, values):
     out_values = []
     values = arg_to_iter(values)
     while values:
         val = values.pop(0)
         if values and val == 'R' and values[0] == 'B':
             values.pop(0)
             out_values.append('R&B')
         elif val:
             out_values.append(val)
     return out_values
示例#53
0
 def _deferred_field(self, field, item, spider):
     deferreds = [
         self._deferred_value(value, spider)
         for value in arg_to_iter(item.get(field))
     ]
     if not deferreds:
         item[field] = None
         return defer_result(item)
     deferred = DeferredList(deferreds, consumeErrors=True)
     deferred.addBoth(self._add_value, field, item)
     return deferred
示例#54
0
 def _extract_hier_csss(self, node, csss, **kw):
     csss = arg_to_iter(csss)
     if len(csss) > 1:
         child_csss = csss[1:]
         return [
             self._extract_hier_csss(Selector(text=child_node_html),
                                     child_csss, **kw)
             for child_node_html in node.css(csss[0])
         ]
     else:
         return filter_regex(kw.get('regex'), node.css(csss[0]))
示例#55
0
def iterate_spider_output(result):
    if collect_asyncgen and hasattr(
            inspect, 'isasyncgen') and inspect.isasyncgen(result):
        d = deferred_from_coro(collect_asyncgen(result))
        d.addCallback(iterate_spider_output)
        return d
    elif inspect.iscoroutine(result):
        d = deferred_from_coro(result)
        d.addCallback(iterate_spider_output)
        return d
    return arg_to_iter(result)
示例#56
0
    def process_item(self, item, spider):
        """Copy a limited number of image URLs to be downloaded from source to target."""

        # adding target field would result in error; return item as-is
        if hasattr(item, "fields") and self.target_field not in item.fields:
            return item

        if self.limit is None or self.limit < 0:  # copy through everything
            item[self.target_field] = list(
                arg_to_iter(item.get(self.source_field)))
            return item

        if not self.limit:  # limit is zero
            item[self.target_field] = []
            return item

        # actual limit
        item[self.target_field] = list(
            islice(arg_to_iter(item.get(self.source_field)), self.limit))
        return item
示例#57
0
 def process_item(self, item, spider):
     # ensure
     if abs(spider.count) + 1 > spider.max:
         spider.close_down = True
     info = self.spiderinfo
     requests = arg_to_iter(self.get_media_requests(item, info))
     dlist = [self._process_request(r, info) for r in requests]
     dfd = DeferredList(dlist, consumeErrors=1)
     # only update when item is passed to pipeline, ensuring count consistency
     spider.count -= 1
     return dfd.addCallback(self.item_completed, item, info)
示例#58
0
def sticky_passthrough(spider, response, func, sticky_args, *args, **kwargs):
    meta_keys = set(
        list(sticky_args) + list(getattr(spider, 'sticky_meta', [])))
    sticky = {k: v for k, v in response.meta.items() if k in meta_keys}
    f = func(spider, response, *args, **kwargs)
    for r in arg_to_iter(f):
        if sticky and isinstance(r, Request):
            r.meta.update(
                {k: v
                 for k, v in sticky.items() if k not in r.meta.keys()})
        yield r
示例#59
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=False,
                 unique=True, process_value=None, deny_extensions=None, restrict_css=(),
                 strip=True):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = LxmlParserLinkExtractor(
            tag=tag_func,
            attr=attr_func,
            unique=unique,
            process=process_value,
            strip=strip,
            canonicalized=canonicalize
        )

        super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
            allow_domains=allow_domains, deny_domains=deny_domains,
            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
            canonicalize=canonicalize, deny_extensions=deny_extensions)
示例#60
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value