def __init__(self, crawler): # FIXME move sh_scrapy.hsref to python-hubstorage and drop it try: from sh_scrapy.hsref import hsref self.hsref = hsref except ImportError: raise NotConfigured settings = crawler.settings mode = 'cs' if settings.get('PAGE_STORAGE_MODE') == 'VERSIONED_CACHE': mode = 'vcs' self.trim_html = False if settings.getbool('PAGE_STORAGE_TRIM_HTML'): self.trim_html = True self.enabled, self.on_error_enabled = _get_enabled_status(settings) self.limits = { 'all': crawler.settings.getint('PAGE_STORAGE_LIMIT'), 'error': crawler.settings.getint('PAGE_STORAGE_ON_ERROR_LIMIT'), } self.counters = { 'all': 0, 'error': 0, } self.cookies_seen = set() endpoint = urlpathjoin(hsref.project.collections.url, mode, _COLLECTION_NAME) logger.info("HubStorage: writing pages to %s", endpoint) hsref.job.metadata.apipost('collection', jl=urlpathjoin(mode, _COLLECTION_NAME)) self._writer = hsref.client.batchuploader.create_writer( endpoint, content_encoding='gzip', size=20) crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
def unset_testbotgroup(hsproject): hsproject.settings.apidelete('botgroups') hsproject.settings.expire() # Additional step to delete botgroups in JobQ url = urlpathjoin(TEST_ENDPOINT, 'botgroups', TEST_BOTGROUP) requests.delete(url, auth=hsproject.auth)
def set_testbotgroup(hsproject): hsproject.settings.apipost(jl={'botgroups': [TEST_BOTGROUP]}) # Additional step to populate JobQ's botgroups table url = urlpathjoin(TEST_ENDPOINT, 'botgroups', TEST_BOTGROUP, 'max_running') requests.post(url, auth=hsproject.auth, data='null') hsproject.settings.expire()