class SHConnection(): ''' Wrapper for scrapinghub client, project and api calls to simplify use. ''' def __init__(self, api_key, default_project_key=None): self.api_key = api_key self.project_key = resolve_project_key( default_project_key=default_project_key ) def __enter__(self): self.client = ScrapinghubClient(self.api_key) self.project = self.client.get_project(self.project_key) return self def __exit__(self, *args): self.client.close() def jobs_iter(self, **kwargs): return self.project.jobs.iter(**kwargs) def get_job(self, job_id): return self.client.get_job(job_id)
class HCFManager(object): def __init__(self, auth, project_id, frontier, batch_size=0): self._client = ScrapinghubClient(auth=auth) self._hcf = self._client.get_project(project_id).frontiers self._frontier = self._hcf.get(frontier) self._links_count = defaultdict(int) self._links_to_flush_count = defaultdict(int) self._batch_size = batch_size self._hcf_retries = 10 def add_request(self, slot, request): self._frontier.get(slot).q.add([request]) self._links_count[slot] += 1 self._links_to_flush_count[slot] += 1 if self._batch_size and self._links_to_flush_count[ slot] >= self._batch_size: return self.flush(slot) return 0 def flush(self, slot=None): n_links_to_flush = self.get_number_of_links_to_flush(slot) if n_links_to_flush: if slot is None: self._hcf.flush() for slot in self._links_to_flush_count.keys(): self._links_to_flush_count[slot] = 0 LOG.info('Flushed %d link(s).', n_links_to_flush) else: slot_obj = self._frontier.get(slot) slot_obj.flush() self._links_to_flush_count[slot] = 0 return n_links_to_flush def read(self, slot, mincount=None): slot_obj = self._frontier.get(slot) for i in range(self._hcf_retries): try: return slot_obj.q.iter(mincount=mincount) except requests_lib.exceptions.ReadTimeout: LOG.error("Could not read from {0}/{1} try {2}/{3}".format( self._frontier.key, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: LOG.error( "Connection error while reading from {0}/{1} try {2}/{3}". format(self._frontier.key, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.RequestException: LOG.error( "Error while reading from {0}/{1} try {2}/{3}".format( self._frontier.key, slot, i + 1, self._hcf_retries)) time.sleep(60 * (i + 1)) return [] def delete(self, slot, ids): slot_obj = self._frontier.get(slot) for i in range(self._hcf_retries): try: slot_obj.q.delete(ids) break except requests_lib.exceptions.ReadTimeout: LOG.error( "Could not delete ids from {0}/{1} try {2}/{3}".format( self._frontier.key, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: LOG.error( "Connection error while deleting ids from {0}/{1} try {2}/{3}" .format(self._frontier.key, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.RequestException: LOG.error("Error deleting ids from {0}/{1} try {2}/{3}".format( self._frontier.key, slot, i + 1, self._hcf_retries)) time.sleep(60 * (i + 1)) def delete_slot(self, slot): slot_obj = self._frontier.get(slot) slot_obj.delete() def close(self): self._hcf.close() self._client.close() def get_number_of_links(self, slot=None): if slot is None: return sum(self._links_count.values()) else: return self._links_count[slot] def get_number_of_links_to_flush(self, slot=None): if slot is None: return sum(self._links_to_flush_count.values()) else: return self._links_to_flush_count[slot]
class CollectionScanner(object): """ Base class for all collection scanners """ # a list of names of complementary collection which shares same keys to the principal, # which its data will be merged in the output # for optimization purposes, it is made the assumption that secondary collections does not # have keys that are not present in principal. That is, key set of secondary collections # are always a subset of key set of principal. # TODO: logic does not work with startts secondary_collections = [] def __init__(self, collection_name, project_id=None, apikey=None, batchsize=DEFAULT_BATCHSIZE, count=0, max_next_records=1000, startafter=None, stopbefore=None, exclude_prefixes=None, secondary_collections=None, autodetect_partitions=True, **kwargs): """ collection_name - target collection project_id - target project id. If none, autodetect from SHUB_JOBKEY environment variable. apikey - hubstorage apikey with access to given project. If None, get from SH_APIKEY environment variable (delegated to scrapinghub library). batchsize - size of each batch in number of records count - total count of records to retrieve max_next_records - how many records get on each call to hubstorage server startafter - start to scan after given hs key prefix stopbefore - stop once found given hs key prefix exclude_prefix - a list of key prefixes to exclude from scanning secondary_collections - a list of secondary collections that updates the class default one. autodetect_partitions - If provided, autodetect partitioned collection. By default is True. If you want instead to force to read a non-partitioned collection when partitioned version also exists under the same name, use False. **kwargs - other extras arguments you want to pass to hubstorage collection, i.e.: - prefix (list of key prefixes to include in the scan) - startts and endts, either in epoch millisecs (as accepted by hubstorage) or a date string (support is added here) - meta (a list with either '_ts' and/or '_key') etc (see husbtorage documentation) """ self.hsc = ScrapinghubClient(apikey)._hsclient project_id = project_id or get_project_id() self.hsp = self.hsc.get_project(project_id) num_partitions = None if autodetect_partitions: num_partitions = get_num_partitions(self.hsp, collection_name) if num_partitions: log.info( "Partitioned collection detected: %d total partitions.", num_partitions) self.col = _CachedBlocksCollection(self.hsp, collection_name, num_partitions) self.__scanned_count = 0 self.__totalcount = count self.lastkey = None self.__startafter = startafter self.__stopbefore = stopbefore self.__exclude_prefixes = exclude_prefixes or [] self.secondary_collections.extend(secondary_collections or []) self.secondary = [ _CachedBlocksCollection(self.hsp, name) for name in filter_collections_exist( self.hsp, self.secondary_collections) ] self.__secondary_is_empty = defaultdict(bool) self.__batchsize = batchsize self.__max_next_records = max_next_records self.__enabled = True self.__start = kwargs.pop('start', '') kwargs = kwargs.copy() self.__endts = self.convert_ts(kwargs.get('endts', None)) kwargs['endts'] = self.__endts kwargs['startts'] = self.convert_ts(kwargs.get('startts', None)) self.__get_kwargs = kwargs def reset(self): """ Resets the scanner state variables in order to start again to scan collection """ self.__scanned_count = 0 self.__totalcount = 0 self.lastkey = None self.__startafter = None self.__secondary_is_empty = defaultdict(bool) self.__enabled = True def get_secondary_data(self, start, meta): secondary_data = defaultdict(dict) last = None max_next_records = self._get_max_next_records(self.__batchsize) for col in self.secondary: if not self.__secondary_is_empty[col.colname]: count = 0 try: for r in col.get(count=[max_next_records], start=start, meta=meta): count += 1 last = key = r.pop('_key') ts = r.pop('_ts') secondary_data[key].update(r) if '_ts' not in secondary_data[ key] or ts > secondary_data[key]['_ts']: secondary_data[key]['_ts'] = ts except KeyError: pass if count < max_next_records: self.__secondary_is_empty[col.colname] = True log.info('Secondary collection %s is depleted', col.colname) return last, dict(secondary_data) def convert_ts(self, timestamp): """ Read a timestamp in diverse formats and return milisecs epoch """ if isinstance(timestamp, (list, tuple)): timestamp = timestamp[0] if isinstance(timestamp, str): timestamp = self.str_to_msecs(timestamp) return timestamp def get_new_batch(self, random_mode=False): """ Convenient way for scanning a collection in batches """ kwargs = self.__get_kwargs.copy() original_meta = kwargs.pop('meta', []) meta = {'_key', '_ts'}.union(original_meta) last_secondary_key = None batchcount = self.__batchsize max_next_records = self._get_max_next_records(batchcount) # start used only once, as HS nulifies startafter if start is given start = self.__start self.__start = '' while max_next_records and self.__enabled: count = 0 jump_prefix = False for r in self.col.get(random_mode, count=[max_next_records], startafter=[self.__startafter], start=start, meta=meta, **kwargs): if self.__stopbefore is not None and r['_key'].startswith( self.__stopbefore): self.__enabled = False break count += 1 for exclude in self.__exclude_prefixes: if r['_key'].startswith(exclude): self.__startafter = exclude + LIMIT_KEY_CHAR jump_prefix = True break if jump_prefix: break self.__startafter = self.lastkey = r['_key'] if last_secondary_key is None or self.__startafter > last_secondary_key: last_secondary_key, secondary_data = self.get_secondary_data( start=self.__startafter, meta=meta) srecord = secondary_data.pop(r['_key'], None) if srecord is not None: ts = srecord['_ts'] r.update(srecord) if ts > r['_ts']: r['_ts'] = ts if self.__endts and r['_ts'] > self.__endts: continue for m in ['_key', '_ts']: if m not in original_meta: r.pop(m) self.__scanned_count += 1 batchcount -= 1 if self.__scanned_count % 10000 == 0: log.info("Last key: %s, Scanned %d", self.lastkey, self.__scanned_count) yield r self.__enabled = count >= max_next_records and ( not self.__totalcount or self.__scanned_count < self.__totalcount) or jump_prefix max_next_records = self._get_max_next_records(batchcount) def _get_max_next_records(self, batchcount): max_next_records = min(self.__max_next_records, batchcount) if self.__totalcount: max_next_records = min(max_next_records, self.__totalcount - self.__scanned_count) return max_next_records def scan_collection_batches(self): while self.__enabled: batch = list(self.get_new_batch()) if batch: yield batch def close(self): log.info("Total scanned: %d", self.__scanned_count) self.hsc.close() def set_startafter(self, startafter): self.__startafter = startafter @staticmethod def str_to_msecs(strtime): """ Converts from any format supported by dateparser to epoch milisecs, which is the time representation used by hubstorage """ if isinstance(strtime, int): return strtime if isinstance(strtime, str): d = dateparser.parse(strtime) return int(time.mktime(d.timetuple()) - time.timezone) * 1000 return 0 @property def scanned_count(self): return self.__scanned_count @property def is_enabled(self): return self.__enabled