示例#1
0
class SHConnection():
    ''' Wrapper for scrapinghub client, project and api calls
    to simplify use.
    '''

    def __init__(self, api_key, default_project_key=None):
        self.api_key = api_key
        self.project_key = resolve_project_key(
            default_project_key=default_project_key
        )

    def __enter__(self):
        self.client = ScrapinghubClient(self.api_key)
        self.project = self.client.get_project(self.project_key)
        return self

    def __exit__(self, *args):
        self.client.close()

    def jobs_iter(self, **kwargs):
        return self.project.jobs.iter(**kwargs)

    def get_job(self, job_id):
        return self.client.get_job(job_id)
示例#2
0
class HCFManager(object):
    def __init__(self, auth, project_id, frontier, batch_size=0):
        self._client = ScrapinghubClient(auth=auth)
        self._hcf = self._client.get_project(project_id).frontiers
        self._frontier = self._hcf.get(frontier)
        self._links_count = defaultdict(int)
        self._links_to_flush_count = defaultdict(int)
        self._batch_size = batch_size
        self._hcf_retries = 10

    def add_request(self, slot, request):
        self._frontier.get(slot).q.add([request])
        self._links_count[slot] += 1
        self._links_to_flush_count[slot] += 1
        if self._batch_size and self._links_to_flush_count[
                slot] >= self._batch_size:
            return self.flush(slot)
        return 0

    def flush(self, slot=None):
        n_links_to_flush = self.get_number_of_links_to_flush(slot)
        if n_links_to_flush:
            if slot is None:
                self._hcf.flush()
                for slot in self._links_to_flush_count.keys():
                    self._links_to_flush_count[slot] = 0
                LOG.info('Flushed %d link(s).', n_links_to_flush)
            else:
                slot_obj = self._frontier.get(slot)
                slot_obj.flush()
                self._links_to_flush_count[slot] = 0

        return n_links_to_flush

    def read(self, slot, mincount=None):
        slot_obj = self._frontier.get(slot)
        for i in range(self._hcf_retries):
            try:
                return slot_obj.q.iter(mincount=mincount)
            except requests_lib.exceptions.ReadTimeout:
                LOG.error("Could not read from {0}/{1} try {2}/{3}".format(
                    self._frontier.key, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.ConnectionError:
                LOG.error(
                    "Connection error while reading from {0}/{1} try {2}/{3}".
                    format(self._frontier.key, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.RequestException:
                LOG.error(
                    "Error while reading from {0}/{1} try {2}/{3}".format(
                        self._frontier.key, slot, i + 1, self._hcf_retries))
            time.sleep(60 * (i + 1))
        return []

    def delete(self, slot, ids):
        slot_obj = self._frontier.get(slot)

        for i in range(self._hcf_retries):
            try:
                slot_obj.q.delete(ids)
                break
            except requests_lib.exceptions.ReadTimeout:
                LOG.error(
                    "Could not delete ids from {0}/{1} try {2}/{3}".format(
                        self._frontier.key, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.ConnectionError:
                LOG.error(
                    "Connection error while deleting ids from {0}/{1} try {2}/{3}"
                    .format(self._frontier.key, slot, i + 1,
                            self._hcf_retries))
            except requests_lib.exceptions.RequestException:
                LOG.error("Error deleting ids from {0}/{1} try {2}/{3}".format(
                    self._frontier.key, slot, i + 1, self._hcf_retries))
            time.sleep(60 * (i + 1))

    def delete_slot(self, slot):
        slot_obj = self._frontier.get(slot)
        slot_obj.delete()

    def close(self):
        self._hcf.close()
        self._client.close()

    def get_number_of_links(self, slot=None):
        if slot is None:
            return sum(self._links_count.values())
        else:
            return self._links_count[slot]

    def get_number_of_links_to_flush(self, slot=None):
        if slot is None:
            return sum(self._links_to_flush_count.values())
        else:
            return self._links_to_flush_count[slot]
示例#3
0
class CollectionScanner(object):
    """
    Base class for all collection scanners
    """
    # a list of names of complementary collection which shares same keys to the principal,
    # which its data will be merged in the output
    # for optimization purposes, it is made the assumption that secondary collections does not
    # have keys that are not present in principal. That is, key set of secondary collections
    # are always a subset of key set of principal.
    # TODO: logic does not work with startts
    secondary_collections = []

    def __init__(self,
                 collection_name,
                 project_id=None,
                 apikey=None,
                 batchsize=DEFAULT_BATCHSIZE,
                 count=0,
                 max_next_records=1000,
                 startafter=None,
                 stopbefore=None,
                 exclude_prefixes=None,
                 secondary_collections=None,
                 autodetect_partitions=True,
                 **kwargs):
        """
        collection_name - target collection
        project_id - target project id. If none, autodetect from SHUB_JOBKEY environment variable.
        apikey - hubstorage apikey with access to given project. If None, get from SH_APIKEY environment variable
                 (delegated to scrapinghub library).
        batchsize - size of each batch in number of records
        count - total count of records to retrieve
        max_next_records - how many records get on each call to hubstorage server
        startafter - start to scan after given hs key prefix
        stopbefore - stop once found given hs key prefix
        exclude_prefix - a list of key prefixes to exclude from scanning
        secondary_collections - a list of secondary collections that updates the class default one.
        autodetect_partitions - If provided, autodetect partitioned collection. By default is True. If you want instead to force to read a non-partitioned
                collection when partitioned version also exists under the same name, use False.
        **kwargs - other extras arguments you want to pass to hubstorage collection, i.e.:
                - prefix (list of key prefixes to include in the scan)
                - startts and endts, either in epoch millisecs (as accepted by hubstorage) or a date string (support is added here)
                - meta (a list with either '_ts' and/or '_key')
                etc (see husbtorage documentation)
        """
        self.hsc = ScrapinghubClient(apikey)._hsclient
        project_id = project_id or get_project_id()
        self.hsp = self.hsc.get_project(project_id)

        num_partitions = None
        if autodetect_partitions:
            num_partitions = get_num_partitions(self.hsp, collection_name)
            if num_partitions:
                log.info(
                    "Partitioned collection detected: %d total partitions.",
                    num_partitions)

        self.col = _CachedBlocksCollection(self.hsp, collection_name,
                                           num_partitions)
        self.__scanned_count = 0
        self.__totalcount = count
        self.lastkey = None
        self.__startafter = startafter
        self.__stopbefore = stopbefore
        self.__exclude_prefixes = exclude_prefixes or []
        self.secondary_collections.extend(secondary_collections or [])
        self.secondary = [
            _CachedBlocksCollection(self.hsp,
                                    name) for name in filter_collections_exist(
                                        self.hsp, self.secondary_collections)
        ]
        self.__secondary_is_empty = defaultdict(bool)
        self.__batchsize = batchsize
        self.__max_next_records = max_next_records
        self.__enabled = True

        self.__start = kwargs.pop('start', '')
        kwargs = kwargs.copy()
        self.__endts = self.convert_ts(kwargs.get('endts', None))
        kwargs['endts'] = self.__endts
        kwargs['startts'] = self.convert_ts(kwargs.get('startts', None))
        self.__get_kwargs = kwargs

    def reset(self):
        """
        Resets the scanner state variables in order to start again to scan collection
        """
        self.__scanned_count = 0
        self.__totalcount = 0
        self.lastkey = None
        self.__startafter = None
        self.__secondary_is_empty = defaultdict(bool)
        self.__enabled = True

    def get_secondary_data(self, start, meta):
        secondary_data = defaultdict(dict)
        last = None
        max_next_records = self._get_max_next_records(self.__batchsize)
        for col in self.secondary:
            if not self.__secondary_is_empty[col.colname]:
                count = 0
                try:
                    for r in col.get(count=[max_next_records],
                                     start=start,
                                     meta=meta):
                        count += 1
                        last = key = r.pop('_key')
                        ts = r.pop('_ts')
                        secondary_data[key].update(r)
                        if '_ts' not in secondary_data[
                                key] or ts > secondary_data[key]['_ts']:
                            secondary_data[key]['_ts'] = ts
                except KeyError:
                    pass
                if count < max_next_records:
                    self.__secondary_is_empty[col.colname] = True
                    log.info('Secondary collection %s is depleted',
                             col.colname)
        return last, dict(secondary_data)

    def convert_ts(self, timestamp):
        """
        Read a timestamp in diverse formats and return milisecs epoch
        """
        if isinstance(timestamp, (list, tuple)):
            timestamp = timestamp[0]
        if isinstance(timestamp, str):
            timestamp = self.str_to_msecs(timestamp)
        return timestamp

    def get_new_batch(self, random_mode=False):
        """
        Convenient way for scanning a collection in batches
        """
        kwargs = self.__get_kwargs.copy()
        original_meta = kwargs.pop('meta', [])
        meta = {'_key', '_ts'}.union(original_meta)
        last_secondary_key = None
        batchcount = self.__batchsize
        max_next_records = self._get_max_next_records(batchcount)
        # start used only once, as HS nulifies startafter if start is given
        start = self.__start
        self.__start = ''

        while max_next_records and self.__enabled:
            count = 0
            jump_prefix = False
            for r in self.col.get(random_mode,
                                  count=[max_next_records],
                                  startafter=[self.__startafter],
                                  start=start,
                                  meta=meta,
                                  **kwargs):
                if self.__stopbefore is not None and r['_key'].startswith(
                        self.__stopbefore):
                    self.__enabled = False
                    break
                count += 1
                for exclude in self.__exclude_prefixes:
                    if r['_key'].startswith(exclude):
                        self.__startafter = exclude + LIMIT_KEY_CHAR
                        jump_prefix = True
                        break
                if jump_prefix:
                    break
                self.__startafter = self.lastkey = r['_key']
                if last_secondary_key is None or self.__startafter > last_secondary_key:
                    last_secondary_key, secondary_data = self.get_secondary_data(
                        start=self.__startafter, meta=meta)
                srecord = secondary_data.pop(r['_key'], None)
                if srecord is not None:
                    ts = srecord['_ts']
                    r.update(srecord)
                    if ts > r['_ts']:
                        r['_ts'] = ts

                if self.__endts and r['_ts'] > self.__endts:
                    continue

                for m in ['_key', '_ts']:
                    if m not in original_meta:
                        r.pop(m)

                self.__scanned_count += 1
                batchcount -= 1
                if self.__scanned_count % 10000 == 0:
                    log.info("Last key: %s, Scanned %d", self.lastkey,
                             self.__scanned_count)
                yield r
            self.__enabled = count >= max_next_records and (
                not self.__totalcount
                or self.__scanned_count < self.__totalcount) or jump_prefix
            max_next_records = self._get_max_next_records(batchcount)

    def _get_max_next_records(self, batchcount):
        max_next_records = min(self.__max_next_records, batchcount)
        if self.__totalcount:
            max_next_records = min(max_next_records,
                                   self.__totalcount - self.__scanned_count)
        return max_next_records

    def scan_collection_batches(self):
        while self.__enabled:
            batch = list(self.get_new_batch())
            if batch:
                yield batch

    def close(self):
        log.info("Total scanned: %d", self.__scanned_count)
        self.hsc.close()

    def set_startafter(self, startafter):
        self.__startafter = startafter

    @staticmethod
    def str_to_msecs(strtime):
        """
        Converts from any format supported by dateparser to epoch milisecs,
        which is the time representation used by hubstorage
        """
        if isinstance(strtime, int):
            return strtime
        if isinstance(strtime, str):
            d = dateparser.parse(strtime)
            return int(time.mktime(d.timetuple()) - time.timezone) * 1000
        return 0

    @property
    def scanned_count(self):
        return self.__scanned_count

    @property
    def is_enabled(self):
        return self.__enabled