Python ScrapinghubClient.close примеры использования

Язык программирования: Python

Пространство имен/Пакет: scrapinghub

Класс/Тип: ScrapinghubClient

Метод/Функция: close

Примеров на hotexamples.com: 3

Python ScrapinghubClient.close - 3 примера найдено. Это лучшие примеры Python кода для scrapinghub.ScrapinghubClient.close, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ScrapinghubClient(30)

get_project(30)

get_job(13)

close(3)

Пример #1

Показать файл

Файл: dash_api_utils.py Проект: lluisball/speximius

class SHConnection():
    ''' Wrapper for scrapinghub client, project and api calls
    to simplify use.
    '''

    def __init__(self, api_key, default_project_key=None):
        self.api_key = api_key
        self.project_key = resolve_project_key(
            default_project_key=default_project_key
        )

    def __enter__(self):
        self.client = ScrapinghubClient(self.api_key)
        self.project = self.client.get_project(self.project_key)
        return self

    def __exit__(self, *args):
        self.client.close()

    def jobs_iter(self, **kwargs):
        return self.project.jobs.iter(**kwargs)

    def get_job(self, job_id):
        return self.client.get_job(job_id)

Пример #2

Показать файл

class HCFManager(object):
    def __init__(self, auth, project_id, frontier, batch_size=0):
        self._client = ScrapinghubClient(auth=auth)
        self._hcf = self._client.get_project(project_id).frontiers
        self._frontier = self._hcf.get(frontier)
        self._links_count = defaultdict(int)
        self._links_to_flush_count = defaultdict(int)
        self._batch_size = batch_size
        self._hcf_retries = 10

    def add_request(self, slot, request):
        self._frontier.get(slot).q.add([request])
        self._links_count[slot] += 1
        self._links_to_flush_count[slot] += 1
        if self._batch_size and self._links_to_flush_count[
                slot] >= self._batch_size:
            return self.flush(slot)
        return 0

    def flush(self, slot=None):
        n_links_to_flush = self.get_number_of_links_to_flush(slot)
        if n_links_to_flush:
            if slot is None:
                self._hcf.flush()
                for slot in self._links_to_flush_count.keys():
                    self._links_to_flush_count[slot] = 0
                LOG.info('Flushed %d link(s).', n_links_to_flush)
            else:
                slot_obj = self._frontier.get(slot)
                slot_obj.flush()
                self._links_to_flush_count[slot] = 0

        return n_links_to_flush

    def read(self, slot, mincount=None):
        slot_obj = self._frontier.get(slot)
        for i in range(self._hcf_retries):
            try:
                return slot_obj.q.iter(mincount=mincount)
            except requests_lib.exceptions.ReadTimeout:
                LOG.error("Could not read from {0}/{1} try {2}/{3}".format(
                    self._frontier.key, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.ConnectionError:
                LOG.error(
                    "Connection error while reading from {0}/{1} try {2}/{3}".
                    format(self._frontier.key, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.RequestException:
                LOG.error(
                    "Error while reading from {0}/{1} try {2}/{3}".format(
                        self._frontier.key, slot, i + 1, self._hcf_retries))
            time.sleep(60 * (i + 1))
        return []

    def delete(self, slot, ids):
        slot_obj = self._frontier.get(slot)

        for i in range(self._hcf_retries):
            try:
                slot_obj.q.delete(ids)
                break
            except requests_lib.exceptions.ReadTimeout:
                LOG.error(
                    "Could not delete ids from {0}/{1} try {2}/{3}".format(
                        self._frontier.key, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.ConnectionError:
                LOG.error(
                    "Connection error while deleting ids from {0}/{1} try {2}/{3}"
                    .format(self._frontier.key, slot, i + 1,
                            self._hcf_retries))
            except requests_lib.exceptions.RequestException:
                LOG.error("Error deleting ids from {0}/{1} try {2}/{3}".format(
                    self._frontier.key, slot, i + 1, self._hcf_retries))
            time.sleep(60 * (i + 1))

    def delete_slot(self, slot):
        slot_obj = self._frontier.get(slot)
        slot_obj.delete()

    def close(self):
        self._hcf.close()
        self._client.close()

    def get_number_of_links(self, slot=None):
        if slot is None:
            return sum(self._links_count.values())
        else:
            return self._links_count[slot]

    def get_number_of_links_to_flush(self, slot=None):
        if slot is None:
            return sum(self._links_to_flush_count.values())
        else:
            return self._links_to_flush_count[slot]

Пример #3

Показать файл

Файл: scanner.py Проект: zanachka/collection-scanner

class CollectionScanner(object):
    """
    Base class for all collection scanners
    """
    # a list of names of complementary collection which shares same keys to the principal,
    # which its data will be merged in the output
    # for optimization purposes, it is made the assumption that secondary collections does not
    # have keys that are not present in principal. That is, key set of secondary collections
    # are always a subset of key set of principal.
    # TODO: logic does not work with startts
    secondary_collections = []

    def __init__(self,
                 collection_name,
                 project_id=None,
                 apikey=None,
                 batchsize=DEFAULT_BATCHSIZE,
                 count=0,
                 max_next_records=1000,
                 startafter=None,
                 stopbefore=None,
                 exclude_prefixes=None,
                 secondary_collections=None,
                 autodetect_partitions=True,
                 **kwargs):
        """
        collection_name - target collection
        project_id - target project id. If none, autodetect from SHUB_JOBKEY environment variable.
        apikey - hubstorage apikey with access to given project. If None, get from SH_APIKEY environment variable
                 (delegated to scrapinghub library).
        batchsize - size of each batch in number of records
        count - total count of records to retrieve
        max_next_records - how many records get on each call to hubstorage server
        startafter - start to scan after given hs key prefix
        stopbefore - stop once found given hs key prefix
        exclude_prefix - a list of key prefixes to exclude from scanning
        secondary_collections - a list of secondary collections that updates the class default one.
        autodetect_partitions - If provided, autodetect partitioned collection. By default is True. If you want instead to force to read a non-partitioned
                collection when partitioned version also exists under the same name, use False.
        **kwargs - other extras arguments you want to pass to hubstorage collection, i.e.:
                - prefix (list of key prefixes to include in the scan)
                - startts and endts, either in epoch millisecs (as accepted by hubstorage) or a date string (support is added here)
                - meta (a list with either '_ts' and/or '_key')
                etc (see husbtorage documentation)
        """
        self.hsc = ScrapinghubClient(apikey)._hsclient
        project_id = project_id or get_project_id()
        self.hsp = self.hsc.get_project(project_id)

        num_partitions = None
        if autodetect_partitions:
            num_partitions = get_num_partitions(self.hsp, collection_name)
            if num_partitions:
                log.info(
                    "Partitioned collection detected: %d total partitions.",
                    num_partitions)

        self.col = _CachedBlocksCollection(self.hsp, collection_name,
                                           num_partitions)
        self.__scanned_count = 0
        self.__totalcount = count
        self.lastkey = None
        self.__startafter = startafter
        self.__stopbefore = stopbefore
        self.__exclude_prefixes = exclude_prefixes or []
        self.secondary_collections.extend(secondary_collections or [])
        self.secondary = [
            _CachedBlocksCollection(self.hsp,
                                    name) for name in filter_collections_exist(
                                        self.hsp, self.secondary_collections)
        ]
        self.__secondary_is_empty = defaultdict(bool)
        self.__batchsize = batchsize
        self.__max_next_records = max_next_records
        self.__enabled = True

        self.__start = kwargs.pop('start', '')
        kwargs = kwargs.copy()
        self.__endts = self.convert_ts(kwargs.get('endts', None))
        kwargs['endts'] = self.__endts
        kwargs['startts'] = self.convert_ts(kwargs.get('startts', None))
        self.__get_kwargs = kwargs

    def reset(self):
        """
        Resets the scanner state variables in order to start again to scan collection
        """
        self.__scanned_count = 0
        self.__totalcount = 0
        self.lastkey = None
        self.__startafter = None
        self.__secondary_is_empty = defaultdict(bool)
        self.__enabled = True

    def get_secondary_data(self, start, meta):
        secondary_data = defaultdict(dict)
        last = None
        max_next_records = self._get_max_next_records(self.__batchsize)
        for col in self.secondary:
            if not self.__secondary_is_empty[col.colname]:
                count = 0
                try:
                    for r in col.get(count=[max_next_records],
                                     start=start,
                                     meta=meta):
                        count += 1
                        last = key = r.pop('_key')
                        ts = r.pop('_ts')
                        secondary_data[key].update(r)
                        if '_ts' not in secondary_data[
                                key] or ts > secondary_data[key]['_ts']:
                            secondary_data[key]['_ts'] = ts
                except KeyError:
                    pass
                if count < max_next_records:
                    self.__secondary_is_empty[col.colname] = True
                    log.info('Secondary collection %s is depleted',
                             col.colname)
        return last, dict(secondary_data)

    def convert_ts(self, timestamp):
        """
        Read a timestamp in diverse formats and return milisecs epoch
        """
        if isinstance(timestamp, (list, tuple)):
            timestamp = timestamp[0]
        if isinstance(timestamp, str):
            timestamp = self.str_to_msecs(timestamp)
        return timestamp

    def get_new_batch(self, random_mode=False):
        """
        Convenient way for scanning a collection in batches
        """
        kwargs = self.__get_kwargs.copy()
        original_meta = kwargs.pop('meta', [])
        meta = {'_key', '_ts'}.union(original_meta)
        last_secondary_key = None
        batchcount = self.__batchsize
        max_next_records = self._get_max_next_records(batchcount)
        # start used only once, as HS nulifies startafter if start is given
        start = self.__start
        self.__start = ''

        while max_next_records and self.__enabled:
            count = 0
            jump_prefix = False
            for r in self.col.get(random_mode,
                                  count=[max_next_records],
                                  startafter=[self.__startafter],
                                  start=start,
                                  meta=meta,
                                  **kwargs):
                if self.__stopbefore is not None and r['_key'].startswith(
                        self.__stopbefore):
                    self.__enabled = False
                    break
                count += 1
                for exclude in self.__exclude_prefixes:
                    if r['_key'].startswith(exclude):
                        self.__startafter = exclude + LIMIT_KEY_CHAR
                        jump_prefix = True
                        break
                if jump_prefix:
                    break
                self.__startafter = self.lastkey = r['_key']
                if last_secondary_key is None or self.__startafter > last_secondary_key:
                    last_secondary_key, secondary_data = self.get_secondary_data(
                        start=self.__startafter, meta=meta)
                srecord = secondary_data.pop(r['_key'], None)
                if srecord is not None:
                    ts = srecord['_ts']
                    r.update(srecord)
                    if ts > r['_ts']:
                        r['_ts'] = ts

                if self.__endts and r['_ts'] > self.__endts:
                    continue

                for m in ['_key', '_ts']:
                    if m not in original_meta:
                        r.pop(m)

                self.__scanned_count += 1
                batchcount -= 1
                if self.__scanned_count % 10000 == 0:
                    log.info("Last key: %s, Scanned %d", self.lastkey,
                             self.__scanned_count)
                yield r
            self.__enabled = count >= max_next_records and (
                not self.__totalcount
                or self.__scanned_count < self.__totalcount) or jump_prefix
            max_next_records = self._get_max_next_records(batchcount)

    def _get_max_next_records(self, batchcount):
        max_next_records = min(self.__max_next_records, batchcount)
        if self.__totalcount:
            max_next_records = min(max_next_records,
                                   self.__totalcount - self.__scanned_count)
        return max_next_records

    def scan_collection_batches(self):
        while self.__enabled:
            batch = list(self.get_new_batch())
            if batch:
                yield batch

    def close(self):
        log.info("Total scanned: %d", self.__scanned_count)
        self.hsc.close()

    def set_startafter(self, startafter):
        self.__startafter = startafter

    @staticmethod
    def str_to_msecs(strtime):
        """
        Converts from any format supported by dateparser to epoch milisecs,
        which is the time representation used by hubstorage
        """
        if isinstance(strtime, int):
            return strtime
        if isinstance(strtime, str):
            d = dateparser.parse(strtime)
            return int(time.mktime(d.timetuple()) - time.timezone) * 1000
        return 0

    @property
    def scanned_count(self):
        return self.__scanned_count

    @property
    def is_enabled(self):
        return self.__enabled