Python PickableCollection примеры использования

Язык программирования: Python

Пространство имен/Пакет: omegaml.util

Класс/Тип: PickableCollection

Примеров на hotexamples.com: 6

Python PickableCollection - 6 примеров найдено. Это лучшие примеры Python кода для omegaml.util.PickableCollection, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PickableCollection(5)

aggregate(1)

count_documents(1)

create_index(1)

distinct(1)

drop(1)

estimated_document_count(1)

find(1)

find_one(1)

find_one_and_delete(1)

find_one_and_replace(1)

find_one_and_update(1)

list_indexes(1)

Пример #1

Показать файл

Файл: parallel.py Проект: omegaml/omegaml

 def _do_transform(self, verbose=0):
     # setup mdf and parameters
     opts = self._transform_options()
     n_jobs = opts['n_jobs']
     chunksize = opts['chunksize']
     applyfn = opts['applyfn']
     chunkfn = opts['chunkfn'] or self._chunker
     maxobs = opts['maxobs']
     mdf = opts['mdf']
     outname = opts['outname']
     append = opts['append']
     resolve = opts['resolve']
     backend = opts['backend']
     outcoll = PickableCollection(mdf.collection.database[outname])
     if not append:
         outcoll.drop()
     non_transforming = lambda mdf: mdf._clone()
     with Parallel(n_jobs=n_jobs, backend=backend,
                   verbose=verbose) as p:
         # prepare for serialization to remote worker
         chunks = chunkfn(non_transforming(mdf), chunksize, maxobs)
         runner = delayed(pyapply_process_chunk)
         worker_resolves_mdf = resolve in ('worker', 'w')
         # run in parallel
         jobs = [runner(mdf, i, chunksize, applyfn, outcoll, worker_resolves_mdf)
                 for i, mdf in enumerate(chunks)]
         p._backend._job_count = len(jobs)
         if verbose:
             print("Submitting {} tasks".format(len(jobs)))
         p(jobs)
     return outcoll

Пример #2

Показать файл

Файл: filtered.py Проект: omegaml/omegaml

 def __init__(self, collection, query=None, projection=None, **kwargs):
     if isinstance(collection, FilteredCollection):
         # avoid cascading of FilteredCollections
         query = query or collection._fixed_query
         projection = projection or collection.projection
         collection = ensure_base_collection(collection)
     else:
         query = query or {}
     self._fixed_query = query
     self.projection = projection
     self.collection = PickableCollection(collection)

Пример #3

Показать файл

Файл: filtered.py Проект: adbmd/omegaml

 def __init__(self, collection, query=None, projection=None, **kwargs):
     is_real_collection = isinstance(collection, Collection)
     while not is_real_collection:
         collection = collection.collection
         is_real_collection = isinstance(collection, Collection)
     collection = PickableCollection(collection)
     query = query or {}
     self._fixed_query = query
     self.projection = projection
     self.collection = collection

Пример #4

Показать файл

 def __init__(self,
              collection,
              columns=None,
              query=None,
              limit=None,
              skip=None,
              sort_order=None,
              force_columns=None,
              immediate_loc=False,
              auto_inspect=False,
              preparefn=None,
              **kwargs):
     self.collection = PickableCollection(collection)
     # columns in frame
     self.columns = make_tuple(columns) if columns else self._get_fields()
     self.columns = [str(col) for col in self.columns]
     # columns to sort by, defaults to not sorted
     self.sort_order = sort_order
     # top n documents to fetch
     self.head_limit = limit
     # top n documents to skip before returning
     self.skip_topn = skip
     # filter criteria
     self.filter_criteria = query or {}
     # force columns -- on output add columns not present
     self.force_columns = force_columns or []
     # was this created from the loc indexer?
     self.from_loc_indexer = kwargs.get('from_loc_indexer', False)
     # was the loc index used a range? Else a single value
     self.from_loc_range = None
     # setup query for filter criteries, if provided
     if self.filter_criteria:
         # make sure we have a filtered collection with the criteria given
         if isinstance(self.filter_criteria, dict):
             self.query_inplace(**self.filter_criteria)
         elif isinstance(self.filter_criteria, Filter):
             self.query_inplace(self.filter_criteria)
         else:
             raise ValueError(
                 'Invalid query specification of type {}'.format(
                     type(self.filter_criteria)))
     # if immediate_loc is True, .loc and .iloc always evaluate
     self.immediate_loc = immediate_loc
     # __array__ will return this value if it is set, set it otherwise
     self._evaluated = None
     # set true to automatically capture inspects on .value. retrieve using .inspect(cached=True)
     self.auto_inspect = auto_inspect
     self._inspect_cache = INSPECT_CACHE
     # apply mixins
     self._applyto = str(self.__class__)
     self._apply_mixins()
     # prepare function to be applied just before returning from .value
     self._preparefn = preparefn

Пример #5

Показать файл

def fast_insert(df, omstore, name, chunksize=default_chunksize):
    """
    fast insert of dataframe to mongodb

    Depending on size use single-process or multiprocessing. Typically
    multiprocessing is faster on datasets with > 10'000 data elements
    (rows x columns). Note this may max out your CPU and may use
    processor count * chunksize of additional memory. The chunksize is
    set to 10'000. The processor count is the default used by multiprocessing,
    typically the number of CPUs reported by the operating system.

    :param df: dataframe
    :param omstore: the OmegaStore to use. will be used to get the mongo_url
    :param name: the dataset name in OmegaStore to use. will be used to get the
    collection name from the omstore
    """
    # this is the fastest implementation (pool)
    # #records	pool	thread/wo copy	thread/w copy	pool w=0	pool dict	no chunking
    # 0.1m        1.47     2.06            2.17           1.59        2.11        2.28
    # 1m         17.4     19.8            20.6            16         17.8        22.2
    # 10m       149      193             183             177        213         256
    # based on
    # df = pd.DataFrame({'x': range(rows)})
    # om.datasets.put(df, 'test', replace=True) # no chunking: chunksize=False
    # - pool mp Pool, passes copy of df chunks, to_dict in pool processes
    # - thread/wo copy uses ThreadPool, shared memory on df
    # - thread/w copy uses ThreadPool, copy of chunks
    # - pool w=0 disables the mongo write concern
    # - pool dict performs to_dict on chunking, passes list of json docs pools just insert
    # - no chunking sets chunksize=False
    if chunksize and len(df) * len(df.columns) > chunksize:
        collection = PickableCollection(omstore.collection(name))
        # we crossed upper limits of single threaded processing, use a Pool
        # use the cached pool
        # use at least 2 processes for parallelism, at most half of available cores
        n_jobs = max(2, math.ceil(os.cpu_count() / 2))
        jobs = zip(dfchunker(df, size=chunksize), repeat(collection),
                   repeat(id(collection)))
        approx_jobs = int(len(df) / chunksize)
        # we use multiprocessing backend because
        with Parallel(n_jobs=n_jobs, backend='omegaml', verbose=False) as p:
            runner = delayed(insert_chunk)
            p_jobs = (runner(job) for job in jobs)
            p._job_count = approx_jobs
            p(p_jobs)
    else:
        # still within bounds for single threaded inserts
        omstore.collection(name).insert_many(df.to_dict(orient='records'))

Пример #6

Показать файл

Файл: filtered.py Проект: omegaml/omegaml

class FilteredCollection:
    """
    A permanently filtered collection

    Supports all methods as a Collection does, however any filter or query
    argument is permanently set at instantiation

        fcoll = FilteredCollection(collection, query={ expression })

    Any subsequent operation will automatically apply the query expression.

    Note that v.v. a Collection and all methods that accept a filter as their first
    argument have a changed signature - the filter argument is optional
    with all FilteredCollection methods, as the filter is set at instantiation.

        Example:

            # in pymongo

            filter = {expression}
            coll.find_one_and_replace(filter, replace)

            # FilteredCollection

            coll = FilteredCollection(query={expression})
            coll.find_one_and_replace(replace, filter=None)

    This is so that calls to a FilteredCollection feel more natural, as opposed
    to specifying an empty filter argument on every call. Still, an additional
    filter can be specified on every method that accepts the filter= optional
    argument:

            # temporarily add another filter

            coll.find_one_and_replace(replace, filter={expression})

    Here expression will only apply to this particular method call. The
    global filter set by query= is unchanged.

    If no expression is given, the empty expression {} is assumed. To change
    the expression for the set fcoll.query = { expression }
    """
    def __init__(self, collection, query=None, projection=None, **kwargs):
        if isinstance(collection, FilteredCollection):
            # avoid cascading of FilteredCollections
            query = query or collection._fixed_query
            projection = projection or collection.projection
            collection = ensure_base_collection(collection)
        else:
            query = query or {}
        self._fixed_query = query
        self.projection = projection
        self.collection = PickableCollection(collection)

    @property
    def _Collection__database(self):
        return self.collection.database

    @property
    def name(self):
        return self.collection.name

    @property
    def database(self):
        return self.collection.database

    @property
    def query(self):
        return Filter(self.collection, **self._fixed_query).query

    def aggregate(self, pipeline, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        pipeline.insert(0, qops.MATCH(query))
        kwargs.update(allowDiskUse=True)
        return self.collection.aggregate(pipeline, **kwargs)

    def find(self, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find(filter=query, **kwargs)

    def find_one(self, filter=None, *args, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find_one(query, *args, **kwargs)

    def find_one_and_delete(self, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find_one_and_delete(query, **kwargs)

    def find_one_and_replace(self, replacement, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find_one_and_replace(query, replacement,
                                                    **kwargs)

    def find_one_and_update(self, update, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find_one_and_update(query, update, **kwargs)

    def estimated_document_count(self, **kwargs):
        return self.collection.estimated_document_count(**kwargs)

    def count_documents(self, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.count_documents(query, **kwargs)

    def distinct(self, key, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.distinct(key, filter=query, **kwargs)

    def create_index(self, keys, **kwargs):
        return self.collection.create_index(keys, **kwargs)

    def list_indexes(self, **kwargs):
        return self.collection.list_indexes(**kwargs)

    def insert(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def update(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def remove(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def find_and_modify(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def ensure_index(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def save(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )