Пример #1
0
 def _do_transform(self, verbose=0):
     # setup mdf and parameters
     opts = self._transform_options()
     n_jobs = opts['n_jobs']
     chunksize = opts['chunksize']
     applyfn = opts['applyfn']
     chunkfn = opts['chunkfn'] or self._chunker
     maxobs = opts['maxobs']
     mdf = opts['mdf']
     outname = opts['outname']
     append = opts['append']
     resolve = opts['resolve']
     backend = opts['backend']
     outcoll = PickableCollection(mdf.collection.database[outname])
     if not append:
         outcoll.drop()
     non_transforming = lambda mdf: mdf._clone()
     with Parallel(n_jobs=n_jobs, backend=backend,
                   verbose=verbose) as p:
         # prepare for serialization to remote worker
         chunks = chunkfn(non_transforming(mdf), chunksize, maxobs)
         runner = delayed(pyapply_process_chunk)
         worker_resolves_mdf = resolve in ('worker', 'w')
         # run in parallel
         jobs = [runner(mdf, i, chunksize, applyfn, outcoll, worker_resolves_mdf)
                 for i, mdf in enumerate(chunks)]
         p._backend._job_count = len(jobs)
         if verbose:
             print("Submitting {} tasks".format(len(jobs)))
         p(jobs)
     return outcoll
Пример #2
0
 def __init__(self, collection, query=None, projection=None, **kwargs):
     if isinstance(collection, FilteredCollection):
         # avoid cascading of FilteredCollections
         query = query or collection._fixed_query
         projection = projection or collection.projection
         collection = ensure_base_collection(collection)
     else:
         query = query or {}
     self._fixed_query = query
     self.projection = projection
     self.collection = PickableCollection(collection)
Пример #3
0
 def __init__(self, collection, query=None, projection=None, **kwargs):
     is_real_collection = isinstance(collection, Collection)
     while not is_real_collection:
         collection = collection.collection
         is_real_collection = isinstance(collection, Collection)
     collection = PickableCollection(collection)
     query = query or {}
     self._fixed_query = query
     self.projection = projection
     self.collection = collection
Пример #4
0
 def __init__(self,
              collection,
              columns=None,
              query=None,
              limit=None,
              skip=None,
              sort_order=None,
              force_columns=None,
              immediate_loc=False,
              auto_inspect=False,
              preparefn=None,
              **kwargs):
     self.collection = PickableCollection(collection)
     # columns in frame
     self.columns = make_tuple(columns) if columns else self._get_fields()
     self.columns = [str(col) for col in self.columns]
     # columns to sort by, defaults to not sorted
     self.sort_order = sort_order
     # top n documents to fetch
     self.head_limit = limit
     # top n documents to skip before returning
     self.skip_topn = skip
     # filter criteria
     self.filter_criteria = query or {}
     # force columns -- on output add columns not present
     self.force_columns = force_columns or []
     # was this created from the loc indexer?
     self.from_loc_indexer = kwargs.get('from_loc_indexer', False)
     # was the loc index used a range? Else a single value
     self.from_loc_range = None
     # setup query for filter criteries, if provided
     if self.filter_criteria:
         # make sure we have a filtered collection with the criteria given
         if isinstance(self.filter_criteria, dict):
             self.query_inplace(**self.filter_criteria)
         elif isinstance(self.filter_criteria, Filter):
             self.query_inplace(self.filter_criteria)
         else:
             raise ValueError(
                 'Invalid query specification of type {}'.format(
                     type(self.filter_criteria)))
     # if immediate_loc is True, .loc and .iloc always evaluate
     self.immediate_loc = immediate_loc
     # __array__ will return this value if it is set, set it otherwise
     self._evaluated = None
     # set true to automatically capture inspects on .value. retrieve using .inspect(cached=True)
     self.auto_inspect = auto_inspect
     self._inspect_cache = INSPECT_CACHE
     # apply mixins
     self._applyto = str(self.__class__)
     self._apply_mixins()
     # prepare function to be applied just before returning from .value
     self._preparefn = preparefn
Пример #5
0
def fast_insert(df, omstore, name, chunksize=default_chunksize):
    """
    fast insert of dataframe to mongodb

    Depending on size use single-process or multiprocessing. Typically
    multiprocessing is faster on datasets with > 10'000 data elements
    (rows x columns). Note this may max out your CPU and may use
    processor count * chunksize of additional memory. The chunksize is
    set to 10'000. The processor count is the default used by multiprocessing,
    typically the number of CPUs reported by the operating system.

    :param df: dataframe
    :param omstore: the OmegaStore to use. will be used to get the mongo_url
    :param name: the dataset name in OmegaStore to use. will be used to get the
    collection name from the omstore
    """
    # this is the fastest implementation (pool)
    # #records	pool	thread/wo copy	thread/w copy	pool w=0	pool dict	no chunking
    # 0.1m        1.47     2.06            2.17           1.59        2.11        2.28
    # 1m         17.4     19.8            20.6            16         17.8        22.2
    # 10m       149      193             183             177        213         256
    # based on
    # df = pd.DataFrame({'x': range(rows)})
    # om.datasets.put(df, 'test', replace=True) # no chunking: chunksize=False
    # - pool mp Pool, passes copy of df chunks, to_dict in pool processes
    # - thread/wo copy uses ThreadPool, shared memory on df
    # - thread/w copy uses ThreadPool, copy of chunks
    # - pool w=0 disables the mongo write concern
    # - pool dict performs to_dict on chunking, passes list of json docs pools just insert
    # - no chunking sets chunksize=False
    if chunksize and len(df) * len(df.columns) > chunksize:
        collection = PickableCollection(omstore.collection(name))
        # we crossed upper limits of single threaded processing, use a Pool
        # use the cached pool
        # use at least 2 processes for parallelism, at most half of available cores
        n_jobs = max(2, math.ceil(os.cpu_count() / 2))
        jobs = zip(dfchunker(df, size=chunksize), repeat(collection),
                   repeat(id(collection)))
        approx_jobs = int(len(df) / chunksize)
        # we use multiprocessing backend because
        with Parallel(n_jobs=n_jobs, backend='omegaml', verbose=False) as p:
            runner = delayed(insert_chunk)
            p_jobs = (runner(job) for job in jobs)
            p._job_count = approx_jobs
            p(p_jobs)
    else:
        # still within bounds for single threaded inserts
        omstore.collection(name).insert_many(df.to_dict(orient='records'))
Пример #6
0
class FilteredCollection:
    """
    A permanently filtered collection

    Supports all methods as a Collection does, however any filter or query
    argument is permanently set at instantiation

        fcoll = FilteredCollection(collection, query={ expression })

    Any subsequent operation will automatically apply the query expression.

    Note that v.v. a Collection and all methods that accept a filter as their first
    argument have a changed signature - the filter argument is optional
    with all FilteredCollection methods, as the filter is set at instantiation.

        Example:

            # in pymongo

            filter = {expression}
            coll.find_one_and_replace(filter, replace)

            # FilteredCollection

            coll = FilteredCollection(query={expression})
            coll.find_one_and_replace(replace, filter=None)

    This is so that calls to a FilteredCollection feel more natural, as opposed
    to specifying an empty filter argument on every call. Still, an additional
    filter can be specified on every method that accepts the filter= optional
    argument:

            # temporarily add another filter

            coll.find_one_and_replace(replace, filter={expression})

    Here expression will only apply to this particular method call. The
    global filter set by query= is unchanged.

    If no expression is given, the empty expression {} is assumed. To change
    the expression for the set fcoll.query = { expression }
    """
    def __init__(self, collection, query=None, projection=None, **kwargs):
        if isinstance(collection, FilteredCollection):
            # avoid cascading of FilteredCollections
            query = query or collection._fixed_query
            projection = projection or collection.projection
            collection = ensure_base_collection(collection)
        else:
            query = query or {}
        self._fixed_query = query
        self.projection = projection
        self.collection = PickableCollection(collection)

    @property
    def _Collection__database(self):
        return self.collection.database

    @property
    def name(self):
        return self.collection.name

    @property
    def database(self):
        return self.collection.database

    @property
    def query(self):
        return Filter(self.collection, **self._fixed_query).query

    def aggregate(self, pipeline, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        pipeline.insert(0, qops.MATCH(query))
        kwargs.update(allowDiskUse=True)
        return self.collection.aggregate(pipeline, **kwargs)

    def find(self, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find(filter=query, **kwargs)

    def find_one(self, filter=None, *args, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find_one(query, *args, **kwargs)

    def find_one_and_delete(self, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find_one_and_delete(query, **kwargs)

    def find_one_and_replace(self, replacement, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find_one_and_replace(query, replacement,
                                                    **kwargs)

    def find_one_and_update(self, update, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.find_one_and_update(query, update, **kwargs)

    def estimated_document_count(self, **kwargs):
        return self.collection.estimated_document_count(**kwargs)

    def count_documents(self, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.count_documents(query, **kwargs)

    def distinct(self, key, filter=None, **kwargs):
        query = dict(self.query)
        query.update(filter or {})
        return self.collection.distinct(key, filter=query, **kwargs)

    def create_index(self, keys, **kwargs):
        return self.collection.create_index(keys, **kwargs)

    def list_indexes(self, **kwargs):
        return self.collection.list_indexes(**kwargs)

    def insert(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def update(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def remove(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def find_and_modify(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def ensure_index(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )

    def save(self, *args, **kwargs):
        raise NotImplementedError(
            "deprecated in Collection and not implemented in FilteredCollection"
        )