def _do_transform(self, verbose=0): # setup mdf and parameters opts = self._transform_options() n_jobs = opts['n_jobs'] chunksize = opts['chunksize'] applyfn = opts['applyfn'] chunkfn = opts['chunkfn'] or self._chunker maxobs = opts['maxobs'] mdf = opts['mdf'] outname = opts['outname'] append = opts['append'] resolve = opts['resolve'] backend = opts['backend'] outcoll = PickableCollection(mdf.collection.database[outname]) if not append: outcoll.drop() non_transforming = lambda mdf: mdf._clone() with Parallel(n_jobs=n_jobs, backend=backend, verbose=verbose) as p: # prepare for serialization to remote worker chunks = chunkfn(non_transforming(mdf), chunksize, maxobs) runner = delayed(pyapply_process_chunk) worker_resolves_mdf = resolve in ('worker', 'w') # run in parallel jobs = [runner(mdf, i, chunksize, applyfn, outcoll, worker_resolves_mdf) for i, mdf in enumerate(chunks)] p._backend._job_count = len(jobs) if verbose: print("Submitting {} tasks".format(len(jobs))) p(jobs) return outcoll
def __init__(self, collection, query=None, projection=None, **kwargs): if isinstance(collection, FilteredCollection): # avoid cascading of FilteredCollections query = query or collection._fixed_query projection = projection or collection.projection collection = ensure_base_collection(collection) else: query = query or {} self._fixed_query = query self.projection = projection self.collection = PickableCollection(collection)
def __init__(self, collection, query=None, projection=None, **kwargs): is_real_collection = isinstance(collection, Collection) while not is_real_collection: collection = collection.collection is_real_collection = isinstance(collection, Collection) collection = PickableCollection(collection) query = query or {} self._fixed_query = query self.projection = projection self.collection = collection
def __init__(self, collection, columns=None, query=None, limit=None, skip=None, sort_order=None, force_columns=None, immediate_loc=False, auto_inspect=False, preparefn=None, **kwargs): self.collection = PickableCollection(collection) # columns in frame self.columns = make_tuple(columns) if columns else self._get_fields() self.columns = [str(col) for col in self.columns] # columns to sort by, defaults to not sorted self.sort_order = sort_order # top n documents to fetch self.head_limit = limit # top n documents to skip before returning self.skip_topn = skip # filter criteria self.filter_criteria = query or {} # force columns -- on output add columns not present self.force_columns = force_columns or [] # was this created from the loc indexer? self.from_loc_indexer = kwargs.get('from_loc_indexer', False) # was the loc index used a range? Else a single value self.from_loc_range = None # setup query for filter criteries, if provided if self.filter_criteria: # make sure we have a filtered collection with the criteria given if isinstance(self.filter_criteria, dict): self.query_inplace(**self.filter_criteria) elif isinstance(self.filter_criteria, Filter): self.query_inplace(self.filter_criteria) else: raise ValueError( 'Invalid query specification of type {}'.format( type(self.filter_criteria))) # if immediate_loc is True, .loc and .iloc always evaluate self.immediate_loc = immediate_loc # __array__ will return this value if it is set, set it otherwise self._evaluated = None # set true to automatically capture inspects on .value. retrieve using .inspect(cached=True) self.auto_inspect = auto_inspect self._inspect_cache = INSPECT_CACHE # apply mixins self._applyto = str(self.__class__) self._apply_mixins() # prepare function to be applied just before returning from .value self._preparefn = preparefn
def fast_insert(df, omstore, name, chunksize=default_chunksize): """ fast insert of dataframe to mongodb Depending on size use single-process or multiprocessing. Typically multiprocessing is faster on datasets with > 10'000 data elements (rows x columns). Note this may max out your CPU and may use processor count * chunksize of additional memory. The chunksize is set to 10'000. The processor count is the default used by multiprocessing, typically the number of CPUs reported by the operating system. :param df: dataframe :param omstore: the OmegaStore to use. will be used to get the mongo_url :param name: the dataset name in OmegaStore to use. will be used to get the collection name from the omstore """ # this is the fastest implementation (pool) # #records pool thread/wo copy thread/w copy pool w=0 pool dict no chunking # 0.1m 1.47 2.06 2.17 1.59 2.11 2.28 # 1m 17.4 19.8 20.6 16 17.8 22.2 # 10m 149 193 183 177 213 256 # based on # df = pd.DataFrame({'x': range(rows)}) # om.datasets.put(df, 'test', replace=True) # no chunking: chunksize=False # - pool mp Pool, passes copy of df chunks, to_dict in pool processes # - thread/wo copy uses ThreadPool, shared memory on df # - thread/w copy uses ThreadPool, copy of chunks # - pool w=0 disables the mongo write concern # - pool dict performs to_dict on chunking, passes list of json docs pools just insert # - no chunking sets chunksize=False if chunksize and len(df) * len(df.columns) > chunksize: collection = PickableCollection(omstore.collection(name)) # we crossed upper limits of single threaded processing, use a Pool # use the cached pool # use at least 2 processes for parallelism, at most half of available cores n_jobs = max(2, math.ceil(os.cpu_count() / 2)) jobs = zip(dfchunker(df, size=chunksize), repeat(collection), repeat(id(collection))) approx_jobs = int(len(df) / chunksize) # we use multiprocessing backend because with Parallel(n_jobs=n_jobs, backend='omegaml', verbose=False) as p: runner = delayed(insert_chunk) p_jobs = (runner(job) for job in jobs) p._job_count = approx_jobs p(p_jobs) else: # still within bounds for single threaded inserts omstore.collection(name).insert_many(df.to_dict(orient='records'))
class FilteredCollection: """ A permanently filtered collection Supports all methods as a Collection does, however any filter or query argument is permanently set at instantiation fcoll = FilteredCollection(collection, query={ expression }) Any subsequent operation will automatically apply the query expression. Note that v.v. a Collection and all methods that accept a filter as their first argument have a changed signature - the filter argument is optional with all FilteredCollection methods, as the filter is set at instantiation. Example: # in pymongo filter = {expression} coll.find_one_and_replace(filter, replace) # FilteredCollection coll = FilteredCollection(query={expression}) coll.find_one_and_replace(replace, filter=None) This is so that calls to a FilteredCollection feel more natural, as opposed to specifying an empty filter argument on every call. Still, an additional filter can be specified on every method that accepts the filter= optional argument: # temporarily add another filter coll.find_one_and_replace(replace, filter={expression}) Here expression will only apply to this particular method call. The global filter set by query= is unchanged. If no expression is given, the empty expression {} is assumed. To change the expression for the set fcoll.query = { expression } """ def __init__(self, collection, query=None, projection=None, **kwargs): if isinstance(collection, FilteredCollection): # avoid cascading of FilteredCollections query = query or collection._fixed_query projection = projection or collection.projection collection = ensure_base_collection(collection) else: query = query or {} self._fixed_query = query self.projection = projection self.collection = PickableCollection(collection) @property def _Collection__database(self): return self.collection.database @property def name(self): return self.collection.name @property def database(self): return self.collection.database @property def query(self): return Filter(self.collection, **self._fixed_query).query def aggregate(self, pipeline, filter=None, **kwargs): query = dict(self.query) query.update(filter or {}) pipeline.insert(0, qops.MATCH(query)) kwargs.update(allowDiskUse=True) return self.collection.aggregate(pipeline, **kwargs) def find(self, filter=None, **kwargs): query = dict(self.query) query.update(filter or {}) return self.collection.find(filter=query, **kwargs) def find_one(self, filter=None, *args, **kwargs): query = dict(self.query) query.update(filter or {}) return self.collection.find_one(query, *args, **kwargs) def find_one_and_delete(self, filter=None, **kwargs): query = dict(self.query) query.update(filter or {}) return self.collection.find_one_and_delete(query, **kwargs) def find_one_and_replace(self, replacement, filter=None, **kwargs): query = dict(self.query) query.update(filter or {}) return self.collection.find_one_and_replace(query, replacement, **kwargs) def find_one_and_update(self, update, filter=None, **kwargs): query = dict(self.query) query.update(filter or {}) return self.collection.find_one_and_update(query, update, **kwargs) def estimated_document_count(self, **kwargs): return self.collection.estimated_document_count(**kwargs) def count_documents(self, filter=None, **kwargs): query = dict(self.query) query.update(filter or {}) return self.collection.count_documents(query, **kwargs) def distinct(self, key, filter=None, **kwargs): query = dict(self.query) query.update(filter or {}) return self.collection.distinct(key, filter=query, **kwargs) def create_index(self, keys, **kwargs): return self.collection.create_index(keys, **kwargs) def list_indexes(self, **kwargs): return self.collection.list_indexes(**kwargs) def insert(self, *args, **kwargs): raise NotImplementedError( "deprecated in Collection and not implemented in FilteredCollection" ) def update(self, *args, **kwargs): raise NotImplementedError( "deprecated in Collection and not implemented in FilteredCollection" ) def remove(self, *args, **kwargs): raise NotImplementedError( "deprecated in Collection and not implemented in FilteredCollection" ) def find_and_modify(self, *args, **kwargs): raise NotImplementedError( "deprecated in Collection and not implemented in FilteredCollection" ) def ensure_index(self, *args, **kwargs): raise NotImplementedError( "deprecated in Collection and not implemented in FilteredCollection" ) def save(self, *args, **kwargs): raise NotImplementedError( "deprecated in Collection and not implemented in FilteredCollection" )