def row_to_doc(obj): for gval, gdf in obj.groupby(groupby): if hasattr(gval,'astype'): gval = make_tuple(gval.astype('O')) else: gval = make_tuple(gval) doc = dict(zip(groupby, gval)) datacols = list(set(gdf.columns) - set(groupby)) doc['_data'] = gdf[datacols].astype('O').to_dict('records') yield doc
def __setitem__(self, sel, val): """ add a projection to a sub context ctx['col'] = value-expression """ mapping = { col: v for (col, v) in zip(make_tuple(sel), make_tuple(val)) } self.project(mapping)
def _get_cursor(self): projection = make_tuple(self.columns) projection += make_tuple(self._get_frame_index()) if not self.sort_order: # implicit sort projection += make_tuple(self._get_frame_om_fields()) cursor = self.collection.find(projection=projection) if self.sort_order: cursor.sort(qops.make_sortkey(make_tuple(self.sort_order))) if self.head_limit: cursor.limit(self.head_limit) if self.skip_topn: cursor.skip(self.skip_topn) return cursor
def value(self): """ return the value of the series this is a Series unless unique() was called. If unique() only distinct values are returned as an array, matching the behavior of a Series :return: pandas.Series """ cursor = self._get_cursor() column = make_tuple(self.columns)[0] if self.is_unique: # the .distinct() cursor returns a list of values # this is to make sure we return the same thing as pandas val = [v for v in cursor] else: val = self._get_dataframe_from_cursor(cursor) val = val[column] val.name = self.name if len(val) == 1 and self.from_loc_indexer: val = val.iloc[0] if self.auto_inspect: self._inspect_cache.append(self.inspect(explain=True, cursor=cursor, raw=True)) if self._preparefn: df = self._preparefn(val) return val
def groupby(self, by, expr=None, append=None, **kwargs): """ add a groupby accumulation using $group :param by: the groupby columns, if provided as a list will be transformed :param expr: :param append: :param kwargs: :return: """ by = make_tuple(by) self.index_columns = self.index_columns + list(by) # define groupby by = {col: '$' + col for col in by} stage = self._getGroupBy(by) groupby = stage['$group'] # add acccumulators expr = expr or {col: colExpr for col, colExpr in six.iteritems(kwargs)} groupby.update(expr) # add a projection to extract groupby values extractId = {col: '$_id.' + col for col in by} # add a projection to keep accumulator columns keepCols = {col: 1 for col in expr} keepCols.update(extractId) self.project(keepCols, append=True) # sort by groupby keys self.add({'$sort': {col: 1 for col in by}}) return self
def inner(self, other, *args): # get all values passed and build terms from them values = list(make_tuple(other) + args) terms = [] for term in values: if isinstance(term, six.string_types): # if the term is a column name, add as a column name if term in self.columns: term = '$' + term # allow to specify values explicitely by $$<value> => <value> term = term.replace('$$', '') terms.append(term) # limit number of terms if requested if max_terms: terms = terms[:max_terms] # add projection of output columns to operator mapping = { col: { op: terms if base is None else ['$' + col] + terms, } for col in self.columns } self.project(mapping) # unwind all columns if requested if unwind: exprs = [{ '$unwind': { 'path': '$' + col } } for col in self.columns] self.stages.extend(exprs) return self
def _get_cursor(self): if self.is_unique: # this way indexes get applied cursor = self.collection.distinct(make_tuple(self.columns)[0]) else: cursor = super(MSeries, self)._get_cursor() return cursor
def inner(self, columns=None): columns = make_tuple(columns or self.columns) mapping = {col: { op: '$' + col, } for col in columns} self.project(mapping) return self
def __getitem__(self, sel): """ return a stage subset on a column """ subctx = ApplyContext(self.caller, columns=make_tuple(sel), index=self.index_columns) self.add(subctx) return subctx
def __init__(self, collection, columns=None, query=None, limit=None, skip=None, sort_order=None, force_columns=None, immediate_loc=False, auto_inspect=False, preparefn=None, **kwargs): self.collection = PickableCollection(collection) # columns in frame self.columns = make_tuple(columns) if columns else self._get_fields() self.columns = [str(col) for col in self.columns] # columns to sort by, defaults to not sorted self.sort_order = sort_order # top n documents to fetch self.head_limit = limit # top n documents to skip before returning self.skip_topn = skip # filter criteria self.filter_criteria = query or {} # force columns -- on output add columns not present self.force_columns = force_columns or [] # was this created from the loc indexer? self.from_loc_indexer = kwargs.get('from_loc_indexer', False) # was the loc index used a range? Else a single value self.from_loc_range = None # setup query for filter criteries, if provided if self.filter_criteria: # make sure we have a filtered collection with the criteria given if isinstance(self.filter_criteria, dict): self.query_inplace(**self.filter_criteria) elif isinstance(self.filter_criteria, Filter): self.query_inplace(self.filter_criteria) else: raise ValueError( 'Invalid query specification of type {}'.format( type(self.filter_criteria))) # if immediate_loc is True, .loc and .iloc always evaluate self.immediate_loc = immediate_loc # __array__ will return this value if it is set, set it otherwise self._evaluated = None # set true to automatically capture inspects on .value. retrieve using .inspect(cached=True) self.auto_inspect = auto_inspect self._inspect_cache = INSPECT_CACHE # apply mixins self._applyto = str(self.__class__) self._apply_mixins() # prepare function to be applied just before returning from .value self._preparefn = preparefn
def inner(self, columns=None): columns = make_tuple(columns or self.columns) mapping = {col: { op: '$' + col, } for col in columns} self.project(mapping) if unwind: self.stages.append({'$unwind': {''}}) return self inner.__doc__ = op.replace('$', '')
def inner(self, columns=None): columns = make_tuple(columns or self.columns) stage = self._getGroupBy(by='$$last') groupby = stage['$group'] groupby.update({ '{}_{}'.format(col, opname): { op: '$' + col } for col in columns }) self.computed.extend(groupby.keys()) self.project_keeper_columns() return self
def _getcopy_kwargs(self, without=None): """ return all parameters required on a copy of this MDataFrame """ kwargs = dict(columns=self.columns, sort_order=self.sort_order, limit=self.head_limit, skip=self.skip_topn, from_loc_indexer=self.from_loc_indexer, immediate_loc=self.immediate_loc, query=self.filter_criteria, auto_inspect=self.auto_inspect, preparefn=self._preparefn) [kwargs.pop(k) for k in make_tuple(without or [])] return kwargs
def sort(self, columns): """ sort by specified columns :param columns: str of single column or a list of columns. Sort order is specified as the + (ascending) or - (descending) prefix to the column name. Default sort order is ascending. :return: the MDataFrame """ self._evaluated = None self.sort_order = make_tuple(columns) return self
def make_index(self, columns, **kwargs): """ return an index specification suitable for collection.create_index() using columns specs like ['+A', '-A'] returns (key, index) pairs suitable for passing on to create_index. also generates a name for the index based on the columns and ordering. Use '@coord' to create a geospecial index. The coord column must be in GeoJSON format :param columns: a single index column, or a list of columns :param kwargs: optional kwargs to merge. if kwargs contains the 'name' key it will be preserved :return: (idx, **kwargs) tuple, pass as create_index(idx, **kwargs) """ SORTPREFIX = ['-', '+', '@'] DIRECTIONMAP = { '-': pymongo.DESCENDING, '+': pymongo.ASCENDING, '@': pymongo.GEOSPHERE, 'default': pymongo.ASCENDING, } columns = make_tuple(columns) direction_default = DIRECTIONMAP.get('default') sort_cols = [ '+' + col if col[0] not in SORTPREFIX else col for col in columns ] # get sort kwargs def direction(col): return DIRECTIONMAP.get(col[0], direction_default) idx = [(col.replace('+', '').replace('-', '').replace('@', ''), direction(col)) for col in sort_cols] name = '__'.join([ (col.replace('-', 'desc_').replace('+', 'asc_').replace('@', 'geo_')) for col in sort_cols ]) kwargs.setdefault('name', name) return idx, kwargs
def aggregate(self, specs, **kwargs): """ aggregate by given specs See the following link for a list of supported operations. https://docs.mongodb.com/manual/reference/operator/aggregation/group/ :param specs: a dictionary of { column : function | list[functions] } pairs. """ def add_stats(specs, column, stat): specs['%s_%s' % (column, stat)] = { '$%s' % MGrouper.STATS_MAP.get(stat, stat): '$%s' % column} # generate $group command _specs = {} for column, stats in six.iteritems(specs): stats = make_tuple(stats) for stat in stats: add_stats(_specs, column, stat) groupby = qops.GROUP(columns=self.columns, **_specs) # execute and return a dataframe pipeline = self._amend_pipeline([groupby]) data = self.collection.aggregate(pipeline, allowDiskUse=True) def get_data(): # we need this to build a pipeline for from_records # to process, otherwise the cursor will be exhausted already for group in data: _id = group.pop('_id') if isinstance(_id, dict): group.update(_id) yield group df = pd.DataFrame.from_records(get_data()) columns = make_list(self.columns) if columns: df = df.set_index(columns, drop=True) return df
def inner(self, other): terms = [] for term in make_tuple(other): if isinstance(term, six.string_types): term = '$' + term terms.append(term) def wrap(expr): if wrap_op is not None: expr = {wrap_op: expr} return expr mapping = { col: wrap({ op: ['$' + col] + terms, }) for col in self.columns } keepCols = {col: '$' + col for col in self.index_columns} mapping.update(keepCols) self.project(mapping) return self
def _get_filter(self, specs): filterq = [] projection = [] if self.positional: idx_cols = ['_om#rowid'] else: idx_cols = self.mdataframe._get_frame_index() flt_kwargs = {} enumerable_types = (list, tuple, np.ndarray) if isinstance(specs, np.ndarray): specs = specs.tolist() if (isinstance(specs, enumerable_types) and isscalar(specs[0]) and len(idx_cols) == 1 and not any(isinstance(s, slice) for s in specs)): # single column index with list of scalar values if (self.positional and isinstance(specs, tuple) and len(specs) == 2 and all(isscalar(v) for v in specs)): # iloc[int, int] is a cell access flt_kwargs[idx_cols[0]] = specs[0] projection.extend(self._get_projection(specs[1])) else: flt_kwargs['{}__in'.format(idx_cols[0])] = specs self._from_range = True elif isinstance(specs, (int, str)): flt_kwargs[idx_cols[0]] = specs else: specs = make_tuple(specs) # list/tuple of slices or scalar values, or MultiIndex for i, spec in enumerate(specs): if i < len(idx_cols): col = idx_cols[i] if isinstance(spec, slice): self._from_range = True start, stop = spec.start, spec.stop if start is not None: flt_kwargs['{}__gte'.format(col)] = start if stop is not None: if isinstance(stop, int): stop -= int(self.positional) flt_kwargs['{}__lte'.format(col)] = stop elif isinstance(spec, enumerable_types) and isscalar(spec[0]): self._from_range = True # single column index with list of scalar values # -- convert to list for PyMongo serialization if isinstance(spec, np.ndarray): spec = spec.tolist() flt_kwargs['{}__in'.format(col)] = spec elif isscalar(col): flt_kwargs[col] = spec else: # we're out of index columns, let's look at columns projection.extend(self._get_projection(spec)) if flt_kwargs: filterq.append(MongoQ(**flt_kwargs)) finalq = None for q in filterq: if finalq: finalq |= q else: finalq = q return finalq, projection
def __init__(self, mdataframe, collection, columns, sort=True): self.mdataframe = mdataframe self.collection = collection self.columns = make_tuple(columns) self.should_sort = sort
def _as_mseries(self, column): kwargs = self._getcopy_kwargs() kwargs.update(columns=make_tuple(column)) return MSeries(self.collection, **kwargs)
def set_index(self, columns): self.index_columns = make_tuple(columns) return self
def PROJECT(self, fields, include=True): fields = make_tuple(fields) return {'$project': {key: 1 if include else 0 for key in fields}}