예제 #1
 def test_find_one_and_delete(self):
     query = {'x': 9}
     fcoll = FilteredCollection(self.coll, query=query)
     result = fcoll.find_one_and_delete()
     self.assertIsInstance(result, dict)
     self.assertEqual(result.get('x'), 9)
     result = list(fcoll.find())
     self.assertEqual(len(result), 1)
예제 #2
 def test_map_reduce(self):
     query = {'x': 9}
     fcoll = FilteredCollection(self.coll, query=query)
     # calculate sum as a test value (values are random)
     ysum = sum(v.get('y') for v in fcoll.find())
     # use map reduce for the same sum calculation
     mapf = 'function() { emit(this.x, this.y); }'
     reducef = 'function(x, values) { return Array.sum(values); }'
     result = list(fcoll.map_reduce(mapf, reducef, 'mr_out').find())
     self.assertIsInstance(result, list)
     self.assertEqual(result[0].get('value'), ysum)
예제 #3
class MDataFrame(object):
    A DataFrame for mongodb

    Performs out-of-core, lazy computOation on a mongodb cluster.
    Behaves like a pandas DataFrame. Actual results are returned
    as pandas DataFrames.

    STATFUNCS = ['mean', 'std', 'min', 'max', 'sum', 'var']

    def __init__(self, collection, columns=None, query=None,
                 limit=None, skip=None, sort_order=None,
                 force_columns=None, immediate_loc=False, auto_inspect=False,
                 normalize=False, raw=False,
                 preparefn=None, **kwargs):
        self.collection = PickableCollection(collection)
        # columns in frame
        self.columns = make_tuple(columns) if columns else self._get_fields(raw=raw)
        self.columns = [str(col) for col in self.columns]
        # columns to sort by, defaults to not sorted
        self.sort_order = sort_order
        # top n documents to fetch
        self.head_limit = limit
        # top n documents to skip before returning
        self.skip_topn = skip
        # filter criteria
        self.filter_criteria = query or {}
        # force columns -- on output add columns not present
        self.force_columns = force_columns or []
        # was this created from the loc indexer?
        self.from_loc_indexer = kwargs.get('from_loc_indexer', False)
        # was the loc index used a range? Else a single value
        self.from_loc_range = None
        # setup query for filter criteries, if provided
        if self.filter_criteria:
            # make sure we have a filtered collection with the criteria given
            if isinstance(self.filter_criteria, dict):
            elif isinstance(self.filter_criteria, Filter):
                raise ValueError('Invalid query specification of type {}'.format(type(self.filter_criteria)))
        # if immediate_loc is True, .loc and .iloc always evaluate
        self.immediate_loc = immediate_loc
        # __array__ will return this value if it is set, set it otherwise
        self._evaluated = None
        # set true to automatically capture inspects on .value. retrieve using .inspect(cached=True)
        self.auto_inspect = auto_inspect
        self._inspect_cache = INSPECT_CACHE
        # apply mixins
        self._applyto = str(self.__class__)
        # parser to parse documents to dataframe
        self._parser = json_normalize if normalize else parser
        # prepare function to be applied just before returning from .value
        self._preparefn = preparefn
        # keep technical fields like _id, _idx etc
        self._raw = raw

    def _apply_mixins(self, *args, **kwargs):
        apply mixins in defaults.OMEGA_MDF_MIXINS
        from omegaml import settings
        defaults = settings()
        for mixin, applyto in defaults.OMEGA_MDF_MIXINS:
            if any(v in self._applyto for v in applyto.split(',')):
                extend_instance(self, mixin, *args, **kwargs)

    def __getstate__(self):
        # pickle support. note that the hard work is done in PickableCollection
        data = dict(self.__dict__)
        return data

    def __setstate__(self, state):
        # pickle support. note that the hard work is done in PickableCollection

    def _getcopy_kwargs(self, without=None):
        """ return all parameters required on a copy of this MDataFrame """
        kwargs = dict(columns=self.columns,
        [kwargs.pop(k) for k in make_tuple(without or [])]
        return kwargs

    def __array__(self, dtype=None):
        # FIXME inefficient. make MDataFrame a drop-in replacement for any numpy ndarray
        # this evaluates every single time
        if self._evaluated is None:
            self._evaluated = array = self.value.as_matrix()
            array = self._evaluated
        return array

    def __getattr__(self, attr):
        if attr in MDataFrame.STATFUNCS:
            return self.statfunc(attr)
        if attr in self.columns:
            kwargs = self._getcopy_kwargs()
            return MSeries(self.collection, **kwargs)
        raise AttributeError(attr)

    def __getitem__(self, cols_or_slice):
        select and project by column, columns, slice, masked-style filter

        Masked-style filters work similar to pd.DataFrame/Series masks
        but do not actually return masks but an instance of Filter. A
        Filter is a delayed evaluation on the data frame.

            # select all rows where any column is == 5
            mdf = MDataFrame(coll)
            flt = mdf == 5

        :param cols_or_slice: single column (str), multi-columns (list),
          slice to select columns or a masked-style
        :return: filtered MDataFrame or MSeries
        if isinstance(cols_or_slice, six.string_types):
            # column name => MSeries
            return self._as_mseries(cols_or_slice)
        elif isinstance(cols_or_slice, int):
            # column number => MSeries
            column = self.columns[cols_or_slice]
            return self._as_mseries(column)
        elif isinstance(cols_or_slice, (tuple, list)):
            # list of column names => MDataFrame subset on columns
            kwargs = self._getcopy_kwargs()
            return MDataFrame(self.collection, **kwargs)
        elif isinstance(cols_or_slice, Filter):
            kwargs = self._getcopy_kwargs()
            return MDataFrame(self.collection, **kwargs)
        elif isinstance(cols_or_slice, np.ndarray):
            raise NotImplemented
        raise ValueError('unknown accessor type %s' % type(cols_or_slice))

    def __setitem__(self, column, value):
        # True for any scalar type, numeric, bool, string
        if np.isscalar(value):
            result = self.collection.update_many(filter=self.filter_criteria,
                                                 update=qops.SET(column, value))
        return self

    def _clone(self, collection=None, **kwargs):
        # convenience method to clone itself with updates
        return self.__class__(collection or self.collection, **kwargs,

    def statfunc(self, stat):
        aggr = MGrouper(self, self.collection, [], sort=False)
        return getattr(aggr, stat)

    def groupby(self, columns, sort=True):
        Group by a given set of columns

        :param columns: the list of columns
        :param sort: if True sort by group key
        :return: MGrouper
        return MGrouper(self, self.collection, columns, sort=sort)

    def _get_fields(self, raw=False):
        result = []
        doc = self.collection.find_one()
        if doc is not None:
            if raw:
                result = list(doc.keys())
                result = [str(col) for col in doc.keys()
                          if col != '_id'
                          and not col.startswith('_idx')
                          and not col.startswith('_om#')]
        return result

    def _get_frame_index(self):
        """ return the dataframe's index columns """
        doc = self.collection.find_one()
        if doc is None:
            result = []
            result = restore_index_columns_order(doc.keys())
        return result

    def _get_frame_om_fields(self):
        """ return the dataframe's omega special fields columns """
        doc = self.collection.find_one()
        if doc is None:
            result = []
            result = [k for k in list(doc.keys()) if k.startswith('_om#')]
        return result

    def _as_mseries(self, column):
        kwargs = self._getcopy_kwargs()
        return MSeries(self.collection, **kwargs)

    def inspect(self, explain=False, cached=False, cursor=None, raw=False):
        inspect this dataframe's actual mongodb query

        :param explain: if True explains access path
        if not cached:
            if isinstance(self.collection, FilteredCollection):
                query = self.collection.query
                query = '*',
            if explain:
                cursor = cursor or self._get_cursor()
                explain = cursor.explain()
            data = {
                'projection': self.columns,
                'query': query,
                'explain': explain or 'specify explain=True'
            data = self._inspect_cache
        if not (raw or explain):
            data = pd.DataFrame(pd.io.json.json_normalize(data))
        return data

    def count(self):
        projected number of rows when resolving
        counts = pd.Series({
            col: len(self)
            for col in self.columns}, index=self.columns)
        return counts

    def __len__(self):
        the projected number of rows when resolving
        return self._get_cursor().count()

    def shape(self):
        return shape of dataframe
        return len(self), len(self.columns)

    def ndim(self):
        return len(self.shape)

    def value(self):
        resolve the query and return a Pandas DataFrame

        :return: the result of the query as a pandas DataFrame 
        cursor = self._get_cursor()
        df = self._get_dataframe_from_cursor(cursor)
        if self.auto_inspect:
            self._inspect_cache.append(self.inspect(explain=True, cursor=cursor, raw=True))
        # this ensures the equiv. of pandas df.loc[n] is a Series
        if self.from_loc_indexer:
            if len(df) == 1 and not self.from_loc_range:
                idx = df.index
                df = df.T
                df = df[df.columns[0]]
                if df.ndim == 1 and len(df) == 1 and not isinstance(idx, pd.MultiIndex):
                    # single row single dimension, numeric index only
                    df = df.iloc[0]
            elif (df.ndim == 1 or df.shape[1] == 1) and not self.from_loc_range:
                df = df[df.columns[0]]
        if self._preparefn:
            df = self._preparefn(df)
        return df

    def reset(self):
        # TODO if head(), tail(), query(), .loc/iloc should return a new MDataFrame instance to avoid having a reset need
        self.head_limit = None
        self.skip_topn = None
        self.filter_criteria = {}
        self.force_columns = []
        self.sort_order = None
        self.from_loc_indexer = False
        return self

    def _get_dataframe_from_cursor(self, cursor):
        from the given cursor return a DataFrame
        df = cursor_to_dataframe(cursor, parser=self._parser)
        df = self._restore_dataframe_proper(df)
        return df

    def _restore_dataframe_proper(self, df):
        df = restore_index(df, dict())
        if '_id' in df.columns and not self._raw:
            df.drop('_id', axis=1, inplace=True)
        if self.force_columns:
            missing = set(self.force_columns) - set(self.columns)
            for col in missing:
                df[col] = np.NaN
        return df

    def _get_cursor(self):
        projection = make_tuple(self.columns)
        projection += make_tuple(self._get_frame_index())
        if not self.sort_order:
            # implicit sort
            projection += make_tuple(self._get_frame_om_fields())
        cursor = self.collection.find(projection=projection)
        if self.sort_order:
        if self.head_limit:
        if self.skip_topn:
        return cursor

    def sort(self, columns):
        sort by specified columns

        :param columns: str of single column or a list of columns. Sort order
                        is specified as the + (ascending) or - (descending)
                        prefix to the column name. Default sort order is
        :return: the MDataFrame
        self._evaluated = None
        self.sort_order = make_tuple(columns)
        return self

    def head(self, limit=10):
        return up to limit numbers of rows

        :param limit: the number of rows to return. Defaults to 10
        :return: the MDataFrame
        return self._clone(limit=limit)

    def tail(self, limit=10):
        return up to limit number of rows from last inserted values

        :param limit:
        tail_n = self.skip(len(self) - limit)
        return self._clone(skip=tail_n)

    def skip(self, topn):
        skip the topn number of rows

        :param topn: the number of rows to skip.
        :return: the MDataFrame 
        return self._clone(skip=topn)

    def merge(self, right, on=None, left_on=None, right_on=None,
              how='inner', target=None, suffixes=('_x', '_y'),
              sort=False, inspect=False):
        merge this dataframe with another dataframe. only left outer joins
        are currently supported. the output is saved as a new collection,
        target name (defaults to a generated name if not specified).

        :param right: the other MDataFrame
        :param on: the list of key columns to merge by
        :param left_on: the list of the key columns to merge on this dataframe
        :param right_on: the list of the key columns to merge on the other 
        :param how: the method to merge. supported are left, inner, right. 
           Defaults to inner
        :param target: the name of the collection to store the merge results
           in. If not provided a temporary name will be created.
        :param suffixes: the suffixes to apply to identical left and right 
        :param sort: if True the merge results will be sorted. If False the
           MongoDB natural order is implied.
        :returns: the MDataFrame to the target MDataFrame
        # validate input
        supported_how = ["left", 'inner', 'right']
        assert how in supported_how, "only %s merges are currently supported" % supported_how
        for key in [on, left_on, right_on]:
            if key:
                assert isinstance(
                    key, six.string_types), "only single column merge keys are supported (%s)" % key
        if isinstance(right, Collection):
            right = MDataFrame(right)
        assert isinstance(
            right, MDataFrame), "both must be MDataFrames, got right=%" % type(right)
        if how == 'right':
            # A right B == B left A
            return right.merge(self, on=on, left_on=right_on, right_on=left_on,
                               how='left', target=target, suffixes=suffixes)
        # generate lookup parameters
        on = on or '_id'
        right_name = self._get_collection_name_of(right, right)
        target_name = self._get_collection_name_of(
            target, '_temp.merge.%s' % uuid4().hex)
        target_field = (
                "%s_%s" % (right_name.replace('.', '_'), right_on or on))
        lookup = qops.LOOKUP(right_name,
        # unwind merged documents from arrays to top-level document fields
        unwind = qops.UNWIND(target_field, preserve=how != 'inner')
        # get all fields from left, right
        project = {}
        for left_col in self.columns:
            source_left_col = left_col
            if left_col == '_id':
                project[left_col] = 1
            if left_col.startswith('_idx'):
            if left_col.startswith('_om#'):
            if left_col != (on or left_on) and left_col in right.columns:
                left_col = '%s%s' % (left_col, suffixes[0])
            project[left_col] = "$%s" % source_left_col
        for right_col in right.columns:
            if right_col == '_id':
            if right_col.startswith('_idx'):
            if right_col.startswith('_om#'):
            if right_col == (on or right_on) and right_col == (on or left_on):
                # if the merge field is the same in both frames, we already
                # have it from left
            if right_col in self.columns:
                left_col = '%s%s' % (right_col, suffixes[1])
                left_col = '%s' % right_col
            project[left_col] = '$%s.%s' % (target_field, right_col)
        expected_columns = list(project.keys())
        project = {"$project": project}
        # store merged documents and return an MDataFrame to it
        out = qops.OUT(target_name)
        pipeline = [lookup, unwind, project]
        if sort:
            sort_cols = make_list(on or [left_on, right_on])
            sort_key = qops.make_sortkey(sort_cols)
            sort = qops.SORT(**dict(sort_key))
        if inspect:
            result = pipeline
            result = self.collection.aggregate(pipeline, allowDiskUse=True)
            result = MDataFrame(self.collection.database[target_name],
        return result

    def append(self, other):
        if isinstance(other, Collection):
            right = MDataFrame(other)
        assert isinstance(
            other, MDataFrame), "both must be MDataFrames, got other={}".format(type(other))
        outname = self.collection.name
        mrout = {
            'merge': outname,
            'nonAtomic': True,
        mapfn = Code("""
        function() {
           this._id = ObjectId();
           if(this['_om#rowid']) {
              this['_om#rowid'] += %s;
           emit(this._id, this);
        """ % len(self))
        reducefn = Code("""
        function(key, value) {
           return value;
        finfn = Code("""
        function(key, value) {
           return value;
        other.collection.map_reduce(mapfn, reducefn, mrout, finalize=finfn, jsMode=True)
        unwind = {
            "$replaceRoot": {
                "newRoot": {
                    "$ifNull": ["$value", "$$CURRENT"],
        output = qops.OUT(outname)
        pipeline = [unwind, output]
        self.collection.aggregate(pipeline, allowDiskUse=True)
        return self

    def _get_collection_name_of(self, some, default=None):
        determine the collection name of the given parameter

        returns the collection name if some is a MDataFrame, a Collection
        or a string_type. Otherwise returns default
        if isinstance(some, MDataFrame):
            name = some.collection.name
        elif isinstance(some, Collection):
            name = some.name
            name = default
        return name

    def _get_filter_criteria(self, *args, **kwargs):
        return mongo query from filter specs

        this uses a Filter to produce the query from the kwargs.

        :param args: a Q object or logical combination of Q objects
        :param kwargs: all AND filter criteria 
        if len(args) > 0:
            q = args[0]
            if isinstance(q, MongoQ):
                filter_criteria = Filter(self.collection, q).query
            elif isinstance(q, Filter):
                filter_criteria = Filter(self.collection, q.q).query
            filter_criteria = Filter(self.collection, **kwargs).query
        return filter_criteria

    def query_inplace(self, *args, **kwargs):
        filters this MDataFrame and returns it. 

        Any subsequent operation on the dataframe will have the filter
        applied. To reset the filter call .reset() without arguments.

        :param args: a Q object or logical combination of Q objects
        :param kwargs: all AND filter criteria 
        :return: self
        self._evaluated = None
        self.filter_criteria = self._get_filter_criteria(*args, **kwargs)
        self.collection = FilteredCollection(
            self.collection, query=self.filter_criteria)
        return self

    def query(self, *args, **kwargs):
        return a new MDataFrame with a filter criteria

        Any subsequent operation on the new dataframe will have the filter
        applied. To reset the filter call .reset() without arguments.

        Note: Unlike pandas DataFrames, a filtered MDataFrame operates
        on the same collection as the original DataFrame

        :param args: a Q object or logical combination of Q objects
        :param kwargs: all AND filter criteria 
        :return: a new MDataFrame with the filter applied
        effective_filter = dict(self.filter_criteria)
        filter_criteria = self._get_filter_criteria(*args, **kwargs)
        if '$and' in effective_filter:
        coll = FilteredCollection(self.collection, query=effective_filter)
        return self._clone(collection=coll, query=effective_filter)

    def create_index(self, keys, **kwargs):
        create and index the easy way
        keys, kwargs = MongoQueryOps().make_index(keys)
        result = self.collection.create_index(keys, **kwargs)
        return result

    def list_indexes(self):
        list all indices in database
        return cursor_to_dataframe(self.collection.list_indexes())

    def loc(self):
        Access by index

        Use as mdf.loc[index_value]

        :return: MLocIndexer
        self._evaluated = None
        indexer = MLocIndexer(self)
        return indexer

    def iloc(self):
        self._evaluated = None
        indexer = MPosIndexer(self)
        return indexer

    def __repr__(self):
        kwargs = ', '.join('{}={}'.format(k, v) for k, v in six.iteritems(self._getcopy_kwargs()))
        return "MDataFrame(collection={collection.name}, {kwargs})".format(collection=self.collection,
예제 #4
 def test_find(self):
     query = {'x': 1}
     fcoll = FilteredCollection(self.coll, query=query)
     result = list(fcoll.find())
     self.assertTrue(len(result) == 2)