示例#1
0
 def _count(self):
     count_columns = self._non_group_columns()
     if len(count_columns) == 0:
         count_columns.append('_'.join(self.columns) + '_count')
     groupby = {
         "$group": {
             "_id": {k: "$%s" % k for k in self.columns},
         }
     }
     for k in count_columns:
         groupby['$group']['%s' % k] = {"$sum": 1}
     pipeline = self._amend_pipeline([groupby])
     if self.should_sort:
         sort = qops.SORT(**dict(qops.make_sortkey('_id')))
         pipeline.append(sort)
     return list(self.collection.aggregate(pipeline, allowDiskUse=True))
示例#2
0
 def _amend_pipeline(self, pipeline):
     """ amend pipeline with default ops on coll.aggregate() calls """
     if self.sort_order:
         sort = qops.SORT(**dict(qops.make_sortkey(self.sort_order)))
         pipeline.append(sort)
     return pipeline
示例#3
0
    def merge(self, right, on=None, left_on=None, right_on=None,
              how='inner', target=None, suffixes=('_x', '_y'),
              sort=False, inspect=False):
        """
        merge this dataframe with another dataframe. only left outer joins
        are currently supported. the output is saved as a new collection,
        target name (defaults to a generated name if not specified).

        :param right: the other MDataFrame
        :param on: the list of key columns to merge by
        :param left_on: the list of the key columns to merge on this dataframe
        :param right_on: the list of the key columns to merge on the other 
           dataframe
        :param how: the method to merge. supported are left, inner, right. 
           Defaults to inner
        :param target: the name of the collection to store the merge results
           in. If not provided a temporary name will be created.
        :param suffixes: the suffixes to apply to identical left and right 
           columns
        :param sort: if True the merge results will be sorted. If False the
           MongoDB natural order is implied.
        :returns: the MDataFrame to the target MDataFrame
        """
        # validate input
        supported_how = ["left", 'inner', 'right']
        assert how in supported_how, "only %s merges are currently supported" % supported_how
        for key in [on, left_on, right_on]:
            if key:
                assert isinstance(
                    key, six.string_types), "only single column merge keys are supported (%s)" % key
        if isinstance(right, Collection):
            right = MDataFrame(right)
        assert isinstance(
            right, MDataFrame), "both must be MDataFrames, got right=%" % type(right)
        if how == 'right':
            # A right B == B left A
            return right.merge(self, on=on, left_on=right_on, right_on=left_on,
                               how='left', target=target, suffixes=suffixes)
        # generate lookup parameters
        on = on or '_id'
        right_name = self._get_collection_name_of(right, right)
        target_name = self._get_collection_name_of(
            target, '_temp.merge.%s' % uuid4().hex)
        target_field = (
                "%s_%s" % (right_name.replace('.', '_'), right_on or on))
        lookup = qops.LOOKUP(right_name,
                             key=on,
                             left_key=left_on,
                             right_key=right_on,
                             target=target_field)
        # unwind merged documents from arrays to top-level document fields
        unwind = qops.UNWIND(target_field, preserve=how != 'inner')
        # get all fields from left, right
        project = {}
        for left_col in self.columns:
            source_left_col = left_col
            if left_col == '_id':
                project[left_col] = 1
                continue
            if left_col.startswith('_idx'):
                continue
            if left_col.startswith('_om#'):
                continue
            if left_col != (on or left_on) and left_col in right.columns:
                left_col = '%s%s' % (left_col, suffixes[0])
            project[left_col] = "$%s" % source_left_col
        for right_col in right.columns:
            if right_col == '_id':
                continue
            if right_col.startswith('_idx'):
                continue
            if right_col.startswith('_om#'):
                continue
            if right_col == (on or right_on) and right_col == (on or left_on):
                # if the merge field is the same in both frames, we already
                # have it from left
                continue
            if right_col in self.columns:
                left_col = '%s%s' % (right_col, suffixes[1])
            else:
                left_col = '%s' % right_col
            project[left_col] = '$%s.%s' % (target_field, right_col)
        expected_columns = list(project.keys())
        project = {"$project": project}
        # store merged documents and return an MDataFrame to it
        out = qops.OUT(target_name)
        pipeline = [lookup, unwind, project]
        if sort:
            sort_cols = make_list(on or [left_on, right_on])
            sort_key = qops.make_sortkey(sort_cols)
            sort = qops.SORT(**dict(sort_key))
            pipeline.append(sort)
        pipeline.append(out)
        if inspect:
            result = pipeline
        else:
            result = self.collection.aggregate(pipeline, allowDiskUse=True)
            result = MDataFrame(self.collection.database[target_name],
                                force_columns=expected_columns)
        return result