def _count(self): count_columns = self._non_group_columns() if len(count_columns) == 0: count_columns.append('_'.join(self.columns) + '_count') groupby = { "$group": { "_id": {k: "$%s" % k for k in self.columns}, } } for k in count_columns: groupby['$group']['%s' % k] = {"$sum": 1} pipeline = self._amend_pipeline([groupby]) if self.should_sort: sort = qops.SORT(**dict(qops.make_sortkey('_id'))) pipeline.append(sort) return list(self.collection.aggregate(pipeline, allowDiskUse=True))
def _amend_pipeline(self, pipeline): """ amend pipeline with default ops on coll.aggregate() calls """ if self.sort_order: sort = qops.SORT(**dict(qops.make_sortkey(self.sort_order))) pipeline.append(sort) return pipeline
def merge(self, right, on=None, left_on=None, right_on=None, how='inner', target=None, suffixes=('_x', '_y'), sort=False, inspect=False): """ merge this dataframe with another dataframe. only left outer joins are currently supported. the output is saved as a new collection, target name (defaults to a generated name if not specified). :param right: the other MDataFrame :param on: the list of key columns to merge by :param left_on: the list of the key columns to merge on this dataframe :param right_on: the list of the key columns to merge on the other dataframe :param how: the method to merge. supported are left, inner, right. Defaults to inner :param target: the name of the collection to store the merge results in. If not provided a temporary name will be created. :param suffixes: the suffixes to apply to identical left and right columns :param sort: if True the merge results will be sorted. If False the MongoDB natural order is implied. :returns: the MDataFrame to the target MDataFrame """ # validate input supported_how = ["left", 'inner', 'right'] assert how in supported_how, "only %s merges are currently supported" % supported_how for key in [on, left_on, right_on]: if key: assert isinstance( key, six.string_types), "only single column merge keys are supported (%s)" % key if isinstance(right, Collection): right = MDataFrame(right) assert isinstance( right, MDataFrame), "both must be MDataFrames, got right=%" % type(right) if how == 'right': # A right B == B left A return right.merge(self, on=on, left_on=right_on, right_on=left_on, how='left', target=target, suffixes=suffixes) # generate lookup parameters on = on or '_id' right_name = self._get_collection_name_of(right, right) target_name = self._get_collection_name_of( target, '_temp.merge.%s' % uuid4().hex) target_field = ( "%s_%s" % (right_name.replace('.', '_'), right_on or on)) lookup = qops.LOOKUP(right_name, key=on, left_key=left_on, right_key=right_on, target=target_field) # unwind merged documents from arrays to top-level document fields unwind = qops.UNWIND(target_field, preserve=how != 'inner') # get all fields from left, right project = {} for left_col in self.columns: source_left_col = left_col if left_col == '_id': project[left_col] = 1 continue if left_col.startswith('_idx'): continue if left_col.startswith('_om#'): continue if left_col != (on or left_on) and left_col in right.columns: left_col = '%s%s' % (left_col, suffixes[0]) project[left_col] = "$%s" % source_left_col for right_col in right.columns: if right_col == '_id': continue if right_col.startswith('_idx'): continue if right_col.startswith('_om#'): continue if right_col == (on or right_on) and right_col == (on or left_on): # if the merge field is the same in both frames, we already # have it from left continue if right_col in self.columns: left_col = '%s%s' % (right_col, suffixes[1]) else: left_col = '%s' % right_col project[left_col] = '$%s.%s' % (target_field, right_col) expected_columns = list(project.keys()) project = {"$project": project} # store merged documents and return an MDataFrame to it out = qops.OUT(target_name) pipeline = [lookup, unwind, project] if sort: sort_cols = make_list(on or [left_on, right_on]) sort_key = qops.make_sortkey(sort_cols) sort = qops.SORT(**dict(sort_key)) pipeline.append(sort) pipeline.append(out) if inspect: result = pipeline else: result = self.collection.aggregate(pipeline, allowDiskUse=True) result = MDataFrame(self.collection.database[target_name], force_columns=expected_columns) return result