def test_remove_reserved_keys(self): self._add_bamboo_reserved_keys() for key in BAMBOO_RESERVED_KEYS: self.assertTrue(key in self.dframe.columns) dframe = remove_reserved_keys(self.dframe) for key in BAMBOO_RESERVED_KEYS: self.assertFalse(key in dframe.columns)
def test_remove_reserved_keys_exclusion(self): self._add_bamboo_reserved_keys() for key in BAMBOO_RESERVED_KEYS: self.assertTrue(key in self.dframe.columns) dframe = remove_reserved_keys(self.dframe, [PARENT_DATASET_ID]) for key in BAMBOO_RESERVED_KEYS: if key == PARENT_DATASET_ID: self.assertTrue(key in dframe.columns) else: self.assertFalse(key in dframe.columns)
def dframe(self, query_args=None, keep_parent_ids=False, padded=False, index=False, reload_=False, keep_mongo_keys=False): """Fetch the dframe for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param padded: Used for joining, default False. :param index: Return the index with dframe, default False. :param reload_: Force refresh of data, default False. :param keep_mongo_keys: Used for updating documents, default False. :returns: Return DataFrame with contents based on query parameters passed to MongoDB. DataFrame will not have parent ids if `keep_parent_ids` is False. """ # bypass cache if we need specific version cacheable = not (query_args or keep_parent_ids or padded) # use cached copy if we have already fetched it if cacheable and not reload_ and self.__is_cached: return self.__dframe query_args = query_args or QueryArgs() observations = self.observations(query_args, as_cursor=True) if query_args.distinct: return DataFrame(observations) dframe = Observation.batch_read_dframe_from_cursor( self, observations, query_args.distinct, query_args.limit) dframe = df_mongo_decode(dframe, keep_mongo_keys=keep_mongo_keys) excluded = [keep_parent_ids and PARENT_DATASET_ID, index and INDEX] dframe = remove_reserved_keys(dframe, filter(bool, excluded)) if index: dframe.rename(columns={INDEX: 'index'}, inplace=True) dframe = self.__maybe_pad(dframe, padded) if cacheable: self.__dframe = dframe return dframe