def __maybe_pad(self, dframe, pad): if pad: if len(dframe.columns): on = dframe.columns[0] place_holder = self.place_holder_dframe(dframe).set_index(on) dframe = BambooFrame(dframe.join(place_holder, on=on)) else: dframe = self.place_holder_dframe() return dframe
def dframe(self, query_args=QueryArgs(), keep_parent_ids=False, padded=False, index=False): """Fetch the dframe for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param index: Return the index with dframe, default False. :returns: Return BambooFrame with contents based on query parameters passed to MongoDB. BambooFrame will not have parent ids if `keep_parent_ids` is False. """ observations = self.observations(query_args, as_cursor=True) dframe = self.__batch_read_dframe_from_cursor( observations, query_args.distinct, query_args.limit) dframe.decode_mongo_reserved_keys() excluded = [] if keep_parent_ids: excluded.append(PARENT_DATASET_ID) if index: excluded.append(INDEX) dframe.remove_bamboo_reserved_keys(excluded) if index: dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'})) if padded: if len(dframe.columns): on = dframe.columns[0] place_holder = self.place_holder_dframe(dframe).set_index(on) dframe = BambooFrame(dframe.join(place_holder, on=on)) else: dframe = self.place_holder_dframe() return dframe
def dframe(self, query=None, select=None, distinct=None, keep_parent_ids=False, limit=0, order_by=None, padded=False): """Fetch the dframe for this dataset. :param select: An optional select to limit the fields in the dframe. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param limit: Limit on the number of rows in the returned dframe. :param order_by: Sort resulting rows according to a column value and sign indicating ascending or descending. Example of `order_by`: - ``order_by='mycolumn'`` - ``order_by='-mycolumn'`` :returns: Return BambooFrame with contents based on query parameters passed to MongoDB. BambooFrame will not have parent ids if `keep_parent_ids` is False. """ observations = self.observations( query=query, select=select, limit=limit, order_by=order_by, as_cursor=True) dframe = self._batch_read_dframe_from_cursor( observations, distinct, limit) dframe.decode_mongo_reserved_keys() dframe.remove_bamboo_reserved_keys(keep_parent_ids) if padded: if len(dframe.columns): on = dframe.columns[0] place_holder = self.place_holder_dframe(dframe).set_index(on) dframe = BambooFrame(dframe.join(place_holder, on=on)) else: dframe = self.place_holder_dframe() return dframe
class TestFrame(TestBase): def setUp(self): TestBase.setUp(self) self.dframe = self.get_data('good_eats.csv') self.bframe = BambooFrame(self.dframe) def _add_bamboo_reserved_keys(self, value=1): for key in BAMBOO_RESERVED_KEYS: column = Series([value] * len(self.bframe)) column.name = key self.bframe = BambooFrame(self.bframe.join(column)) def test_add_parent_column(self): value = 1 self._add_bamboo_reserved_keys(value) for index, item in self.bframe[PARENT_DATASET_ID].iteritems(): self.assertEqual(item, value) def test_decode_mongo_reserved_keys(self): prev_columns = self.bframe.columns for col in MONGO_RESERVED_KEYS: self.assertTrue(col in self.bframe.columns) self.bframe.decode_mongo_reserved_keys() for col in MONGO_RESERVED_KEYS: self.assertFalse(col in self.bframe.columns) def test_recognize_dates(self): bframe_with_dates = self.bframe.recognize_dates() for field in bframe_with_dates['submit_date']: self.assertTrue(isinstance(field, datetime)) def test_recognize_dates_from_schema(self): schema = Schema({ 'submit_date': { SIMPLETYPE: DATETIME } }) bframe_with_dates = self.bframe.recognize_dates_from_schema(schema) for field in bframe_with_dates['submit_date']: self.assertTrue(isinstance(field, datetime)) def test_remove_bamboo_reserved_keys(self): self._add_bamboo_reserved_keys() for key in BAMBOO_RESERVED_KEYS: self.assertTrue(key in self.bframe.columns) self.bframe.remove_bamboo_reserved_keys() for key in BAMBOO_RESERVED_KEYS: self.assertFalse(key in self.bframe.columns) def test_remove_bamboo_reserved_keys_exclusion(self): self._add_bamboo_reserved_keys() for key in BAMBOO_RESERVED_KEYS: self.assertTrue(key in self.bframe.columns) self.bframe.remove_bamboo_reserved_keys([PARENT_DATASET_ID]) for key in BAMBOO_RESERVED_KEYS: if key == PARENT_DATASET_ID: self.assertTrue(key in self.bframe.columns) else: self.assertFalse(key in self.bframe.columns) def test_only_rows_for_parent_id(self): parent_id = 1 len_parent_rows = len(self.bframe) / 2 column = Series([parent_id] * len_parent_rows) column.name = PARENT_DATASET_ID self.bframe = BambooFrame(self.bframe.join(column)) bframe_only = self.bframe.only_rows_for_parent_id(parent_id) self.assertFalse(PARENT_DATASET_ID in bframe_only.columns) self.assertEqual(len(bframe_only), len_parent_rows) def test_to_jsondict(self): jsondict = self.bframe.to_jsondict() self.assertEqual(len(jsondict), len(self.bframe)) for col in jsondict: self.assertEqual(len(col), len(self.bframe.columns)) def test_to_json(self): json = self.bframe.to_json() self.assertEqual(type(json), str)