def test_delete(self): self._save_observations() records = Observation.find(self.dataset) self.assertNotEqual(records, []) Observation.delete_all(self.dataset) records = [x for x in Observation.find(self.dataset)] self.assertEqual(records, [])
def remove_parent_observations(self, parent_id): """Remove obervations for this dataset with the passed `parent_id`. :param parent_id: Remove observations with this ID as their parent dataset ID. """ Observation.delete_all(self, {PARENT_DATASET_ID: parent_id})
def action(dataset, data=data): data = safe_json_loads(data) Observation.update(dataset, int(index), data) return { self.SUCCESS: 'Updated row with index "%s".' % index, Dataset.ID: dataset_id}
def _save_records(self): Observation.save(self.get_data('good_eats.csv'), self.dataset) records = Observation.find(self.dataset) self.assertTrue(isinstance(records, list)) self.assertTrue(isinstance(records[0], dict)) self.assertTrue('_id' in records[0].keys()) return records
def test_delete_all(self): self.__save_records() records = Observation.find(self.dataset) self.assertNotEqual(records, []) Observation.delete_all(self.dataset) records = Observation.find(self.dataset) self.assertEqual(records, [])
def append_observations(self, dframe): Observation.append(dframe, self) self.update({self.NUM_ROWS: self.num_rows + len(dframe)}) # to update cardinalities here we need to refetch the full DataFrame. dframe = self.dframe(keep_parent_ids=True) self.build_schema(dframe) self.update_stats(dframe)
def delete_task(dataset, query=None): """Background task to delete dataset and its associated observations.""" Observation.delete_all(dataset, query=query) if query is None: super(dataset.__class__, dataset).delete({DATASET_ID: dataset.dataset_id}) Observation.delete_encoding(dataset)
def delete_task(dataset, query=None): """Background task to delete dataset and its associated observations.""" Observation.delete_all(dataset, query=query) if query is None: super(dataset.__class__, dataset).delete( {DATASET_ID: dataset.dataset_id}) Observation.delete_encoding(dataset)
def __save_records(self): Observation.save(self.get_data('good_eats.csv'), self.dataset) records = Observation.find(self.dataset) self.assertTrue(isinstance(records, list)) self.assertTrue(isinstance(records[0], dict)) self.assertTrue('_id' in records[0].keys()) return records
def remove_parent_observations(self, parent_id): """Remove obervations for this dataset with the passed `parent_id`. :param parent_id: Remove observations with this ID as their parent dataset ID. """ Observation.delete_all(self, {PARENT_DATASET_ID: parent_id}) # clear the cached dframe self.__dframe = None
def test_delete_encoding(self): self.__save_records() encoding = Observation.encoding(self.dataset) self.assertTrue(isinstance(encoding, dict)) Observation.delete_encoding(self.dataset) encoding = Observation.encoding(self.dataset) self.assertEqual(encoding, None)
def delete_observation(self, index): """Delete observation at index. :params index: The index of an observation to delete. """ Observation.delete(self, index) dframe = self.dframe() self.update({self.NUM_ROWS: len(dframe)}) self.build_schema(dframe, overwrite=True)
def delete_observation(self, index): """Delete observation at index. :params index: The index of an observation to delete. """ Observation.delete(self, index) dframe = self.dframe() self.update({self.NUM_ROWS: len(dframe)}) self.build_schema(dframe, overwrite=True) call_async(propagate, self, update={'delete': index})
def test_delete_one(self): self.__save_records() records = Observation.find(self.dataset) self.assertNotEqual(records, []) row = self.__decode(records[0]) Observation.delete(self.dataset, row[INDEX]) new_records = Observation.find(self.dataset) # Dump to avoid problems with nan != nan. self.assertEqual(dump_mongo_json(records[1:]), dump_mongo_json(new_records))
def replace_observations(self, dframe, overwrite=False, set_num_columns=True): """Remove all rows for this dataset and save the rows in `dframe`. :param dframe: Replace rows in this dataset with this DataFrame's rows. :returns: BambooFrame equivalent to the passed in `dframe`. """ self.build_schema(dframe, overwrite=overwrite, set_num_columns=set_num_columns) dframe = self.add_id_column_to_dframe(dframe) Observation.delete_all(self) return self.save_observations(dframe)
def observations(self, query_args=None, as_cursor=False): """Return observations for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param as_cursor: Return the observations as a cursor. """ return Observation.find(self, query_args or QueryArgs(), as_cursor=as_cursor)
def replace_observations(self, dframe, overwrite=False, set_num_columns=True): """Remove all rows for this dataset and save the rows in `dframe`. :param dframe: Replace rows in this dataset with this DataFrame's rows. :param overwrite: If true replace the schema, otherwise update it. Default False. :param set_num_columns: If true update the dataset stored number of columns. Default True. :returns: DataFrame equivalent to the passed in `dframe`. """ self.build_schema(dframe, overwrite=overwrite, set_num_columns=set_num_columns) Observation.delete_all(self) return self.save_observations(dframe)
def test_find_with_select_and_query(self): self.__save_records() self.query_args.select = {"rating": 1} rows = Observation.find(self.dataset, self.query_args) self.assertTrue(isinstance(rows, list)) row = self.__decode(rows[0]) self.assertEquals(sorted(row.keys()), ['_id', 'rating'])
def delete_columns(self, columns): """Delete column `column` from this dataset. :param column: The column to delete. """ columns = set(self.schema.keys()).intersection(set(to_list(columns))) if not len(columns): raise ArgumentError("Columns: %s not in dataset.") Observation.delete_columns(self, columns) new_schema = self.schema [new_schema.pop(c) for c in columns] self.set_schema(new_schema, set_num_columns=True) return columns
def test_delete(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) records = Dataset.find(self.test_dataset_ids[dataset_name]) self.assertNotEqual(records, []) dataset.delete() records = Dataset.find(self.test_dataset_ids[dataset_name]) self.assertEqual(records, []) self.assertEqual(Observation.encoding(dataset), None)
def __create_or_update(self, url=None, csv_file=None, json_file=None, schema=None, na_values=[], perish=0, dataset_id=None): result = None error = 'url, csv_file or schema required' try: if schema or url or csv_file or json_file: if dataset_id is None: dataset = Dataset() dataset.save() else: dataset = Dataset.find_one(dataset_id) Observation.delete_all(dataset) if schema: dataset.import_schema(schema) na_values = safe_json_loads(na_values) if url: dataset.import_from_url(url, na_values=na_values) elif csv_file: dataset.import_from_csv(csv_file, na_values=na_values) elif json_file: dataset.import_from_json(json_file) result = {Dataset.ID: dataset.dataset_id} perish = parse_int(perish) perish and dataset.delete(countdown=perish) except urllib2.URLError: error = 'could not load: %s' % url except IOError: error = 'could not get a filehandle for: %s' % csv_file except JSONError as e: error = e.__str__() self.set_response_params(result, success_status_code=201) return self._dump_or_error(result, error)
def test_encoding(self): self.__save_records() encoding = Observation.encoding(self.dataset) for column in self.dataset.dframe().columns: if column == MONGO_ID: column = MONGO_ID_ENCODED self.assertTrue(column in encoding.keys()) for v in encoding.values(): self.assertTrue(isinstance(int(v), int))
def observations(self, query=None, select=None, limit=0, order_by=None, as_cursor=False): """Return observations for this dataset. :param query: Optional query for MongoDB to limit rows returned. :param select: Optional select for MongoDB to limit columns. :param limit: If greater than 0, limit number of observations returned to this maximum. :param order_by: Order the returned observations. """ return Observation.find(self, query, select, limit=limit, order_by=order_by, as_cursor=as_cursor)
def dframe(self, query_args=None, keep_parent_ids=False, padded=False, index=False, reload_=False, keep_mongo_keys=False): """Fetch the dframe for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param padded: Used for joining, default False. :param index: Return the index with dframe, default False. :param reload_: Force refresh of data, default False. :param keep_mongo_keys: Used for updating documents, default False. :returns: Return DataFrame with contents based on query parameters passed to MongoDB. DataFrame will not have parent ids if `keep_parent_ids` is False. """ # bypass cache if we need specific version cacheable = not (query_args or keep_parent_ids or padded) # use cached copy if we have already fetched it if cacheable and not reload_ and self.__is_cached: return self.__dframe query_args = query_args or QueryArgs() observations = self.observations(query_args, as_cursor=True) if query_args.distinct: return DataFrame(observations) dframe = Observation.batch_read_dframe_from_cursor( self, observations, query_args.distinct, query_args.limit) dframe = df_mongo_decode(dframe, keep_mongo_keys=keep_mongo_keys) excluded = [keep_parent_ids and PARENT_DATASET_ID, index and INDEX] dframe = remove_reserved_keys(dframe, filter(bool, excluded)) if index: dframe.rename(columns={INDEX: 'index'}, inplace=True) dframe = self.__maybe_pad(dframe, padded) if cacheable: self.__dframe = dframe return dframe
def observations(self, query_args=QueryArgs(), as_cursor=False): """Return observations for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param as_cursor: Return the observations as a cursor. """ if query_args.distinct: as_cursor = True observations = Observation.find(self, query_args, as_cursor=as_cursor) if query_args.distinct: observations = observations.distinct(query_args.distinct) return observations
def get_value(self, period): value = None if self.dataset: fields = Observation.encoding(self.dataset) fields["dataset"] = self.dataset fields['dataset_id_field'] = fields[DATASET_ID] fields['period'] = period query = json.loads(Template(self.final_str).render(fields)) form_meta_timeend = '%(form_meta_timeend)s' % fields query[0]['$match'][form_meta_timeend]['$gte'] = period.start query[0]['$match'][form_meta_timeend]['$lte'] = period.end aggregate_value = self._db.observations.aggregate(query) if not aggregate_value['result']: return 0 value = aggregate_value['result'][0]['total_num'] return value
def test_dframe(self): dataset = Dataset.create(self.test_dataset_ids['good_eats.csv']) dataset.save_observations( recognize_dates(self.get_data('good_eats.csv'))) records = [x for x in Observation.find(dataset)] dframe = dataset.dframe() self.assertTrue(isinstance(dframe, DataFrame)) self.assertTrue(all(self.get_data('good_eats.csv').reindex( columns=dframe.columns).eq(dframe))) columns = dframe.columns # ensure no reserved keys for key in MONGO_RESERVED_KEY_STRS: self.assertFalse(key in columns) # ensure date is converted self.assertTrue(isinstance(dframe.submit_date[0], datetime))
def test_edit_row(self): dataset_id = self._post_file() index = 0 update = {'amount': 10, 'food_type': 'breakfast'} expected_dframe = Dataset.find_one(dataset_id).dframe() expected_row = expected_dframe.ix[0].to_dict() expected_row.update(update) expected_dframe.ix[0] = Series(expected_row) results = json.loads(self.controller.row_update(dataset_id, index, json.dumps(update))) self.assertTrue(Datasets.SUCCESS in results.keys()) dataset = Dataset.find_one(dataset_id) dframe = dataset.dframe() self.assertEqual(self.NUM_ROWS, len(dframe)) self._check_dframes_are_equal(expected_dframe, dframe) # check that previous row exists all_observations = Observation.find(dataset, include_deleted=True) self.assertEqual(self.NUM_ROWS + 1, len(all_observations))
def get_value(self, period): value = None if self.dataset: fields = Observation.encoding(self.dataset) fields["dataset"] = self.dataset fields['dataset_id_field'] = fields[DATASET_ID] fields['period'] = period mapper = Code(Template(self.mapper_str).render(fields)) reducer = Code(Template(self.reducer_str).render(fields)) query = json.loads(Template(self.query_str).render(fields)) query['%(form_meta_timeend)s' % fields]['$gte'] = period.start query['%(form_meta_timeend)s' % fields]['$lte'] = period.end aggregate = json.loads(Template(self.aggregate_str).render(fields)) results = self._db.observations.map_reduce( mapper, reducer, 'myresults_malaria', query=query) value = None if results.count(): aggregate_value = results.aggregate(aggregate) value = aggregate_value['result'][0]['total'] self._db.myresults_malaria.drop() return value
def test_edit_row(self): dataset_id = self._post_file() index = 0 update = {'amount': 10, 'food_type': 'breakfast'} expected_dframe = Dataset.find_one(dataset_id).dframe() expected_row = expected_dframe.ix[0].to_dict() expected_row.update(update) expected_dframe.ix[0] = Series(expected_row) results = json.loads( self.controller.row_update(dataset_id, index, json.dumps(update))) self.assertTrue(Datasets.SUCCESS in results.keys()) dataset = Dataset.find_one(dataset_id) dframe = dataset.dframe() self.assertEqual(self.NUM_ROWS, len(dframe)) self._check_dframes_are_equal(expected_dframe, dframe) # check that previous row exists all_observations = Observation.find(dataset, include_deleted=True) self.assertEqual(self.NUM_ROWS + 1, len(all_observations))
def get_value(self, period): value = None if self.dataset: fields = Observation.encoding(self.dataset) fields["dataset"] = self.dataset fields['dataset_id_field'] = fields[DATASET_ID] fields['period'] = period mapper = Code(Template(self.mapper_str).render(fields)) reducer = Code(Template(self.reducer_str).render(fields)) query = json.loads(Template(self.query_str).render(fields)) query['%(form_meta_timeend)s' % fields]['$gte'] = period.start query['%(form_meta_timeend)s' % fields]['$lte'] = period.end aggregate = json.loads(Template(self.aggregate_str).render(fields)) results = self._db.observations.map_reduce(mapper, reducer, 'myresults_malaria', query=query) value = None if results.count(): aggregate_value = results.aggregate(aggregate) value = aggregate_value['result'][0]['total'] self._db.myresults_malaria.drop() return value
def test_delete_row(self): dataset_id = self._post_file() dataset = Dataset.find_one(dataset_id) index = 0 expected_dframe = Dataset.find_one( dataset_id).dframe()[index + 1:].reset_index() del expected_dframe['index'] results = json.loads(self.controller.row_delete(dataset_id, index)) self.assertTrue(Datasets.SUCCESS in results.keys()) dataset = Dataset.find_one(dataset_id) dframe = dataset.dframe() self.assertEqual(self.NUM_ROWS - 1, len(dframe)) self._check_dframes_are_equal(expected_dframe, dframe) # check info updated info = dataset.info() self.assertEqual(self.NUM_ROWS - 1, info[Dataset.NUM_ROWS]) # check that row is softly deleted all_observations = Observation.find(dataset, include_deleted=True) self.assertEqual(self.NUM_ROWS, len(all_observations))
def update_observations(self, dframe): return Observation.update_from_dframe(dframe, self)
def update_observation(self, index, data): # check that update is valid dframe_from_update(self, [data]) Observation.update(self, index, data) call_async(propagate, self, update={'edit': [index, data]})
def action(dataset): row = Observation.find_one(dataset, parse_int(index)) if row: return row.clean_record
def test_find_with_select(self): self._save_observations() query_args = QueryArgs(select={"rating": 1}) rows = Observation.find(self.dataset, query_args) self.assertTrue(isinstance(rows, list)) self.assertEquals(sorted(rows[0].keys()), ['_id', 'rating'])
def test_find_with_query(self): self._save_observations() rows = Observation.find(self.dataset, self.query_args) self.assertTrue(isinstance(rows, list))
def test_find(self): self._save_observations() rows = Observation.find(self.dataset) self.assertTrue(isinstance(rows, list))
def test_save_over_bulk(self): Observation.save(self.get_data('good_eats_large.csv'), self.dataset) records = Observation.find(self.dataset) self.assertEqual(len(records), 1001)
def __decode(self, row): return Observation.encode(row, encoding=Observation.decoding(self.dataset))
def test_encode_no_dataset(self): records = self.__save_records() for record in records: encoded = Observation.encode(record) self.assertEqual(dump_mongo_json(encoded), dump_mongo_json(record))
def save_observations(self, dframe): """Save rows in `dframe` for this dataset. :param dframe: DataFrame to save rows from. """ return Observation.save(dframe, self)
def test_find(self): self.__save_records() rows = Observation.find(self.dataset) self.assertTrue(isinstance(rows, list))
def test_find_with_query(self): self.__save_records() rows = Observation.find(self.dataset, self.query_args) self.assertTrue(isinstance(rows, list))
{"{{dataset_id_field}}": "{{dataset.dataset_id}}", "{{form_meta_timeend}}": { "$gte": "{{period.start}}", "$lte": "{{period.end}}" } } """ aggregate_str = """ {"$group": {"_id": 0, "total": {"$sum": "$value.{{num_using_fp}}"}}} """ dataset_id = "5791793ac29b4d77b20cf1a04d8e7161" dataset = Dataset.find_one(dataset_id) period = Period.month_period(2013, 3) if dataset: fields = Observation.encoding(dataset) fields["dataset"] = dataset fields['dataset_id_field'] = fields[DATASET_ID] fields['period'] = Period.month_period(2013, 3) mapper = Code(Template(mapper_str).render(fields)) reducer = Code(Template(reducer_str).render(fields)) query = json.loads(Template(query_str).render(fields)) query['%(form_meta_timeend)s' % fields]['$gte'] = period.start query['%(form_meta_timeend)s' % fields]['$lte'] = period.end aggregate = json.loads(Template(aggregate_str).render(fields)) results = db.observations.map_reduce(mapper, reducer, 'myresults_fp', query=query) if results.count(): value = results.aggregate(aggregate)