def import_dataset(_file, dataset): """ For reading a URL and saving the corresponding dataset. """ dframe = read_csv(_file) Dataset.build_schema(dataset, dframe.dtypes) Observation.save(dframe, dataset)
def test_delete(self): self._save_observations() records = [x for x in Observation.find(self.dataset)] self.assertNotEqual(records, []) Observation.delete(self.dataset) records = [x for x in Observation.find(self.dataset)] self.assertEqual(records, [])
def _save_records(self): records = Observation.save(self.test_data['good_eats.csv'], self.dataset) cursor = Observation.find(self.dataset) records = [x for x in cursor] self.assertTrue(isinstance(records, list)) self.assertTrue(isinstance(records[0], dict)) self.assertTrue('_id' in records[0].keys()) return records
def test_find_as_df(self): self._save_observations() records = [x for x in Observation.find(self.dataset)] dframe = Observation.find(self.dataset, as_df=True) self.assertTrue(isinstance(dframe, DataFrame)) self.assertEqual(self.test_data['good_eats.csv'].reindex( columns=dframe.columns), dframe) columns = dframe.columns for key in MONGO_RESERVED_KEYS: self.assertFalse(prefix_reserved_key(key) in columns)
def DELETE(self, dataset_id): """ Delete observations (i.e. the dataset) with hash *dataset_id* from mongo """ dataset = Dataset.find_one(dataset_id) result = None if dataset: Dataset.delete(dataset_id) Observation.delete(dataset) result = {SUCCESS: 'deleted dataset: %s' % dataset_id} return dump_or_error(result, 'id not found')
def test_find_with_select_and_query(self): self._save_observations() cursor = Observation.find(self.dataset, '{"rating": "delectible"}', '{"rating": 1}') self.assertTrue(isinstance(cursor, Cursor)) results = [row for row in cursor] self.assertEquals(sorted(results[0].keys()), ['_id', 'rating'])
def observe(self, *, processes=1, period=1): log.info("Computing observations for campaign") self.observations = [ Observation(self.dataset, observer, self.population, self.config) for observer in self.observers ] for observation in self.observations: observation.observe(processes=processes, period=period)
def save(cls, dataset, formula, name, **kwargs): """ Attempt to parse formula, then save formula, and add a task to calculate formula. """ dframe = Observation.find(dataset, as_df=True) # attempt to get a row from the dataframe try: row = dframe.irow(0) except IndexError, err: row = {}
def calculate_column(dataset, dframe, formula, name): """ For calculating new columns. Get necessary data given a calculation ID, execute calculation formula, store results in dataset the calculation refers to. """ # parse formula into function and variables parser = Parser() func = parser.parse_formula(formula) new_column = dframe.apply(func, axis=1, args=(parser, )) new_column.name = name return Observation.update(dframe.join(new_column), dataset)
def setUp(self): TestBase.setUp(self) self.dataset = Dataset.save(self.test_dataset_ids['good_eats.csv']) dframe = self.test_data['good_eats.csv'] Dataset.build_schema(self.dataset, dframe.dtypes) Observation.save(dframe, self.dataset) self.calculations = [ 'rating', 'gps', 'amount + gps_alt', 'amount - gps_alt', 'amount + 5', 'amount - gps_alt + 2.5', 'amount * gps_alt', 'amount / gps_alt', 'amount * gps_alt / 2.5', 'amount + gps_alt * gps_precision', '(amount + gps_alt) * gps_precision', 'amount = 2', '10 < amount', '10 < amount + gps_alt', 'not amount = 2', 'not(amount = 2)', 'amount = 2 and 10 < amount', 'amount = 2 or 10 < amount', 'not not amount = 2 or 10 < amount', 'not amount = 2 or 10 < amount', '(not amount = 2) or 10 < amount', 'not(amount = 2 or 10 < amount)', 'amount ^ 3', '(amount + gps_alt) ^ 2 + 100', '-amount', '-amount < gps_alt - 100', 'rating in ["delectible"]', 'risk_factor in ["low_risk"]', 'amount in ["9.0", "2.0", "20.0"]', '(risk_factor in ["low_risk"]) and (amount in ["9.0", "20.0"])', ] self.places = 5
def monitor_observations(self): last_run = None while True: # If there is no previous run, just run through anyway curtime = datetime.datetime.now() if last_run is not None: if (curtime - last_run).total_seconds() < 60: continue last_run = curtime # Prepare observation data for the previous minute minute_obs = (curtime - datetime.timedelta(minutes=1)).replace(second=0) minute_store = minute_obs + datetime.timedelta(minutes=1) wind_df = self.get_wind_df(minute=minute_obs) wind_direction = self.average_wind_direction(wind_df=wind_df) wind_speed = self.average_wind_speed(wind_df=wind_df) wind_gust = self.max_wind_gust(wind_df=wind_df) rain = self.current_rain(minute=minute_obs) obs = Observation(dt=minute_store, wind_direction=wind_direction, wind_speed=wind_speed, wind_gust=wind_gust, rain=rain) try: session = db.get_session() session.add(obs) session.commit() except Exception as e: print("Unable to add observation due to exception %s" % e)
def GET(self, dataset_id, mode=False, query='{}', select=None, group=ALL): """ Return data set for hash *dataset_id*. Execute query *query* in mongo if passed. If summary is passed return summary statistics for data set. If group is passed group the summary, if summary is false group is ignored. """ dataset = Dataset.find_one(dataset_id) result = None try: if dataset: if mode == MODE_INFO: result = Dataset.schema(dataset) elif mode == MODE_SUMMARY: result = summarize(dataset, query, select, group) else: return mongo_to_json(Observation.find(dataset, query, select)) except JSONError, e: result = {ERROR: e.__str__()}
def summarize(dataset, query, select, group): """ Return a summary for the rows/values filtered by *query* and *select* and grouped by *group* or the overall summary if no group is specified. """ # narrow list of observations via query/select dframe = Observation.find(dataset, query, select, as_df=True) # do not allow group by numeric types # TODO check schema for valid groupby columns once included _type = dframe.dtypes.get(group) if group != ALL and (_type is None or _type.type != np.object_): return {ERROR: "group: '%s' is not categorical." % group} # check cached stats for group and update as necessary stats = dataset.get(STATS, {}) if not stats.get(group): stats = {ALL: summarize_df(dframe)} if group == ALL \ else summarize_with_groups(dframe, stats, group) Dataset.update(dataset, {STATS: stats}) stats_to_return = stats.get(group) return dict_from_mongo(stats_to_return if group == ALL else {group: stats_to_return})
def _test_calculator(self, delay=True): dframe = Observation.find(self.dataset, as_df=True) columns = dframe.columns.tolist() start_num_cols = len(columns) added_num_cols = 0 column_labels_to_slugs = build_labels_to_slugs(self.dataset) label_list, slugified_key_list = [list(ary) for ary in zip(*column_labels_to_slugs.items())] for idx, formula in enumerate(self.calculations): name = 'test-%s' % idx if delay: task = calculate_column.delay(self.dataset, dframe, formula, name) # test that task has completed self.assertTrue(task.ready()) self.assertTrue(task.successful()) else: task = calculate_column(self.dataset, dframe, formula, name) column_labels_to_slugs = build_labels_to_slugs(self.dataset) unslug_name = name name = column_labels_to_slugs[unslug_name] # test that updated dataframe persisted dframe = Observation.find(self.dataset, as_df=True) self.assertTrue(name in dframe.columns) # test new number of columns added_num_cols += 1 self.assertEqual(start_num_cols + added_num_cols, len(dframe.columns.tolist())) # test that the schema is up to date dataset = Dataset.find_one(self.dataset[DATASET_ID]) self.assertTrue(SCHEMA in dataset.keys()) self.assertTrue(isinstance(dataset[SCHEMA], dict)) schema = dataset[SCHEMA] # test slugified column names slugified_key_list.append(name) self.assertEqual(sorted(schema.keys()), sorted(slugified_key_list)) # test column labels label_list.append(unslug_name) labels = [schema[col][LABEL] for col in schema.keys()] self.assertEqual(sorted(labels), sorted(label_list)) # test result of calculation formula = column_labels_to_slugs[formula] for idx, row in dframe.iterrows(): try: result = np.float64(row[name]) stored = np.float64(row[formula]) # np.nan != np.nan, continue if we have two nan values if np.isnan(result) and np.isnan(stored): continue msg = self._equal_msg(result, stored, formula) self.assertAlmostEqual(result, stored, self.places, msg) except ValueError: msg = self._equal_msg(row[name], row[formula], formula) self.assertEqual(row[name], row[formula], msg)
def test_save_over_bulk(self): Observation.save(self.test_data['good_eats_large.csv'], self.dataset) cursor = Observation.find(self.dataset) records = [x for x in cursor] self.assertEqual(len(records), 1001)
def test_find(self): self._save_observations() cursor = Observation.find(self.dataset) self.assertTrue(isinstance(cursor, Cursor))
def test_find_with_query(self): self._save_observations() cursor = Observation.find(self.dataset, '{"rating": "delectible"}') self.assertTrue(isinstance(cursor, Cursor))
def _save_observations(self): return Observation.save(self.test_data['good_eats.csv'], self.dataset)
def _save_observations_and_calculation(self, formula=None): if not formula: formula = self.formula Observation.save(self.test_data['good_eats.csv'], self.dataset) return Calculation.save(self.dataset, formula, self.name)