def send(self, url, parsed_instance): xform = parsed_instance.instance.xform rows = [parsed_instance.to_dict_for_mongo()] # prefix meta columns names for bamboo prefix = (u'%(id_string)s_%(id)s' % { 'id_string': xform.id_string, 'id': xform.id }) for row in rows: for col, value in row.items(): if col.startswith('_') or col.startswith('meta_') \ or col.startswith('meta/'): new_col = (u'%(prefix)s%(col)s' % { 'prefix': prefix, 'col': col }) row.update({new_col: value}) del (row[col]) # create dataset on bamboo first (including current submission) if not xform.bamboo_dataset: dataset_id = get_new_bamboo_dataset(xform, force_last=True) xform.bamboo_dataset = dataset_id xform.save() else: dataset = Dataset(connection=Connection(url=get_bamboo_url(xform)), dataset_id=xform.bamboo_dataset) dataset.update_data(rows=rows)
def convert_data(data_path): reg_string = r"districts/([a-z_]+)/data/(education|health|water|lga_data).(csv|json)" reg_match = re.match(reg_string, data_path) if reg_match: state_lga, sector, ext = reg_match.groups() print "lga: %s, sector: %s, ext: %s" % (state_lga, sector, ext) if sector == "water": bamboo_id = bamboo_hash["Water_Facilities"]["bamboo_id"] if sector == "education": bamboo_id = bamboo_hash["Education_Facilities"]["bamboo_id"] if sector == "health": bamboo_id = bamboo_hash["Health_Facilities"]["bamboo_id"] if sector == "lga_data": bamboo_id = bamboo_hash["LGA_Data"]["bamboo_id"] begin = time.time() bamboo_ds = Dataset(dataset_id=bamboo_id) ffdata = bamboo_ds.get_data(query={"unique_lga": state_lga}, format=ext) # retry if len(ffdata) is 0: print "XXXbamboo fails, retry" ffdata = bamboo_ds.get_data(query={"unique_lga": state_lga}, format=ext) if sector == "lga_data": ffdata = ffdata[0] ffdata = {"data": [{"id": str(key), "value": str(value)} for key, value in ffdata.iteritems()]} ffdata = json.dumps(ffdata) print "saving data to %s" % data_path write_json(data_path, ffdata) end = time.time() time_delta = end - begin print "finished %s %s, used %s" % (state_lga, sector, time_delta)
def run_test_suite(dataset_file_path_list): print "running test suite for %s" % " ".join(dataset_file_path_list) alldata = [] for dataset_name in dataset_file_path_list: d = {} d['hostname'] = os.uname()[1] d['bamboo_url'] = URL d['unix_time'] = time.time() conn = Connection(url=URL) d['commit'] = conn.version['commit'] d['branch'] = conn.version['branch'] dataset = Dataset(connection=conn, path=dataset_name) d['import_time'] = time_till_import_is_finished(dataset) info = dataset.get_info() d['row'] = info['num_rows'] d['col'] = info['num_columns'] d['add_1_calculations_time'] = time_to_add_1_calculations(dataset) d['add_5_calculations_1by1_time'] = time_to_add_5_calculations_1by1(dataset) d['add_5_calculations_batch_time'] = time_to_add_5_calculations_batch(dataset) d['update_1_time'] = time_to_add_1_update(dataset) d['update_5_1by1_time'] = time_to_add_5_update_1by1(dataset) d['update_5_batch_time'] = time_to_add_5_update_batch(dataset) dataset.delete() alldata.append(d) return alldata
def send(self, url, data): xform = XForm.objects.get(id=data.get("xform_id")) rows = [data.get("json")] # prefix meta columns names for bamboo prefix = (u'%(id_string)s_%(id)s' % {'id_string': xform.id_string, 'id': xform.id}) for row in rows: for col, value in row.items(): if col.startswith('_') or col.startswith('meta_') \ or col.startswith('meta/'): new_col = (u'%(prefix)s%(col)s' % {'prefix': prefix, 'col': col}) row.update({new_col: value}) del(row[col]) # create dataset on bamboo first (including current submission) if not xform.bamboo_dataset: dataset_id = get_new_bamboo_dataset(xform, force_last=True) xform.bamboo_dataset = dataset_id xform.save() else: dataset = Dataset(connection=Connection(url=get_bamboo_url(xform)), dataset_id=xform.bamboo_dataset) dataset.update_data(rows=rows)
def run_test_suite(dataset_file_path_list): print "running test suite for %s" % " ".join(dataset_file_path_list) alldata = [] for dataset_name in dataset_file_path_list: d = {} d['hostname'] = os.uname()[1] d['bamboo_url'] = URL d['unix_time'] = time.time() conn = Connection(url=URL) d['commit'] = conn.version['commit'] d['branch'] = conn.version['branch'] dataset = Dataset(connection=conn, path=dataset_name) d['import_time'] = time_till_import_is_finished(dataset) info = dataset.get_info() d['row'] = info['num_rows'] d['col'] = info['num_columns'] d['add_1_calculations_time'] = time_to_add_1_calculations(dataset) d['add_5_calculations_1by1_time'] = time_to_add_5_calculations_1by1( dataset) d['add_5_calculations_batch_time'] = time_to_add_5_calculations_batch( dataset) d['update_1_time'] = time_to_add_1_update(dataset) d['update_5_1by1_time'] = time_to_add_5_update_1by1(dataset) d['update_5_batch_time'] = time_to_add_5_update_batch(dataset) dataset.delete() alldata.append(d) return alldata
def test_merge(self): # already have one dataset in self.dataset dataset = Dataset(path=self.CSV_FILE, connection=self.connection) result = Dataset.merge([self.dataset, dataset], connection=self.connection) self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(result)
def delete_bamboo_dataset(xform): if not xform.bamboo_dataset: return False try: dataset = Dataset(connection=Connection(url=get_bamboo_url(xform)), dataset_id=xform.bamboo_dataset) return dataset.delete() except ErrorParsingBambooData: return False
def test_merge_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) other_dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) result = Dataset.merge([dataset, other_dataset]) self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(other_dataset) self._cleanup(result)
def test_join_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) aux_dataset = Dataset(path=self.AUX_CSV_FILE, connection=self.default_connection) self.wait() result = Dataset.join(dataset, aux_dataset, 'food_type') self.wait() self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(aux_dataset) self._cleanup(result)
def test_create_dataset_from_schema(self): dataset = Dataset(schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) # schema string schema_str = open(self.SCHEMA_FILE).read() dataset = Dataset(schema_content=schema_str, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset)
def test_na_values(self): dataset = Dataset( path=self.CSV_FILE, connection=self.connection, na_values=['n/a']) self.wait() first_row = dataset.get_data(query={'food_type': 'street_meat', 'amount': 2, 'rating': 'delectible', 'risk_factor': 'low_risk'}, limit=1)[-1] self.assertEqual(first_row.get('comments'), 'null') self._cleanup(dataset)
def send(self, url, parsed_instance): xform = parsed_instance.instance.xform rows = [parsed_instance.to_dict_for_mongo()] # create dataset on bamboo first (including current submission) if not xform.bamboo_dataset: dataset_id = get_new_bamboo_dataset(xform, force_last=True) xform.bamboo_dataset = dataset_id xform.save() else: dataset = Dataset(connection=Connection(url=get_bamboo_url(xform)), dataset_id=xform.bamboo_dataset) dataset.update_data(rows=rows)
def test_na_values(self): dataset = Dataset(path=self.CSV_FILE, connection=self.connection, na_values=['n/a']) self.wait() first_row = dataset.get_data(query={ 'food_type': 'street_meat', 'amount': 2, 'rating': 'delectible', 'risk_factor': 'low_risk' }, limit=1)[-1] self.assertEqual(first_row.get('comments'), 'null') self._cleanup(dataset)
def test_bamboo_service(self): # comment out when we can test or mock it differently raise SkipTest service_url = 'http://bamboo.io/' service_name = 'bamboo' xml_submission1 = os.path.join(self.this_directory, u'fixtures', u'dhisform_submission1.xml') xml_submission2 = os.path.join(self.this_directory, u'fixtures', u'dhisform_submission2.xml') xml_submission3 = os.path.join(self.this_directory, u'fixtures', u'dhisform_submission3.xml') # make sure xform doesnt have a bamboo dataset self.xform.bamboo_dataset = '' self.xform.save() # make a first submission without the service self._make_submission(xml_submission1) self.assertEqual(self.response.status_code, 201) # add rest service AFTER 1st submission self._add_rest_service(service_url, service_name) # submit another one. self._make_submission(xml_submission2) self.assertEqual(self.response.status_code, 201) self.wait(5) # it should have created the whole dataset xform = XForm.objects.get(id=self.xform.id) self.assertTrue(xform.bamboo_dataset != '' and xform.bamboo_dataset is not None) dataset = Dataset(connection=Connection(service_url), dataset_id=xform.bamboo_dataset) self.assertEqual(dataset.get_info()['num_rows'], 2) # submit a third one. check that we have 3 records self._make_submission(xml_submission3) self.assertEqual(self.response.status_code, 201) self.wait(5) self.assertEqual(dataset.get_info()['num_rows'], 3) # test regeneration dsi = dataset.get_info() regen_url = reverse(link_to_bamboo, kwargs={ 'username': self.user.username, 'id_string': self.xform.id_string }) response = self.client.post(regen_url, {}) # deleting DS redirects to profile page self.assertEqual(response.status_code, 302) self.wait(5) xform = XForm.objects.get(id=self.xform.id) self.assertTrue(xform.bamboo_dataset) dataset = Dataset(connection=Connection(service_url), dataset_id=xform.bamboo_dataset) new_dsi = dataset.get_info() self.assertEqual(new_dsi['num_rows'], dsi['num_rows']) self.assertNotEqual(new_dsi['id'], dsi['id'])
def test_join(self): self._create_aux_dataset_from_file() self.wait() result = Dataset.join(self.dataset, self.aux_dataset, 'food_type', connection=self.connection) self.assertTrue(isinstance(result, Dataset)) self._cleanup(result)
def update_sources(site): sources = 'sources.json' sources_dir = os.path.join(os.path.dirname(__file__), 'data') if isinstance(site, basestring): sources = os.path.join(os.path.dirname(__file__), 'data', site.lower(), 'sources.json') sources_dir = os.path.join(sources_dir, site.lower()) else: sources = os.path.join(os.path.dirname(__file__), 'sources.json') if not os.path.exists(sources): raise Exception(u"Please define a sources.json.") f = open(sources) sources_dict = json.loads(f.read()) f.close() assert 'bamboo_server' in sources_dict assert 'sources' in sources_dict connection = Connection(sources_dict['bamboo_server']) for k, v in sources_dict['sources'].iteritems(): if v == "": path = os.path.join(sources_dir, k) if not os.path.exists(path): raise Exception(u"%s does not exist," % path) try: dataset = Dataset(path=path, connection=connection, na_values=["---", "None"], data_format='csv') except Exception, e: print u"Exception: Publishing %s failed!\n\t%s" % (k, e) else: sources_dict['sources'][k] = dataset.id
def test_create_dataset_from_schema_with_data(self): # schema + JSON data dataset = Dataset(path=self.JSON_FILE, data_format='json', schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) # schema + CSV data dataset = Dataset(path=self.CSV_FILE, data_format='csv', schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset)
def test_join_bad_on(self): self._create_aux_dataset_from_file() self.wait() result = Dataset.join(self.dataset, self.aux_dataset, 'BAD', connection=self.connection) self.assertFalse(result)
def send(self, url, parsed_instance): xform = parsed_instance.instance.xform rows = [parsed_instance.to_dict_for_mongo()] # prefix meta columns names for bamboo prefix = u"%(id_string)s_%(id)s" % {"id_string": xform.id_string, "id": xform.id} for row in rows: for col, value in row.items(): if col.startswith("_") or col.startswith("meta_") or col.startswith("meta/"): new_col = u"%(prefix)s%(col)s" % {"prefix": prefix, "col": col} row.update({new_col: value}) del (row[col]) # create dataset on bamboo first (including current submission) if not xform.bamboo_dataset: dataset_id = get_new_bamboo_dataset(xform, force_last=True) xform.bamboo_dataset = dataset_id xform.save() else: dataset = Dataset(connection=Connection(url=get_bamboo_url(xform)), dataset_id=xform.bamboo_dataset) dataset.update_data(rows=rows)
def _get_sum(self, key, value, period): sum_value = 0 for v in value[key]: if 'aggregations' in v: sum_value += self._get_aggregate('aggregations', v, period) continue dataset_id = v['dataset_id'] # dataset_id form sources.json is most recent if dataset_id != self._sources[v['source']]\ and self._sources[v['source']] != "": dataset_id = self._sources[v['source']] dataset = Dataset( dataset_id=dataset_id, connection=self.connection) params = {} if 'calculation' in v: # check or create calculations if isinstance(v['calculation'], list): for calculation in v['calculation']: self._add_calculation(calculation, dataset, period) if isinstance(v['calculation'], dict): self._add_calculation(v['calculation'], dataset, period) if 'query' in v: query_string = json.dumps(v['query']) template = env.from_string(query_string) query_string = template.render(period=period) v['query'] = json.loads(query_string) params['query'] = v['query'] if 'count' in v and 'query' in v: params['count'] = v['count'] if 'distinct' in v: params['distinct'] = v['distinct'] val = dataset.get_data(**params) if isinstance(val, dict): raise Exception("Bamboo Error: %s" % val) sum_value += val return sum_value
def get_new_bamboo_dataset(xform, force_last=False): dataset_id = u'' try: content_data = get_csv_data(xform, force_last=force_last) dataset = Dataset(connection=Connection(url=get_bamboo_url(xform)), content=content_data, na_values=['n/a']) except (ErrorParsingBambooData, NoRecordsFoundError): return dataset_id if dataset.id: return dataset.id return dataset_id
def test_bamboo_service(self): service_url = 'http://bamboo.io/' service_name = 'bamboo' # self._add_rest_service(service_url, service_name) self.wait(2) xml_submission1 = os.path.join(self.this_directory, u'fixtures', u'dhisform_submission1.xml') xml_submission2 = os.path.join(self.this_directory, u'fixtures', u'dhisform_submission2.xml') xml_submission3 = os.path.join(self.this_directory, u'fixtures', u'dhisform_submission3.xml') # make a first submission without the service self._make_submission(xml_submission1) self.assertEqual(self.response.status_code, 201) # add rest service AFTER 1st submission self._add_rest_service(service_url, service_name) # submit another one. self._make_submission(xml_submission2) self.assertEqual(self.response.status_code, 201) self.wait(3) # it should have created the whole dataset xform = XForm.objects.get(id=self.xform.id) self.assertTrue(xform.bamboo_dataset) dataset = Dataset(connection=Connection(service_url), dataset_id=xform.bamboo_dataset) self.assertEqual(dataset.get_info()['num_rows'], 2) # submit a third one. check that we have 3 records self._make_submission(xml_submission3) self.assertEqual(self.response.status_code, 201) self.wait(3) self.assertEqual(dataset.get_info()['num_rows'], 3) # test regeneration dsi = dataset.get_info() regen_url = reverse(link_to_bamboo, kwargs={ 'username': self.user.username, 'id_string': self.xform.id_string }) response = self.client.post(regen_url, {}) # deleting DS redirects to profile page self.assertEqual(response.status_code, 302) self.wait(3) xform = XForm.objects.get(id=self.xform.id) self.assertTrue(xform.bamboo_dataset) dataset = Dataset(connection=Connection(service_url), dataset_id=xform.bamboo_dataset) new_dsi = dataset.get_info() self.assertEqual(new_dsi['num_rows'], dsi['num_rows']) self.assertNotEqual(new_dsi['id'], dsi['id'])
def test_merge_fail(self): other_dataset = Dataset('12345', connection=self.connection) result = Dataset.merge([self.dataset, other_dataset], connection=self.connection) self.assertFalse(result)
def test_join_bad_other_dataset(self): with self.assertRaises(PyBambooException): Dataset.join(self.dataset, Exception(), 'food_type', connection=self.connection)
def _create_dataset_from_file(self): self.dataset = Dataset(path=self.CSV_FILE, connection=self.connection) self.wait()
def test_create_dataset_bad_data_format(self): with self.assertRaises(PyBambooException): Dataset(path=self.CSV_FILE, data_format='BAD', connection=self.connection)
def test_create_dataset_from_url(self): dataset = Dataset( url='http://formhub.org/mberg/forms/good_eats/data.csv', connection=self.connection) self.assertTrue(self.dataset.id is not None) self._cleanup(dataset)
def test_create_dataset_from_json(self): dataset = Dataset(path=self.JSON_FILE, data_format='json', connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset)
#! /usr/bin/env python import time from pybamboo.dataset import Dataset d = Dataset(path='data/water_points.csv') print 'dataset id: %s' % d.id time.sleep(2) info = d.get_info() print 'num_columns: %s' % info['num_columns'] print 'num_rows: %s' % info['num_rows'] print 'columns: %s' % info['schema'].keys() water_points = d.get_data(select=['communities_villages','water_functioning']) print 'return all water points where water is functioning %s' % water_points broken_water_points =d.get_data(select=['communities_villages'],query={'water_functioning': False}) #print 'Villages with broken water points %s' % len(broken_water_points) summary = d.get_summary() print 'Water point lift mechanism type summary %s' % summary['water_lift_mechanism_type']
def _create_aux_dataset_from_file(self): self.aux_dataset = Dataset(path=self.AUX_CSV_FILE, connection=self.connection) self.wait()
def main(): dataset_url = "%sdatasets/%s.csv" % (BAMBOO_DEV_URL, args.dataset) dataset = Dataset(url=dataset_url) print dataset.id
from pybamboo.dataset import Dataset from pybamboo.connection import Connection LOCAL_HOST = False connect = Connection(url='http://localhost:8080') if LOCAL_HOST else Connection() DATASET_ID = sys.argv[-1] CSV_URL = u"https://github.com/modilabs/bamboo-examples/raw/master/data/water_points.csv" WAIT_INTERVAL = 10 WAIT_MAX = 180 if len(sys.argv) > 1 and DATASET_ID: print("Retrieving dataset from UUID %s" % DATASET_ID) dataset = Dataset(dataset_id=DATASET_ID, connection=connect) else: print(u"Creating dataset from %s" % CSV_URL) dataset = Dataset(url=CSV_URL, connection=connect) waited = 0 while(dataset.get_info().get(u'state', 'ready') != 'ready'): if waited >= WAIT_MAX: print(u"Unable to get dataset ready in time. Exiting.") sys.exit() print(u"Dataset not ready. Waiting 10s...") time.sleep(WAIT_INTERVAL) waited += WAIT_INTERVAL print(u"---")
def main(): # Download CSV for all forms. for form in FORMS: form_csv = path(u'%s.csv' % form) if not form_csv.isfile(): print(u"Downloading CSV for %s" % form) url = u"https://www.formhub.org/atasoils/forms/%s/data.csv" % form form_csv_tmp = path(download_formhub(url, login=FH_LOGIN, password=FH_PASSWORD)) shutil.copy(form_csv_tmp, u'%s.csv' % form) print(form_csv, form_csv.isfile()) # Parse EthioSIS and build a cleaned-up version cleanup_ethiosis(csv_in=u'%s.csv' % FORMS[0], csv_out=u'%s_clean.csv' % FORMS[0]) print(u"Cleanup done.") print(u"\n") # Generate FH submissions for each cleaned sample. submissions_done = path('submissions_done') if not submissions_done.isfile(): print(u"Generating FH submissions") generate_fh_submission(csv_in=u'%s_clean.csv' % FORMS[0], form=NEW_FORMS[0]) submissions_done.touch() # flat list of available IDs to pop out for id_list in EXISTING.values(): for soil_id in id_list: if not soil_id in AVAILABLES: AVAILABLES.append(soil_id) # Parse Steps 1-6, cleanup (duplicates), clean PC names for findex, form in enumerate(FORMS): if findex == 0: continue step = u'step%d' % findex step_done = path(u'%s_done' % step) if not step_done.isfile(): print(u"Generating STEP %d submissions" % findex) generate_fh_steps(csv_in=u'%s.csv' % form, form=form, step=step) step_done.touch() # join the datasets print(u"Joining datasets") joined_dataset = None bamboo_conn = Connection(BAMBOO_URL) for form in NEW_FORMS: try: form_dataset = json.loads(requests.get(PUBLIC_API_URL % {'form': form}).text)['bamboo_dataset'] except: form_dataset = u'' if not form_dataset: continue print(u"%s: %s" % (form, form_dataset)) if not joined_dataset: joined_dataset = form_dataset continue print(u"Joined dataset: %s" % joined_dataset) ds_joined = Dataset(connection=bamboo_conn, dataset_id=joined_dataset) ds_form = Dataset(connection=bamboo_conn, dataset_id=form_dataset) dataset = Dataset.join(left_dataset=ds_joined, right_dataset=ds_form, on=u'barcode', connection=bamboo_conn) time.sleep(10) joined_dataset = dataset.id print(u"Merged dataset: %s" % dataset.id) print(u"Ultimate dataset: %s" % dataset.id)
def test_merge_bad_datasets(self): dataset = {} other_dataset = [] with self.assertRaises(PyBambooException): Dataset.merge([dataset, other_dataset], connection=self.connection)
class TestDataset(TestBase): def setUp(self): TestBase.setUp(self) self._create_dataset_from_file() def _create_dataset_from_file(self): self.dataset = Dataset(path=self.CSV_FILE, connection=self.connection) self.wait() def _create_aux_dataset_from_file(self): self.aux_dataset = Dataset(path=self.AUX_CSV_FILE, connection=self.connection) self.wait() def _wait_for_dataset_ready(self): while self.dataset.state == 'pending': self.wait() def test_create_dataset_from_json(self): dataset = Dataset(path=self.JSON_FILE, data_format='json', connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) def test_create_dataset_from_schema(self): dataset = Dataset(schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) # schema string schema_str = open(self.SCHEMA_FILE).read() dataset = Dataset(schema_content=schema_str, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) def test_create_dataset_from_schema_with_data(self): # schema + JSON data dataset = Dataset(path=self.JSON_FILE, data_format='json', schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) # schema + CSV data dataset = Dataset(path=self.CSV_FILE, data_format='csv', schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) def test_create_dataset_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) self._cleanup(dataset) def test_create_dataset_no_info(self): with self.assertRaises(PyBambooException): Dataset() def test_create_dataset_bad_data_format(self): with self.assertRaises(PyBambooException): Dataset(path=self.CSV_FILE, data_format='BAD', connection=self.connection) def test_create_dataset_from_file(self): # created in TestDataset.setUp() self.assertTrue(self.dataset.id is not None) def test_create_dataset_from_url(self): dataset = Dataset( url='http://formhub.org/mberg/forms/good_eats/data.csv', connection=self.connection) self.assertTrue(self.dataset.id is not None) self._cleanup(dataset) def test_reset_dataset(self): dataset_id = self.dataset._id self.dataset.reset(path=self.CSV_FILE, connection=self.connection) self.assertEqual(self.dataset._id, dataset_id) def test_reset_dataset_no_dataset_id(self): self.dataset.delete() with self.assertRaises(PyBambooException): self.dataset.reset() def test_na_values(self): dataset = Dataset(path=self.CSV_FILE, connection=self.connection, na_values=['n/a']) self.wait() first_row = dataset.get_data(query={ 'food_type': 'street_meat', 'amount': 2, 'rating': 'delectible', 'risk_factor': 'low_risk' }, limit=1)[-1] self.assertEqual(first_row.get('comments'), 'null') self._cleanup(dataset) def test_resample(self): data = self.dataset.resample(date_column='submit_date', interval='D', how='mean') self.assertTrue(data) def test_resample_with_query(self): data = self.dataset.resample(date_column='submit_date', interval='D', query={"food_type": "street_meat"}, how='sum') self.assertTrue(data) def test_rolling(self): data = self.dataset.rolling(win_type='boxcar', window=3) self.assertTrue(isinstance(data, list)) def test_set_info(self): description = u"Meals rating worldwide" attribution = u"mberg" label = u"Good Eats" license = u"Public Domain" self.dataset.set_info(attribution=attribution, description=description, label=label, license=license) infos = self.dataset.get_info() self.assertEqual(infos['description'], description) self.assertEqual(infos['attribution'], attribution) self.assertEqual(infos['label'], label) self.assertEqual(infos['license'], license) def test_index_present(self): data = self.dataset.get_data(index=True) self.assertTrue('index' in data[-1].keys()) def test_str(self): self.assertEqual(str(self.dataset), self.dataset.id) def test_version(self): self.assert_keys_in_dict(self.VERSION_KEYS, self.dataset.version) def test_columns(self): self.wait() # have to wait, bamboo issue #284 cols = self.dataset.columns keys = self.dataset.get_info()['schema'].keys() for key in keys: self.assertTrue(key in cols) for col in cols: self.assertTrue(col in keys) def test_state(self): self.assertEqual(self.dataset.state, 'ready') def test_num_columns(self): self.assertEqual(self.dataset.num_columns, 15) def test_num_rows(self): self.assertEqual(self.dataset.num_rows, 19) def test_count(self): self.wait() count = self.dataset.count(field='food_type', method='count') self.assertEqual(count, 19) def test_data_count(self): self._wait_for_dataset_ready() # TODO: is this necessary? count = self.dataset.get_data(count=True) self.assertEqual(count, 19) def test_delete_dataset(self): self.dataset.delete() self.assertTrue(self.dataset._id is None) def test_invalid_dataset(self): self.dataset.delete() with self.assertRaises(PyBambooException): self.dataset.delete() def test_add_calculation(self): result = self.dataset.add_calculation(name='double_amount', formula='amount * 2') self.assertTrue(result) def test_add_calculations(self): formulae = [ { 'name': 'double_amount', 'formula': 'amount * 2' }, { 'name': 'triple_amount', 'formula': 'amount * 3' }, ] result = self.dataset.add_calculations(json=formulae) self.assertTrue(result) def test_add_invalid_calculation_a_priori(self): bad_calcs = [ { 'name': None, 'formula': 'ok' }, { 'name': 'number', 'formula': 3 }, { 'name': 'number', 'formula': 'ok', 'groups': 3 }, ] for calc in bad_calcs: with self.assertRaises(PyBambooException): self.dataset.add_calculation(**calc) with self.assertRaises(PyBambooException): self.dataset.add_calculations() def test_add_invalid_calculation_a_posteriori(self): result = self.dataset.add_calculation(name='double_amount', formula='BAD') self.assertEqual(result, False) def test_add_aggregation(self): result = self.dataset.add_calculation(name='sum_amount', formula='sum(amount)') self.assertTrue(result) self.dataset.has_aggs_to_remove = True def test_add_aggregation_with_groups(self): result = self.dataset.add_calculation(name='sum_amount', formula='sum(amount)', groups=['food_type']) self.assertTrue(result) result = self.dataset.add_calculation(name='sum_amount', formula='sum(amount)', groups=['food_type', 'rating']) self.assertTrue(result) self.dataset.has_aggs_to_remove = True def test_add_aggregation_invalid_groups(self): with self.assertRaises(PyBambooException): self.dataset.add_calculation(name='sum_amount', formula='sum(amount)', groups='BAD') def test_remove_calculation(self): name = 'double_amount' self.dataset.add_calculation(name=name, formula='amount * 2') result = self.dataset.remove_calculation(name) self.assertTrue(result) def test_remove_aggregation(self): name = 'sum_amount' result = self.dataset.add_calculation(name=name, formula='sum(amount)') self.assertTrue(result) result = self.dataset.remove_calculation(name) self.assertTrue(result) self.dataset.has_aggs_to_remove = True def test_remove_calculation_fail(self): result = self.dataset.remove_calculation('bad') self.assertFalse(result) def test_get_calculations(self): calc_keys = ['state', 'formula', 'group', 'name'] result = self.dataset.add_calculation(name='double_amount', formula='amount * 2') self.assertEqual(result, True) result = self.dataset.get_calculations() self.assertTrue(isinstance(result, list)) for calc in result: self.assertTrue(isinstance(calc, dict)) keys = calc.keys() for key in calc_keys: self.assertTrue(key in keys) self.assertEqual(result[0]['state'], 'pending') self.wait() self.wait() result = self.dataset.get_calculations() self.assertEqual(result[0]['state'], 'ready') def test_get_aggregate_datasets(self): result = self.dataset.get_aggregate_datasets() self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 0) self.dataset.add_calculation(name='sum_amount', formula='sum(amount)') self.wait() self.wait() result = self.dataset.get_aggregate_datasets() self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 1) self.assertTrue('' in result.keys()) self.assertTrue(isinstance(result[''], Dataset)) self.dataset.add_calculation(name='sum_amount', formula='sum(amount)', groups=['food_type']) self.wait() self.wait() result = self.dataset.get_aggregate_datasets() self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 2) self.assertTrue('food_type' in result.keys()) self.assertTrue(isinstance(result['food_type'], Dataset)) self.dataset.has_aggs_to_remove = True def test_get_aggregate_datasets_no_aggregations(self): result = self.dataset.get_aggregate_datasets() self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 0) def test_get_summary(self): self.wait() # TODO: remove (bamboo issue #276) result = self.dataset.get_summary() self.assertTrue(isinstance(result, dict)) cols = self.dataset.columns keys = result.keys() for col in cols: self.assertTrue(col in keys) def test_get_summary_with_select(self): self.wait() # TODO: remove (bamboo issue #276) result = self.dataset.get_summary(select=['food_type']) self.assertEqual(len(result), 1) self.assertTrue('food_type' in result.keys()) result = self.dataset.get_summary(select=['food_type', 'rating']) self.assertEqual(len(result), 2) result_keys = result.keys() self.assertTrue('food_type' in result_keys) self.assertTrue('food_type' in result_keys) def test_get_summary_bad_select(self): with self.assertRaises(PyBambooException): self.dataset.get_summary(select='BAD') def test_get_summary_with_query(self): self.wait() # TODO: remove (bamboo issue #276) self.dataset.get_summary(query={'food_type': 'lunch'}) def test_get_summary_bad_query(self): with self.assertRaises(PyBambooException): self.dataset.get_summary(query='BAD') def test_get_summary_with_groups(self): self.wait() # TODO: remove (bamboo issue #276) result = self.dataset.get_summary(groups=['food_type']) self.assertEqual(len(result), 1) values = self.dataset.get_summary( select=['food_type'])['food_type']['summary'].keys() self.assertTrue('food_type' in result.keys()) self.assertTrue(isinstance(result['food_type'], dict)) keys = result['food_type'].keys() for val in values: self.assertTrue(val in keys) def test_get_summary_bad_groups(self): with self.assertRaises(PyBambooException): self.dataset.get_summary(groups='BAD') def test_get_info(self): info_keys = [ 'attribution', 'description', 'license', 'created_at', 'updated_at', 'label', 'num_columns', 'num_rows', 'id', 'schema', ] schema_keys = [ 'simpletype', 'olap_type', 'label', ] self.wait() # have to wait, bamboo issue #284 result = self.dataset.get_info() self.assertTrue(isinstance(result, dict)) for key in info_keys: self.assertTrue(key in result.keys()) self.assertEqual(result['num_columns'], 15) self.assertEqual(result['num_rows'], 19) schema = result['schema'] self.assertTrue(isinstance(schema, dict)) self.assertEqual(len(schema.keys()), 15) for col_name, col_info in schema.iteritems(): for key in schema_keys: self.assertTrue(key in col_info.keys()) def test_get_data(self): self.wait() result = self.dataset.get_data() self.assertTrue(isinstance(result, list)) self.assertEqual(len(result), 19) def test_get_data_with_select(self): self.wait() result = self.dataset.get_data(select=['food_type', 'amount']) self.assertEqual(len(result), 19) for row in result: self.assertEqual(len(row), 2) cols = row.keys() self.assertTrue('food_type' in cols) self.assertTrue('amount' in cols) def test_get_data_with_query(self): self.wait() # TODO: remove (bamboo issue #285) result = self.dataset.get_data(query={'food_type': 'lunch'}) self.assertEqual(len(result), 7) def test_get_data_with_select_and_query(self): self.wait() # TODO: remove (bamboo issue #285) result = self.dataset.get_data(select=['food_type', 'amount'], query={'food_type': 'lunch'}) self.assertEqual(len(result), 7) for row in result: self.assertEqual(len(row), 2) cols = row.keys() self.assertTrue('food_type' in cols) self.assertTrue('amount' in cols) def test_get_data_with_format(self): self.wait() # TODO: remove (bamboo issue #285) result = self.dataset.get_data(format='csv') self.assertTrue(isinstance(result, basestring)) def test_get_data_invalid_select(self): with self.assertRaises(PyBambooException): self.dataset.get_data(select='BAD') def test_get_data_invalid_query(self): with self.assertRaises(PyBambooException): self.dataset.get_data(query='BAD') def test_get_data_with_invalid_format(self): with self.assertRaises(PyBambooException): self.dataset.get_data(format='BAD') def test_get_data_bad_query(self): self.wait() # TODO: remove (bamboo issue #285) result = self.dataset.get_data(query={'BAD': 'BAD'}) self.assertFalse(result) def test_update_data(self): row = { 'food_type': 'morning_food', 'amount': 10.0, 'risk_factor': 'high_risk', 'rating': 'delectible', } result = self.dataset.update_data([row]) self.wait(15) result = self.dataset.get_data() self.assertTrue(isinstance(result, list)) self.assertEqual(len(result), 20) def test_update_data_no_data(self): with self.assertRaises(PyBambooException): self.dataset.update_data([]) def test_update_data_bad_data(self): bad_rows = [{}, [[]], [{'exception': Exception()}]] for rows in bad_rows: with self.assertRaises(PyBambooException): self.dataset.update_data(rows) def test_merge(self): # already have one dataset in self.dataset dataset = Dataset(path=self.CSV_FILE, connection=self.connection) result = Dataset.merge([self.dataset, dataset], connection=self.connection) self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(result) def test_merge_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) other_dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) result = Dataset.merge([dataset, other_dataset]) self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(other_dataset) self._cleanup(result) def test_merge_bad_datasets(self): dataset = {} other_dataset = [] with self.assertRaises(PyBambooException): Dataset.merge([dataset, other_dataset], connection=self.connection) def test_merge_fail(self): other_dataset = Dataset('12345', connection=self.connection) result = Dataset.merge([self.dataset, other_dataset], connection=self.connection) self.assertFalse(result) def test_join(self): self._create_aux_dataset_from_file() self.wait() result = Dataset.join(self.dataset, self.aux_dataset, 'food_type', connection=self.connection) self.assertTrue(isinstance(result, Dataset)) self._cleanup(result) def test_join_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) aux_dataset = Dataset(path=self.AUX_CSV_FILE, connection=self.default_connection) self.wait() result = Dataset.join(dataset, aux_dataset, 'food_type') self.wait() self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(aux_dataset) self._cleanup(result) def test_join_bad_other_dataset(self): with self.assertRaises(PyBambooException): Dataset.join(self.dataset, Exception(), 'food_type', connection=self.connection) def test_join_bad_on(self): self._create_aux_dataset_from_file() self.wait() result = Dataset.join(self.dataset, self.aux_dataset, 'BAD', connection=self.connection) self.assertFalse(result) # /row/INDEX tests. def test_get_row(self): self.assertEqual( self.dataset.get_row(0)['comments'], u"Try the yogurt drink") def test_update_row(self): index = 2 comment = 'test' self.dataset.update_row(index, {'comments': comment}) self.assertEqual(self.dataset.get_row(index)['comments'], comment) def test_delete_row(self): self._wait_for_dataset_ready() # TODO: is this necessary? index = 10 self.dataset.delete_row(index=index) result = self.dataset.get_row(index) self.assertTrue('error' in result)
class TestDataset(TestBase): def setUp(self): TestBase.setUp(self) self._create_dataset_from_file() def _create_dataset_from_file(self): self.dataset = Dataset(path=self.CSV_FILE, connection=self.connection) self.wait() def _create_aux_dataset_from_file(self): self.aux_dataset = Dataset(path=self.AUX_CSV_FILE, connection=self.connection) self.wait() def _wait_for_dataset_ready(self): while self.dataset.state == 'pending': self.wait() def test_create_dataset_from_json(self): dataset = Dataset(path=self.JSON_FILE, data_format='json', connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) def test_create_dataset_from_schema(self): dataset = Dataset(schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) # schema string schema_str = open(self.SCHEMA_FILE).read() dataset = Dataset(schema_content=schema_str, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) def test_create_dataset_from_schema_with_data(self): # schema + JSON data dataset = Dataset(path=self.JSON_FILE, data_format='json', schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) # schema + CSV data dataset = Dataset(path=self.CSV_FILE, data_format='csv', schema_path=self.SCHEMA_FILE, connection=self.connection) self.assertTrue(dataset.id is not None) self._cleanup(dataset) def test_create_dataset_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) self._cleanup(dataset) def test_create_dataset_no_info(self): with self.assertRaises(PyBambooException): Dataset() def test_create_dataset_bad_data_format(self): with self.assertRaises(PyBambooException): Dataset(path=self.CSV_FILE, data_format='BAD', connection=self.connection) def test_create_dataset_from_file(self): # created in TestDataset.setUp() self.assertTrue(self.dataset.id is not None) def test_create_dataset_from_url(self): dataset = Dataset( url='http://formhub.org/mberg/forms/good_eats/data.csv', connection=self.connection) self.assertTrue(self.dataset.id is not None) self._cleanup(dataset) def test_reset_dataset(self): dataset_id = self.dataset._id self.dataset.reset(path=self.CSV_FILE, connection=self.connection) self.assertEqual(self.dataset._id, dataset_id) def test_reset_dataset_no_dataset_id(self): self.dataset.delete() with self.assertRaises(PyBambooException): self.dataset.reset() def test_na_values(self): dataset = Dataset( path=self.CSV_FILE, connection=self.connection, na_values=['n/a']) self.wait() first_row = dataset.get_data(query={'food_type': 'street_meat', 'amount': 2, 'rating': 'delectible', 'risk_factor': 'low_risk'}, limit=1)[-1] self.assertEqual(first_row.get('comments'), 'null') self._cleanup(dataset) def test_resample(self): data = self.dataset.resample(date_column='submit_date', interval='D', how='mean') self.assertTrue(data) def test_resample_with_query(self): data = self.dataset.resample(date_column='submit_date', interval='D', query={"food_type": "street_meat"}, how='sum') self.assertTrue(data) def test_rolling(self): data = self.dataset.rolling(win_type='boxcar', window=3) self.assertTrue(isinstance(data, list)) def test_set_info(self): description = u"Meals rating worldwide" attribution = u"mberg" label = u"Good Eats" license = u"Public Domain" self.dataset.set_info(attribution=attribution, description=description, label=label, license=license) infos = self.dataset.get_info() self.assertEqual(infos['description'], description) self.assertEqual(infos['attribution'], attribution) self.assertEqual(infos['label'], label) self.assertEqual(infos['license'], license) def test_index_present(self): data = self.dataset.get_data(index=True) self.assertTrue('index' in data[-1].keys()) def test_str(self): self.assertEqual(str(self.dataset), self.dataset.id) def test_version(self): self.assert_keys_in_dict(self.VERSION_KEYS, self.dataset.version) def test_columns(self): self.wait() # have to wait, bamboo issue #284 cols = self.dataset.columns keys = self.dataset.get_info()['schema'].keys() for key in keys: self.assertTrue(key in cols) for col in cols: self.assertTrue(col in keys) def test_state(self): self.assertEqual(self.dataset.state, 'ready') def test_num_columns(self): self.assertEqual(self.dataset.num_columns, 15) def test_num_rows(self): self.assertEqual(self.dataset.num_rows, 19) def test_count(self): self.wait() count = self.dataset.count(field='food_type', method='count') self.assertEqual(count, 19) def test_data_count(self): self._wait_for_dataset_ready() # TODO: is this necessary? count = self.dataset.get_data(count=True) self.assertEqual(count, 19) def test_delete_dataset(self): self.dataset.delete() self.assertTrue(self.dataset._id is None) def test_invalid_dataset(self): self.dataset.delete() with self.assertRaises(PyBambooException): self.dataset.delete() def test_add_calculation(self): result = self.dataset.add_calculation(name='double_amount', formula='amount * 2') self.assertTrue(result) def test_add_calculations(self): formulae = [ {'name': 'double_amount', 'formula': 'amount * 2'}, {'name': 'triple_amount', 'formula': 'amount * 3'}, ] result = self.dataset.add_calculations(json=formulae) self.assertTrue(result) def test_add_invalid_calculation_a_priori(self): bad_calcs = [ {'name': None, 'formula': 'ok'}, {'name': 'number', 'formula': 3}, {'name': 'number', 'formula': 'ok', 'groups': 3}, ] for calc in bad_calcs: with self.assertRaises(PyBambooException): self.dataset.add_calculation(**calc) with self.assertRaises(PyBambooException): self.dataset.add_calculations() def test_add_invalid_calculation_a_posteriori(self): result = self.dataset.add_calculation(name='double_amount', formula='BAD') self.assertEqual(result, False) def test_add_aggregation(self): result = self.dataset.add_calculation(name='sum_amount', formula='sum(amount)') self.assertTrue(result) self.dataset.has_aggs_to_remove = True def test_add_aggregation_with_groups(self): result = self.dataset.add_calculation( name='sum_amount', formula='sum(amount)', groups=['food_type']) self.assertTrue(result) result = self.dataset.add_calculation( name='sum_amount', formula='sum(amount)', groups=['food_type', 'rating']) self.assertTrue(result) self.dataset.has_aggs_to_remove = True def test_add_aggregation_invalid_groups(self): with self.assertRaises(PyBambooException): self.dataset.add_calculation( name='sum_amount', formula='sum(amount)', groups='BAD') def test_remove_calculation(self): name = 'double_amount' self.dataset.add_calculation(name=name, formula='amount * 2') result = self.dataset.remove_calculation(name) self.assertTrue(result) def test_remove_aggregation(self): name = 'sum_amount' result = self.dataset.add_calculation(name=name, formula='sum(amount)') self.assertTrue(result) result = self.dataset.remove_calculation(name) self.assertTrue(result) self.dataset.has_aggs_to_remove = True def test_remove_calculation_fail(self): result = self.dataset.remove_calculation('bad') self.assertFalse(result) def test_get_calculations(self): calc_keys = ['state', 'formula', 'group', 'name'] result = self.dataset.add_calculation(name='double_amount', formula='amount * 2') self.assertEqual(result, True) result = self.dataset.get_calculations() self.assertTrue(isinstance(result, list)) for calc in result: self.assertTrue(isinstance(calc, dict)) keys = calc.keys() for key in calc_keys: self.assertTrue(key in keys) self.assertEqual(result[0]['state'], 'pending') self.wait() self.wait() result = self.dataset.get_calculations() self.assertEqual(result[0]['state'], 'ready') def test_get_aggregate_datasets(self): result = self.dataset.get_aggregate_datasets() self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 0) self.dataset.add_calculation(name='sum_amount', formula='sum(amount)') self.wait() self.wait() result = self.dataset.get_aggregate_datasets() self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 1) self.assertTrue('' in result.keys()) self.assertTrue(isinstance(result[''], Dataset)) self.dataset.add_calculation( name='sum_amount', formula='sum(amount)', groups=['food_type']) self.wait() self.wait() result = self.dataset.get_aggregate_datasets() self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 2) self.assertTrue('food_type' in result.keys()) self.assertTrue(isinstance(result['food_type'], Dataset)) self.dataset.has_aggs_to_remove = True def test_get_aggregate_datasets_no_aggregations(self): result = self.dataset.get_aggregate_datasets() self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 0) def test_get_summary(self): self.wait() # TODO: remove (bamboo issue #276) result = self.dataset.get_summary() self.assertTrue(isinstance(result, dict)) cols = self.dataset.columns keys = result.keys() for col in cols: self.assertTrue(col in keys) def test_get_summary_with_select(self): self.wait() # TODO: remove (bamboo issue #276) result = self.dataset.get_summary(select=['food_type']) self.assertEqual(len(result), 1) self.assertTrue('food_type' in result.keys()) result = self.dataset.get_summary(select=['food_type', 'rating']) self.assertEqual(len(result), 2) result_keys = result.keys() self.assertTrue('food_type' in result_keys) self.assertTrue('food_type' in result_keys) def test_get_summary_bad_select(self): with self.assertRaises(PyBambooException): self.dataset.get_summary(select='BAD') def test_get_summary_with_query(self): self.wait() # TODO: remove (bamboo issue #276) self.dataset.get_summary(query={'food_type': 'lunch'}) def test_get_summary_bad_query(self): with self.assertRaises(PyBambooException): self.dataset.get_summary(query='BAD') def test_get_summary_with_groups(self): self.wait() # TODO: remove (bamboo issue #276) result = self.dataset.get_summary(groups=['food_type']) self.assertEqual(len(result), 1) values = self.dataset.get_summary( select=['food_type'])['food_type']['summary'].keys() self.assertTrue('food_type' in result.keys()) self.assertTrue(isinstance(result['food_type'], dict)) keys = result['food_type'].keys() for val in values: self.assertTrue(val in keys) def test_get_summary_bad_groups(self): with self.assertRaises(PyBambooException): self.dataset.get_summary(groups='BAD') def test_get_info(self): info_keys = [ 'attribution', 'description', 'license', 'created_at', 'updated_at', 'label', 'num_columns', 'num_rows', 'id', 'schema', ] schema_keys = [ 'simpletype', 'olap_type', 'label', ] self.wait() # have to wait, bamboo issue #284 result = self.dataset.get_info() self.assertTrue(isinstance(result, dict)) for key in info_keys: self.assertTrue(key in result.keys()) self.assertEqual(result['num_columns'], 15) self.assertEqual(result['num_rows'], 19) schema = result['schema'] self.assertTrue(isinstance(schema, dict)) self.assertEqual(len(schema.keys()), 15) for col_name, col_info in schema.iteritems(): for key in schema_keys: self.assertTrue(key in col_info.keys()) def test_get_data(self): self.wait() result = self.dataset.get_data() self.assertTrue(isinstance(result, list)) self.assertEqual(len(result), 19) def test_get_data_with_select(self): self.wait() result = self.dataset.get_data(select=['food_type', 'amount']) self.assertEqual(len(result), 19) for row in result: self.assertEqual(len(row), 2) cols = row.keys() self.assertTrue('food_type' in cols) self.assertTrue('amount' in cols) def test_get_data_with_query(self): self.wait() # TODO: remove (bamboo issue #285) result = self.dataset.get_data(query={'food_type': 'lunch'}) self.assertEqual(len(result), 7) def test_get_data_with_select_and_query(self): self.wait() # TODO: remove (bamboo issue #285) result = self.dataset.get_data( select=['food_type', 'amount'], query={'food_type': 'lunch'}) self.assertEqual(len(result), 7) for row in result: self.assertEqual(len(row), 2) cols = row.keys() self.assertTrue('food_type' in cols) self.assertTrue('amount' in cols) def test_get_data_with_format(self): self.wait() # TODO: remove (bamboo issue #285) result = self.dataset.get_data(format='csv') self.assertTrue(isinstance(result, basestring)) def test_get_data_invalid_select(self): with self.assertRaises(PyBambooException): self.dataset.get_data(select='BAD') def test_get_data_invalid_query(self): with self.assertRaises(PyBambooException): self.dataset.get_data(query='BAD') def test_get_data_with_invalid_format(self): with self.assertRaises(PyBambooException): self.dataset.get_data(format='BAD') def test_get_data_bad_query(self): self.wait() # TODO: remove (bamboo issue #285) result = self.dataset.get_data(query={'BAD': 'BAD'}) self.assertFalse(result) def test_update_data(self): row = { 'food_type': 'morning_food', 'amount': 10.0, 'risk_factor': 'high_risk', 'rating': 'delectible', } result = self.dataset.update_data([row]) self.wait(15) result = self.dataset.get_data() self.assertTrue(isinstance(result, list)) self.assertEqual(len(result), 20) def test_update_data_no_data(self): with self.assertRaises(PyBambooException): self.dataset.update_data([]) def test_update_data_bad_data(self): bad_rows = [ {}, [[]], [{'exception': Exception()}] ] for rows in bad_rows: with self.assertRaises(PyBambooException): self.dataset.update_data(rows) def test_merge(self): # already have one dataset in self.dataset dataset = Dataset(path=self.CSV_FILE, connection=self.connection) result = Dataset.merge([self.dataset, dataset], connection=self.connection) self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(result) def test_merge_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) other_dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) result = Dataset.merge([dataset, other_dataset]) self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(other_dataset) self._cleanup(result) def test_merge_bad_datasets(self): dataset = {} other_dataset = [] with self.assertRaises(PyBambooException): Dataset.merge([dataset, other_dataset], connection=self.connection) def test_merge_fail(self): other_dataset = Dataset('12345', connection=self.connection) result = Dataset.merge([self.dataset, other_dataset], connection=self.connection) self.assertFalse(result) def test_join(self): self._create_aux_dataset_from_file() self.wait() result = Dataset.join(self.dataset, self.aux_dataset, 'food_type', connection=self.connection) self.assertTrue(isinstance(result, Dataset)) self._cleanup(result) def test_join_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) aux_dataset = Dataset(path=self.AUX_CSV_FILE, connection=self.default_connection) self.wait() result = Dataset.join(dataset, aux_dataset, 'food_type') self.wait() self.assertTrue(isinstance(result, Dataset)) self._cleanup(dataset) self._cleanup(aux_dataset) self._cleanup(result) def test_join_bad_other_dataset(self): with self.assertRaises(PyBambooException): Dataset.join(self.dataset, Exception(), 'food_type', connection=self.connection) def test_join_bad_on(self): self._create_aux_dataset_from_file() self.wait() result = Dataset.join(self.dataset, self.aux_dataset, 'BAD', connection=self.connection) self.assertFalse(result) # /row/INDEX tests. def test_get_row(self): self.assertEqual(self.dataset.get_row(0)['comments'], u"Try the yogurt drink") def test_update_row(self): index = 2 comment = 'test' self.dataset.update_row(index, {'comments': comment}) self.assertEqual(self.dataset.get_row(index)['comments'], comment) def test_delete_row(self): self._wait_for_dataset_ready() # TODO: is this necessary? index = 10 self.dataset.delete_row(index=index) result = self.dataset.get_row(index) self.assertTrue('error' in result)
def test_create_dataset_default_connection(self): dataset = Dataset(path=self.CSV_FILE, connection=self.default_connection) self._cleanup(dataset)
def test_create_dataset_no_info(self): with self.assertRaises(PyBambooException): Dataset()
# get state of current datasets with open(bamboo_id_file) as f: bamboo_ids = json.loads(f.read()) if not bamboo_ids: print '"%s" not found: exiting' % bamboo_id_file sys.exit(0) print 'current dataset status:' print json.dumps(bamboo_ids, indent=4, sort_keys=True) # upload originals for sector in bamboo_ids.keys(): for name, id in bamboo_ids[sector]['originals'].iteritems(): if not id: print 'dataset: %s not uploaded, uploading %s.csv' % (name, name) dataset = Dataset(connection=connection, path='csvs/originals/%s.csv' % name) state = dataset.get_info()['state'] while state != 'ready': time.sleep(1) state = dataset.get_info()['state'] print state bamboo_ids[sector]['originals'][name] = dataset.id with open(bamboo_id_file, 'wb') as f: f.write(json.dumps(bamboo_ids)) # merge originals for sector in bamboo_ids.keys(): if not bamboo_ids[sector]['merged']: print 'no merged dataset for sector: %s' % sector datasets = [ Dataset(connection=connection, dataset_id=id)
# get state of current datasets with open(bamboo_id_file) as f: bamboo_ids = json.loads(f.read()) if not bamboo_ids: print '"%s" not found: exiting' % bamboo_id_file sys.exit(0) print 'current dataset status:' print json.dumps(bamboo_ids, indent=4, sort_keys=True) # upload originals for sector in bamboo_ids.keys(): for name, id in bamboo_ids[sector]['originals'].iteritems(): if not id: print 'dataset: %s not uploaded, uploading %s.csv' % (name, name) dataset = Dataset(connection=connection, path='csvs/originals/%s.csv' % name) state = dataset.get_info()['state'] while state != 'ready': time.sleep(1) state = dataset.get_info()['state'] print state bamboo_ids[sector]['originals'][name] = dataset.id with open(bamboo_id_file, 'wb') as f: f.write(json.dumps(bamboo_ids)) # merge originals for sector in bamboo_ids.keys(): if not bamboo_ids[sector]['merged']: print 'no merged dataset for sector: %s' % sector datasets = [Dataset(connection=connection, dataset_id=id) for name, id in bamboo_ids[sector]['originals'].iteritems()]
print 'Could not load %s' % BAMBOO_HASH_FILE sys.exit(1) # update the datasets hash_updates = dict() for name, content in bamboo_hash.iteritems(): filename = content['filename'] bamboo_id = content['bamboo_id'] sector = content.get('sector') file_path = 'data/' + filename print '%s -> %s' % (filename, bamboo_id) if bamboo_id: print '%s has bamboo id: %s. Updating bamboo dataset.' %\ (name, bamboo_id) try: dataset = Dataset(dataset_id=bamboo_id) dataset.remove_calculation('sector') dataset.reset(path=file_path) if sector: formula = '"%s"' % sector print 'Adding column for sector: %s, formula: %s' %\ (sector, formula) result = dataset.add_calculation('sector', formula) if result: print 'Calculation added successfully!' else: print 'Problem adding calculation!' except PyBambooException: print 'Error creating dataset for file: %s' % filename else: print '%s has no bamboo id. Adding file to bamboo.' % name
def _get_aggregate(self, key, value, period): sum_value = 0 for v in value[key]: dataset_id = v['dataset_id'] # dataset_id form sources.json is most recent if dataset_id != self._sources[v['source']]\ and self._sources[v['source']] != "": dataset_id = self._sources[v['source']] dataset = Dataset( dataset_id=dataset_id, connection=self.connection) params = {} if 'calculation' in v: # check or create calculations if isinstance(v['calculation'], list): for calculation in v['calculation']: self._add_calculation(calculation, dataset, period) if isinstance(v['calculation'], dict): self._add_calculation(v['calculation'], dataset, period) if 'query' in v: query_string = json.dumps(v['query']) template = env.from_string(query_string) query_string = template.render(period=period) v['query'] = json.loads(query_string) params['query'] = v['query'] # if 'count' in v and 'query' in v: # params['count'] = v['count'] if 'distinct' in v: params['distinct'] = v['distinct'] data = dataset.get_data(format='csv', **params) if data.strip() == '': # no data to create a dataset - skip continue # create a aggregate dataset aggr_dataset = Dataset( content=data, data_format='csv', connection=self.connection) if 'aggregate' in v: # check or create calculations if isinstance(v['aggregate'], list): for calculation in v['aggregate']: calc = aggr_dataset.add_calculation( name=calculation['name'], formula=calculation['formula'] ) if calc: aggr_ds = aggr_dataset.get_aggregations()[''] k = aggr_ds.get_data() val = k[0][calculation['name']] if isinstance(val, basestring): raise ValueError("Dataset %s return %s" % (aggr_ds.id, val)) sum_value += val aggr_ds.delete() if isinstance(v['aggregate'], dict): calculation = v['aggregate'] calc = aggr_dataset.add_calculation( name=calculation['name'], formula=calculation['formula'] ) if calc: aggr_ds = aggr_dataset.get_aggregations()[''] k = aggr_ds.get_data() val = k[0][calculation['name']] if isinstance(val, basestring): raise ValueError("Dataset %s return %s" % (aggr_ds.id, val)) sum_value += val aggr_ds.delete() aggr_dataset.delete() return sum_value