def __get_dataset(self): name = 'Create Dataset Test' acc = self.user.account # test create dataset = Dataset.create_by_user(self.user, acc.id, name) dataset = Dataset.objects.find_one_by_user(self.user, id=dataset.id) self.assertTrue(dataset) self.assertEqual(dataset.parent_id, acc.id) self.assertEqual(dataset.name, name) self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC) with open(CSV_SHORT_FILEPATH) as csv_file: filelen = len([1 for _ in csv_file]) csv_file.seek(0) data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) discovered_schema = data_loader.read_schema() dataset.update(schema=discovered_schema) finish_data_load. async (self.user, dataset, data_loader) self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC) self.assertEqual(dataset.load_progress, 100) self.assertEqual(dataset.data_coll.count(), filelen - 1) # minus head self.assertEqual(dataset.data_coll.count(), dataset.rows) data_item = dataset.data_coll.find_one() self.assertTrue({c[KEY_NAME] for c in dataset.schema} <= set(data_item.keys())) # TODO: test full data integrity!!! return dataset
def test_compute_cardinalities(self): csv_file = tempfile.TemporaryFile('w+') writer = csv.writer(csv_file, delimiter=CsvDataLoader.TAB) writer.writerow(['FName', 'LName', 'SN']) writer.writerow(['ram', 'shakya', None]) writer.writerow(['shyam', 'shrestha', 0]) writer.writerow(['hari', 'shrestha', 1]) writer.writerow(['shyam', 'shakya', 2]) writer.writerow(['hari', 'shakya', 3]) writer.writerow(['hari', 'shakya', 4]) csv_file.flush() csv_file.seek(0) name = "cardinality_test" data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) self.user.account.datasets.add_dataset(self.user, name, data_loader) dataset = self.user.account.datasets.get_dataset(self.user, name) dataset.reload() actual = dataset.cardinalities self.assertEqual(sorted(actual.keys()), ['FName', 'LName', 'SN']) self.assertEqual(actual['FName']['count'], 3) self.assertEqual(sorted(actual['FName']['values']), ['hari', 'ram', 'shyam']) self.assertEqual(actual['LName']['count'], 2) self.assertEqual(sorted(actual['LName']['values']), ['shakya', 'shrestha']) # None and NaN are dropped self.assertEqual(actual['SN']['count'], 5) self.assertEqual(sorted(actual['SN']['values']), [0, 1, 2, 3, 4])
def test_schema_parse_max_lines(self): import pandas acc = self.user.account with open(CSV_SHORT_FILEPATH) as csv_file: with patch('pandas.read_csv', wraps=pandas.read_csv) as parse_meth: name = 'test_schema_analysis' # dataset = Dataset.create(acc.id, self.user, 'test_schema_analysis') data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) data_loader.read_schema() input_file = parse_meth.call_args[0][0] input_file.seek(0) filelen = len([1 for _ in input_file]) self.assertEqual(TEST_MAX_LINES + 1, filelen) acc.datasets.delete_dataset(self.user, name)
def test_1k_fields(self): from cStringIO import StringIO from bson.objectid import ObjectId import random COLUMNS = 1000 LAST_COL_IDX = COLUMNS - 1 csv = StringIO() for col in xrange(COLUMNS): csv.write(('COLUMN%s' % col)) csv.write('\t') if col != LAST_COL_IDX else csv.write('\n') _from = 10 * 1000 * 1000 * 1000 _to = _from * 2 for row in (1, 2): for col in xrange(COLUMNS): last = col == LAST_COL_IDX if col < COLUMNS / 2: csv.write('%s' % random.randint(_from, _to)) else: csv.write(unicode(ObjectId())) if not last: csv.write('\t') else: csv.write('\n') csv.seek(0) name = 'Test1000Columns' data_loader = CsvDataLoader(csv, sep=CsvDataLoader.TAB) dataset = self.user.account.datasets.add_dataset( self.user, name, data_loader) data_cls = dataset.get_data_class() self.assertEqual(data_cls.objects.count(), 2)
def get_data_loader_by_ext(self, input_file, params): if input_file.filename.lower().endswith('.csv'): return CsvDataLoader(input_file.stream, sep=params['sep']) if input_file.filename.lower().endswith('.json'): def type_getter(data): return data.get('event_type') return JsonDataLoader(input_file.stream, data_type_getter=type_getter) raise WrongFileExtension('Only CSV and JSON are supported.')
def create(self, *args, **kwargs): csv_file = kwargs['csv_file'] if not csv_file.filename.endswith('.csv'): raise WrongFileExtension( 'Wrong file extension, only .csv is supported.') sep = kwargs['sep'] data_loader = CsvDataLoader(csv_file.stream, sep=sep) manager = getattr(self.user.account, self.ACC_DYN_MANAGER_NAME) profile = manager.create(self.user, data_loader) return profile.to_dict()
def create(self, *args, **kwargs): ''' Create endpoint creates Dataset entity (:status=NEW) then makes schema analysis (:status=ANALYZED), then starts async task to load raw data into db (:status=LOADING). After ANALYZED user can invoke /dataset/update_schema/<name> to alter schema conf and then invoke /dataset/sync/<name> to sync the uploaded data. ''' name = kwargs['name'] csv_file = kwargs['csv_file'] if not csv_file.filename.endswith('.csv'): raise WrongFileExtension( 'Wrong file extension, only .csv is supported.') sep = kwargs['sep'] data_loader = CsvDataLoader(csv_file.stream, sep=sep) acc = self.user.account dataset = acc.datasets.add_dataset(self.user, name, data_loader) return dataset.to_dict()
def update(self, *args, **kwargs): '''Append new data into existing Dataset. Schema is locked at the moment, so only same data structure is allowed to upload. Statuses start changing from LOADING then LOADED. :sync_status left IN_SYNC. ''' csv_file = kwargs['csv_file'] sep = kwargs['sep'] if not csv_file.filename.endswith('.csv'): raise WrongFileExtension( 'Wrong file extension, only .csv is supported.') acc = self.user.account dataset = acc.datasets.get_dataset(self.user, kwargs['name']) if not dataset: return data_loader = CsvDataLoader(csv_file.stream, sep=sep) acc.datasets.update_dataset(self.user, dataset, data_loader) return dataset.to_dict()
def test_change_id_column(self): from solariat_bottle.utils.predictor_events import translate_column manager = getattr(self.user.account, self.acc_attr_name) with open(CSV_FILEPATH) as csv_file: data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) profile = manager.create(self.user, data_loader) copy_cols = ('INTERACTION_ID', 'START_TS', 'END_TS') schema = [ col for col in profile.discovered_schema if col[KEY_NAME] in copy_cols ] id_col_name = 'INTERACTION_ID' for col in schema: if col[KEY_NAME] == id_col_name: col[KEY_IS_ID] = True break data = self._post('%s/update_schema' % self.ENDPOINT, {'schema': schema}, expected_code=201) data = self._get('%s/get' % self.ENDPOINT, {}) id_col = [col for col in data['data']['schema'] if KEY_IS_ID in col][0] self.assertEqual(id_col[KEY_NAME], id_col_name) self.assertEqual(data['data']['schema'], schema) data = self._post('%s/sync/apply' % self.ENDPOINT, {}, expected_code=201) profile.reload() raw_data = profile.data_sync_coll.find_one() self.assertEqual(raw_data['_id'], raw_data[id_col_name]) data = self._post('%s/sync/accept' % self.ENDPOINT, {}, expected_code=201) profile.reload() data = profile.get_data()[0] self.assertEqual(data.id, getattr(data, id_col_name))
def test_expression_fields(self): name = "ExpressionsTest" dataset = self.create_and_load_dataset(name) COL_NAME = 'EXP_COLUMN' EXP_FIELD = { KEY_NAME: COL_NAME, KEY_TYPE: TYPE_INTEGER, KEY_EXPRESSION: 'INTERACTION_ID % 2', } schema = dataset.schema + [EXP_FIELD] for col in schema: if col[KEY_NAME] == 'INTERACTION_ID': col[KEY_TYPE] = TYPE_INTEGER break dataset.update_schema(schema) dataset.apply_sync() dataset.accept_sync() cnt = 0 for data in dataset.get_data(): if data.INTERACTION_ID is None: continue self.assertEqual(data.INTERACTION_ID % 2, data.EXP_COLUMN) cnt += 1 with open(CSV_SHORT_FILEPATH) as csv_file: data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) self.user.account.datasets.update_dataset(self.user, dataset, data_loader) cnt2 = 0 for data in dataset.get_data(): if data.INTERACTION_ID is None: continue self.assertEqual(data.INTERACTION_ID % 2, data.EXP_COLUMN) cnt2 += 1 self.assertGreater(cnt2, cnt)
def test_event_workflow(self): # TODO: check we cannot import data when channel OUT_OF_SYNC from solariat_bottle.db.events.event import Event from solariat_bottle.schema_data_loaders.csv import CsvDataLoader from solariat_bottle.schema_data_loaders.json import JsonDataLoader # View code channel_type = ChannelType.objects.create_by_user(self.user, name='TestType', account=self.user.account) acc = self.user.account web_event_type = acc.event_types.create(self.user, channel_type, name='Web') mail_event_type = acc.event_types.create(self.user, channel_type, name='Mail') chat_event_type = acc.event_types.create(self.user, channel_type, name='Chat') channel_type.is_locked = True channel_type.save() with self.assertRaises(ChannelTypeIsLocked): acc.event_types.create(self.user, channel_type, name='ChannelTypeIsLockedError') # discover from csv: event_type as input rows_csv = 10 csv_file = self.get_csv_input_file(size=rows_csv) csv_data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.COMMA) schema1 = csv_data_loader.read_schema() web_event_type.update(discovered_schema=schema1) self.assertEqual({col[KEY_NAME] for col in schema1}, set(self.TEST_ITEM_COLUMNS)) # discover from json: no event_type as input needed EVENT_TYPE_MAP = { ev.name: ev for ev in (web_event_type, mail_event_type, chat_event_type) } rows_json = 20 json_file = self.get_json_input_file(size=rows_json, event_types=('Mail', 'Chat')) def event_type_getter(data_item): return data_item.get(self.EVENT_TYPE_DATA_FIELD) json_data_loader = JsonDataLoader(json_file, event_type_getter) schemas_map = json_data_loader.read_schema() discover_only_type = None for event_type_name, schema in schemas_map.iteritems(): event_type = EVENT_TYPE_MAP.get(event_type_name) if not event_type: # TODO: log continue if discover_only_type and discover_only_type != event_type: continue event_type.update(discovered_schema=schema) self.assertEqual({col[KEY_NAME] for col in schema}, set(self.TEST_ITEM_COLUMNS) | {self.EVENT_TYPE_DATA_FIELD}) # import data # case 1: event type has just discovered schema (or no schema at all) # sync_status: IN_SYNC; insert data to data_coll as it is (raw) # # case 2: event type has schema created # sync_status: OUT_OF_SYNC; do not insert any data # # case 3: event type has applied schema # sync_status: IN_SYNC; insert data with schema ChClass = channel_type.get_channel_class() channel = ChClass.objects.create_by_user(self.user, title='SomeImportChannel', channel_type_id=channel_type.id) with self.assertRaises(ImproperStateError): web_event_type.import_data(self.user, csv_data_loader) web_event_type.channel_id = channel.id web_event_type.import_data(self.user, csv_data_loader) self.assertEqual(web_event_type.data_coll.count(), rows_csv) # TODO: test it also like: channel.import_data(event_type=web_event_type) # check we have data imported, check accordance to schema channel.import_data(self.user, json_data_loader) for ev_type in EVENT_TYPE_MAP.values(): ev_type.reload() # import pdb; pdb.set_trace() total_imported = rows_csv + rows_json self.assertNotEqual(web_event_type.data_coll.count(), total_imported) self.assertEqual(web_event_type.all_data_coll.count(), total_imported) self.assertEqual(Event.objects.coll.count(), total_imported) mail_events_count = mail_event_type.data_coll.count() chat_events_count = chat_event_type.data_coll.count() self.assertTrue(0 < mail_events_count <= rows_json) self.assertEqual(mail_event_type.rows, mail_events_count) schema = [dict(col) for col in mail_event_type.discovered_schema] bool_col = [col for col in schema if col[KEY_NAME] == 'BoolCol'][0] self.assertTrue(bool_col[KEY_TYPE], TYPE_BOOLEAN) raw_event = mail_event_type.data_coll.find_one() val = raw_event[bool_col[KEY_NAME]] self.assertIsInstance(val, bool) bool_col[KEY_TYPE] = TYPE_STRING mail_event_type.update_schema(schema) self.assertEqual(mail_event_type.sync_status, EventType.OUT_OF_SYNC) with self.assertRaises(ImproperStateError): mail_event_type.import_data(self.user, json_data_loader) mail_event_type.apply_sync() self.assertEqual(mail_event_type.sync_status, EventType.SYNCED) self.assertEqual(mail_event_type.items_synced, mail_events_count) # should be no errors self.assertEqual(Event.objects.coll.count(), total_imported) self.assertEqual(mail_event_type.data_sync_coll.count(), mail_events_count) self.assertEqual(chat_event_type.data_sync_coll.count(), 0) raw_event = mail_event_type.data_sync_coll.find_one() val = raw_event[bool_col[KEY_NAME]] self.assertIsInstance(val, basestring) mail_event_type.accept_sync() self.assertEqual(Event.objects.coll.count(), total_imported) self.assertEqual(mail_event_type.data_sync_coll.count(), 0) raw_event = mail_event_type.data_coll.find_one() val = raw_event[bool_col[KEY_NAME]] self.assertIsInstance(val, basestring)
def test_events_view_workflow(self): from solariat_bottle.utils.predictor_events import translate_column acc = self.user.account # 1. create channel_type_resp = self._post('/channel_type/create', {KEY_NAME: 'TestChannelType'}, expected_code=201) channel_type_name = channel_type_resp['data']['name'] resp = self._post('/event_type/create', {KEY_NAME: EVENT_TYPE_FLOW_NAME, KEY_PLATFORM: channel_type_name}, expected_code=201) data = resp['data'] self.assertEqual(data['sync_status'], EventType.OUT_OF_SYNC) self.assertFalse(data['schema']) self.assertFalse(data['discovered_schema']) self.assertFalse(data['is_locked']) # 2. discover schema with open(CSV_FILEPATH) as csv_file: # TODO: discover schema on json post_data = dict(file=(csv_file, CSV_FILENAME), sep=CsvDataLoader.TAB, name=EVENT_TYPE_FLOW_NAME) resp = self.client.post( '/event_type/discover_schema', buffered=True, content_type='multipart/form-data', data=post_data, base_url='https://localhost') self.assertEqual(resp.status_code, 201) data = json.loads(resp.data)['data'] self.assertEqual(data['sync_status'], EventType.OUT_OF_SYNC) self.assertFalse(data['schema']) self.assertTrue(data['discovered_schema']) # 3. load data: we can load data without a schema, in this case derived schema applied channel_type = ChannelType.objects.find_one_by_user(self.user, name=channel_type_name) ChClass = channel_type.get_channel_class() channel = ChClass.objects.create_by_user(self.user, title='ImportingChannel #1', channel_type_id=channel_type.id) # import customer profile first with open(CSV_FILEPATH) as csv_file: data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) profile = self.user.account.customer_profile.create(self.user, data_loader) self.assertTrue(profile.discovered_schema) self.assertFalse(profile.schema) schema = [dict(col) for col in profile.discovered_schema] actor_id_col = [col for col in schema if col[KEY_NAME] == KEY_ACTOR_ID][0] actor_id_col[KEY_IS_ID] = True profile.update_schema(schema) profile.apply_sync() profile.accept_sync() with open(CSV_FILEPATH) as csv_file: # TODO: discover schema on json resp = self.client.post( '/event_type/import_data', buffered=True, content_type='multipart/form-data', data={ KEY_FILE: (csv_file, CSV_FILENAME), 'sep': CsvDataLoader.TAB, KEY_CHANNEL_ID: channel.id, KEY_NAME: EVENT_TYPE_FLOW_NAME, }, base_url='https://localhost') self.assertEqual(resp.status_code, 201) data = json.loads(resp.data)['data'] self.assertEqual(data['sync_status'], EventType.OUT_OF_SYNC) self.assertFalse(data['schema']) self.assertEqual(data['rows'], 50) # TODO: replace with SIZE # 4. edit schema based on discovered resp = self._post('/event_type/update_schema/%s' % EVENT_TYPE_FLOW_NAME, {'schema': data['discovered_schema']}, expected_code=201) data = resp['data'] self.assertEqual(data['sync_status'], EventType.OUT_OF_SYNC) self.assertTrue(data['schema']) # 5. sync resp = self._post('/event_type/sync/apply/%s' % EVENT_TYPE_FLOW_NAME, {}, expected_code=201) data = resp['data'] self.assertEqual(data['sync_status'], EventType.SYNCED) resp = self._post('/event_type/sync/accept/%s' % EVENT_TYPE_FLOW_NAME, {}, expected_code=201) data = resp['data'] self.assertEqual(data['sync_status'], EventType.IN_SYNC) # 6. get, list resp = self._get('/event_type/get/%s' % EVENT_TYPE_FLOW_NAME, {}) data = resp['data'] self.assertEqual(data[KEY_NAME], EVENT_TYPE_FLOW_NAME) self.assertTrue(data['id']) resp = self._get('/event_type/list', {}) items = resp['data'] self.assertIsInstance(items, list) self.assertTrue(len(items) >= 1) item = [i for i in items if i[KEY_NAME] == EVENT_TYPE_FLOW_NAME] self.assertTrue(item) resp = self._get('/event_type/list', {KEY_PLATFORM: channel_type.name}) items = resp['data'] self.assertIsInstance(items, list) self.assertTrue(len(items) >= 1)
def load_dataset(self, dataset): with open(CSV_SHORT_FILEPATH) as csv_file: data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) finish_data_load. async (self.user, dataset, data_loader)
def create_and_load_dataset(self, name, set_schema_after_load=True): acc = self.user.account with open(CSV_SHORT_FILEPATH) as csv_file: data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) dataset = acc.datasets.add_dataset(self.user, name, data_loader) return dataset
def test_apply_sync_and_accept_sync(self): name = 'TestApplySchema' dataset = Dataset.create(self.user.account.id, name) self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC) self.assertEqual(dataset.rows, 0) self.assertEqual(dataset.sync_progress, 0) self.assertFalse(dataset.schema) with open(CSV_FILEPATH) as csv_file: raw_items = len([1 for _ in csv_file]) - 1 # minus head csv_file.seek(0) data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) dataset.schema = data_loader.read_schema() dataset.save() # cannot do csv_file.seek() because pandas close the file with open(CSV_FILEPATH) as csv_file: data_loader.csv_file = csv_file finish_data_load. async (self.user, dataset, data_loader) self.assertEqual(dataset.rows, raw_items) # let's include the case when not all data could be synced FAIL_COL_NAME = 'STAT_INI_1' # dataset.update_schema(dataset.discovered_schema) self.assertTrue(dataset.schema) col = [ col for col in dataset.schema if col[KEY_NAME] == FAIL_COL_NAME ][0] self.assertEqual(col[KEY_TYPE], TYPE_INTEGER) raw_data = dataset.data_coll.find_one() dataset.data_coll.update({'_id': raw_data['_id']}, {'$set': { FAIL_COL_NAME: 'fail' }}) # start of sync self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC) dataset.apply_sync() self.assertEqual(dataset.sync_status, Dataset.SYNCED) self.assertEqual(dataset.sync_progress, 100) self.assertEqual(dataset.rows, raw_items) self.assertEqual(dataset.data_coll.count(), raw_items) d = dataset self.assertTrue(d.sync_collection and d.sync_collection != d.mongo_collection) items_synced = dataset.items_synced self.assertEqual(dataset.data_sync_coll.count(), items_synced) self.assertNotEqual(raw_items, items_synced) # because we fail 1 item manually self.assertTrue(FAIL_COL_NAME in dataset.sync_errors) self.assertEqual(len(dataset.sync_errors[FAIL_COL_NAME]), 1) # we cannot apply once more time since no changes to schema were made with self.assertRaises(ImproperStateError): dataset.apply_sync() dataset.accept_sync() self.assertEqual(dataset.sync_status, Dataset.IN_SYNC) self.assertEqual(dataset.rows, items_synced) self.assertEqual(dataset.data_coll.count(), items_synced) dataset.drop_data()
def create_dataset(self, name): acc = self.user.account with open(CSV_FILEPATH) as csv_file: data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) dataset = acc.datasets.add_dataset(self.user, name, data_loader) return dataset