示例#1
0
    def __get_dataset(self):
        name = 'Create Dataset Test'
        acc = self.user.account

        # test create
        dataset = Dataset.create_by_user(self.user, acc.id, name)
        dataset = Dataset.objects.find_one_by_user(self.user, id=dataset.id)
        self.assertTrue(dataset)
        self.assertEqual(dataset.parent_id, acc.id)
        self.assertEqual(dataset.name, name)
        self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC)

        with open(CSV_SHORT_FILEPATH) as csv_file:
            filelen = len([1 for _ in csv_file])
            csv_file.seek(0)
            data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
            discovered_schema = data_loader.read_schema()
            dataset.update(schema=discovered_schema)
            finish_data_load. async (self.user, dataset, data_loader)

        self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC)
        self.assertEqual(dataset.load_progress, 100)
        self.assertEqual(dataset.data_coll.count(), filelen - 1)  # minus head
        self.assertEqual(dataset.data_coll.count(), dataset.rows)

        data_item = dataset.data_coll.find_one()
        self.assertTrue({c[KEY_NAME]
                         for c in dataset.schema} <= set(data_item.keys()))

        # TODO: test full data integrity!!!
        return dataset
示例#2
0
    def test_compute_cardinalities(self):
        csv_file = tempfile.TemporaryFile('w+')
        writer = csv.writer(csv_file, delimiter=CsvDataLoader.TAB)
        writer.writerow(['FName', 'LName', 'SN'])
        writer.writerow(['ram', 'shakya', None])
        writer.writerow(['shyam', 'shrestha', 0])
        writer.writerow(['hari', 'shrestha', 1])
        writer.writerow(['shyam', 'shakya', 2])
        writer.writerow(['hari', 'shakya', 3])
        writer.writerow(['hari', 'shakya', 4])

        csv_file.flush()
        csv_file.seek(0)

        name = "cardinality_test"
        data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
        self.user.account.datasets.add_dataset(self.user, name, data_loader)

        dataset = self.user.account.datasets.get_dataset(self.user, name)
        dataset.reload()

        actual = dataset.cardinalities
        self.assertEqual(sorted(actual.keys()), ['FName', 'LName', 'SN'])

        self.assertEqual(actual['FName']['count'], 3)
        self.assertEqual(sorted(actual['FName']['values']),
                         ['hari', 'ram', 'shyam'])

        self.assertEqual(actual['LName']['count'], 2)
        self.assertEqual(sorted(actual['LName']['values']),
                         ['shakya', 'shrestha'])

        # None and NaN are dropped
        self.assertEqual(actual['SN']['count'], 5)
        self.assertEqual(sorted(actual['SN']['values']), [0, 1, 2, 3, 4])
示例#3
0
    def test_schema_parse_max_lines(self):
        import pandas

        acc = self.user.account
        with open(CSV_SHORT_FILEPATH) as csv_file:
            with patch('pandas.read_csv', wraps=pandas.read_csv) as parse_meth:
                name = 'test_schema_analysis'
                # dataset = Dataset.create(acc.id, self.user, 'test_schema_analysis')
                data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
                data_loader.read_schema()

                input_file = parse_meth.call_args[0][0]
                input_file.seek(0)
                filelen = len([1 for _ in input_file])
                self.assertEqual(TEST_MAX_LINES + 1, filelen)
                acc.datasets.delete_dataset(self.user, name)
示例#4
0
    def test_1k_fields(self):
        from cStringIO import StringIO
        from bson.objectid import ObjectId
        import random

        COLUMNS = 1000
        LAST_COL_IDX = COLUMNS - 1

        csv = StringIO()
        for col in xrange(COLUMNS):
            csv.write(('COLUMN%s' % col))
            csv.write('\t') if col != LAST_COL_IDX else csv.write('\n')

        _from = 10 * 1000 * 1000 * 1000
        _to = _from * 2
        for row in (1, 2):
            for col in xrange(COLUMNS):
                last = col == LAST_COL_IDX
                if col < COLUMNS / 2:
                    csv.write('%s' % random.randint(_from, _to))
                else:
                    csv.write(unicode(ObjectId()))
                if not last:
                    csv.write('\t')
                else:
                    csv.write('\n')

        csv.seek(0)
        name = 'Test1000Columns'
        data_loader = CsvDataLoader(csv, sep=CsvDataLoader.TAB)
        dataset = self.user.account.datasets.add_dataset(
            self.user, name, data_loader)
        data_cls = dataset.get_data_class()
        self.assertEqual(data_cls.objects.count(), 2)
示例#5
0
    def get_data_loader_by_ext(self, input_file, params):
        if input_file.filename.lower().endswith('.csv'):
            return CsvDataLoader(input_file.stream, sep=params['sep'])
        if input_file.filename.lower().endswith('.json'):

            def type_getter(data):
                return data.get('event_type')

            return JsonDataLoader(input_file.stream,
                                  data_type_getter=type_getter)
        raise WrongFileExtension('Only CSV and JSON are supported.')
示例#6
0
    def create(self, *args, **kwargs):
        csv_file = kwargs['csv_file']
        if not csv_file.filename.endswith('.csv'):
            raise WrongFileExtension(
                'Wrong file extension, only .csv is supported.')

        sep = kwargs['sep']
        data_loader = CsvDataLoader(csv_file.stream, sep=sep)

        manager = getattr(self.user.account, self.ACC_DYN_MANAGER_NAME)
        profile = manager.create(self.user, data_loader)
        return profile.to_dict()
示例#7
0
    def create(self, *args, **kwargs):
        ''' Create endpoint creates Dataset entity (:status=NEW)
        then makes schema analysis (:status=ANALYZED), then starts async
        task to load raw data into db (:status=LOADING). After ANALYZED
        user can invoke /dataset/update_schema/<name> to alter schema conf
        and then invoke /dataset/sync/<name> to sync the uploaded data.
        '''

        name = kwargs['name']
        csv_file = kwargs['csv_file']
        if not csv_file.filename.endswith('.csv'):
            raise WrongFileExtension(
                'Wrong file extension, only .csv is supported.')

        sep = kwargs['sep']
        data_loader = CsvDataLoader(csv_file.stream, sep=sep)
        acc = self.user.account
        dataset = acc.datasets.add_dataset(self.user, name, data_loader)
        return dataset.to_dict()
示例#8
0
    def update(self, *args, **kwargs):
        '''Append new data into existing Dataset. Schema is locked at
        the moment, so only same data structure is allowed to upload.
        Statuses start changing from LOADING then LOADED. :sync_status
        left IN_SYNC.
        '''

        csv_file = kwargs['csv_file']
        sep = kwargs['sep']
        if not csv_file.filename.endswith('.csv'):
            raise WrongFileExtension(
                'Wrong file extension, only .csv is supported.')

        acc = self.user.account
        dataset = acc.datasets.get_dataset(self.user, kwargs['name'])
        if not dataset:
            return

        data_loader = CsvDataLoader(csv_file.stream, sep=sep)
        acc.datasets.update_dataset(self.user, dataset, data_loader)
        return dataset.to_dict()
    def test_change_id_column(self):
        from solariat_bottle.utils.predictor_events import translate_column

        manager = getattr(self.user.account, self.acc_attr_name)
        with open(CSV_FILEPATH) as csv_file:
            data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
            profile = manager.create(self.user, data_loader)

        copy_cols = ('INTERACTION_ID', 'START_TS', 'END_TS')
        schema = [
            col for col in profile.discovered_schema
            if col[KEY_NAME] in copy_cols
        ]
        id_col_name = 'INTERACTION_ID'
        for col in schema:
            if col[KEY_NAME] == id_col_name:
                col[KEY_IS_ID] = True
                break

        data = self._post('%s/update_schema' % self.ENDPOINT,
                          {'schema': schema},
                          expected_code=201)
        data = self._get('%s/get' % self.ENDPOINT, {})
        id_col = [col for col in data['data']['schema'] if KEY_IS_ID in col][0]
        self.assertEqual(id_col[KEY_NAME], id_col_name)

        self.assertEqual(data['data']['schema'], schema)
        data = self._post('%s/sync/apply' % self.ENDPOINT, {},
                          expected_code=201)

        profile.reload()
        raw_data = profile.data_sync_coll.find_one()
        self.assertEqual(raw_data['_id'], raw_data[id_col_name])

        data = self._post('%s/sync/accept' % self.ENDPOINT, {},
                          expected_code=201)

        profile.reload()
        data = profile.get_data()[0]
        self.assertEqual(data.id, getattr(data, id_col_name))
示例#10
0
    def test_expression_fields(self):
        name = "ExpressionsTest"
        dataset = self.create_and_load_dataset(name)
        COL_NAME = 'EXP_COLUMN'
        EXP_FIELD = {
            KEY_NAME: COL_NAME,
            KEY_TYPE: TYPE_INTEGER,
            KEY_EXPRESSION: 'INTERACTION_ID % 2',
        }
        schema = dataset.schema + [EXP_FIELD]
        for col in schema:
            if col[KEY_NAME] == 'INTERACTION_ID':
                col[KEY_TYPE] = TYPE_INTEGER
                break

        dataset.update_schema(schema)
        dataset.apply_sync()
        dataset.accept_sync()

        cnt = 0
        for data in dataset.get_data():
            if data.INTERACTION_ID is None:
                continue
            self.assertEqual(data.INTERACTION_ID % 2, data.EXP_COLUMN)
            cnt += 1

        with open(CSV_SHORT_FILEPATH) as csv_file:
            data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
            self.user.account.datasets.update_dataset(self.user, dataset,
                                                      data_loader)

        cnt2 = 0
        for data in dataset.get_data():
            if data.INTERACTION_ID is None:
                continue
            self.assertEqual(data.INTERACTION_ID % 2, data.EXP_COLUMN)
            cnt2 += 1

        self.assertGreater(cnt2, cnt)
示例#11
0
    def test_event_workflow(self):
        # TODO: check we cannot import data when channel OUT_OF_SYNC

        from solariat_bottle.db.events.event import Event
        from solariat_bottle.schema_data_loaders.csv import CsvDataLoader
        from solariat_bottle.schema_data_loaders.json import JsonDataLoader

        # View code
        channel_type = ChannelType.objects.create_by_user(self.user,
                                                          name='TestType',
                                                          account=self.user.account)
        acc = self.user.account
        web_event_type = acc.event_types.create(self.user, channel_type, name='Web')
        mail_event_type = acc.event_types.create(self.user, channel_type, name='Mail')
        chat_event_type = acc.event_types.create(self.user, channel_type, name='Chat')

        channel_type.is_locked = True
        channel_type.save()
        with self.assertRaises(ChannelTypeIsLocked):
            acc.event_types.create(self.user, channel_type, name='ChannelTypeIsLockedError')

        # discover from csv: event_type as input
        rows_csv = 10
        csv_file = self.get_csv_input_file(size=rows_csv)
        csv_data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.COMMA)
        schema1 = csv_data_loader.read_schema()
        web_event_type.update(discovered_schema=schema1)

        self.assertEqual({col[KEY_NAME] for col in schema1}, set(self.TEST_ITEM_COLUMNS))

        # discover from json: no event_type as input needed
        EVENT_TYPE_MAP = {
            ev.name: ev for ev in (web_event_type, mail_event_type, chat_event_type)
        }
        rows_json = 20
        json_file = self.get_json_input_file(size=rows_json, event_types=('Mail', 'Chat'))

        def event_type_getter(data_item):
            return data_item.get(self.EVENT_TYPE_DATA_FIELD)

        json_data_loader = JsonDataLoader(json_file, event_type_getter)
        schemas_map = json_data_loader.read_schema()

        discover_only_type = None
        for event_type_name, schema in schemas_map.iteritems():
            event_type = EVENT_TYPE_MAP.get(event_type_name)
            if not event_type:
                # TODO: log
                continue
            if discover_only_type and discover_only_type != event_type:
                continue
            event_type.update(discovered_schema=schema)
            self.assertEqual({col[KEY_NAME] for col in schema},
                             set(self.TEST_ITEM_COLUMNS) | {self.EVENT_TYPE_DATA_FIELD})

        # import data
        # case 1: event type has just discovered schema (or no schema at all)
        # sync_status: IN_SYNC; insert data to data_coll as it is (raw)
        #
        # case 2: event type has schema created
        # sync_status: OUT_OF_SYNC; do not insert any data
        #
        # case 3: event type has applied schema
        # sync_status: IN_SYNC; insert data with schema

        ChClass = channel_type.get_channel_class()
        channel = ChClass.objects.create_by_user(self.user,
                                                 title='SomeImportChannel',
                                                 channel_type_id=channel_type.id)

        with self.assertRaises(ImproperStateError):
            web_event_type.import_data(self.user, csv_data_loader)

        web_event_type.channel_id = channel.id
        web_event_type.import_data(self.user, csv_data_loader)
        self.assertEqual(web_event_type.data_coll.count(), rows_csv)
        # TODO: test it also like: channel.import_data(event_type=web_event_type)

        # check we have data imported, check accordance to schema

        channel.import_data(self.user, json_data_loader)
        for ev_type in EVENT_TYPE_MAP.values():
            ev_type.reload()

        # import pdb; pdb.set_trace()
        total_imported = rows_csv + rows_json
        self.assertNotEqual(web_event_type.data_coll.count(), total_imported)
        self.assertEqual(web_event_type.all_data_coll.count(), total_imported)
        self.assertEqual(Event.objects.coll.count(), total_imported)

        mail_events_count = mail_event_type.data_coll.count()
        chat_events_count = chat_event_type.data_coll.count()
        self.assertTrue(0 < mail_events_count <= rows_json)
        self.assertEqual(mail_event_type.rows, mail_events_count)

        schema = [dict(col) for col in mail_event_type.discovered_schema]
        bool_col = [col for col in schema if col[KEY_NAME] == 'BoolCol'][0]
        self.assertTrue(bool_col[KEY_TYPE], TYPE_BOOLEAN)
        raw_event = mail_event_type.data_coll.find_one()
        val = raw_event[bool_col[KEY_NAME]]
        self.assertIsInstance(val, bool)

        bool_col[KEY_TYPE] = TYPE_STRING
        mail_event_type.update_schema(schema)
        self.assertEqual(mail_event_type.sync_status, EventType.OUT_OF_SYNC)
        with self.assertRaises(ImproperStateError):
            mail_event_type.import_data(self.user, json_data_loader)

        mail_event_type.apply_sync()
        self.assertEqual(mail_event_type.sync_status, EventType.SYNCED)
        self.assertEqual(mail_event_type.items_synced, mail_events_count)  # should be no errors

        self.assertEqual(Event.objects.coll.count(), total_imported)
        self.assertEqual(mail_event_type.data_sync_coll.count(), mail_events_count)
        self.assertEqual(chat_event_type.data_sync_coll.count(), 0)

        raw_event = mail_event_type.data_sync_coll.find_one()
        val = raw_event[bool_col[KEY_NAME]]
        self.assertIsInstance(val, basestring)

        mail_event_type.accept_sync()
        self.assertEqual(Event.objects.coll.count(), total_imported)
        self.assertEqual(mail_event_type.data_sync_coll.count(), 0)

        raw_event = mail_event_type.data_coll.find_one()
        val = raw_event[bool_col[KEY_NAME]]
        self.assertIsInstance(val, basestring)
示例#12
0
    def test_events_view_workflow(self):
        from solariat_bottle.utils.predictor_events import translate_column

        acc = self.user.account

        # 1. create
        channel_type_resp = self._post('/channel_type/create',
                                       {KEY_NAME: 'TestChannelType'},
                                       expected_code=201)
        channel_type_name = channel_type_resp['data']['name']

        resp = self._post('/event_type/create',
                          {KEY_NAME: EVENT_TYPE_FLOW_NAME,
                          KEY_PLATFORM: channel_type_name},
                          expected_code=201)

        data = resp['data']
        self.assertEqual(data['sync_status'], EventType.OUT_OF_SYNC)
        self.assertFalse(data['schema'])
        self.assertFalse(data['discovered_schema'])
        self.assertFalse(data['is_locked'])

        # 2. discover schema
        with open(CSV_FILEPATH) as csv_file:
            # TODO: discover schema on json
            post_data = dict(file=(csv_file, CSV_FILENAME),
                             sep=CsvDataLoader.TAB,
                             name=EVENT_TYPE_FLOW_NAME)
            resp = self.client.post(
                '/event_type/discover_schema',
                buffered=True,
                content_type='multipart/form-data',
                data=post_data,
                base_url='https://localhost')

            self.assertEqual(resp.status_code, 201)
            data = json.loads(resp.data)['data']


        self.assertEqual(data['sync_status'], EventType.OUT_OF_SYNC)
        self.assertFalse(data['schema'])
        self.assertTrue(data['discovered_schema'])

        # 3. load data: we can load data without a schema, in this case derived schema applied
        channel_type = ChannelType.objects.find_one_by_user(self.user, name=channel_type_name)
        ChClass = channel_type.get_channel_class()
        channel = ChClass.objects.create_by_user(self.user,
                                                 title='ImportingChannel #1',
                                                 channel_type_id=channel_type.id)

        # import customer profile first
        with open(CSV_FILEPATH) as csv_file:
            data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
            profile = self.user.account.customer_profile.create(self.user, data_loader)

        self.assertTrue(profile.discovered_schema)
        self.assertFalse(profile.schema)
        schema = [dict(col) for col in profile.discovered_schema]
        actor_id_col = [col for col in schema if col[KEY_NAME] == KEY_ACTOR_ID][0]
        actor_id_col[KEY_IS_ID] = True
        profile.update_schema(schema)
        profile.apply_sync()
        profile.accept_sync()

        with open(CSV_FILEPATH) as csv_file:
            # TODO: discover schema on json
            resp = self.client.post(
                '/event_type/import_data',
                buffered=True,
                content_type='multipart/form-data',
                data={
                    KEY_FILE: (csv_file, CSV_FILENAME),
                    'sep': CsvDataLoader.TAB,
                    KEY_CHANNEL_ID: channel.id,
                    KEY_NAME: EVENT_TYPE_FLOW_NAME,
                },
                base_url='https://localhost')

            self.assertEqual(resp.status_code, 201)
            data = json.loads(resp.data)['data']

        self.assertEqual(data['sync_status'], EventType.OUT_OF_SYNC)
        self.assertFalse(data['schema'])
        self.assertEqual(data['rows'], 50)  # TODO: replace with SIZE

        # 4. edit schema based on discovered
        resp = self._post('/event_type/update_schema/%s' % EVENT_TYPE_FLOW_NAME,
                          {'schema': data['discovered_schema']},
                          expected_code=201)
        data = resp['data']
        self.assertEqual(data['sync_status'], EventType.OUT_OF_SYNC)
        self.assertTrue(data['schema'])

        # 5. sync
        resp = self._post('/event_type/sync/apply/%s' % EVENT_TYPE_FLOW_NAME, {},
                          expected_code=201)
        data = resp['data']
        self.assertEqual(data['sync_status'], EventType.SYNCED)

        resp = self._post('/event_type/sync/accept/%s' % EVENT_TYPE_FLOW_NAME, {},
                          expected_code=201)
        data = resp['data']
        self.assertEqual(data['sync_status'], EventType.IN_SYNC)

        # 6. get, list
        resp = self._get('/event_type/get/%s' % EVENT_TYPE_FLOW_NAME, {})
        data = resp['data']
        self.assertEqual(data[KEY_NAME], EVENT_TYPE_FLOW_NAME)
        self.assertTrue(data['id'])

        resp = self._get('/event_type/list', {})
        items = resp['data']
        self.assertIsInstance(items, list)
        self.assertTrue(len(items) >= 1)
        item = [i for i in items if i[KEY_NAME] == EVENT_TYPE_FLOW_NAME]
        self.assertTrue(item)

        resp = self._get('/event_type/list', {KEY_PLATFORM: channel_type.name})
        items = resp['data']
        self.assertIsInstance(items, list)
        self.assertTrue(len(items) >= 1)
示例#13
0
 def load_dataset(self, dataset):
     with open(CSV_SHORT_FILEPATH) as csv_file:
         data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
         finish_data_load. async (self.user, dataset, data_loader)
示例#14
0
 def create_and_load_dataset(self, name, set_schema_after_load=True):
     acc = self.user.account
     with open(CSV_SHORT_FILEPATH) as csv_file:
         data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
         dataset = acc.datasets.add_dataset(self.user, name, data_loader)
         return dataset
示例#15
0
    def test_apply_sync_and_accept_sync(self):
        name = 'TestApplySchema'
        dataset = Dataset.create(self.user.account.id, name)
        self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC)
        self.assertEqual(dataset.rows, 0)
        self.assertEqual(dataset.sync_progress, 0)
        self.assertFalse(dataset.schema)

        with open(CSV_FILEPATH) as csv_file:
            raw_items = len([1 for _ in csv_file]) - 1  # minus head
            csv_file.seek(0)
            data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
            dataset.schema = data_loader.read_schema()
            dataset.save()

        # cannot do csv_file.seek() because pandas close the file
        with open(CSV_FILEPATH) as csv_file:
            data_loader.csv_file = csv_file
            finish_data_load. async (self.user, dataset, data_loader)

        self.assertEqual(dataset.rows, raw_items)

        # let's include the case when not all data could be synced
        FAIL_COL_NAME = 'STAT_INI_1'
        # dataset.update_schema(dataset.discovered_schema)
        self.assertTrue(dataset.schema)
        col = [
            col for col in dataset.schema if col[KEY_NAME] == FAIL_COL_NAME
        ][0]
        self.assertEqual(col[KEY_TYPE], TYPE_INTEGER)

        raw_data = dataset.data_coll.find_one()
        dataset.data_coll.update({'_id': raw_data['_id']},
                                 {'$set': {
                                     FAIL_COL_NAME: 'fail'
                                 }})

        # start of sync
        self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC)
        dataset.apply_sync()
        self.assertEqual(dataset.sync_status, Dataset.SYNCED)
        self.assertEqual(dataset.sync_progress, 100)
        self.assertEqual(dataset.rows, raw_items)
        self.assertEqual(dataset.data_coll.count(), raw_items)
        d = dataset
        self.assertTrue(d.sync_collection
                        and d.sync_collection != d.mongo_collection)
        items_synced = dataset.items_synced
        self.assertEqual(dataset.data_sync_coll.count(), items_synced)
        self.assertNotEqual(raw_items,
                            items_synced)  # because we fail 1 item manually
        self.assertTrue(FAIL_COL_NAME in dataset.sync_errors)
        self.assertEqual(len(dataset.sync_errors[FAIL_COL_NAME]), 1)

        # we cannot apply once more time since no changes to schema were made
        with self.assertRaises(ImproperStateError):
            dataset.apply_sync()

        dataset.accept_sync()
        self.assertEqual(dataset.sync_status, Dataset.IN_SYNC)
        self.assertEqual(dataset.rows, items_synced)
        self.assertEqual(dataset.data_coll.count(), items_synced)
        dataset.drop_data()
示例#16
0
 def create_dataset(self, name):
     acc = self.user.account
     with open(CSV_FILEPATH) as csv_file:
         data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
         dataset = acc.datasets.add_dataset(self.user, name, data_loader)
         return dataset