def setUp(self):
     """Create an empty work trails repository."""
     # Create fresh set of directories
     if os.path.isdir(SERVER_DIR):
         shutil.rmtree(SERVER_DIR)
     os.mkdir(SERVER_DIR)
     self.filestore = FileSystemFilestore(FILESERVER_DIR)
 def test_download_dataset(self):
     """Test loading a dataset from Url. Note that this test depends on the
     accessed web service to be running. It will fail otherwise."""
     # Skip test if DOWNLOAD_URL is None
     if DOWNLOAD_URL is None:
         print('Skipping download test')
         return
     store = FileSystemDatastore(STORE_DIR)
     ds = store.download_dataset(url=DOWNLOAD_URL)
     dataset_dir = os.path.join(STORE_DIR, ds.identifier)
     self.assertTrue(os.path.isdir(dataset_dir))
     self.assertTrue(os.path.isfile(os.path.join(dataset_dir, DATA_FILE)))
     self.assertTrue(os.path.isfile(os.path.join(dataset_dir, DESCRIPTOR_FILE)))
     self.assertFalse(os.path.isfile(os.path.join(dataset_dir, METADATA_FILE)))
     self.validate_class_size_dataset(ds)
     # Download file into a given filestore
     fs = FileSystemFilestore(FSSTORE_DIR)
     ds, fh = store.download_dataset(
         url=DOWNLOAD_URL,
         filestore=fs
     )
     self.validate_class_size_dataset(ds)
     self.assertEqual(len(fs.list_files()), 1)
     self.assertIsNotNone(fh)
     self.assertIsNotNone(fs.get_file(fh.identifier))
예제 #3
0
 def setUp(self):
     """Create empty server directory."""
     if os.path.isdir(SERVER_DIR):
         shutil.rmtree(SERVER_DIR)
     os.mkdir(SERVER_DIR)
     self.fileserver = FileSystemFilestore(FILESERVER_DIR)
     self.db = MimirDatastore(DATASTORE_DIRECTORY)
예제 #4
0
 def setUp(self):
     """Create instances of the default datastore and filestore."""
     # Drop directory if it exists
     if os.path.isdir(SERVER_DIR):
         shutil.rmtree(SERVER_DIR)
     os.makedirs(SERVER_DIR)
     self.datastore = FileSystemDatastore(DATASTORE_DIR)
     self.filestore = FileSystemFilestore(FILESTORE_DIR)
예제 #5
0
 def test_upload_stream(self):
     """Test file upload from an open file object."""
     db = FileSystemFilestore(SERVER_DIR)
     file = FileStorage(filename=CSV_FILE)
     fh = db.upload_stream(file=file, file_name=os.path.basename(CSV_FILE))
     self.assertEqual(fh.file_name, os.path.basename(CSV_FILE))
     self.assertEqual(fh.mimetype, fs.FORMAT_CSV)
     self.assertTrue(os.path.isfile(fh.filepath))
     self.assertEqual(fh.identifier, db.get_file(fh.identifier).identifier)
예제 #6
0
 def setUp(self):
     """Create an instance of the Mimir processor for an empty server
     directory.
     """
     # Drop directory if it exists
     if os.path.isdir(SERVER_DIR):
         shutil.rmtree(SERVER_DIR)
     os.makedirs(SERVER_DIR)
     self.datastore = MimirDatastore(DATASTORE_DIR)
     self.filestore = FileSystemFilestore(FILESTORE_DIR)
예제 #7
0
 def setUp(self):
     """Create an instance of the default vizier processor for an empty server
     directory.
     """
     # Drop directory if it exists
     if os.path.isdir(SERVER_DIR):
         shutil.rmtree(SERVER_DIR)
     os.makedirs(SERVER_DIR)
     self.processor = VizualTaskProcessor(api=DefaultVizualApi())
     self.datastore = FileSystemDatastore(DATASTORE_DIR)
     self.filestore = FileSystemFilestore(FILESTORE_DIR)
예제 #8
0
 def set_up(self, engine):
     """Create an empty file server repository."""
     if os.path.isdir(SERVER_DIR):
         shutil.rmtree(SERVER_DIR)
     os.mkdir(SERVER_DIR)
     # Setup file server
     self.fs = FileSystemFilestore(FILESERVER_DIR)
     # Setup the respective datastore and Vizual engine
     if engine == ENGINEENV_DEFAULT:
         self.datastore = FileSystemDatastore(DATASTORE_DIR)
     elif engine == ENGINEENV_MIMIR:
         self.datastore = MimirDatastore(DATASTORE_DIR)
 def setUp(self):
     """Create an instance of the Mimir processor for an empty server
     directory.
     """
     # Drop directory if it exists
     if os.path.isdir(SERVER_DIR):
         shutil.rmtree(SERVER_DIR)
     os.makedirs(SERVER_DIR)
     self.processor = MimirProcessor()
     self.datastore = MimirDatastore(DATASTORE_DIR)
     self.filestore = FileSystemFilestore(FILESTORE_DIR)
     self.available_lenses = set(mimir.getAvailableLensTypes())
예제 #10
0
 def test_get_file(self):
     """Test file get method."""
     db = FileSystemFilestore(SERVER_DIR)
     fh1 = db.upload_file(CSV_FILE)
     fh2 = db.get_file(fh1.identifier)
     self.assertEqual(fh1.identifier, fh2.identifier)
     self.assertEqual(fh1.filepath, fh2.filepath)
     self.assertEqual(fh1.mimetype, fh2.mimetype)
     # Ensure that the file parses as a CSV file
     with fh1.open() as csvfile:
         rows = 0
         for row in csv.reader(csvfile, delimiter=fh1.delimiter):
             rows += 1
     self.assertEqual(rows, 3)
예제 #11
0
class TestMimirDatasetAnnotations(unittest.TestCase):
    def setUp(self):
        """Create empty server directory."""
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.mkdir(SERVER_DIR)
        self.fileserver = FileSystemFilestore(FILESERVER_DIR)
        self.db = MimirDatastore(DATASTORE_DIRECTORY)

    def tearDown(self):
        """Delete server directory."""
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_dataset_annotations(self):
        """Run test for Mimir datastore."""
        dh = self.db.load_dataset(
            f_handle=self.fileserver.upload_file(DATA_FILE))
        ds = self.db.get_dataset(dh.identifier)
        rows = ds.fetch_rows()
        print(ds.row_ids)
        for row in rows:
            print(str(row.identifier) + '\t' + str(row.values))
        for row_id in ds.row_ids:
            for anno in ds.get_annotations(column_id=1, row_id=row_id):
                print(str(row_id) + '\t' + anno.key + '=' + str(anno.value))
예제 #12
0
class TestUnicodeHandling(unittest.TestCase):

    def setUp(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.mkdir(SERVER_DIR)
        self.filestore = FileSystemFilestore(FILESERVER_DIR)

    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        # if os.path.isdir(SERVER_DIR):
        #     shutil.rmtree(SERVER_DIR)

    @unittest.skip("FS Datastore doesn't like funky unicode")
    def test_default_config(self):
        """Run workflow with default configuration."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        self.run_workflow(FileSystemDatastore(DATASTORE_DIR))

    @unittest.skip("Blocked on https://github.com/UBOdin/mimir-api/issues/9")
    def test_mimir_config(self):
        """Run workflows for Mimir configurations."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        import vizier.mimir as mimir
        self.run_workflow(MimirDatastore(DATASTORE_DIR))

    def run_workflow(self, datastore):
        """Test functionality to execute a Python script that creates a dataset
        containing unicode characters."""
        f_handle = self.filestore.upload_file(CSV_FILE)
        ds = datastore.load_dataset(f_handle)
        # RUN Python Script
        cmd = python_cell(
            source=PYTHON_SCRIPT,
            validate=True
        )
        result = PyCellTaskProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(
                datastore=datastore,
                filestore=self.filestore,
                datasets={DATASET_NAME: ds.identifier}
            )
        )
        self.assertTrue(result.is_success)
        #print wf.modules[-1].stdout[0]['data']
        ds = datastore.get_dataset(result.provenance.write[DATASET_NAME].identifier)
        names = set(c.name.upper().replace('_', ' ') for c in ds.columns)
        self.assertTrue(len(names), 4)
        for name in ['DATE', 'IMO CODE', 'PORT OF DEPARTURE', 'PLACE OF RECEIPT']:
            self.assertTrue(name in names)
예제 #13
0
class TestSQLProcessor(unittest.TestCase):
    def setUp(self):
        """Create an instance of the Mimir processor for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.datastore = MimirDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_run_sql_query(self):
        """Test running a SQL query without materializing the result."""
        f_handle = self.filestore.upload_file(CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        cmd = sql_cell(source='SELECT grade_or_service_category FROM ' +
                       DATASET_NAME + ' WHERE program = \'GENERAL EDUCATION\'',
                       validate=True)
        result = SQLTaskProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(datasets={DATASET_NAME: ds.identifier},
                                datastore=self.datastore,
                                filestore=self.filestore))
        self.assertTrue(result.is_success)
        self.assertIsNone(result.provenance.read)
        self.assertIsNone(result.provenance.write)
        self.assertTrue(len(result.outputs.stdout) > 0)
        self.assertEqual(len(result.outputs.stderr), 0)
        # Materialize result
        cmd = sql_cell(source='SELECT grade_or_service_category FROM ' +
                       DATASET_NAME + ' WHERE program = \'GENERAL EDUCATION\'',
                       output_dataset='ge',
                       validate=True)
        result = SQLTaskProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(datasets={DATASET_NAME: ds.identifier},
                                datastore=self.datastore,
                                filestore=self.filestore))
        self.assertTrue(result.is_success)
        self.assertIsNone(result.provenance.read)
        self.assertIsNotNone(result.provenance.write)
        self.assertTrue('ge' in result.provenance.write)
        self.assertTrue(len(result.outputs.stdout) > 0)
        self.assertEqual(len(result.outputs.stderr), 0)
예제 #14
0
 def test_load_dataset(self):
     """Test validation of load dataset command."""
     db = FileSystemFilestore(SERVER_DIR)
     fh = db.upload_file(CSV_FILE)
     cmd = load_dataset(dataset_name='ds',
                        file={
                            pckg.FILE_ID: fh.identifier,
                            pckg.FILE_NAME: fh.file_name
                        },
                        validate=True).to_external_form(command=PACKAGE.get(
                            vizual.VIZUAL_LOAD),
                                                        datasets=DATASETS)
     self.assertEqual(cmd, 'LOAD DATASET ds FROM ' + fh.file_name)
     cmd = load_dataset(dataset_name='ds',
                        file={
                            pckg.FILE_URL: 'http://some.file.url'
                        },
                        validate=True).to_external_form(command=PACKAGE.get(
                            vizual.VIZUAL_LOAD),
                                                        datasets=DATASETS)
     self.assertEqual(cmd, 'LOAD DATASET ds FROM http://some.file.url')
     cmd = load_dataset(dataset_name='ds',
                        file={
                            pckg.FILE_ID: fh.identifier,
                            pckg.FILE_URL: 'http://some.file.url'
                        },
                        validate=True).to_external_form(command=PACKAGE.get(
                            vizual.VIZUAL_LOAD),
                                                        datasets=DATASETS)
     self.assertEqual(cmd, 'LOAD DATASET ds FROM http://some.file.url')
     cmd = load_dataset(dataset_name='ds',
                        file={
                            pckg.FILE_ID: 'Some File'
                        },
                        validate=True).to_external_form(command=PACKAGE.get(
                            vizual.VIZUAL_LOAD),
                                                        datasets=DATASETS)
     self.assertEqual(cmd, 'LOAD DATASET ds FROM \'Some File\'')
예제 #15
0
    def get_filestore(self, identifier):
        """Get the filestore instance for the project with the given identifier.

        Paramaters
        ----------
        identifier: string
            Unique identifier for filestore

        Returns
        -------
        vizier.filestore.fs.base.FileSystemFilestore
        """
        filestore_dir = os.path.join(self.base_path, identifier)
        return FileSystemFilestore(filestore_dir)
예제 #16
0
 def test_delete_file(self):
     """Test delete file method."""
     db = FileSystemFilestore(SERVER_DIR)
     f = db.upload_file(CSV_FILE)
     f = db.get_file(f.identifier)
     self.assertIsNotNone(f)
     self.assertTrue(db.delete_file(f.identifier))
     f = db.get_file(f.identifier)
     self.assertIsNone(f)
예제 #17
0
class TestDefaultVizualProcessor(unittest.TestCase):
    def setUp(self):
        """Create an instance of the default vizier processor for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.datastore = FileSystemDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def count_non_null_values(self, data, column_index):
        """Return the number of values in a column that are not None."""
        count = 0
        for row in data:
            if not row[column_index] is None:
                count += 1
        return count

    def test_query(self):
        """Test running a query for simple chart plots."""
        ds = self.datastore.load_dataset(self.filestore.upload_file(LOAD_FILE))
        view = ChartViewHandle(dataset_name='ABC', x_axis=2)
        view.add_series(1, range_start=25, range_end=30)
        view.add_series(0, range_start=25, range_end=30)
        view.add_series(3, range_start=25, range_end=30)
        data = ChartQuery().exec_query(dataset=ds, view=view)[0]
        self.assertEqual(len(data), 6)
        for row in data:
            self.assertEqual(len(row), 3)
        self.assertTrue(isinstance(data[0][0], int))
        self.assertTrue(isinstance(data[0][1], float))
        # Remove interval end for one series. This should return all rows
        # starting from index 25
        view = ChartViewHandle(dataset_name='ABC', x_axis=2)
        view.add_series(1, range_start=25, range_end=30)
        view.add_series(0, range_start=25)
        view.add_series(3, range_start=25, range_end=30)
        data = ChartQuery().exec_query(dataset=ds, view=view)[0]
        self.assertEqual(len(data), 29)
        self.assertIsNone(data[28][0])
        self.assertIsNotNone(data[28][1])
        self.assertIsNone(data[28][2])
        for row in data:
            self.assertEqual(len(row), 3)
        # Remove interval start for another series. The first series will
        # contain 31 values, the second 29, and the third 6
        view = ChartViewHandle(dataset_name='ABC', x_axis=2)
        view.add_series(1, range_end=30)
        view.add_series(0, range_start=25)
        view.add_series(3, range_start=25, range_end=30)
        data = ChartQuery().exec_query(dataset=ds, view=view)[0]
        self.assertEqual(len(data), 31)
        self.assertEqual(self.count_non_null_values(data, 0), 31)
        self.assertEqual(self.count_non_null_values(data, 1), 29)
        self.assertEqual(self.count_non_null_values(data, 2), 6)
        for row in data:
            self.assertEqual(len(row), 3)
        # Without any range constraints the result should contain all 54 rows
        view = ChartViewHandle(dataset_name='ABC', x_axis=2)
        view.add_series(1, label='A')
        view.add_series(0, label='B')
        view.add_series(3)
        data = ChartQuery().exec_query(dataset=ds, view=view)[0]
        self.assertEqual(len(data), 54)
class TestVizierClient(unittest.TestCase):
    def setUp(self):
        """Create instances of the default datastore and filestore."""
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.datastore = FileSystemDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_create_new_dataset(self):
        """Test creating and updating a new dataset via the client."""
        client = VizierDBClient(datastore=self.datastore, datasets=dict())
        ds = DatasetClient()
        ds.insert_column('Name')
        ds.insert_column('Age')
        ds.insert_row(['Alice', '23'])
        ds.insert_row(['Bob', '25'])
        rows = ds.rows
        ds = client.create_dataset('MyDataset', ds)
        # Ensure the returned dataset contains the input data
        self.assertEqual([c.name for c in ds.columns], ['Name', 'Age'])
        self.assertEqual([str(v) for v in ds.rows[0].values], ['Alice', '23'])
        self.assertEqual([str(v) for v in ds.rows[1].values], ['Bob', '25'])
        # Modify the reference to the original rows to ensure that the rows in
        # the loaded datasets are not affected
        self.assertEqual([str(v) for v in rows[0].values], ['Alice', '23'])
        rows[0].set_value(0, 'Jane')
        self.assertEqual([str(v) for v in rows[0].values], ['Jane', '23'])
        self.assertEqual([str(v) for v in ds.rows[0].values], ['Alice', '23'])
        # Update dataset
        ds.rows[1].set_value('Age', '26')
        client.update_dataset('MyDataset', ds)
        ds = client.get_dataset('MyDataset')
        self.assertEqual([str(v) for v in ds.rows[1].values], ['Bob', '26'])
        # Value error when creating dataset with existing name
        with self.assertRaises(ValueError):
            client.create_dataset('MyDataset', ds)
        # Value error when retrieving unknown dataset
        with self.assertRaises(ValueError):
            client.get_dataset('SomeDataset')
        # Ensure the returned dataset contains the modified data
        client.rename_dataset('MyDataset', 'SomeDataset')
        ds = client.get_dataset('SomeDataset')
        # Ensure that access to unknown datasets is recorded
        with self.assertRaises(ValueError):
            client.get_dataset('ThisIsNotADataset')
        for name in ['somedataset', 'mydataset']:
            self.assertTrue(name in client.read)
            self.assertTrue(name in client.write)
        self.assertTrue('thisisnotadataset' in client.read)
        self.assertFalse('thisisnotadataset' in client.write)

    def test_dataset_annotations(self):
        """Test creating and updating an existing dataset via the client."""
        # Move columns around
        ds = self.datastore.load_dataset(self.filestore.upload_file(CSV_FILE))
        client = VizierDBClient(datastore=self.datastore,
                                datasets={DATASET_NAME: ds.identifier})
        ds = client.get_dataset(DATASET_NAME)
        annotations = ds.annotations
        annotations.add(key='comment', value='Good', column_id=0, row_id=1)
        annotations.add(key='comment', value='Good', column_id=1, row_id=1)
        annotations.add(key='quality', value='Nice', column_id=0, row_id=1)
        ds = client.update_dataset(name=DATASET_NAME, dataset=ds)
        self.assertEqual(len(ds.annotations.cells), 3)
        ds = client.get_dataset(DATASET_NAME)
        self.assertEqual(len(ds.annotations.cells), 3)
        row = ds.rows[1]
        annotations = row.annotations(0)
        for key in ['comment', 'quality']:
            self.assertTrue(key in list(annotations.keys()))
        annotations = row.annotations(1)
        self.assertTrue('comment' in list(annotations.keys()))
        self.assertFalse('quality' in list(annotations.keys()))
        row.set_value(0, 'New Value', clear_annotations=True)
        self.assertEqual(len(ds.annotations.cells), 1)
        ds = client.update_dataset(name=DATASET_NAME, dataset=ds)
        self.assertEqual(len(ds.annotations.cells), 1)
        ds = client.get_dataset(DATASET_NAME)
        self.assertEqual(len(ds.annotations.cells), 1)

    def test_update_existing_dataset(self):
        """Test creating and updating an existing dataset via the client."""
        # Move columns around
        ds = self.datastore.load_dataset(self.filestore.upload_file(CSV_FILE))
        client = VizierDBClient(datastore=self.datastore,
                                datasets={DATASET_NAME: ds.identifier})
        ds = client.get_dataset(DATASET_NAME)
        col_1 = [row.get_value(1) for row in ds.rows]
        ds.insert_column('empty', 2)
        ds = client.update_dataset(DATASET_NAME, ds)
        col_2 = [row.get_value(2) for row in ds.rows]
        ds.move_column('empty', 1)
        ds = client.update_dataset(DATASET_NAME, ds)
        for i in range(len(ds.rows)):
            row = ds.rows[i]
            self.assertEqual(row.values[1], col_2[i])
            self.assertEqual(row.values[2], col_1[i])
        # Rename
        ds.columns[1].name = 'allnone'
        ds = client.update_dataset(DATASET_NAME, ds)
        for i in range(len(ds.rows)):
            row = ds.rows[i]
            self.assertEqual(row.get_value('allnone'), col_2[i])
            self.assertEqual(row.values[2], col_1[i])
        # Insert row
        row = ds.insert_row()
        row.set_value('Name', 'Zoe')
        ds = client.create_dataset('upd', ds)
        self.assertEqual(len(ds.rows), 3)
        r2 = ds.rows[2]
        self.assertEqual(r2.identifier, 2)
        self.assertEqual(r2.values, ['Zoe', None, None, None])
        # Annotations
        ds = client.get_dataset(DATASET_NAME)
        col = ds.get_column('Age')
        row = ds.rows[0]
        ds.annotations.add(column_id=col.identifier,
                           row_id=row.identifier,
                           key='user:comment',
                           value='My Comment')
        ds = client.update_dataset(DATASET_NAME, ds)
        annotations = ds.rows[0].annotations('Age').find_all('user:comment')
        self.assertEqual(len(annotations), 1)
        anno = annotations[0]
        self.assertEqual(anno.key, 'user:comment')
        self.assertEqual(anno.value, 'My Comment')
        ds.annotations.add(column_id=col.identifier,
                           row_id=row.identifier,
                           key='user:comment',
                           value='Another Comment')
        ds = client.update_dataset(DATASET_NAME, ds)
        annotations = ds.rows[0].annotations('Age').find_all('user:comment')
        self.assertEqual(len(annotations), 2)
        self.assertEqual(list(ds.rows[0].annotations('Age').keys()),
                         ['user:comment'])
        values = [a.value for a in annotations]
        for val in ['My Comment', 'Another Comment']:
            self.assertTrue(val in values)
        anno = ds.rows[0].annotations('Age').find_one('user:comment')
        anno.key = 'user:issue'
        anno.value = 'Some Issue'
        ds = client.update_dataset(DATASET_NAME, ds)
        annotations = ds.rows[0].annotations('Age').find_all('user:comment')
        self.assertEqual(len(annotations), 1)
        keys = list(ds.rows[0].annotations('Age').keys())
        for key in ['user:comment', 'user:issue']:
            self.assertTrue(key in keys)
        values = [
            a.value
            for a in ds.rows[0].annotations('Age').find_all('user:issue')
        ]
        for val in ['Some Issue']:
            self.assertTrue(val in values)
        ds.annotations.remove(
            column_id=col.identifier,
            row_id=row.identifier,
            key='user:issue',
        )
        ds = client.update_dataset(DATASET_NAME, ds)
        annotations = ds.rows[0].annotations('Age').find_all('user:issue')
        self.assertEqual(len(annotations), 0)
        annotations = ds.rows[0].annotations('Age').find_all('user:comment')
        self.assertEqual(len(annotations), 1)
        # Delete column
        ds = client.get_dataset(DATASET_NAME)
        ds.delete_column('Age')
        client.update_dataset(DATASET_NAME, ds)
        ds = client.get_dataset(DATASET_NAME)
        names = [col.name.upper() for col in ds.columns]
        self.assertTrue('NAME' in names)
        self.assertFalse('AGE' in names)
        self.assertTrue(DATASET_NAME in client.read)
        self.assertTrue(DATASET_NAME in client.write)
        self.assertFalse('upd' in client.read)
        self.assertTrue('upd' in client.write)
class TestMimirProcessor(unittest.TestCase):
    """Individual test for Mimir lenses. Run separately since each test has to
    initialize and shout down the Mimir gateway.
    """
    def setUp(self):
        """Create an instance of the Mimir processor for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.processor = MimirProcessor()
        self.datastore = MimirDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)
        self.available_lenses = set(mimir.getAvailableLensTypes())

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def compute_lens_result(self, ds, command):
        return self.processor.compute(command_id=command.command_id,
                                      arguments=command.arguments,
                                      context=TaskContext(
                                          project_id=1,
                                          datastore=self.datastore,
                                          filestore=self.filestore,
                                          artifacts={DATASET_NAME: ds}))

    def test_geocode_lens(self):
        if lens_types.MIMIR_GEOCODE not in self.available_lenses:
            self.skipTest("Mimir Geocoding Lens not initialized.")
        """Test GEOCODE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(GEO_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Geocode Lens
        command = cmd.mimir_geocode(
            DATASET_NAME,
            'GOOGLE',
            house_nr=ds.column_by_name('STRNUMBER').identifier,
            street=ds.column_by_name('STRNAME').identifier,
            city=ds.column_by_name('CITY').identifier,
            state=ds.column_by_name('STATE').identifier)
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)

        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(columns), 6)
        self.assertTrue('LATITUDE' in columns)
        self.assertTrue('LONGITUDE' in columns)

        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(columns), 8)
        self.assertTrue('LATITUDE_1' in columns)
        self.assertTrue('LONGITUDE_1' in columns)
        self.assertEqual(len(ds.columns), 8)

    def test_key_repair_lens(self):
        """Test KEY REPAIR lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(KEY_REPAIR_FILE)
        ds1 = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_key_repair(DATASET_NAME,
                                       ds1.column_by_name('Empid').identifier)
        result = self.compute_lens_result(ds1, command)
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 4)
        self.assertEqual(ds.row_count, 2)
        names = set()
        empids = set()
        for row in ds.fetch_rows():
            empids.add(int(row.values[0]))
            names.add(row.values[1])
        self.assertTrue(1 in empids)
        self.assertTrue('Alice' in names or 'Bob' in names)
        self.assertFalse('Alice' in names and 'Bob' in names)
        self.assertTrue('Carla' in names)
        # Test error case and command text
        with self.assertRaises(ValueError):
            command = cmd.mimir_key_repair('MY DS', 'MY COL')
            result = self.compute_lens_result(ds, command)

    def test_missing_value_lens(self):
        """Test MISSING_VALUE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_missing_value(
            DATASET_NAME,
            columns=[{
                'column': ds.column_by_name('AGE').identifier
            }])
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        for row in rows:
            self.assertIsNotNone(row.values[1])
        self.assertNotEqual(rows[2].values[ds.column_index('Age')], '')
        # MISSING VALUE Lens with value constraint
        command = cmd.mimir_missing_value(
            DATASET_NAME,
            columns=[{
                'column': ds.column_by_name('AGE').identifier,
                'constraint': '> 30'
            }],
        )
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        for row in rows:
            self.assertIsNotNone(row.values[1])
        print(rows[2].values)
        # we shouldn't be imputing a value lower than the minimum value in the dataset
        self.assertTrue(rows[2].values[ds.column_index('Age')] >= 23)

    def test_missing_key_lens(self):
        """Test MISSING_KEY lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        age_col = ds.column_by_name('Age').identifier
        command = cmd.mimir_missing_key(DATASET_NAME, age_col)
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 3)
        rows = ds.fetch_rows()
        # Depending on implementation this could be either 22 or 24, as there are two rows
        # with missing values for the key column.  Currently, Mimir discards such rows, but
        # if this suddenly turns into a 24, that's not incorrect either.
        self.assertEqual(len(rows), 22)
        command = cmd.mimir_missing_key(DATASET_NAME,
                                        ds.column_by_name('Salary').identifier)
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEqual(len(rows), 31)

    def test_picker_lens(self):
        """Test PICKER lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(PICKER_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        command = cmd.mimir_picker(
            DATASET_NAME, [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('Salary').identifier
            }])
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        result_ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in result_ds.columns]
        # print(columns)
        self.assertEqual(len(result_ds.columns), 3)
        self.assertTrue('AGE_1' in columns)
        # Pick another column, this time with custom name
        command = cmd.mimir_picker(
            DATASET_NAME, [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('Salary').identifier
            }],
            pick_as='My_Column')
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        result_ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in result_ds.columns]
        self.assertEqual(len(result_ds.columns), 3)
        self.assertTrue('MY_COLUMN' in columns)

    def test_type_inference_lens(self):
        """Test TYPE INFERENCE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Infer type
        command = cmd.mimir_type_inference(DATASET_NAME, 0.6)
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds2 = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds2.columns), 3)
        self.assertEqual(ds2.row_count, 7)
        ds1_rows = ds.fetch_rows()
        ds2_rows = ds2.fetch_rows()
        for i in range(ds2.row_count):
            self.assertEqual(ds1_rows[i].values, ds2_rows[i].values)
예제 #20
0
 def setup_fileserver(self):
     """Create a fresh file server."""
     if os.path.isdir(FILESERVER_DIR):
         shutil.rmtree(FILESERVER_DIR)
     os.mkdir(FILESERVER_DIR)
     self.fileserver = FileSystemFilestore(FILESERVER_DIR)
예제 #21
0
class TestMimirDatastore(unittest.TestCase):
    def setup_fileserver(self):
        """Create a fresh file server."""
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)
        os.mkdir(FILESERVER_DIR)
        self.fileserver = FileSystemFilestore(FILESERVER_DIR)

    def set_up(self):
        """Create empty data store directory."""
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.mkdir(SERVER_DIR)
        self.db = MimirDatastore(DATASTORE_DIR)

    def tear_down(self):
        """Delete data store directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_mimir_datastore(self):
        """Run test for Mimir datastore."""
        self.set_up()
        self.dataset_load()
        self.tear_down()
        self.set_up()
        self.datastore_init()
        self.tear_down()
        self.set_up()
        self.dataset_read()
        self.tear_down()
        self.set_up()
        self.dataset_column_index()
        self.tear_down()
        self.tear_down()

    def datastore_init(self):
        """Test initalizing a datastore with existing datasets."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        self.db = MimirDatastore(DATASTORE_DIR)

    def dataset_column_index(self):
        """Test the column by id index of the dataset handle."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        # Ensure that the project data has three columns and two rows
        self.assertEqual(ds.column_by_id(0).name.upper(), 'NAME')
        self.assertEqual(ds.column_by_id(1).name.upper(), 'AGE')
        self.assertEqual(ds.column_by_id(2).name.upper(), 'SALARY')
        with self.assertRaises(ValueError):
            ds.column_by_id(5)
        ds.columns.append(DatasetColumn(identifier=5, name='NEWNAME'))
        self.assertEqual(ds.column_by_id(5).name.upper(), 'NEWNAME')
        with self.assertRaises(ValueError):
            ds.column_by_id(4)

    def dataset_load(self):
        """Test create and delete dataset."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        # Ensure that the project data has three columns and two rows
        self.assertEqual(len(ds.columns), 3)
        self.assertEqual(len(ds.fetch_rows()), 2)
        self.assertEqual(ds.row_count, 2)

    def dataset_read(self):
        """Test reading a dataset."""
        self.setup_fileserver()
        dh = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        ds = self.db.get_dataset(dh.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEqual(dh.identifier, ds.identifier)
        self.assertEqual(len(dh.columns), len(ds.columns))
        self.assertEqual(len(dh.fetch_rows()), len(ds_rows))
        self.assertEqual(len(dh.fetch_rows()), len(ds_rows))
        self.assertEqual(dh.row_count, len(ds_rows))
        # Name,Age,Salary
        # Alice,23,35K
        # Bob,32,30K
        self.assertEqual(ds.column_index('Name'), 0)
        self.assertEqual(ds.column_index('Age'), 1)
        self.assertEqual(ds.column_index('Salary'), 2)
        row = ds_rows[0]
        self.assertEqual(row.values[0], 'Alice')
        self.assertEqual(int(row.values[1]), 23)
        self.assertEqual(row.values[2], '35K')
        row = ds_rows[1]
        self.assertEqual(row.values[0], 'Bob')
        self.assertEqual(int(row.values[1]), 32)
        self.assertEqual(row.values[2], '30K')
예제 #22
0
class TestMimirProcessor(unittest.TestCase):
    """Individual test for Mimir lenses. Run separately since each test has to
    initialize and shout down the Mimir gateway.
    """
    def setUp(self):
        """Create an instance of the Mimir processor for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.processor = MimirProcessor()
        self.datastore = MimirDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_domain_lens(self):
        """Test DOMAIN lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        col_age = ds.column_by_name('Age')
        command = cmd.mimir_domain(DATASET_NAME, col_age.identifier)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        self.assertNotEqual(rows[2].values[ds.column_index('Age')], '')
        # Introduce an error. Make sure command formating is correct
        command = cmd.mimir_domain('MY DS', 'MY COL')
        with self.assertRaises(ValueError):
            result = self.processor.compute(
                command_id=command.command_id,
                arguments=command.arguments,
                context=TaskContext(datastore=self.datastore,
                                    filestore=self.filestore,
                                    datasets={DATASET_NAME: ds.identifier}))

    def test_geocode_lens(self):
        """Test GEOCODE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(GEO_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Geocode Lens
        command = cmd.mimir_geocode(
            DATASET_NAME,
            'GOOGLE',
            house_nr=ds.column_by_name('STRNUMBER').identifier,
            street=ds.column_by_name('STRNAME').identifier,
            city=ds.column_by_name('CITY').identifier,
            state=ds.column_by_name('STATE').identifier)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(columns), 6)
        self.assertTrue('LATITUDE' in columns)
        self.assertTrue('LONGITUDE' in columns)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(columns), 8)
        self.assertTrue('LATITUDE_1' in columns)
        self.assertTrue('LONGITUDE_1' in columns)
        self.assertEqual(len(ds.columns), 8)

    def test_key_repair_lens(self):
        """Test KEY REPAIR lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(KEY_REPAIR_FILE)
        ds1 = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_key_repair(DATASET_NAME,
                                       ds1.column_by_name('Empid').identifier)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds1.identifier}))
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 4)
        self.assertEqual(ds.row_count, 3)
        names = set()
        empids = set()
        rowids = set()
        for row in ds.fetch_rows():
            rowids.add(row.identifier)
            empids.add(int(row.get_value('empid')))
            names.add(row.get_value('name'))
        self.assertTrue(1 in empids)
        self.assertTrue(2 in rowids)
        self.assertTrue('Alice' in names)
        self.assertTrue('Carla' in names)
        # Test error case and command text
        command = cmd.mimir_key_repair('MY DS', 'MY COL')
        with self.assertRaises(ValueError):
            self.processor.compute(command_id=command.command_id,
                                   arguments=command.arguments,
                                   context=TaskContext(
                                       datastore=self.datastore,
                                       filestore=self.filestore,
                                       datasets={DATASET_NAME: ds.identifier}))

    def test_missing_value_lens(self):
        """Test MISSING_VALUE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_missing_value(
            DATASET_NAME,
            columns=[{
                'column': ds.column_by_name('AGE').identifier
            }])
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        for row in rows:
            self.assertIsNotNone(row.values[1])
        self.assertNotEqual(rows[2].values[ds.column_index('Age')], '')
        # MISSING VALUE Lens with value constraint
        command = cmd.mimir_missing_value(
            DATASET_NAME,
            columns=[{
                'column': ds.column_by_name('AGE').identifier,
                'constraint': '> 30'
            }],
        )
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        for row in rows:
            self.assertIsNotNone(row.values[1])
        self.assertTrue(rows[2].values[ds.column_index('Age')] > 30)

    def test_missing_key_lens(self):
        """Test MISSING_KEY lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        age_col = ds.column_by_name('Age').identifier
        command = cmd.mimir_missing_key(DATASET_NAME, age_col)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEqual(len(rows), 24)
        command = cmd.mimir_missing_key(DATASET_NAME,
                                        ds.column_by_name('Salary').identifier)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEqual(len(rows), 55)

    def test_picker_lens(self):
        """Test PICKER lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(PICKER_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        command = cmd.mimir_picker(
            DATASET_NAME, [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('Salary').identifier
            }])
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        print(columns)
        self.assertEqual(len(ds.columns), 5)
        self.assertTrue('PICK_ONE_AGE_SALARY' in columns)
        # Pick another column, this time with custom name
        command = cmd.mimir_picker(
            DATASET_NAME, [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('Salary').identifier
            }],
            pick_as='My_Column')
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(ds.columns), 6)
        self.assertTrue('PICK_ONE_AGE_SALARY' in columns)
        self.assertTrue('MY_COLUMN' in columns)
        # Pick from a picked column
        command = cmd.mimir_picker(
            DATASET_NAME,
            [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('PICK_ONE_AGE_SALARY').identifier
            }],
            pick_as='My_Next_Column')
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertTrue('MY_NEXT_COLUMN' in columns)

    def test_schema_matching_lens(self):
        """Test SCHEMA_MATCHING lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_schema_matching(DATASET_NAME, [{
            'column': 'BDate',
            'type': 'int'
        }, {
            'column': 'PName',
            'type': 'varchar'
        }], 'new_' + DATASET_NAME)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write['new_' + DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 2)
        self.assertEqual(ds.row_count, 2)

    def test_type_inference_lens(self):
        """Test TYPE INFERENCE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Infer type
        command = cmd.mimir_type_inference(DATASET_NAME, 0.6)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds2 = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds2.columns), 3)
        self.assertEqual(ds2.row_count, 7)
        ds1_rows = ds.fetch_rows()
        ds2_rows = ds2.fetch_rows()
        for i in range(ds2.row_count):
            self.assertEqual(ds1_rows[i].values, ds2_rows[i].values)
예제 #23
0
class TestDatasetPaginationReader(unittest.TestCase):

    def set_up(self, engine):
        """Create an empty file server repository."""
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.mkdir(SERVER_DIR)
        # Setup file server
        self.fs = FileSystemFilestore(FILESERVER_DIR)
        # Setup the respective datastore and Vizual engine
        if engine == ENGINEENV_DEFAULT:
            self.datastore = FileSystemDatastore(DATASTORE_DIR)
        elif engine == ENGINEENV_MIMIR:
            self.datastore = MimirDatastore(DATASTORE_DIR)

    def tear_down(self, engine):
        """Clean-up by dropping file server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_default_engine(self):
        """Test functionality for the default setup."""
        self.run_tests(ENGINEENV_DEFAULT)

    def test_mimir_engine(self):
        """Test functionality for the Mimir setup."""
        import vizier.mimir as mimir # noqa: F401
        self.run_tests(ENGINEENV_MIMIR)

    def run_tests(self, engine):
        """Run sequence of tests for given configuration."""
        self.set_up(engine)
        ds = self.datastore.load_dataset(self.fs.upload_file(CSV_FILE_1))
        rows = ds.fetch_rows()
        self.assertEqual(len(rows), 7)
        rows = ds.fetch_rows(offset=1)
        self.assertEqual(len(rows), 6)
        self.assertEqual(rows[0].values[0], 'Bob')
        self.assertEqual(rows[5].values[0], 'Gertrud')
        rows = ds.fetch_rows(limit=2)
        self.assertEqual(len(rows), 2)
        self.assertEqual(rows[0].values[0], 'Alice')
        self.assertEqual(rows[1].values[0], 'Bob')
        rows = ds.fetch_rows(offset=4, limit=3)
        self.assertEqual(len(rows), 3)
        self.assertEqual(rows[0].values[0], 'Eileen')
        self.assertEqual(rows[2].values[0], 'Gertrud')
        rows = ds.fetch_rows(offset=5, limit=3)
        self.assertEqual(len(rows), 2)
        self.assertEqual(rows[0].values[0], 'Frank')
        self.assertEqual(rows[1].values[0], 'Gertrud')
        rows = ds.fetch_rows(offset=6, limit=3)
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0].values[0], 'Gertrud')
        # Test larger dataset with deletes
        ds = self.datastore.load_dataset(self.fs.upload_file(CSV_FILE_2))
        rows = ds.fetch_rows(offset=0, limit=10)
        self.assertEqual(len(rows), 10)
        rows = ds.fetch_rows(offset=10, limit=20)
        self.assertEqual(len(rows), 20)
        rows = ds.fetch_rows(offset=60, limit=10)
        self.assertEqual(len(rows), 3)
        self.tear_down(engine)
예제 #24
0
 def test_upload_file(self):
     """Test file upload."""
     db = FileSystemFilestore(SERVER_DIR)
     fh = db.upload_file(CSV_FILE)
     self.assertEqual(fh.file_name, os.path.basename(CSV_FILE))
     self.assertEqual(fh.mimetype, fs.FORMAT_CSV)
     self.assertEqual(fh.identifier, db.get_file(fh.identifier).identifier)
     self.assertTrue(
         os.path.isfile(
             os.path.join(SERVER_DIR, fh.identifier, METADATA_FILENAME)))
     self.assertTrue(os.path.isfile(fh.filepath))
     self.assertTrue(fh.is_tabular)
     # Re-load the repository
     db = FileSystemFilestore(SERVER_DIR)
     fh = db.get_file(fh.identifier)
     self.assertEqual(fh.file_name, os.path.basename(CSV_FILE))
     self.assertEqual(fh.mimetype, fs.FORMAT_CSV)
     self.assertEqual(fh.identifier, db.get_file(fh.identifier).identifier)
     # Add files with other valid suffixes
     fh = db.upload_file(CSV_FILE)
     self.assertFalse(fh.compressed)
     self.assertEqual(fh.delimiter, ',')
     fh = db.upload_file(GZIP_CSV_FILE)
     self.assertTrue(fh.compressed)
     self.assertEqual(fh.delimiter, ',')
     fh = db.upload_file(TSV_FILE)
     self.assertFalse(fh.compressed)
     self.assertEqual(fh.delimiter, '\t')
     fh = db.upload_file(GZIP_TSV_FILE)
     self.assertTrue(fh.compressed)
     self.assertEqual(fh.delimiter, '\t')
     # Re-load the repository
     db = FileSystemFilestore(SERVER_DIR)
     self.assertEqual(len(db.list_files()), 5)
     fh = db.upload_file(TEXT_FILE)
     self.assertFalse(fh.is_tabular)
예제 #25
0
 def test_list_file(self):
     """Test list files method."""
     db = FileSystemFilestore(SERVER_DIR)
     db.upload_file(CSV_FILE)
     db.upload_file(GZIP_CSV_FILE)
     db.upload_file(TSV_FILE)
     db.upload_file(GZIP_TSV_FILE)
     files = db.list_files()
     self.assertEqual(len(files), 4)
     db.upload_file(CSV_FILE)
     db.upload_file(GZIP_CSV_FILE)
     db.upload_file(TSV_FILE)
     db.upload_file(GZIP_TSV_FILE)
     files = db.list_files()
     self.assertEqual(len(files), 8)
class TestVizierClient(unittest.TestCase):

    def setUp(self):
        """Create instances of the default datastore and filestore."""
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.datastore=FileSystemDatastore(DATASTORE_DIR)
        self.filestore=FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_create_new_dataset(self):
        """Test creating and updating a new dataset via the client."""
        client = VizierDBClient(
                    datastore=self.datastore, 
                    datasets=dict(),
                    dataobjects=dict(),
                    source="",
                    project_id=7
                )
        ds = DatasetClient()
        ds.insert_column('Name')
        ds.insert_column('Age')
        ds.insert_row(['Alice', '23'])
        ds.insert_row(['Bob', '25'])
        rows = ds.rows
        ds = client.create_dataset('MyDataset', ds)
        # Ensure the returned dataset contains the input data
        self.assertEqual([c.name for c in ds.columns], ['Name', 'Age'])
        self.assertEqual([str(v) for v in ds.rows[0].values], ['Alice', '23'])
        self.assertEqual([str(v) for v in ds.rows[1].values], ['Bob', '25'])
        # Modify the reference to the original rows to ensure that the rows in
        # the loaded datasets are not affected
        self.assertEqual([str(v) for v in rows[0].values], ['Alice', '23'])
        rows[0].set_value(0, 'Jane')
        self.assertEqual([str(v) for v in rows[0].values], ['Jane', '23'])
        self.assertEqual([str(v) for v in ds.rows[0].values], ['Alice', '23'])
        # Update dataset
        ds.rows[1].set_value('Age', '26')
        ds.save()
        ds = client.get_dataset('MyDataset')
        self.assertEqual([str(v) for v in ds.rows[1].values], ['Bob', '26'])
        # Value error when creating dataset with existing name
        with self.assertRaises(ValueError):
            client.create_dataset('MyDataset', ds)
        # Value error when retrieving unknown dataset
        with self.assertRaises(ValueError):
            client.get_dataset('SomeDataset')
        # Ensure the returned dataset contains the modified data
        client.rename_dataset('MyDataset', 'SomeDataset')
        ds = client.get_dataset('SomeDataset')
        # Ensure that access to unknown datasets is recorded
        with self.assertRaises(ValueError):
            client.get_dataset('ThisIsNotADataset')
        self.assertTrue('mydataset' in client.write)
        self.assertTrue('somedataset' in client.write)
        self.assertTrue('thisisnotadataset' in client.read)
        self.assertFalse('thisisnotadataset' in client.write)

    def test_update_existing_dataset(self):
        """Test creating and updating an existing dataset via the client."""
        # Move columns around
        ds = self.datastore.load_dataset(self.filestore.upload_file(CSV_FILE))
        client = VizierDBClient(
            datastore=self.datastore,
            datasets={DATASET_NAME:ds},
            dataobjects=dict(),
            source="",
            project_id=7
        )
        ds = client.get_dataset(DATASET_NAME)
        col_1 = [row.get_value(1) for row in ds.rows]
        ds.insert_column('empty', 3)
        ds = client.update_dataset(DATASET_NAME, ds)
        col_2 = [row.get_value(2) for row in ds.rows]
        ds.move_column('empty', 1)
        ds = client.update_dataset(DATASET_NAME, ds)
        for i in range(len(ds.rows)):
            row = ds.rows[i]
            self.assertEqual(row.values[3], col_2[i])
            self.assertEqual(row.values[2], col_1[i])
        # Rename
        ds.columns[1].name = 'allnone'
        ds = client.update_dataset(DATASET_NAME, ds)
        for i in range(len(ds.rows)):
            row = ds.rows[i]
            self.assertEqual(row.get_value('allnone'), None)
            self.assertEqual(row.values[2], col_1[i])
        # Insert row
        row = ds.insert_row()
        row.set_value('Name', 'Zoe')
        ds = client.create_dataset('upd', ds)
        self.assertEqual(len(ds.rows), 3)
        r2 = ds.rows[2]
        self.assertEqual(r2.values, ['Zoe', None, None, None])
        # Delete column
        ds = client.get_dataset(DATASET_NAME)
        ds.delete_column('Age')
        client.update_dataset(DATASET_NAME, ds)
        ds = client.get_dataset(DATASET_NAME)
        names = [col.name.upper() for col in ds.columns]
        self.assertTrue('NAME' in names)
        self.assertFalse('AGE' in names)
        self.assertTrue(DATASET_NAME in client.read)
        self.assertTrue(DATASET_NAME in client.write)
        self.assertFalse('upd' in client.read)
        self.assertTrue('upd' in client.write)
class TestDefaultPlotProcessor(unittest.TestCase):

    def setUp(self):
        """Create instances of the default datastore and filestore."""
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.datastore=FileSystemDatastore(DATASTORE_DIR)
        self.filestore=FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_advanced_plot(self):
        """Test running the simple plot command with a more advanced chart
        definition.
        """
        fh = self.filestore.upload_file(TSV_FILE)
        ds = self.datastore.load_dataset(fh)
        cmd = create_plot(
            dataset_name=DATASET_NAME,
            chart_name='My Chart',
            series=[{'column': 1, 'range': '25:30', 'label': 'A'}, {'column': 0, 'range': '25:30'}],
            validate=True
        )
        result = PlotProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(
                project_id=0,
                datastore=self.datastore,
                filestore=self.filestore,
                artifacts={DATASET_NAME: ds}
            )
        )
        chart = result.outputs.stdout[0].value
        self.assertEqual(chart['data']['data'][0]['label'], 'A')
        self.assertEqual(chart['data']['data'][1]['label'], 'average_class_size')
        self.assertEqual(chart['result']['series'][0]['label'], 'A')
        self.assertEqual(chart['result']['series'][1]['label'], 'average_class_size')
        self.assertEqual(len(chart['result']['series'][0]['data']), 6)
        self.assertEqual(len(chart['result']['series'][1]['data']), 6)

    def test_simple_plot(self):
        """Test running the simple plot command."""
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.datastore.load_dataset(fh)
        cmd = create_plot(
            dataset_name=DATASET_NAME,
            chart_name='My Chart',
            series=[{'column': 1}],
            validate=True
        )
        result = PlotProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(
                project_id=0,
                datastore=self.datastore,
                filestore=self.filestore,
                artifacts={DATASET_NAME: ds}
            )
        )
예제 #28
0
class TestDefaultVizualProcessor(unittest.TestCase):
    def setUp(self):
        """Create an instance of the default vizier processor for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.processor = VizualTaskProcessor(api=DefaultVizualApi())
        self.datastore = FileSystemDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_create_api_from_dictionary(self):
        """Test creating the processor instance with properties parameter
        instead of api.
        """
        processor = VizualTaskProcessor(
            properties={
                PROPERTY_API:
                ClassLoader.to_dict(
                    module_name='vizier.engine.packages.vizual.api.fs',
                    class_name='DefaultVizualApi')
            })
        fh = self.filestore.upload_file(CSV_FILE)
        cmd = vizual.load_dataset(dataset_name=DATASET_NAME,
                                  file={pckg.FILE_ID: fh.identifier},
                                  validate=True)
        result = processor.compute(command_id=cmd.command_id,
                                   arguments=cmd.arguments,
                                   context=TaskContext(
                                       project_id=5,
                                       datastore=self.datastore,
                                       filestore=self.filestore,
                                       artifacts={}))
        self.assertIsNotNone(result.provenance.write)
        self.assertTrue(DATASET_NAME in result.provenance.write)
        dataset_id = result.provenance.write[DATASET_NAME].identifier
        self.assertTrue(result.provenance.read is None
                        or len(result.provenance.read) == 0)
        self.assertIsNotNone(result.provenance.resources)
        self.assertEqual(result.provenance.resources[RESOURCE_DATASET],
                         dataset_id)

    def load_dataset(self):
        """Load a single dataset and return the resulting database state."""
        fh = self.filestore.upload_file(CSV_FILE)
        cmd = vizual.load_dataset(dataset_name=DATASET_NAME,
                                  file={pckg.FILE_ID: fh.identifier},
                                  validate=True)
        result = self.processor.compute(command_id=cmd.command_id,
                                        arguments=cmd.arguments,
                                        context=TaskContext(
                                            project_id=5,
                                            datastore=self.datastore,
                                            filestore=self.filestore,
                                            artifacts={}))
        return result.provenance.write

    def test_delete_column(self):
        """Test functionality to delete a column."""
        cmd = vizual.delete_column(dataset_name=DATASET_NAME,
                                   column=1,
                                   validate=True)
        self.validate_command(cmd)

    def test_delete_row(self):
        """Test functionality to delete a row."""
        cmd = vizual.delete_row(dataset_name=DATASET_NAME,
                                row=1,
                                validate=True)
        self.validate_command(cmd)

    def test_drop_dataset(self):
        """Test functionality to drop a dataset."""
        cmd = vizual.drop_dataset(dataset_name=DATASET_NAME, validate=True)
        datasets = self.load_dataset()
        dataset_id = datasets[DATASET_NAME]
        result = self.processor.compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(project_id=5,
                                datastore=self.datastore,
                                filestore=self.filestore,
                                artifacts={DATASET_NAME: dataset_id}))
        self.assertFalse(DATASET_NAME in result.provenance.read)
        self.assertTrue(DATASET_NAME in result.provenance.delete)
        self.assertFalse(DATASET_NAME in result.provenance.write)

    def test_filter_columns(self):
        """Test projection of a dataset."""
        # Create a new dataset
        cmd = vizual.projection(dataset_name=DATASET_NAME,
                                columns=[{
                                    'column': 1
                                }, {
                                    'column': 2,
                                    'name': 'MyName'
                                }],
                                validate=True)
        self.validate_command(cmd)

    def test_insert_column(self):
        """Test functionality to insert a columns."""
        cmd = vizual.insert_column(dataset_name=DATASET_NAME,
                                   position=1,
                                   name='My Col',
                                   validate=True)
        self.validate_command(cmd)

    def test_insert_row(self):
        """Test functionality to insert a row."""
        # Create a new dataset
        cmd = vizual.insert_row(dataset_name=DATASET_NAME,
                                position=1,
                                validate=True)
        self.validate_command(cmd)

    def test_load_dataset(self):
        """Test functionality to load a dataset."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        cmd = vizual.load_dataset(dataset_name='ABC',
                                  file={pckg.FILE_ID: fh.identifier},
                                  validate=True)
        result = self.processor.compute(command_id=cmd.command_id,
                                        arguments=cmd.arguments,
                                        context=TaskContext(
                                            project_id=5,
                                            datastore=self.datastore,
                                            filestore=self.filestore,
                                            artifacts={}))
        self.assertIsNotNone(result.provenance.write)
        self.assertTrue('abc' in result.provenance.write)
        dataset_id = result.provenance.write['abc'].identifier
        self.assertTrue(result.provenance.read is None
                        or len(result.provenance.read) == 0)
        self.assertIsNotNone(result.provenance.resources)
        self.assertEqual(result.provenance.resources[RESOURCE_DATASET],
                         dataset_id)
        # Running load again will not change the dataset identifier
        result = self.processor.compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(project_id=5,
                                datastore=self.datastore,
                                filestore=self.filestore,
                                artifacts={},
                                resources=result.provenance.resources))
        self.assertEqual(result.provenance.write['abc'].identifier, dataset_id)
        self.assertEqual(result.provenance.resources[RESOURCE_DATASET],
                         dataset_id)

    def test_move_column(self):
        """Test functionality to move a column."""
        cmd = vizual.move_column(dataset_name=DATASET_NAME,
                                 column=0,
                                 position=1,
                                 validate=True)
        self.validate_command(cmd)

    def test_move_row(self):
        """Test functionality to move a row."""
        cmd = vizual.move_row(dataset_name=DATASET_NAME,
                              row=0,
                              position=1,
                              validate=True)
        self.validate_command(cmd)

    def test_rename_column(self):
        """Test functionality to rename a column."""
        cmd = vizual.rename_column(dataset_name=DATASET_NAME,
                                   column=1,
                                   name='The col',
                                   validate=True)
        self.validate_command(cmd)

    def test_rename_dataset(self):
        """Test functionality to rename a dataset."""
        cmd = vizual.rename_dataset(dataset_name=DATASET_NAME,
                                    new_name='XYZ',
                                    validate=True)
        datasets = self.load_dataset()
        dataset_id = datasets[DATASET_NAME]
        result = self.processor.compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(project_id=5,
                                datastore=self.datastore,
                                filestore=self.filestore,
                                artifacts={DATASET_NAME: dataset_id}))
        self.assertFalse(DATASET_NAME in result.provenance.write)
        self.assertFalse(DATASET_NAME in result.provenance.read)
        self.assertTrue(DATASET_NAME in result.provenance.delete)
        self.assertFalse(DATASET_NAME in result.provenance.write)
        self.assertTrue('xyz' in result.provenance.write)

    def test_sort_dataset(self):
        """Test sorting a dataset."""
        cmd = vizual.sort_dataset(dataset_name=DATASET_NAME,
                                  columns=[{
                                      'column': 1,
                                      'order': 'Z-A'
                                  }, {
                                      'column': 2,
                                      'order': 'A-Z'
                                  }],
                                  validate=True)
        self.validate_command(cmd)

    def test_update_cell(self):
        """Test functionality to update a dataset cell."""
        # Create a new dataset
        datasets = self.load_dataset()
        dataset = self.datastore.get_dataset(datasets[DATASET_NAME].identifier)
        row_ids = [row.identifier for row in dataset.fetch_rows()]
        cmd = vizual.update_cell(dataset_name=DATASET_NAME,
                                 column=1,
                                 row=row_ids[0],
                                 value=9,
                                 validate=True)
        self.validate_command(cmd, dataset=dataset)

    def validate_command(self, cmd, dataset=None):
        """Validate execution of the given command."""
        if dataset is None:
            datasets = self.load_dataset()
            dataset = datasets[DATASET_NAME]
        result = self.processor.compute(command_id=cmd.command_id,
                                        arguments=cmd.arguments,
                                        context=TaskContext(
                                            project_id=5,
                                            datastore=self.datastore,
                                            filestore=self.filestore,
                                            artifacts={DATASET_NAME: dataset}))
        self.assertNotEqual(result.provenance.write[DATASET_NAME].identifier,
                            dataset.identifier)
        self.assertIsNotNone(result.provenance.read)
        self.assertEqual(result.provenance.read[DATASET_NAME],
                         dataset.identifier)
        self.assertIsNotNone(result.provenance.write)
        with self.assertRaises(ValueError):
            result = self.processor.compute(command_id=cmd.command_id,
                                            arguments=cmd.arguments,
                                            context=TaskContext(
                                                project_id=5,
                                                datastore=self.datastore,
                                                filestore=self.filestore,
                                                artifacts={}))
예제 #29
0
class TestDefaultVizualApi(unittest.TestCase):

    api: MimirVizualApi

    def setUp(self):
        """Create an instance of the default vizier API for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.api = MimirVizualApi()
        self.datastore = MimirDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_api(self):
        """Run all tests after we initialize mimir. Make sure to create a
        fresh environment after each test.
        """
        self.delete_column()
        self.setUp()
        self.delete_row()
        self.setUp()
        self.filter_columns()
        self.setUp()
        self.insert_column()
        self.setUp()
        self.insert_row()
        self.setUp()
        self.load_dataset()
        self.setUp()
        self.move_column()
        self.setUp()
        self.move_row()
        self.setUp()
        self.rename_column()
        self.setUp()
        self.sequence_of_steps()
        self.setUp()
        self.sort_dataset()
        self.setUp()
        self.update_cell()

    def delete_column(self):
        """Test functionality to delete a column."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        row_ids = [row.identifier for row in ds_rows]
        # Delete Age column
        col_id = ds.column_by_name('AGE').identifier
        result = self.api.delete_column(ds.identifier, col_id, self.datastore)
        # Resulting dataset should differ from previous one
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        # Retrieve modified dataset and ensure that it cobtains the following
        #
        # Name, Salary
        # ------------
        # Alice, 35K
        # Bob, 30K
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        # Schema is Name, Salary
        self.assertEqual(len(ds.columns), 2)
        self.assertEqual(ds.columns[0].name.upper(), 'NAME')
        self.assertEqual(ds.columns[1].name.upper(), 'SALARY')
        # Make sure that all rows only have two columns
        row = ds_rows[0]
        self.assertEqual(len(row.values), 2)
        self.assertEqual(len(row.values), 2)
        self.assertEqual(row.values[0], 'Alice')
        self.assertEqual(row.values[1], '35K')
        row = ds_rows[1]
        self.assertEqual(len(row.values), 2)
        self.assertEqual(len(row.values), 2)
        self.assertEqual(row.values[0], 'Bob')
        self.assertEqual(row.values[1], '30K')
        # Ensure that row identifier haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(ds_rows[i].identifier, row_ids[i])
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(MimirError):
            self.api.delete_column('unknown:uri', 0, self.datastore)
        # Ensure exception is thrown if column identifier is unknown
        with self.assertRaises(ValueError):
            self.api.delete_column(ds.identifier, 100, self.datastore)

    def delete_row(self):
        """Test functionality to delete a row."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Delete second row
        result = self.api.delete_row(ds.identifier, row_ids[1], self.datastore)
        del row_ids[1]
        # Resulting dataset should differ from previous one
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        # Retrieve modified dataset and ensure that it contains the following
        # data:
        #
        # Name, Age, Salary
        # ------------
        # Alice, 23, 35K
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        # Schema is Name, Salary
        col_names = ['Name', 'Age', 'Salary']
        self.assertEqual(len(ds.columns), len(col_names))
        for i in range(len(ds.columns)):
            self.assertEqual(ds.columns[i].name.upper(), col_names[i].upper())
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEqual(ds.columns[i].identifier, col_ids[i])
        # There should only be one row
        self.assertEqual(len(ds_rows), 1)
        # Ensure that row identifier haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(ds_rows[i].identifier, row_ids[i])
        # Ensure exception is thrown if dataset is unknown
        with self.assertRaises(MimirError):
            self.api.delete_row('unknown:uri', 0, self.datastore)

    def filter_columns(self):
        """Test projection of a dataset."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        result = self.api.filter_columns(ds.identifier, [2, 0], ['BD', None],
                                         self.datastore)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        self.assertEqual(len(ds.columns), 2)
        self.assertEqual(ds.columns[0].name.upper(), 'BD')
        self.assertEqual(ds.columns[1].name.upper(), 'NAME')
        rows = ds.fetch_rows()
        self.assertEqual(rows[0].values, ['35K', 'Alice'])
        self.assertEqual(rows[1].values, ['30K', 'Bob'])

    def insert_column(self):
        """Test functionality to insert a columns."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Insert columns at position 1
        col_ids.insert(1, ds.max_column_id() + 1)
        result = self.api.insert_column(ds.identifier, 1, 'Height',
                                        self.datastore)
        # Resulting dataset should differ from previous one
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        # Retrieve dataset and ensure that it has the following schema:
        # Name, Height, Age, Salary
        ds = self.datastore.get_dataset(result.dataset.identifier)
        col_names = ['Name', 'Height', 'Age', 'Salary']
        # Ensure that there are four rows
        self.assertEqual(len(ds.columns), len(col_names))
        print(ds.columns)
        for i in range(len(col_names)):
            col = ds.columns[i]
            self.assertEqual(col.name.upper(), col_names[i].upper())
        # Insert columns at last position
        col_ids.append(ds.max_column_id() + 1)
        col_names.append('Weight')
        result = self.api.insert_column(ds.identifier, 4, 'Weight',
                                        self.datastore)
        # Resulting dataset should differ from previous one
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        # Retrieve dataset and ensure that it has the following schema:
        # Name, Height, Age, Salary, Weight
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        # Ensure that there are five rows
        self.assertEqual(len(ds.columns), len(col_names))
        for i in range(len(col_names)):
            col = ds.columns[i]
            self.assertEqual(col.name.upper(), col_names[i].upper())
        # The cell values for new columns are None all other values are not None
        for row in ds_rows:
            for i in range(len(ds.columns)):
                if i == 1 or i == 4:
                    self.assertIsNone(row.values[i])
                else:
                    self.assertTrue(row.values[i])
        # Ensure that row identifier haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(ds_rows[i].identifier, row_ids[i])
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(MimirError):
            self.api.insert_column('unknown:uri', 1, 'Height', self.datastore)
        # Ensure exception is thrown if column name is invalid
        self.api.insert_column(ds.identifier, 1, 'Height_from_ground',
                               self.datastore)
        with self.assertRaises(ValueError):
            self.api.insert_column(ds.identifier, 1, 'Height from ground!@#',
                                   self.datastore)
        # Ensure exception is thrown if column position is out of bounds
        with self.assertRaises(ValueError):
            self.api.insert_column(ds.identifier, 100, 'Height',
                                   self.datastore)

    def insert_row(self):
        """Test functionality to insert a row."""
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        # Keep track of column and row identifier
        ds_rows = ds.fetch_rows()
        row_ids = [row.identifier for row in ds_rows]
        # Insert row at index position 1
        row_ids.insert(1, None)
        # Result should indicate that one row was inserted. The identifier of
        # the resulting dataset should differ from the identifier of the
        # original dataset
        result = self.api.insert_row(ds.identifier, 1, self.datastore)
        # Resulting dataset should differ from previous one
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        # Retrieve modified dataset
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        # Ensure that there are three rows
        self.assertEqual(len(ds_rows), 3)
        # The second row has empty values for each column
        row = ds_rows[1]
        self.assertEqual(len(row.values), len(ds.columns))
        for i in range(len(ds.columns)):
            self.assertIsNone(row.values[i])
        # Append row at end current dataset
        row_ids.append(None)
        result = self.api.insert_row(ds.identifier, 3, self.datastore)
        # Resulting dataset should differ from previous one
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        # Ensure that there are three rows
        self.assertEqual(len(ds_rows), 4)
        # The next to last row has non-empty values for each column
        row = ds_rows[2]
        self.assertEqual(len(row.values), len(ds.columns))
        for i in range(len(ds.columns)):
            self.assertIsNotNone(row.values[i])
        # The last row has empty values for each column
        row = ds_rows[3]
        self.assertEqual(len(row.values), len(ds.columns))
        for i in range(len(ds.columns)):
            self.assertIsNone(row.values[i])
        # Ensure that row ids haven't changed
        # ## July 16, 2020 by OK: Bug in mimir that is going to take a bunch of
        # ## heavy lifting to fix: https://github.com/UBOdin/mimir-api/issues/11
        # for i in range(len(ds_rows)):
        #     if row_ids[i] is not None:
        #         self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i]))
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(MimirError):
            self.api.insert_row('unknown:uri', 1, self.datastore)
        # Ensure no exception is raised
        self.api.insert_row(ds.identifier, 4, self.datastore)

    def load_dataset(self) -> None:
        """Test functionality to load a dataset."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        result = self.api.load_dataset(datastore=self.datastore,
                                       filestore=self.filestore,
                                       file_id=fh.identifier)
        ds = result.dataset
        resources = result.resources
        assert (isinstance(ds, DatasetHandle))
        ds_rows = ds.fetch_rows()
        self.assertEqual(len(ds.columns), 3)
        self.assertEqual(len(ds_rows), 2)
        for row in ds_rows:
            self.assertTrue(isinstance(row.values[1], int))
        self.assertIsNotNone(resources)
        self.assertEqual(resources[RESOURCE_FILEID], fh.identifier)
        self.assertEqual(resources[RESOURCE_DATASET], ds.identifier)
        # Delete file handle and oing the same should raise an exception
        self.filestore.delete_file(fh.identifier)
        with self.assertRaises(ValueError):
            self.api.load_dataset(datastore=self.datastore,
                                  filestore=self.filestore,
                                  file_id=fh.identifier)
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(ValueError):
            self.api.load_dataset(datastore=self.datastore,
                                  filestore=self.filestore,
                                  file_id='unknown:uri')
        # Test loading file from external resource. Skip if DOWNLOAD_URL is None
        if DOWNLOAD_URL is None:
            print('Skipping download test')
            return
        result = self.api.load_dataset(datastore=self.datastore,
                                       filestore=self.filestore,
                                       url=DOWNLOAD_URL,
                                       options=[{
                                           'delimiter': '\t'
                                       }])
        ds = result.dataset
        resources = result.resources
        ds_rows = ds.fetch_rows()
        self.assertEqual(len(ds.columns), 4)
        self.assertEqual(len(ds_rows), 54)
        self.assertIsNotNone(resources)
        self.assertEqual(resources[RESOURCE_URL], DOWNLOAD_URL)
        self.assertEqual(resources[RESOURCE_DATASET], ds.identifier)
        # Attempt to simulate re-running without downloading again. Set the
        # Uri to some fake Uri that would raise an exception if an attempt was
        # made to download
        url = 'some fake uri'
        resources[RESOURCE_URL] = url
        result = self.api.load_dataset(datastore=self.datastore,
                                       filestore=self.filestore,
                                       url=url,
                                       resources=resources)
        prev_id = result.dataset.identifier
        self.assertEqual(result.dataset.identifier, prev_id)
        # If we re-run with reload flag true a new dataset should be returned
        resources[RESOURCE_URL] = DOWNLOAD_URL
        result = self.api.load_dataset(datastore=self.datastore,
                                       filestore=self.filestore,
                                       url=DOWNLOAD_URL,
                                       resources=resources,
                                       reload=True,
                                       options=[{
                                           'delimiter': '\t'
                                       }])
        self.assertNotEqual(result.dataset.identifier, prev_id)

    def move_column(self):
        """Test functionality to move a column."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Swap first two columns
        c = col_ids[0]
        del col_ids[0]
        col_ids.insert(1, c)
        result = self.api.move_column(ds.identifier,
                                      ds.column_by_name('Name').identifier, 1,
                                      self.datastore)
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEqual(ds.columns[0].name.upper(), 'Age'.upper())
        self.assertEqual(ds.columns[1].name.upper(), 'Name'.upper())
        self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper())
        row = ds_rows[0]
        self.assertEqual(row.values[0], 23)
        self.assertEqual(row.values[1], 'Alice')
        self.assertEqual(row.values[2], '35K')
        row = ds_rows[1]
        self.assertEqual(row.values[0], 32)
        self.assertEqual(row.values[1], 'Bob')
        self.assertEqual(row.values[2], '30K')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i]))
        # Swap last two columns
        c = col_ids[1]
        del col_ids[1]
        col_ids.append(c)
        result = self.api.move_column(ds.identifier,
                                      ds.column_by_name('Salary').identifier,
                                      1, self.datastore)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEqual(ds.columns[0].name.upper(), 'Age'.upper())
        self.assertEqual(ds.columns[1].name.upper(), 'Salary'.upper())
        self.assertEqual(ds.columns[2].name.upper(), 'Name'.upper())
        row = ds_rows[0]
        self.assertEqual(row.values[0], 23)
        self.assertEqual(row.values[1], '35K')
        self.assertEqual(row.values[2], 'Alice')
        row = ds_rows[1]
        self.assertEqual(row.values[0], 32)
        self.assertEqual(row.values[1], '30K')
        self.assertEqual(row.values[2], 'Bob')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i]))
        # No changes if source and target position are the same
        result = self.api.move_column(ds.identifier, ds.columns[1].identifier,
                                      1, self.datastore)
        self.assertEqual(ds.identifier, result.dataset.identifier)
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(MimirError):
            self.api.move_column('unknown:uri', 0, 1, self.datastore)
        # Raise error if source column is out of bounds
        with self.assertRaises(ValueError):
            self.api.move_column(ds.identifier, 40, 1, self.datastore)
        # Raise error if target position is out of bounds
        with self.assertRaises(ValueError):
            self.api.move_column(ds.identifier,
                                 ds.column_by_name('Name').identifier, -1,
                                 self.datastore)
        with self.assertRaises(ValueError):
            self.api.move_column(ds.identifier,
                                 ds.column_by_name('Name').identifier, 4,
                                 self.datastore)

    def move_row(self):
        """Test functionality to move a row."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        row_ids = [row.identifier for row in ds_rows]
        # Swap first two rows
        result = self.api.move_row(ds.identifier, row_ids[0], 1,
                                   self.datastore)
        row_ids = [row for row in reversed(row_ids)]
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEqual(ds.columns[0].name.upper(), 'Name'.upper())
        self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper())
        self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper())
        row = ds_rows[0]
        self.assertEqual(row.values[0], 'Bob')
        self.assertEqual(row.values[1], 32)
        self.assertEqual(row.values[2], '30K')
        row = ds_rows[1]
        self.assertEqual(row.values[0], 'Alice')
        self.assertEqual(row.values[1], 23)
        self.assertEqual(row.values[2], '35K')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(ds_rows[i].identifier, row_ids[i])
        # Swap last two rows
        result = self.api.move_row(ds.identifier, row_ids[1], 0,
                                   self.datastore)
        row_ids = [row for row in reversed(row_ids)]
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEqual(ds.columns[0].name.upper(), 'Name'.upper())
        self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper())
        self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper())
        row = ds_rows[0]
        self.assertEqual(row.values[0], 'Alice')
        self.assertEqual(row.values[1], 23)
        self.assertEqual(row.values[2], '35K')
        row = ds_rows[1]
        self.assertEqual(row.values[0], 'Bob')
        self.assertEqual(row.values[1], 32)
        self.assertEqual(row.values[2], '30K')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i]))
        # Move first row to the end
        result = self.api.move_row(ds.identifier, row_ids[0], 2,
                                   self.datastore)
        row_ids = [row for row in reversed(row_ids)]
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        row = ds_rows[0]
        self.assertEqual(row.values[0], 'Bob')
        self.assertEqual(row.values[1], 32)
        self.assertEqual(row.values[2], '30K')
        row = ds_rows[1]
        self.assertEqual(row.values[0], 'Alice')
        self.assertEqual(row.values[1], 23)
        self.assertEqual(row.values[2], '35K')
        # Ensure that row ids haven't changed

        # ## July 16, 2020 by OK: Bug in mimir that is going to take a bunch of
        # ## heavy lifting to fix: https://github.com/UBOdin/mimir-api/issues/11
        # for i in range(len(ds_rows)):
        #     self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i]))
        # No changes if source and target position are the same

        result = self.api.move_row(ds.identifier, row_ids[1], 1,
                                   self.datastore)

        # ## July 21, 2020 by OK: It would be fantastic if we could easily detect
        # no-op vizual, but for now skip this check
        #self.assertEqual(ds.identifier, result.dataset.identifier)

        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(MimirError):
            self.api.move_row('unknown:uri', 0, 1, self.datastore)
        # Raise error if target position is out of bounds
        # ## July 21, 2020 by OK: Skipping this check for now
        # with self.assertRaises(ValueError):
        #     self.api.move_row(ds.identifier, 0, -1, self.datastore)
        # with self.assertRaises(ValueError):
        #     self.api.move_row(ds.identifier, 1, 4, self.datastore)

    def rename_column(self):
        """Test functionality to rename a column."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        row_ids = [row.identifier for row in ds_rows]
        # Rename first column to Firstname
        result = self.api.rename_column(ds.identifier,
                                        ds.column_by_name('Name').identifier,
                                        'Firstname', self.datastore)
        self.assertNotEqual(result.dataset.identifier, ds.identifier)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        self.assertEqual(ds.columns[0].name.upper(), 'Firstname'.upper())
        self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper())
        self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper())
        result = self.api.rename_column(ds.identifier,
                                        ds.column_by_name('Age').identifier,
                                        'BDate', self.datastore)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEqual(ds.columns[0].name.upper(), 'Firstname'.upper())
        self.assertEqual(ds.columns[1].name, 'BDate')
        self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper())
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i]))
        # No changes if the old and new column name are the same (with exception
        # to upper and lower cases).
        result = self.api.rename_column(ds.identifier,
                                        ds.column_by_name('BDate').identifier,
                                        'BDate', self.datastore)
        # ## July 21, 2020 by OK: It would be fantastic if we could easily detect
        # no-op vizual, but for now skip this check
        # self.assertEqual(ds.identifier, result.dataset.identifier)
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(MimirError):
            self.api.rename_column('unknown:uri', 0, 'Firstname',
                                   self.datastore)
        # Ensure exception is thrown for invalid column id
        with self.assertRaises(ValueError):
            self.api.rename_column(ds.identifier, 500, 'BDate', self.datastore)

    def sequence_of_steps(self):
        """Test sequence of calls that modify a dataset."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        ds = self.api.insert_row(ds.identifier, 1, self.datastore).dataset
        row_ids = [row.identifier for row in ds.fetch_rows()]
        row0 = row_ids[0]
        row1 = row_ids[1]
        row2 = row_ids[2]
        ds = self.api.insert_column(ds.identifier, 3, 'HDate',
                                    self.datastore).dataset
        ds = self.api.update_cell(ds.identifier,
                                  ds.column_by_name('HDate').identifier, row0,
                                  '180', self.datastore).dataset
        ds = self.api.update_cell(ds.identifier,
                                  ds.column_by_name('HDate').identifier, row2,
                                  '160', self.datastore).dataset
        ds = self.api.rename_column(ds.identifier,
                                    ds.column_by_name('HDate').identifier,
                                    'Height', self.datastore).dataset
        ds = self.api.update_cell(ds.identifier,
                                  ds.column_by_name('Height').identifier, row1,
                                  '170', self.datastore).dataset
        ds = self.api.move_row(ds.identifier, row1, 2, self.datastore).dataset
        ds = self.api.update_cell(ds.identifier,
                                  ds.column_by_name('Name').identifier, row2,
                                  'Carla', self.datastore).dataset
        ds = self.api.update_cell(ds.identifier,
                                  ds.column_by_name('Age').identifier, row2,
                                  '45', self.datastore).dataset
        ds = self.api.update_cell(ds.identifier,
                                  ds.column_by_name('Salary').identifier, row2,
                                  '56K', self.datastore).dataset
        ds = self.api.move_column(ds.identifier,
                                  ds.column_by_name('Salary').identifier, 4,
                                  self.datastore).dataset
        ds = self.api.delete_column(ds.identifier,
                                    ds.column_by_name('Age').identifier,
                                    self.datastore).dataset
        ds = self.api.delete_row(ds.identifier, row0, self.datastore).dataset
        ds = self.api.delete_row(ds.identifier, row1, self.datastore).dataset
        ds = self.datastore.get_dataset(ds.identifier)
        ds_rows = ds.fetch_rows()
        names = ['Name', 'Height', 'Salary']
        self.assertEqual(len(ds.columns), len(names))
        for i in range(len(names)):
            col = ds.columns[i]
            self.assertEqual(col.name.upper(), names[i].upper())
        self.assertEqual(len(ds_rows), 1)
        self.assertEqual(ds_rows[0].values, ['Carla', '160', '56K'])

    def sort_dataset(self):
        """Test sorting a dataset."""
        # Create a new dataset
        fh = self.filestore.upload_file(SORT_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        result = self.api.sort_dataset(ds.identifier, [1, 2, 0],
                                       [False, False, True], self.datastore)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        rows = ds.fetch_rows()
        names = ['Alice', 'Bob', 'Dave', 'Gertrud', 'Frank']
        result = list()
        for row in rows:
            name = row.values[0]
            if name in names:
                result.append(name)
        for i in range(len(names)):
            self.assertEqual(names[i], result[i])
        result = self.api.sort_dataset(ds.identifier, [2, 1, 0],
                                       [True, False, True], self.datastore)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        rows = ds.fetch_rows()
        names = ['Gertrud', 'Frank', 'Bob', 'Alice', 'Dave']
        result = list()
        for row in rows:
            name = row.values[0]
            if name in names:
                result.append(name)
        for i in range(len(names)):
            self.assertEqual(names[i], result[i])
        # Raises error for invalid column identifier
        with self.assertRaises(ValueError):
            self.api.sort_dataset(ds.identifier, [2, 10, 0],
                                  [True, False, True], self.datastore)

    def update_cell(self):
        """Test functionality to update a dataset cell."""
        # Create a new dataset
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.api.load_dataset(datastore=self.datastore,
                                   filestore=self.filestore,
                                   file_id=fh.identifier).dataset
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        row_ids = [row.identifier for row in ds_rows]
        # Update cell [0, 0]. Ensure that one row was updated and a new
        # identifier is generated. Also ensure that the resulting datasets
        # has the new value in cell [0, 0]
        row_id = row_ids[0]
        result = self.api.update_cell(ds.identifier, 0, row_id, 'MyValue',
                                      self.datastore)
        self.assertNotEqual(ds.identifier, result.dataset.identifier)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        row = None
        for r in ds.fetch_rows():
            if r.identifier == row_id:
                row = r
                break
        self.assertEqual(row.values[0], 'MyValue')
        result = self.api.update_cell(ds.identifier,
                                      ds.column_by_name('Name').identifier,
                                      row_id, 'AValue', self.datastore)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        row = None
        for r in ds.fetch_rows():
            if r.identifier == row_id:
                row = r
                break
        self.assertEqual(row.values[0], 'AValue')
        self.assertEqual(row.values[ds.column_index('Name')], 'AValue')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i]))
        # Set value to None
        result = self.api.update_cell(ds.identifier,
                                      ds.column_by_name('Name').identifier,
                                      row_id, None, self.datastore)
        ds = self.datastore.get_dataset(result.dataset.identifier)
        ds_rows = ds.fetch_rows()
        row = None
        for r in ds.fetch_rows():
            if r.identifier == row_id:
                row = r
                break
        self.assertIsNone(row.values[0])
        self.assertIsNone(row.values[ds.column_index('Name')])
        # Ensure exception is thrown if dataset is unknown
        with self.assertRaises(MimirError):
            self.api.update_cell('unknown:uri', 0, 0, 'MyValue',
                                 self.datastore)
예제 #30
0
class TestDefaultPyCellProcessor(unittest.TestCase):
    def setUp(self):
        """Create instances of the default datastore and filestore."""
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.datastore = FileSystemDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_create_dataset_script(self):
        """Test running a script that creates a new datasets."""
        cmd = python_cell(source=CREATE_DATASET_PY, validate=True)
        result = PyCellTaskProcessor().compute(command_id=cmd.command_id,
                                               arguments=cmd.arguments,
                                               context=TaskContext(
                                                   datastore=self.datastore,
                                                   filestore=self.filestore))
        self.assertTrue(result.is_success)
        self.assertIsNotNone(result.provenance.read)
        self.assertIsNotNone(result.provenance.write)
        self.assertEqual(len(result.provenance.read), 0)
        self.assertEqual(len(result.provenance.write), 1)
        self.assertTrue('people' in result.provenance.write)
        self.assertIsNotNone(result.provenance.write['people'])
        self.assertEqual(len(result.outputs.stdout), 1)
        self.assertEqual(len(result.outputs.stderr), 0)
        self.assertEqual(result.outputs.stdout[0].value, 'Alice\nBob')

    def test_print_dataset_script(self):
        """Test running a script that prints rows in an existing datasets."""
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.datastore.load_dataset(fh)
        cmd = python_cell(source=PRINT_DATASET_PY, validate=True)
        result = PyCellTaskProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={'people': ds.identifier}))
        self.assertTrue(result.is_success)
        self.assertIsNotNone(result.provenance.read)
        self.assertIsNotNone(result.provenance.write)
        self.assertEqual(len(result.provenance.read), 1)
        self.assertEqual(len(result.provenance.write), 0)
        self.assertTrue('people' in result.provenance.read)
        self.assertIsNotNone(result.provenance.read['people'])
        self.assertEqual(len(result.outputs.stdout), 1)
        self.assertEqual(len(result.outputs.stderr), 0)
        self.assertEqual(result.outputs.stdout[0].value, 'Alice\nBob')

    def test_simple_script(self):
        """Test running the simple python script."""
        cmd = python_cell(source='print 2+2', validate=True)
        result = PyCellTaskProcessor().compute(command_id=cmd.command_id,
                                               arguments=cmd.arguments,
                                               context=TaskContext(
                                                   datastore=self.datastore,
                                                   filestore=self.filestore,
                                                   datasets=dict()))
        self.assertTrue(result.is_success)
        self.assertEqual(result.outputs.stdout[0].value, '4')

    def test_unknown_dataset_script(self):
        """Test running a script that accesses an unknown datasets."""
        fh = self.filestore.upload_file(CSV_FILE)
        ds = self.datastore.load_dataset(fh)
        cmd = python_cell(source=PRINT_UNKNOWN_DATASET_PY, validate=True)
        result = PyCellTaskProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={'people': ds.identifier}))
        self.assertFalse(result.is_success)
        self.assertIsNone(result.provenance.read)
        self.assertIsNone(result.provenance.write)
        self.assertEqual(len(result.outputs.stdout), 0)
        self.assertEqual(len(result.outputs.stderr), 1)
        # Running a similar script that catches the error schould be a success
        # and the access to the dataset should be recorded in the resulting
        # read provenance
        cmd = python_cell(source=PRINT_UNKNOWN_DATASET_PY_WITH_TRY_CATCH,
                          validate=True)
        result = PyCellTaskProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={'people': ds.identifier}))
        self.assertTrue(result.is_success)
        self.assertIsNotNone(result.provenance.read)
        self.assertIsNotNone(result.provenance.write)
        self.assertEqual(len(result.provenance.read), 1)
        self.assertEqual(len(result.provenance.write), 0)
        self.assertTrue('employees' in result.provenance.read)
        self.assertIsNone(result.provenance.read['employees'])
        self.assertEqual(len(result.outputs.stdout), 1)
        self.assertEqual(len(result.outputs.stderr), 0)