def test_set_specification(self): """Test if the set-specs subcommand of the CLI worls properly.""" org_specs = pr.get_schema_specs("pysemantic") cmd = ['semantic', 'set-specs', 'pysemantic', '--dataset', 'iris', '--dlm', '|'] try: subprocess.check_call(cmd, env=self.testenv) new_specs = pr.get_schema_specs("pysemantic", "iris") self.assertEqual(new_specs['delimiter'], '|') finally: for dataset_name, specs in org_specs.iteritems(): pr.set_schema_specs("pysemantic", dataset_name, **specs)
def test_random_row_selection(self): iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = dict(random=True, count=50) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 50) ideal_ix = np.arange(50) self.assertFalse(np.all(loaded.index.values == ideal_ix))
def test_random_row_selection_within_range(self): """Check if randomly selecting rows within a range works.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75], 'count': 10, 'random': True} project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 10) ix = loaded.index.values self.assertTrue(ix.max() <= 50)
def test_row_selection_random_range(self): """Check if a range of rows can be selected from the dataset.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75], 'random': True} project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 50) ideal_ix = np.arange(50) self.assertFalse(np.all(loaded.index.values == ideal_ix))
def test_nrows_callable(self): """Check if specifying the nrows argument as a callable works.""" nrows = lambda x: np.remainder(x, 2) == 0 iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = nrows project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 75) ideal_ix = np.arange(150, step=2) np.testing.assert_allclose(ideal_ix, loaded.index.values)
def test_random_row_selection_within_range(self): """Check if randomly selecting rows within a range works.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75], 'count': 10, 'random': True} iris_specs['header'] = 0 del iris_specs['dtypes'] iris_specs['column_names'] = colnames(iris_specs['path']) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 10) ix = loaded.index.values self.assertTrue(ix.max() <= 50)
def test_row_selection_range(self): """Check if a range of rows can be selected from the dataset.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75]} iris_specs['header'] = 0 del iris_specs['dtypes'] iris_specs['column_names'] = colnames(iris_specs['path']) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 50) ideal_ix = np.arange(50) self.assertTrue(np.allclose(loaded.index.values, ideal_ix))
def test_row_selection_random_range(self): """Check if a range of rows can be selected from the dataset.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75], 'random': True} iris_specs['header'] = 0 del iris_specs['dtypes'] iris_specs['column_names'] = colnames(iris_specs['path']) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 50) ideal_ix = np.arange(50) self.assertFalse(np.all(loaded.index.values == ideal_ix))
def test_init_project_yaml_dump(self): """Test initialization of Project class with the raw yaml dump.""" project_specs = pr.get_schema_specs('pysemantic') project = pr.Project(schema=project_specs) loaded = project.load_datasets() self.assertItemsEqual(loaded.keys(), ('iris', 'person_activity', 'multi_iris', 'bad_iris', 'random_row_iris')) dframe = pd.read_csv(**self.expected_specs['iris']) self.assertDataFrameEqual(loaded['iris'], dframe) dframe = pd.read_csv(**self.expected_specs['person_activity']) self.assertDataFrameEqual(loaded['person_activity'], dframe) dframes = [pd.read_csv(**args) for args in self.expected_specs['multi_iris']] dframes = [x.drop_duplicates() for x in dframes] dframe = pd.concat(dframes) dframe.set_index(np.arange(dframe.shape[0]), inplace=True) self.assertDataFrameEqual(loaded['multi_iris'], dframe)
def test_add_dataset(self): """Test if the add-dataset subcommand adds datasets to projects.""" tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b']) dframe.to_csv(outfile, index=False) cmd = ("semantic add-dataset testdata --project pysemantic --path {}" " --dlm ,") cmd = cmd.format(outfile).split(" ") try: subprocess.check_call(cmd, env=self.testenv) _pr = pr.Project("pysemantic") self.assertIn("testdata", _pr.datasets) specs = dict(path=outfile, delimiter=',') actual = pr.get_schema_specs("pysemantic", "testdata") self.assertKwargsEqual(specs, actual) finally: pr.remove_dataset("pysemantic", "testdata") shutil.rmtree(tempdir)
def test_add_dataset(self): """Test if adding datasets programmatically works fine.""" tempdir = tempfile.mkdtemp() outpath = op.join(tempdir, "foo.csv") dframe = pd.DataFrame(np.random.random((10, 10))) dframe.to_csv(outpath, index=False) specs = dict(path=outpath, delimiter=',', nrows=10) try: pr.add_dataset("pysemantic", "sample_dataset", specs) parsed_specs = pr.get_schema_specs("pysemantic", "sample_dataset") self.assertKwargsEqual(specs, parsed_specs) finally: shutil.rmtree(tempdir) with open(TEST_DATA_DICT, "r") as fileobj: test_specs = yaml.load(fileobj, Loader=Loader) del test_specs['sample_dataset'] with open(TEST_DATA_DICT, "w") as fileobj: yaml.dump(test_specs, fileobj, Dumper=Dumper, default_flow_style=False)
def test_init_project_yaml_dump(self): """Test initialization of Project class with the raw yaml dump.""" project_specs = pr.get_schema_specs('pysemantic') project = pr.Project(schema=project_specs) loaded = project.load_datasets() self.assertItemsEqual(loaded.keys(), ('iris', 'person_activity', 'multi_iris', 'bad_iris', 'random_row_iris')) dframe = pd.read_csv(**self.expected_specs['iris']) self.assertDataFrameEqual(loaded['iris'], dframe) dframe = pd.read_csv(**self.expected_specs['person_activity']) self.assertDataFrameEqual(loaded['person_activity'], dframe) dframes = [ pd.read_csv(**args) for args in self.expected_specs['multi_iris'] ] dframes = [x.drop_duplicates() for x in dframes] dframe = pd.concat(dframes) dframe.set_index(np.arange(dframe.shape[0]), inplace=True) self.assertDataFrameEqual(loaded['multi_iris'], dframe)
def test_get_schema_spec(self): """Test the module level function to get schema specifications.""" specs = pr.get_schema_specs("pysemantic") self.assertKwargsEqual(specs, self.data_specs)