def cli(arguments): """cli - The main CLI argument parser. :param arguments: command line arguments, as parsed by docopt :type arguments: dict :return: None """ if arguments.get("list", False): if arguments["--project"] is None: pr.view_projects() else: proj_name = arguments.get("--project") dataset_names = pr.get_datasets(proj_name) for name in dataset_names: print name elif arguments.get("add", False): proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("PROJECT_SPECFILE") proj_spec = op.abspath(proj_spec) pr.add_project(proj_name, proj_spec) elif arguments.get("remove", False): proj_name = arguments.get("PROJECT_NAME") if arguments["--dataset"] is None: if not pr.remove_project(proj_name): print "Removing the project {0} failed.".format(proj_name) else: pr.remove_dataset(proj_name, arguments["--dataset"]) elif arguments.get("set-schema", False): try: proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("SCHEMA_FPATH") proj_spec = op.abspath(proj_spec) pr.set_schema_fpath(proj_name, proj_spec) except MissingProject: msg = """Project {} not found in the configuration. Please use $ semantic add to register the project.""".format( arguments.get("PROJECT_NAME") ) print msg elif arguments.get("set-specs", False): proj_name = arguments.get("PROJECT_NAME") dataset_name = arguments.get("--dataset") newspecs = {} if arguments.get("--path", False): newspecs["path"] = arguments.get("--path") if arguments.get("--dlm", False): newspecs["delimiter"] = arguments.get("--dlm") pr.set_schema_specs(proj_name, dataset_name, **newspecs) elif arguments.get("add-dataset", False): proj_name = arguments.get("--project") dataset_name = arguments.get("DATASET_NAME") specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"]) pr.add_dataset(proj_name, dataset_name, specs) elif arguments.get("export", False): project = pr.Project(arguments.get("PROJECT_NAME")) project.export_dataset(arguments.get("--dataset"), outpath=arguments.get("OUTPATH"))
def cli(arguments): """cli - The main CLI argument parser. :param arguments: command line arguments, as parsed by docopt :type arguments: dict :return: None """ if arguments.get("list", False): if arguments['--project'] is None: pr.view_projects() else: proj_name = arguments.get('--project') dataset_names = pr.get_datasets(proj_name) for name in dataset_names: print name elif arguments.get("add", False): proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("PROJECT_SPECFILE") proj_spec = op.abspath(proj_spec) pr.add_project(proj_name, proj_spec) elif arguments.get("remove", False): proj_name = arguments.get("PROJECT_NAME") if arguments['--dataset'] is None: if not pr.remove_project(proj_name): print "The project {0} doesn't exist.".format(proj_name) else: pr.remove_dataset(proj_name, arguments['--dataset']) elif arguments.get("set-schema", False): try: proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("SCHEMA_FPATH") proj_spec = op.abspath(proj_spec) pr.set_schema_fpath(proj_name, proj_spec) except MissingProject: msg = """Project {} not found in the configuration. Please use $ semantic add to register the project.""".format(arguments.get("PROJECT_NAME")) print msg elif arguments.get("set-specs", False): proj_name = arguments.get("PROJECT_NAME") dataset_name = arguments.get("--dataset") newspecs = {} if arguments.get("--path", False): newspecs['path'] = arguments.get("--path") if arguments.get("--dlm", False): newspecs['delimiter'] = arguments.get("--dlm") pr.set_schema_specs(proj_name, dataset_name, **newspecs) elif arguments.get("add-dataset", False): proj_name = arguments.get('--project') dataset_name = arguments.get("DATASET_NAME") specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"]) pr.add_dataset(proj_name, dataset_name, specs) elif arguments.get("export", False): project = pr.Project(arguments.get("PROJECT_NAME")) project.export_dataset(arguments.get("--dataset"), outpath=arguments.get("OUTPATH"))
def test_exclude_cols(self): """Test if importing data with excluded columns works.""" filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv") specs = {'path': filepath, 'exclude_columns': ['Species']} pr.add_dataset('pysemantic', 'excl_iris', specs) try: project = pr.Project("pysemantic") loaded = project.load_dataset("excl_iris") self.assertNotIn('Species', loaded.columns) finally: pr.remove_dataset("pysemantic", "excl_iris")
def test_index_col(self): """Test if specifying the index_col works.""" iris_fpath = self.expected_specs['iris']['filepath_or_buffer'] specs = {'path': iris_fpath, 'index_col': 'Species', 'dataframe_rules': {'drop_duplicates': False}} pr.add_dataset("pysemantic", "iris_indexed", specs) try: df = pr.Project('pysemantic').load_dataset('iris_indexed') for specie in ['setosa', 'versicolor', 'virginica']: self.assertEqual(df.ix[specie].shape[0], 50) finally: pr.remove_dataset('pysemantic', 'iris_indexed')
def test_column_postprocessors(self): """Test if postprocessors work on column data properly.""" filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv") col_rules = {'Species': {'postprocessors': [_dummy_postproc]}} specs = {'path': filepath, 'column_rules': col_rules} pr.add_dataset("pysemantic", "postproc_iris", specs) try: project = pr.Project("pysemantic") loaded = project.load_dataset("postproc_iris") processed = loaded['Species'] self.assertNotIn("setosa", processed.unique()) finally: pr.remove_dataset("pysemantic", "postproc_iris")
def test_add_dataset(self): """Test if adding datasets programmatically works fine.""" tempdir = tempfile.mkdtemp() outpath = op.join(tempdir, "foo.csv") dframe = pd.DataFrame(np.random.random((10, 10))) dframe.to_csv(outpath, index=False) specs = dict(path=outpath, delimiter=',', nrows=10) try: pr.add_dataset("pysemantic", "sample_dataset", specs) parsed_specs = pr.get_schema_specs("pysemantic", "sample_dataset") self.assertKwargsEqual(specs, parsed_specs) finally: shutil.rmtree(tempdir) with open(TEST_DATA_DICT, "r") as fileobj: test_specs = yaml.load(fileobj, Loader=Loader) del test_specs['sample_dataset'] with open(TEST_DATA_DICT, "w") as fileobj: yaml.dump(test_specs, fileobj, Dumper=Dumper, default_flow_style=False)
def test_multiindex(self): """Test if providing a list of indices in the schema returns a proper multiindexed dataframe.""" pa_fpath = self.expected_specs['person_activity']['filepath_or_buffer'] index_cols = ['sequence_name', 'tag'] specs = {'path': pa_fpath, 'index_col': index_cols, 'delimiter': '\t'} pr.add_dataset("pysemantic", "pa_multiindex", specs) try: df = pr.Project('pysemantic').load_dataset('pa_multiindex') self.assertTrue(isinstance(df.index, pd.MultiIndex)) self.assertEqual(len(df.index.levels), 2) seq_name, tags = df.index.levels org_df = pd.read_table(specs['path']) for col in index_cols: x = org_df[col].unique().tolist() y = df.index.get_level_values(col).unique().tolist() self.assertItemsEqual(x, y) finally: pr.remove_dataset('pysemantic', 'pa_multiindex')
def test_remove_dataset(self): """Test if removing datasets works from the command line.""" # Add a temporary dataset and try to remove it. tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b']) dframe.to_csv(outfile, index=False) specs = dict(path=outfile, delimiter=',') pr.add_dataset("pysemantic", "testdata", specs) try: command = "semantic remove pysemantic --dataset testdata" cmd = command.split(' ') subprocess.check_call(cmd, env=self.testenv) datasets = pr.get_datasets("pysemantic") self.assertNotIn("testdata", datasets) finally: datasets = pr.get_datasets("pysemantic") if "testdata" in datasets: pr.remove_dataset("pysemantic", "testdata") shutil.rmtree(tempdir)
def test_regex_separator(self): """Test if the project properly loads a dataset when it encounters regex separators. """ tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "sample.txt") data = ["col1"] + map(str, range(10)) with open(outfile, "w") as fileobj: fileobj.write("\n".join(data)) specs = dict(path=outfile, delimiter=r'\n', dtypes={'col1': int}) pr.add_dataset("pysemantic", "sample_dataset", specs) try: with warnings.catch_warnings(record=True) as catcher: _pr = pr.Project("pysemantic") dframe = _pr.load_dataset("sample_dataset") assert len(catcher) == 2 assert issubclass(catcher[1].category, ParserWarning) data.remove("col1") self.assertItemsEqual(map(int, data), dframe['col1'].tolist()) finally: pr.remove_dataset("pysemantic", "sample_dataset") shutil.rmtree(tempdir)
def test_regex_separator(self): """Test if the project properly loads a dataset when it encounters regex separators. """ tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "sample.txt") data = ["col1"] + map(str, range(10)) with open(outfile, "w") as fileobj: fileobj.write("\n".join(data)) specs = dict(path=outfile, delimiter=r'\n', dtypes={'col1': int}) pr.add_dataset("pysemantic", "sample_dataset", specs) try: _pr = pr.Project("pysemantic") with warnings.catch_warnings(record=True) as catcher: dframe = _pr.load_dataset("sample_dataset") assert len(catcher) == 2 assert issubclass(catcher[1].category, ParserWarning) data.remove("col1") self.assertItemsEqual(map(int, data), dframe['col1'].tolist()) finally: pr.remove_dataset("pysemantic", "sample_dataset") shutil.rmtree(tempdir)