예제 #1
0
파일: cli.py 프로젝트: prolificd/pysemantic
def cli(arguments):
    """cli - The main CLI argument parser.

    :param arguments: command line arguments, as parsed by docopt
    :type arguments: dict
    :return: None
    """
    if arguments.get("list", False):
        if arguments["--project"] is None:
            pr.view_projects()
        else:
            proj_name = arguments.get("--project")
            dataset_names = pr.get_datasets(proj_name)
            for name in dataset_names:
                print name
    elif arguments.get("add", False):
        proj_name = arguments.get("PROJECT_NAME")
        proj_spec = arguments.get("PROJECT_SPECFILE")
        proj_spec = op.abspath(proj_spec)
        pr.add_project(proj_name, proj_spec)
    elif arguments.get("remove", False):
        proj_name = arguments.get("PROJECT_NAME")
        if arguments["--dataset"] is None:
            if not pr.remove_project(proj_name):
                print "Removing the project {0} failed.".format(proj_name)
        else:
            pr.remove_dataset(proj_name, arguments["--dataset"])
    elif arguments.get("set-schema", False):
        try:
            proj_name = arguments.get("PROJECT_NAME")
            proj_spec = arguments.get("SCHEMA_FPATH")
            proj_spec = op.abspath(proj_spec)
            pr.set_schema_fpath(proj_name, proj_spec)
        except MissingProject:
            msg = """Project {} not found in the configuration. Please use
            $ semantic add
            to register the project.""".format(
                arguments.get("PROJECT_NAME")
            )
            print msg
    elif arguments.get("set-specs", False):
        proj_name = arguments.get("PROJECT_NAME")
        dataset_name = arguments.get("--dataset")
        newspecs = {}
        if arguments.get("--path", False):
            newspecs["path"] = arguments.get("--path")
        if arguments.get("--dlm", False):
            newspecs["delimiter"] = arguments.get("--dlm")
        pr.set_schema_specs(proj_name, dataset_name, **newspecs)
    elif arguments.get("add-dataset", False):
        proj_name = arguments.get("--project")
        dataset_name = arguments.get("DATASET_NAME")
        specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"])
        pr.add_dataset(proj_name, dataset_name, specs)
    elif arguments.get("export", False):
        project = pr.Project(arguments.get("PROJECT_NAME"))
        project.export_dataset(arguments.get("--dataset"), outpath=arguments.get("OUTPATH"))
예제 #2
0
def cli(arguments):
    """cli - The main CLI argument parser.

    :param arguments: command line arguments, as parsed by docopt
    :type arguments: dict
    :return: None
    """
    if arguments.get("list", False):
        if arguments['--project'] is None:
            pr.view_projects()
        else:
            proj_name = arguments.get('--project')
            dataset_names = pr.get_datasets(proj_name)
            for name in dataset_names:
                print name
    elif arguments.get("add", False):
        proj_name = arguments.get("PROJECT_NAME")
        proj_spec = arguments.get("PROJECT_SPECFILE")
        proj_spec = op.abspath(proj_spec)
        pr.add_project(proj_name, proj_spec)
    elif arguments.get("remove", False):
        proj_name = arguments.get("PROJECT_NAME")
        if arguments['--dataset'] is None:
            if not pr.remove_project(proj_name):
                print "The project {0} doesn't exist.".format(proj_name)
        else:
            pr.remove_dataset(proj_name, arguments['--dataset'])
    elif arguments.get("set-schema", False):
        try:
            proj_name = arguments.get("PROJECT_NAME")
            proj_spec = arguments.get("SCHEMA_FPATH")
            proj_spec = op.abspath(proj_spec)
            pr.set_schema_fpath(proj_name, proj_spec)
        except MissingProject:
            msg = """Project {} not found in the configuration. Please use
            $ semantic add
            to register the project.""".format(arguments.get("PROJECT_NAME"))
            print msg
    elif arguments.get("set-specs", False):
        proj_name = arguments.get("PROJECT_NAME")
        dataset_name = arguments.get("--dataset")
        newspecs = {}
        if arguments.get("--path", False):
            newspecs['path'] = arguments.get("--path")
        if arguments.get("--dlm", False):
            newspecs['delimiter'] = arguments.get("--dlm")
        pr.set_schema_specs(proj_name, dataset_name, **newspecs)
    elif arguments.get("add-dataset", False):
        proj_name = arguments.get('--project')
        dataset_name = arguments.get("DATASET_NAME")
        specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"])
        pr.add_dataset(proj_name, dataset_name, specs)
    elif arguments.get("export", False):
        project = pr.Project(arguments.get("PROJECT_NAME"))
        project.export_dataset(arguments.get("--dataset"),
                               outpath=arguments.get("OUTPATH"))
예제 #3
0
 def test_exclude_cols(self):
     """Test if importing data with excluded columns works."""
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "iris.csv")
     specs = {'path': filepath, 'exclude_columns': ['Species']}
     pr.add_dataset('pysemantic', 'excl_iris', specs)
     try:
         project = pr.Project("pysemantic")
         loaded = project.load_dataset("excl_iris")
         self.assertNotIn('Species', loaded.columns)
     finally:
         pr.remove_dataset("pysemantic", "excl_iris")
예제 #4
0
 def test_index_col(self):
     """Test if specifying the index_col works."""
     iris_fpath = self.expected_specs['iris']['filepath_or_buffer']
     specs = {'path': iris_fpath, 'index_col': 'Species',
              'dataframe_rules': {'drop_duplicates': False}}
     pr.add_dataset("pysemantic", "iris_indexed", specs)
     try:
         df = pr.Project('pysemantic').load_dataset('iris_indexed')
         for specie in ['setosa', 'versicolor', 'virginica']:
             self.assertEqual(df.ix[specie].shape[0], 50)
     finally:
         pr.remove_dataset('pysemantic', 'iris_indexed')
예제 #5
0
 def test_column_postprocessors(self):
     """Test if postprocessors work on column data properly."""
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "iris.csv")
     col_rules = {'Species': {'postprocessors': [_dummy_postproc]}}
     specs = {'path': filepath, 'column_rules': col_rules}
     pr.add_dataset("pysemantic", "postproc_iris", specs)
     try:
         project = pr.Project("pysemantic")
         loaded = project.load_dataset("postproc_iris")
         processed = loaded['Species']
         self.assertNotIn("setosa", processed.unique())
     finally:
         pr.remove_dataset("pysemantic", "postproc_iris")
예제 #6
0
 def test_add_dataset(self):
     """Test if adding datasets programmatically works fine."""
     tempdir = tempfile.mkdtemp()
     outpath = op.join(tempdir, "foo.csv")
     dframe = pd.DataFrame(np.random.random((10, 10)))
     dframe.to_csv(outpath, index=False)
     specs = dict(path=outpath, delimiter=',', nrows=10)
     try:
         pr.add_dataset("pysemantic", "sample_dataset", specs)
         parsed_specs = pr.get_schema_specs("pysemantic", "sample_dataset")
         self.assertKwargsEqual(specs, parsed_specs)
     finally:
         shutil.rmtree(tempdir)
         with open(TEST_DATA_DICT, "r") as fileobj:
             test_specs = yaml.load(fileobj, Loader=Loader)
         del test_specs['sample_dataset']
         with open(TEST_DATA_DICT, "w") as fileobj:
             yaml.dump(test_specs, fileobj, Dumper=Dumper,
                       default_flow_style=False)
예제 #7
0
 def test_multiindex(self):
     """Test if providing a list of indices in the schema returns a proper
     multiindexed dataframe."""
     pa_fpath = self.expected_specs['person_activity']['filepath_or_buffer']
     index_cols = ['sequence_name', 'tag']
     specs = {'path': pa_fpath, 'index_col': index_cols, 'delimiter': '\t'}
     pr.add_dataset("pysemantic", "pa_multiindex", specs)
     try:
         df = pr.Project('pysemantic').load_dataset('pa_multiindex')
         self.assertTrue(isinstance(df.index, pd.MultiIndex))
         self.assertEqual(len(df.index.levels), 2)
         seq_name, tags = df.index.levels
         org_df = pd.read_table(specs['path'])
         for col in index_cols:
             x = org_df[col].unique().tolist()
             y = df.index.get_level_values(col).unique().tolist()
             self.assertItemsEqual(x, y)
     finally:
         pr.remove_dataset('pysemantic', 'pa_multiindex')
예제 #8
0
 def test_remove_dataset(self):
     """Test if removing datasets works from the command line."""
     # Add a temporary dataset and try to remove it.
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "testdata.csv")
     dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b'])
     dframe.to_csv(outfile, index=False)
     specs = dict(path=outfile, delimiter=',')
     pr.add_dataset("pysemantic", "testdata", specs)
     try:
         command = "semantic remove pysemantic --dataset testdata"
         cmd = command.split(' ')
         subprocess.check_call(cmd, env=self.testenv)
         datasets = pr.get_datasets("pysemantic")
         self.assertNotIn("testdata", datasets)
     finally:
         datasets = pr.get_datasets("pysemantic")
         if "testdata" in datasets:
             pr.remove_dataset("pysemantic", "testdata")
         shutil.rmtree(tempdir)
예제 #9
0
 def test_remove_dataset(self):
     """Test if removing datasets works from the command line."""
     # Add a temporary dataset and try to remove it.
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "testdata.csv")
     dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b'])
     dframe.to_csv(outfile, index=False)
     specs = dict(path=outfile, delimiter=',')
     pr.add_dataset("pysemantic", "testdata", specs)
     try:
         command = "semantic remove pysemantic --dataset testdata"
         cmd = command.split(' ')
         subprocess.check_call(cmd, env=self.testenv)
         datasets = pr.get_datasets("pysemantic")
         self.assertNotIn("testdata", datasets)
     finally:
         datasets = pr.get_datasets("pysemantic")
         if "testdata" in datasets:
             pr.remove_dataset("pysemantic", "testdata")
         shutil.rmtree(tempdir)
예제 #10
0
 def test_add_dataset(self):
     """Test if adding datasets programmatically works fine."""
     tempdir = tempfile.mkdtemp()
     outpath = op.join(tempdir, "foo.csv")
     dframe = pd.DataFrame(np.random.random((10, 10)))
     dframe.to_csv(outpath, index=False)
     specs = dict(path=outpath, delimiter=',', nrows=10)
     try:
         pr.add_dataset("pysemantic", "sample_dataset", specs)
         parsed_specs = pr.get_schema_specs("pysemantic", "sample_dataset")
         self.assertKwargsEqual(specs, parsed_specs)
     finally:
         shutil.rmtree(tempdir)
         with open(TEST_DATA_DICT, "r") as fileobj:
             test_specs = yaml.load(fileobj, Loader=Loader)
         del test_specs['sample_dataset']
         with open(TEST_DATA_DICT, "w") as fileobj:
             yaml.dump(test_specs,
                       fileobj,
                       Dumper=Dumper,
                       default_flow_style=False)
예제 #11
0
 def test_regex_separator(self):
     """Test if the project properly loads a dataset when it encounters
     regex separators.
     """
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "sample.txt")
     data = ["col1"] + map(str, range(10))
     with open(outfile, "w") as fileobj:
         fileobj.write("\n".join(data))
     specs = dict(path=outfile, delimiter=r'\n', dtypes={'col1': int})
     pr.add_dataset("pysemantic", "sample_dataset", specs)
     try:
         with warnings.catch_warnings(record=True) as catcher:
             _pr = pr.Project("pysemantic")
             dframe = _pr.load_dataset("sample_dataset")
             assert len(catcher) == 2
             assert issubclass(catcher[1].category, ParserWarning)
         data.remove("col1")
         self.assertItemsEqual(map(int, data), dframe['col1'].tolist())
     finally:
         pr.remove_dataset("pysemantic", "sample_dataset")
         shutil.rmtree(tempdir)
예제 #12
0
 def test_regex_separator(self):
     """Test if the project properly loads a dataset when it encounters
     regex separators.
     """
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "sample.txt")
     data = ["col1"] + map(str, range(10))
     with open(outfile, "w") as fileobj:
         fileobj.write("\n".join(data))
     specs = dict(path=outfile, delimiter=r'\n', dtypes={'col1': int})
     pr.add_dataset("pysemantic", "sample_dataset", specs)
     try:
         _pr = pr.Project("pysemantic")
         with warnings.catch_warnings(record=True) as catcher:
             dframe = _pr.load_dataset("sample_dataset")
             assert len(catcher) == 2
             assert issubclass(catcher[1].category, ParserWarning)
         data.remove("col1")
         self.assertItemsEqual(map(int, data), dframe['col1'].tolist())
     finally:
         pr.remove_dataset("pysemantic", "sample_dataset")
         shutil.rmtree(tempdir)