示例#1
0
 def test_update_dataset(self):
     """Test if the update_dataset method works."""
     tempdir = tempfile.mkdtemp()
     _pr = pr.Project("pysemantic")
     iris = _pr.load_dataset("iris")
     x = np.random.random((150, ))
     y = np.random.random((150, ))
     iris['x'] = x
     iris['y'] = y
     org_cols = iris.columns.tolist()
     outpath = op.join(tempdir, "iris.csv")
     with open(TEST_DATA_DICT, "r") as fid:
         org_specs = yaml.load(fid, Loader=Loader)
     try:
         _pr.update_dataset("iris", iris, path=outpath, sep='\t')
         _pr = pr.Project("pysemantic")
         iris = _pr.load_dataset("iris")
         self.assertItemsEqual(org_cols, iris.columns.tolist())
         iris_validator = _pr.validators['iris']
         updated_args = iris_validator.parser_args
         self.assertEqual(updated_args['dtype']['x'], float)
         self.assertEqual(updated_args['dtype']['y'], float)
         self.assertEqual(updated_args['sep'], '\t')
         self.assertEqual(updated_args['filepath_or_buffer'], outpath)
     finally:
         shutil.rmtree(tempdir)
         with open(TEST_DATA_DICT, "w") as fid:
             yaml.dump(org_specs,
                       fid,
                       Dumper=Dumper,
                       default_flow_style=False)
示例#2
0
 def test_load_dataset_missing_nrows(self):
     """Test if the project loads datasets properly if the nrows parameter
     is not provided in the schema.
     """
     # Modify the schema to remove the nrows
     with open(TEST_DATA_DICT, "r") as fileobj:
         org_specs = yaml.load(fileobj, Loader=Loader)
     new_specs = deepcopy(org_specs)
     for dataset_specs in new_specs.itervalues():
         if "nrows" in dataset_specs:
             del dataset_specs['nrows']
     with open(TEST_DATA_DICT, "w") as fileobj:
         yaml.dump(new_specs,
                   fileobj,
                   Dumper=Dumper,
                   default_flow_style=False)
     try:
         _pr = pr.Project("pysemantic")
         dframe = pd.read_csv(**self.expected_specs['iris'])
         loaded = _pr.load_dataset("iris")
         self.assertDataFrameEqual(dframe, loaded)
         dframe = pd.read_table(**self.expected_specs['person_activity'])
         loaded = _pr.load_dataset("person_activity")
         self.assertDataFrameEqual(loaded, dframe)
     finally:
         with open(TEST_DATA_DICT, "w") as fileobj:
             yaml.dump(org_specs,
                       fileobj,
                       Dumper=Dumper,
                       default_flow_style=False)
示例#3
0
 def setUp(self):
     iris_specs = _get_iris_args()
     copied_iris_specs = deepcopy(iris_specs)
     copied_iris_specs.update({
         'filepath_or_buffer':
         iris_specs['filepath_or_buffer'].replace("iris", "iris2")
     })
     multi_iris_specs = [iris_specs, copied_iris_specs]
     person_activity_specs = _get_person_activity_args()
     random_row_iris_specs = {
         'nrows': {
             'random': True,
             'count': 50
         },
         'error_bad_lines':
         False,
         'filepath_or_buffer':
         op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv")
     }
     expected = {
         'iris': iris_specs,
         'person_activity': person_activity_specs,
         'multi_iris': multi_iris_specs,
         'random_row_iris': random_row_iris_specs
     }
     self.expected_specs = expected
     self.project = pr.Project(project_name="pysemantic")
示例#4
0
 def test_load_excel_multisheet(self):
     """Test combining multiple sheets into a single dataframe."""
     tempdir = tempfile.mkdtemp()
     spreadsheet = op.join(tempdir, "multifile_iris.xlsx")
     iris = self.project.load_dataset("iris")
     with pd.ExcelWriter(spreadsheet) as writer:
         iris.to_excel(writer, "iris1", index=False)
         iris.to_excel(writer, "iris2", index=False)
     schema = {
         'iris': {
             'path': spreadsheet,
             'sheetname': ['iris1', 'iris2'],
             'dataframe_rules': {
                 'drop_duplicates': False
             }
         }
     }
     schema_fpath = op.join(tempdir, "multi_iris.yaml")
     with open(schema_fpath, "w") as fout:
         yaml.dump(schema, fout, Dumper=Dumper, default_flow_style=False)
     pr.add_project("multi_iris", schema_fpath)
     try:
         ideal = pd.concat((iris, iris), axis=0)
         actual = pr.Project('multi_iris').load_dataset("iris")
         self.assertDataFrameEqual(ideal, actual)
     finally:
         _remove_project("multi_iris", tempdir)
    def test_integer_col_na_values(self):
        """Test if the loader can load columns with integers and NAs.

        This is necessary because NaNs cannot be represented by integers."""
        x = map(str, range(20))
        x[13] = ""
        df = pd.DataFrame.from_dict(dict(a=x, b=x))
        tempdir = tempfile.mkdtemp()
        outfile = op.join(tempdir, "testdata.csv")
        df.to_csv(outfile, index=False)
        specfile = op.join(tempdir, "dict.yaml")
        specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile)
        with open(specfile, "w") as fileobj:
            yaml.dump({'testdata': specs},
                      fileobj,
                      Dumper=yaml.CDumper,
                      default_flow_style=False)
        pr.add_project("wrong_dtype", specfile)
        try:
            _pr = pr.Project("wrong_dtype")
            df = _pr.load_dataset("testdata")
            self.assertEqual(df['a'].dtype, float)
            self.assertEqual(df['b'].dtype, float)
        finally:
            pr.remove_project("wrong_dtype")
            shutil.rmtree(tempdir)
 def test_load_dataset_wrong_dtypes_in_spec(self):
     """Test if the loader can safely load columns that have a wrongly
     specified data type in the schema.
     """
     # Make a file with two columns, both specified as integers in the
     # dtypes, but one has random string types.
     x = np.random.randint(0, 10, size=(100, 2))
     dframe = pd.DataFrame(x, columns=['a', 'b'])
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "testdata.csv")
     _ix = np.random.randint(0, 100, size=(5, ))
     dframe['b'][_ix] = "aa"
     dframe.to_csv(outfile, index=False)
     specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile)
     specfile = op.join(tempdir, "dict.yaml")
     with open(specfile, "w") as fileobj:
         yaml.dump({'testdata': specs},
                   fileobj,
                   Dumper=yaml.CDumper,
                   default_flow_style=False)
     pr.add_project("wrong_dtype", specfile)
     try:
         _pr = pr.Project("wrong_dtype")
         with warnings.catch_warnings(record=True) as catcher:
             dframe = _pr.load_dataset("testdata")
             assert len(catcher) == 1
             assert issubclass(catcher[-1].category, UserWarning)
     finally:
         pr.remove_project("wrong_dtype")
         shutil.rmtree(tempdir)
示例#7
0
 def test_random_row_selection(self):
     iris_specs = pr.get_schema_specs("pysemantic", "iris")
     iris_specs['nrows'] = dict(random=True, count=50)
     project = pr.Project(schema={'iris': iris_specs})
     loaded = project.load_dataset('iris')
     self.assertEqual(loaded.shape[0], 50)
     ideal_ix = np.arange(50)
     self.assertFalse(np.all(loaded.index.values == ideal_ix))
示例#8
0
def cli(arguments):
    """cli - The main CLI argument parser.

    :param arguments: command line arguments, as parsed by docopt
    :type arguments: dict
    :return: None
    """
    if arguments.get("list", False):
        if arguments['--project'] is None:
            pr.view_projects()
        else:
            proj_name = arguments.get('--project')
            dataset_names = pr.get_datasets(proj_name)
            for name in dataset_names:
                print name
    elif arguments.get("add", False):
        proj_name = arguments.get("PROJECT_NAME")
        proj_spec = arguments.get("PROJECT_SPECFILE")
        proj_spec = op.abspath(proj_spec)
        pr.add_project(proj_name, proj_spec)
    elif arguments.get("remove", False):
        proj_name = arguments.get("PROJECT_NAME")
        if arguments['--dataset'] is None:
            if not pr.remove_project(proj_name):
                print "The project {0} doesn't exist.".format(proj_name)
        else:
            pr.remove_dataset(proj_name, arguments['--dataset'])
    elif arguments.get("set-schema", False):
        try:
            proj_name = arguments.get("PROJECT_NAME")
            proj_spec = arguments.get("SCHEMA_FPATH")
            proj_spec = op.abspath(proj_spec)
            pr.set_schema_fpath(proj_name, proj_spec)
        except MissingProject:
            msg = """Project {} not found in the configuration. Please use
            $ semantic add
            to register the project.""".format(arguments.get("PROJECT_NAME"))
            print msg
    elif arguments.get("set-specs", False):
        proj_name = arguments.get("PROJECT_NAME")
        dataset_name = arguments.get("--dataset")
        newspecs = {}
        if arguments.get("--path", False):
            newspecs['path'] = arguments.get("--path")
        if arguments.get("--dlm", False):
            newspecs['delimiter'] = arguments.get("--dlm")
        pr.set_schema_specs(proj_name, dataset_name, **newspecs)
    elif arguments.get("add-dataset", False):
        proj_name = arguments.get('--project')
        dataset_name = arguments.get("DATASET_NAME")
        specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"])
        pr.add_dataset(proj_name, dataset_name, specs)
    elif arguments.get("export", False):
        project = pr.Project(arguments.get("PROJECT_NAME"))
        project.export_dataset(arguments.get("--dataset"),
                               outpath=arguments.get("OUTPATH"))
示例#9
0
 def test_nrows_callable(self):
     """Check if specifying the nrows argument as a callable works."""
     nrows = lambda x: np.remainder(x, 2) == 0
     iris_specs = pr.get_schema_specs("pysemantic", "iris")
     iris_specs['nrows'] = nrows
     project = pr.Project(schema={'iris': iris_specs})
     loaded = project.load_dataset('iris')
     self.assertEqual(loaded.shape[0], 75)
     ideal_ix = np.arange(150, step=2)
     np.testing.assert_allclose(ideal_ix, loaded.index.values)
示例#10
0
 def test_row_selection_random_range(self):
     """Check if a range of rows can be selected from the dataset."""
     iris_specs = pr.get_schema_specs("pysemantic", "iris")
     iris_specs['nrows'] = {'range': [25, 75], 'random': True}
     iris_specs['header'] = 0
     del iris_specs['dtypes']
     iris_specs['column_names'] = colnames(iris_specs['path'])
     project = pr.Project(schema={'iris': iris_specs})
     loaded = project.load_dataset('iris')
     self.assertEqual(loaded.shape[0], 50)
     ideal_ix = np.arange(50)
     self.assertFalse(np.all(loaded.index.values == ideal_ix))
示例#11
0
 def test_random_row_selection_within_range(self):
     """Check if randomly selecting rows within a range works."""
     iris_specs = pr.get_schema_specs("pysemantic", "iris")
     iris_specs['nrows'] = {'range': [25, 75], 'count': 10, 'random': True}
     iris_specs['header'] = 0
     del iris_specs['dtypes']
     iris_specs['column_names'] = colnames(iris_specs['path'])
     project = pr.Project(schema={'iris': iris_specs})
     loaded = project.load_dataset('iris')
     self.assertEqual(loaded.shape[0], 10)
     ix = loaded.index.values
     self.assertTrue(ix.max() <= 50)
示例#12
0
 def setUp(self):
     iris_specs = {
         'sep':
         ',',
         'dtype': {
             'Petal Length': float,
             'Sepal Width': float,
             'Petal Width': float,
             'Sepal Length': float,
             'Species': str
         },
         'usecols': [
             'Petal Length', 'Sepal Length', 'Sepal Width', 'Petal Width',
             'Species'
         ],
         'nrows':
         150,
         'filepath_or_buffer':
         op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv")
     }
     copied_iris_specs = deepcopy(iris_specs)
     copied_iris_specs.update({
         'filepath_or_buffer':
         iris_specs['filepath_or_buffer'].replace("iris", "iris2")
     })
     multi_iris_specs = [iris_specs, copied_iris_specs]
     person_activity_specs = {
         'sep':
         '\t',
         'dtype': {
             'activity': str,
             'sequence_name': str,
             'tag': str,
             'x': float,
             'y': float,
             'z': float,
         },
         'usecols':
         ['activity', 'sequence_name', 'tag', 'x', 'y', 'z', 'date'],
         'parse_dates': ['date'],
         'nrows':
         100,
         'filepath_or_buffer':
         op.join(op.abspath(op.dirname(__file__)), "testdata",
                 "person_activity.tsv")
     }
     expected = {
         'iris': iris_specs,
         'person_activity': person_activity_specs,
         'multi_iris': multi_iris_specs
     }
     self.expected_specs = expected
     self.project = pr.Project(project_name="pysemantic")
示例#13
0
 def test_export_dataset_csv(self):
     """Test if the default csv exporter works."""
     tempdir = tempfile.mkdtemp()
     project = pr.Project("pysemantic")
     try:
         dataset = "iris"
         outpath = op.join(tempdir, dataset + ".csv")
         project.export_dataset(dataset, outpath=outpath)
         self.assertTrue(op.exists(outpath))
         loaded = pd.read_csv(outpath)
         self.assertDataFrameEqual(loaded, project.load_dataset(dataset))
     finally:
         shutil.rmtree(tempdir)
示例#14
0
 def test_update_dataset_deleted_columns(self):
     """Test if the update dataset method removes column specifications."""
     tempdir = tempfile.mkdtemp()
     _pr = pr.Project("pysemantic")
     iris = _pr.load_dataset("iris")
     outpath = op.join(tempdir, "iris.csv")
     with open(TEST_DATA_DICT, "r") as fid:
         org_specs = yaml.load(fid, Loader=Loader)
     try:
         del iris['Species']
         _pr.update_dataset("iris", iris, path=outpath)
         pr_reloaded = pr.Project("pysemantic")
         iris_reloaded = pr_reloaded.load_dataset("iris")
         self.assertNotIn("Species", iris_reloaded.columns)
         self.assertNotIn("Species", pr_reloaded.column_rules["iris"])
     finally:
         shutil.rmtree(tempdir)
         with open(TEST_DATA_DICT, "w") as fid:
             yaml.dump(org_specs,
                       fid,
                       Dumper=Dumper,
                       default_flow_style=False)
示例#15
0
 def test_export_dataset_hdf(self):
     """Test if exporting the dataset to hdf works."""
     tempdir = tempfile.mkdtemp()
     project = pr.Project("pysemantic")
     try:
         for dataset in project.datasets:
             if dataset not in ("bad_iris", "random_row_iris"):
                 outpath = op.join(tempdir, dataset + ".h5")
                 project.export_dataset(dataset, outpath=outpath)
                 self.assertTrue(op.exists(outpath))
                 group = r'/{0}/{1}'.format(project.project_name, dataset)
                 loaded = pd.read_hdf(outpath, group)
                 self.assertDataFrameEqual(loaded,
                                           project.load_dataset(dataset))
     finally:
         shutil.rmtree(tempdir)
示例#16
0
 def test_add_dataset(self):
     """Test if the add-dataset subcommand adds datasets to projects."""
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "testdata.csv")
     dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b'])
     dframe.to_csv(outfile, index=False)
     cmd = ("semantic add-dataset testdata --project pysemantic --path {}"
            " --dlm ,")
     cmd = cmd.format(outfile).split(" ")
     try:
         subprocess.check_call(cmd, env=self.testenv)
         _pr = pr.Project("pysemantic")
         self.assertIn("testdata", _pr.datasets)
         specs = dict(path=outfile, delimiter=',')
         actual = pr.get_schema_specs("pysemantic", "testdata")
         self.assertKwargsEqual(specs, actual)
     finally:
         pr.remove_dataset("pysemantic", "testdata")
         shutil.rmtree(tempdir)
示例#17
0
 def test_init_project_yaml_dump(self):
     """Test initialization of Project class with the raw yaml dump."""
     project_specs = pr.get_schema_specs('pysemantic')
     project = pr.Project(schema=project_specs)
     loaded = project.load_datasets()
     self.assertItemsEqual(loaded.keys(),
                           ('iris', 'person_activity', 'multi_iris',
                            'bad_iris', 'random_row_iris'))
     dframe = pd.read_csv(**self.expected_specs['iris'])
     self.assertDataFrameEqual(loaded['iris'], dframe)
     dframe = pd.read_csv(**self.expected_specs['person_activity'])
     self.assertDataFrameEqual(loaded['person_activity'], dframe)
     dframes = [
         pd.read_csv(**args) for args in self.expected_specs['multi_iris']
     ]
     dframes = [x.drop_duplicates() for x in dframes]
     dframe = pd.concat(dframes)
     dframe.set_index(np.arange(dframe.shape[0]), inplace=True)
     self.assertDataFrameEqual(loaded['multi_iris'], dframe)
示例#18
0
 def test_regex_separator(self):
     """Test if the project properly loads a dataset when it encounters
     regex separators.
     """
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "sample.txt")
     data = ["col1"] + map(str, range(10))
     with open(outfile, "w") as fileobj:
         fileobj.write("\n".join(data))
     specs = dict(path=outfile, delimiter=r'\n', dtypes={'col1': int})
     pr.add_dataset("pysemantic", "sample_dataset", specs)
     try:
         _pr = pr.Project("pysemantic")
         with warnings.catch_warnings(record=True) as catcher:
             dframe = _pr.load_dataset("sample_dataset")
             assert len(catcher) == 2
             assert issubclass(catcher[1].category, ParserWarning)
         data.remove("col1")
         self.assertItemsEqual(map(int, data), dframe['col1'].tolist())
     finally:
         pr.remove_dataset("pysemantic", "sample_dataset")
         shutil.rmtree(tempdir)
示例#19
0
 def test_reload_data_dict(self):
     """Test if the reload_data_dict method works."""
     project = pr.Project("pysemantic")
     tempdir = tempfile.mkdtemp()
     datapath = op.join(tempdir, "data.csv")
     ideal = pd.DataFrame(np.random.randint(0, 9, size=(10, 5)),
                          columns=map(str, range(5)))
     ideal.to_csv(datapath, index=False)
     with open(TEST_DATA_DICT, "r") as fid:
         specs = yaml.load(fid, Loader=Loader)
     specs['fakedata'] = dict(path=datapath)
     with open(TEST_DATA_DICT, "w") as fid:
         yaml.dump(specs, fid, Dumper=Dumper)
     try:
         project.reload_data_dict()
         actual = project.load_dataset("fakedata")
         self.assertDataFrameEqual(ideal, actual)
     finally:
         shutil.rmtree(tempdir)
         del specs['fakedata']
         with open(TEST_DATA_DICT, "w") as fid:
             yaml.dump(specs, fid, Dumper=Dumper)
示例#20
0
 def test_error_bad_lines_correction(self):
     """test if the correction for bad lines works."""
     iris_path = op.join(op.abspath(op.dirname(__file__)), "testdata",
                         "iris.csv")
     with open(iris_path, "r") as fid:
         iris_lines = fid.readlines()
     tempdir = tempfile.mkdtemp()
     outpath = op.join(tempdir, "bad_iris.csv")
     iris_lines[50] = iris_lines[50].rstrip() + ",0,23,\n"
     with open(outpath, 'w') as fid:
         fid.writelines(iris_lines)
     data_dict = op.join(tempdir, "dummy_project.yaml")
     specs = {'bad_iris': {'path': outpath}}
     with open(data_dict, "w") as fid:
         yaml.dump(specs, fid, Dumper=Dumper, default_flow_style=False)
     pr.add_project('dummy_project', data_dict)
     try:
         project = pr.Project('dummy_project')
         df = project.load_dataset('bad_iris')
         self.assertItemsEqual(df.shape, (147, 5))
     finally:
         _remove_project("dummy_project", tempdir)
示例#21
0
 def __enter__(self):
     pr.add_project("dummy_project", self.schema_fpath)
     return pr.Project("dummy_project")
示例#22
0
 def test_load_excel_sheetname(self):
     """Test if specifying the sheetname loads the correct dataframe."""
     xl_project = pr.Project("test_excel")
     ideal_iris = self.project.load_dataset("iris")
     actual_iris = xl_project.load_dataset("iris_renamed")
     self.assertDataFrameEqual(ideal_iris, actual_iris)
示例#23
0
 def test_load_excel(self):
     """Test if excel spreadsheets are read properly from the schema."""
     xl_project = pr.Project("test_excel")
     ideal_iris = self.project.load_dataset("iris")
     actual_iris = xl_project.load_dataset("iris")
     self.assertDataFrameEqual(ideal_iris, actual_iris)
示例#24
0
 def test_na_reps(self):
     """Test if the NA representations are parsed properly."""
     project = pr.Project("pysemantic")
     loaded = project.load_dataset("bad_iris")
     self.assertItemsEqual(loaded.shape, (300, 5))