Пример #1
0
 def __init__(self, parser_args, project, maxiter=None):
     self.parser_args = parser_args
     self.project = project
     fpath = self.parser_args.get('filepath_or_buffer',
                                  self.parser_args.get('io'))
     sep = self.parser_args.get('sep', False)
     if sep:
         self.colnames = colnames(fpath, sep=sep)
     else:
         self.colnames = colnames(fpath)
         self.parser_args['sep'] = sep
     if maxiter is not None:
         self.maxiter = maxiter
     else:
         self.maxiter = len(self.colnames)
Пример #2
0
 def __init__(self, parser_args, project, maxiter=None):
     self.parser_args = parser_args
     self.project = project
     fpath = self.parser_args.get('filepath_or_buffer',
                                  self.parser_args.get('io'))
     sep = self.parser_args.get('sep', False)
     if sep:
         self.colnames = colnames(fpath, sep=sep)
     else:
         self.colnames = colnames(fpath)
         self.parser_args['sep'] = sep
     if maxiter is not None:
         self.maxiter = maxiter
     else:
         self.maxiter = len(self.colnames)
Пример #3
0
    def _detect_row_with_na(self):
        """Return the list of columns in the dataframe, for which the data type
        has been marked as integer, but which contain NAs.

        :param parser_args: Dictionary containing parser arguments.
        """
        dtypes = self.parser_args.get("dtype")
        usecols = self.parser_args.get("usecols")
        if usecols is None:
            usecols = colnames(self.parser_args['filepath_or_buffer'])
        int_cols = [col for col in usecols if dtypes.get(col) is int]
        fpath = self.parser_args['filepath_or_buffer']
        sep = self.parser_args.get('sep', ',')
        nrows = self.parser_args.get('nrows')
        na_reps = {}
        if self.parser_args.get('na_values', False):
            for colname, na_vals in self.parser_args.get('na_values').iteritems():
                if colname in int_cols:
                    na_reps[colname] = na_vals
        converters = {}
        if self.parser_args.get('converters', False):
            for cname, cnv in self.parser_args.get('converters').iteritems():
                if cname in int_cols:
                    converters[cname] = cnv
        df = self.parser(fpath, sep=sep, usecols=int_cols, nrows=nrows,
                         na_values=na_reps, converters=converters)
        bad_rows = []
        for col in df:
            if np.any(pd.isnull(df[col])):
                bad_rows.append(col)
        return bad_rows
Пример #4
0
 def test_colnames_infer_parser_from_sep(self):
     """Test if the colnames are read if the separator is specified."""
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "person_activity.tsv")
     ideal = "sequence_name tag date x y z activity".split()
     actual = colnames(filepath, sep='\\t')
     self.assertItemsEqual(actual, ideal)
Пример #5
0
 def test_colnames_parser_arg(self):
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "person_activity.tsv")
     ideal = "sequence_name tag date x y z activity".split()
     from pandas import read_table
     actual = colnames(filepath, parser=read_table)
     self.assertItemsEqual(actual, ideal)
Пример #6
0
 def test_dataset_colnames(self):
     """Check if the column names read by the Loader are correct."""
     for name, sep in {'iris': ',', 'person_activity': '\t'}.iteritems():
         loaded = self.project.load_dataset(name)
         columns = loaded.columns.tolist()
         spec_colnames = colnames(self.data_specs[name]['path'], sep=sep)
         self.assertItemsEqual(spec_colnames, columns)
Пример #7
0
 def test_dataset_colnames(self):
     """Check if the column names read by the Loader are correct."""
     for name, sep in {'iris': ',', 'person_activity': '\t'}.iteritems():
         loaded = self.project.load_dataset(name)
         columns = loaded.columns.tolist()
         spec_colnames = colnames(self.data_specs[name]['path'], sep=sep)
         self.assertItemsEqual(spec_colnames, columns)
Пример #8
0
 def test_colnames_infer_parser_from_extension(self):
     """Test if the colnames function can infer the correct parser from the
     file extension."""
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "person_activity.tsv")
     ideal = "sequence_name tag date x y z activity".split()
     actual = colnames(filepath)
     self.assertItemsEqual(actual, ideal)
Пример #9
0
 def test_colnames(self):
     """Test if the column names are read correctly from a file."""
     ideal = [
         'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width',
         'Species'
     ]
     actual = colnames(self.filepath)
     self.assertItemsEqual(actual, ideal)
Пример #10
0
 def test_colnames_parser_arg(self):
     """Test if the colnames are read if the parser is specified."""
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "person_activity.tsv")
     ideal = "sequence_name tag date x y z activity".split()
     from pandas import read_table
     actual = colnames(filepath, parser=read_table)
     self.assertItemsEqual(actual, ideal)
Пример #11
0
 def _get_colnames(self):
     usecols = self.specification.get('use_columns')
     if len(self.exclude_columns) > 0:
         if usecols:
             for colname in self.exclude_columns:
                 usecols.remove(colname)
         else:
             usecols = colnames(self.filepath, sep=self.delimiter)
             for colname in self.exclude_columns:
                 usecols.remove(colname)
     else:
         if usecols is None:
             if self.filepath and not self.is_multifile:
                 return colnames(self.filepath, sep=self.delimiter)
     if self.index_col is not None:
         if self.index_col not in usecols:
             usecols.append(self.index_col)
     return usecols
Пример #12
0
 def test_get_multifile_dataset_specs(self):
     """Test if the multifile dataset specifications are valid."""
     outargs = self.project.get_dataset_specs("multi_iris")
     for argset in outargs:
         argset['usecols'] = colnames(argset['filepath_or_buffer'])
     self.assertTrue(isinstance(outargs, list))
     self.assertEqual(len(outargs), len(self.expected_specs['multi_iris']))
     for i in range(len(outargs)):
         self.assertKwargsEqual(outargs[i],
                                self.expected_specs['multi_iris'][i])
Пример #13
0
 def test_get_multifile_dataset_specs(self):
     """Test if the multifile dataset specifications are valid."""
     outargs = self.project.get_dataset_specs("multi_iris")
     for argset in outargs:
         argset['usecols'] = colnames(argset['filepath_or_buffer'])
     self.assertTrue(isinstance(outargs, list))
     self.assertEqual(len(outargs), len(self.expected_specs['multi_iris']))
     for i in range(len(outargs)):
         self.assertKwargsEqual(outargs[i],
                                self.expected_specs['multi_iris'][i])
Пример #14
0
 def test_random_row_selection_within_range(self):
     """Check if randomly selecting rows within a range works."""
     iris_specs = pr.get_schema_specs("pysemantic", "iris")
     iris_specs['nrows'] = {'range': [25, 75], 'count': 10, 'random': True}
     iris_specs['header'] = 0
     del iris_specs['dtypes']
     iris_specs['column_names'] = colnames(iris_specs['path'])
     project = pr.Project(schema={'iris': iris_specs})
     loaded = project.load_dataset('iris')
     self.assertEqual(loaded.shape[0], 10)
     ix = loaded.index.values
     self.assertTrue(ix.max() <= 50)
Пример #15
0
 def test_row_selection_random_range(self):
     """Check if a range of rows can be selected from the dataset."""
     iris_specs = pr.get_schema_specs("pysemantic", "iris")
     iris_specs['nrows'] = {'range': [25, 75], 'random': True}
     iris_specs['header'] = 0
     del iris_specs['dtypes']
     iris_specs['column_names'] = colnames(iris_specs['path'])
     project = pr.Project(schema={'iris': iris_specs})
     loaded = project.load_dataset('iris')
     self.assertEqual(loaded.shape[0], 50)
     ideal_ix = np.arange(50)
     self.assertFalse(np.all(loaded.index.values == ideal_ix))
Пример #16
0
 def test_random_row_selection_within_range(self):
     """Check if randomly selecting rows within a range works."""
     iris_specs = pr.get_schema_specs("pysemantic", "iris")
     iris_specs['nrows'] = {'range': [25, 75], 'count': 10, 'random': True}
     iris_specs['header'] = 0
     del iris_specs['dtypes']
     iris_specs['column_names'] = colnames(iris_specs['path'])
     project = pr.Project(schema={'iris': iris_specs})
     loaded = project.load_dataset('iris')
     self.assertEqual(loaded.shape[0], 10)
     ix = loaded.index.values
     self.assertTrue(ix.max() <= 50)
Пример #17
0
 def _get_colnames(self):
     usecols = self.specification.get('use_columns')
     if (usecols is None) and (self.is_mysql or self.is_postgresql):
         return None
     if len(self.exclude_columns) > 0:
         if usecols:
             for colname in self.exclude_columns:
                 usecols.remove(colname)
         else:
             usecols = colnames(self.filepath, sep=self.delimiter)
             for colname in self.exclude_columns:
                 usecols.remove(colname)
     else:
         if usecols is None:
             if self.filepath and not self.is_multifile:
                 return colnames(self.filepath, sep=self.delimiter)
     if self.index_col is not None:
         if usecols is not None:
             if self.index_col not in usecols:
                 usecols.append(self.index_col)
     return usecols
Пример #18
0
 def test_row_selection_random_range(self):
     """Check if a range of rows can be selected from the dataset."""
     iris_specs = pr.get_schema_specs("pysemantic", "iris")
     iris_specs['nrows'] = {'range': [25, 75], 'random': True}
     iris_specs['header'] = 0
     del iris_specs['dtypes']
     iris_specs['column_names'] = colnames(iris_specs['path'])
     project = pr.Project(schema={'iris': iris_specs})
     loaded = project.load_dataset('iris')
     self.assertEqual(loaded.shape[0], 50)
     ideal_ix = np.arange(50)
     self.assertFalse(np.all(loaded.index.values == ideal_ix))
Пример #19
0
def _get_iris_args():
    """Get the ideal parser arguments for the iris dataset."""
    filepath = op.join(op.dirname(__file__), "testdata", "iris.csv")
    names = colnames(filepath)
    return dict(filepath_or_buffer=op.abspath(filepath),
                sep=",", nrows=150, error_bad_lines=False,
                dtype={'Petal Length': float,
                       'Petal Width': float,
                       'Sepal Length': float,
                       'Sepal Width': float,
                       'Species': str},
                usecols=names, na_values=None, parse_dates=False,
                converters=None, header='infer', index_col=None)
Пример #20
0
def _get_person_activity_args():
    """Get the ideal parser arguments for the activity dataset."""
    filepath = op.join(op.dirname(__file__), "testdata", "person_activity.tsv")
    names = colnames(filepath, sep='\t')
    return dict(filepath_or_buffer=op.abspath(filepath),
                error_bad_lines=False, usecols=names, na_values=None,
                converters=None, header='infer', index_col=None,
                sep="\t", nrows=100, dtype={'sequence_name': str,
                                            'tag': str,
                                            'x': float,
                                            'y': float,
                                            'z': float,
                                            'activity': str},
                parse_dates=['date'])
Пример #21
0
def _get_iris_args():
    """Get the ideal parser arguments for the iris dataset."""
    filepath = op.join(op.dirname(__file__), "testdata", "iris.csv")
    names = colnames(filepath)
    return dict(filepath_or_buffer=op.abspath(filepath),
                sep=",",
                nrows=150,
                error_bad_lines=False,
                dtype={
                    'Petal Length': float,
                    'Petal Width': float,
                    'Sepal Length': float,
                    'Sepal Width': float,
                    'Species': str
                },
                usecols=names,
                na_values=None,
                parse_dates=False,
                converters=None,
                header='infer',
                index_col=None)
Пример #22
0
def _get_person_activity_args():
    """Get the ideal parser arguments for the activity dataset."""
    filepath = op.join(op.dirname(__file__), "testdata", "person_activity.tsv")
    names = colnames(filepath, sep='\t')
    return dict(filepath_or_buffer=op.abspath(filepath),
                error_bad_lines=False,
                usecols=names,
                na_values=None,
                converters=None,
                header='infer',
                index_col=None,
                sep="\t",
                nrows=100,
                dtype={
                    'sequence_name': str,
                    'tag': str,
                    'x': float,
                    'y': float,
                    'z': float,
                    'activity': str
                },
                parse_dates=['date'])
Пример #23
0
    def _detect_row_with_na(self):
        """Return the list of columns in the dataframe, for which the data type
        has been marked as integer, but which contain NAs.

        :param parser_args: Dictionary containing parser arguments.
        """
        dtypes = self.parser_args.get("dtype")
        usecols = self.parser_args.get("usecols")
        if usecols is None:
            usecols = colnames(self.parser_args['filepath_or_buffer'])
        int_cols = [col for col in usecols if dtypes.get(col) is int]
        fpath = self.parser_args['filepath_or_buffer']
        sep = self.parser_args.get('sep', ',')
        nrows = self.parser_args.get('nrows')
        na_reps = {}
        if self.parser_args.get('na_values', False):
            for colname, na_vals in self.parser_args.get(
                    'na_values').iteritems():
                if colname in int_cols:
                    na_reps[colname] = na_vals
        converters = {}
        if self.parser_args.get('converters', False):
            for cname, cnv in self.parser_args.get('converters').iteritems():
                if cname in int_cols:
                    converters[cname] = cnv
        df = self.parser(fpath,
                         sep=sep,
                         usecols=int_cols,
                         nrows=nrows,
                         na_values=na_reps,
                         converters=converters)
        bad_rows = []
        for col in df:
            if np.any(pd.isnull(df[col])):
                bad_rows.append(col)
        return bad_rows
Пример #24
0
    def _get_parser_args(self):
        if self.md5:
            if self.md5 != get_md5_checksum(self.filepath):
                msg = \
                    """The MD5 checksum of the file {} does not match the one
                     specified in the schema. This may not be the file you are
                     looking for."""
                logger.warn(msg.format(self.filepath))
                warnings.warn(msg.format(self.filepath), UserWarning)
        args = {}
        if self._delimiter:
            args['sep'] = self._delimiter

        # Columns to use
        if len(self.colnames) > 0:
            args['usecols'] = self.colnames

        # Columns to exclude
        if len(self.exclude_columns) > 0:
            usecols = colnames(self._filepath, sep=args.get('sep', ','))
            for colname in self.exclude_columns:
                usecols.remove(colname)
            args['usecols'] = usecols

        # NA values
        if len(self.na_values) > 0:
            args['na_values'] = self.na_values

        # Date/Time arguments
        # FIXME: Allow for a mix of datetime column groupings and individual
        # columns
        if len(self.datetime_cols) > 0:
            if isinstance(self.datetime_cols, dict):
                args['parse_dates'] = self.datetime_cols
            elif isinstance(self.datetime_cols, list):
                args['parse_dates'] = [self.datetime_cols]
        else:
            parse_dates = []
            for k, v in self._dtypes.iteritems():
                if v is datetime.date:
                    parse_dates.append(k)
            for k in parse_dates:
                del self._dtypes[k]
            args['dtype'] = self.dtypes
            if len(parse_dates) > 0:
                args['parse_dates'] = parse_dates

        if len(self.converters) > 0:
            args['converters'] = self.converters

        if self.header != 0:
            args['header'] = self.header
        if self.column_names is not None:
            if isinstance(self.column_names, list):
                args['names'] = self.column_names
                # Force include the header argument
                args['header'] = self.header
            elif isinstance(self.column_names, dict) or callable(self.column_names):
                self.df_rules['column_names'] = self.column_names

        if self.is_multifile:
            arglist = []
            for i in range(len(self._filepath)):
                argset = copy.deepcopy(args)
                argset.update({'filepath_or_buffer': self._filepath[i]})
                argset.update({'nrows': self._nrows[i]})
                arglist.append(argset)
            return arglist
        else:
            if self._filepath:
                args.update({'filepath_or_buffer': self._filepath})
            if "nrows" in self.specification:
                args.update({'nrows': self._nrows})
            self.pickled_args.update(args)
            return self.pickled_args
Пример #25
0
 def test_colnames(self):
     ideal = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width',
              'Species']
     actual = colnames(self.filepath)
     self.assertItemsEqual(actual, ideal)
Пример #26
0
 def test_colnames_infer_parser_from_sep(self):
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "person_activity.tsv")
     ideal = "sequence_name tag date x y z activity".split()
     actual = colnames(filepath, sep='\\t')
     self.assertItemsEqual(actual, ideal)