def __init__(self, parser_args, project, maxiter=None): self.parser_args = parser_args self.project = project fpath = self.parser_args.get('filepath_or_buffer', self.parser_args.get('io')) sep = self.parser_args.get('sep', False) if sep: self.colnames = colnames(fpath, sep=sep) else: self.colnames = colnames(fpath) self.parser_args['sep'] = sep if maxiter is not None: self.maxiter = maxiter else: self.maxiter = len(self.colnames)
def _detect_row_with_na(self): """Return the list of columns in the dataframe, for which the data type has been marked as integer, but which contain NAs. :param parser_args: Dictionary containing parser arguments. """ dtypes = self.parser_args.get("dtype") usecols = self.parser_args.get("usecols") if usecols is None: usecols = colnames(self.parser_args['filepath_or_buffer']) int_cols = [col for col in usecols if dtypes.get(col) is int] fpath = self.parser_args['filepath_or_buffer'] sep = self.parser_args.get('sep', ',') nrows = self.parser_args.get('nrows') na_reps = {} if self.parser_args.get('na_values', False): for colname, na_vals in self.parser_args.get('na_values').iteritems(): if colname in int_cols: na_reps[colname] = na_vals converters = {} if self.parser_args.get('converters', False): for cname, cnv in self.parser_args.get('converters').iteritems(): if cname in int_cols: converters[cname] = cnv df = self.parser(fpath, sep=sep, usecols=int_cols, nrows=nrows, na_values=na_reps, converters=converters) bad_rows = [] for col in df: if np.any(pd.isnull(df[col])): bad_rows.append(col) return bad_rows
def test_colnames_infer_parser_from_sep(self): """Test if the colnames are read if the separator is specified.""" filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "person_activity.tsv") ideal = "sequence_name tag date x y z activity".split() actual = colnames(filepath, sep='\\t') self.assertItemsEqual(actual, ideal)
def test_colnames_parser_arg(self): filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "person_activity.tsv") ideal = "sequence_name tag date x y z activity".split() from pandas import read_table actual = colnames(filepath, parser=read_table) self.assertItemsEqual(actual, ideal)
def test_dataset_colnames(self): """Check if the column names read by the Loader are correct.""" for name, sep in {'iris': ',', 'person_activity': '\t'}.iteritems(): loaded = self.project.load_dataset(name) columns = loaded.columns.tolist() spec_colnames = colnames(self.data_specs[name]['path'], sep=sep) self.assertItemsEqual(spec_colnames, columns)
def test_colnames_infer_parser_from_extension(self): """Test if the colnames function can infer the correct parser from the file extension.""" filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "person_activity.tsv") ideal = "sequence_name tag date x y z activity".split() actual = colnames(filepath) self.assertItemsEqual(actual, ideal)
def test_colnames(self): """Test if the column names are read correctly from a file.""" ideal = [ 'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species' ] actual = colnames(self.filepath) self.assertItemsEqual(actual, ideal)
def test_colnames_parser_arg(self): """Test if the colnames are read if the parser is specified.""" filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "person_activity.tsv") ideal = "sequence_name tag date x y z activity".split() from pandas import read_table actual = colnames(filepath, parser=read_table) self.assertItemsEqual(actual, ideal)
def _get_colnames(self): usecols = self.specification.get('use_columns') if len(self.exclude_columns) > 0: if usecols: for colname in self.exclude_columns: usecols.remove(colname) else: usecols = colnames(self.filepath, sep=self.delimiter) for colname in self.exclude_columns: usecols.remove(colname) else: if usecols is None: if self.filepath and not self.is_multifile: return colnames(self.filepath, sep=self.delimiter) if self.index_col is not None: if self.index_col not in usecols: usecols.append(self.index_col) return usecols
def test_get_multifile_dataset_specs(self): """Test if the multifile dataset specifications are valid.""" outargs = self.project.get_dataset_specs("multi_iris") for argset in outargs: argset['usecols'] = colnames(argset['filepath_or_buffer']) self.assertTrue(isinstance(outargs, list)) self.assertEqual(len(outargs), len(self.expected_specs['multi_iris'])) for i in range(len(outargs)): self.assertKwargsEqual(outargs[i], self.expected_specs['multi_iris'][i])
def test_random_row_selection_within_range(self): """Check if randomly selecting rows within a range works.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75], 'count': 10, 'random': True} iris_specs['header'] = 0 del iris_specs['dtypes'] iris_specs['column_names'] = colnames(iris_specs['path']) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 10) ix = loaded.index.values self.assertTrue(ix.max() <= 50)
def test_row_selection_random_range(self): """Check if a range of rows can be selected from the dataset.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75], 'random': True} iris_specs['header'] = 0 del iris_specs['dtypes'] iris_specs['column_names'] = colnames(iris_specs['path']) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 50) ideal_ix = np.arange(50) self.assertFalse(np.all(loaded.index.values == ideal_ix))
def _get_colnames(self): usecols = self.specification.get('use_columns') if (usecols is None) and (self.is_mysql or self.is_postgresql): return None if len(self.exclude_columns) > 0: if usecols: for colname in self.exclude_columns: usecols.remove(colname) else: usecols = colnames(self.filepath, sep=self.delimiter) for colname in self.exclude_columns: usecols.remove(colname) else: if usecols is None: if self.filepath and not self.is_multifile: return colnames(self.filepath, sep=self.delimiter) if self.index_col is not None: if usecols is not None: if self.index_col not in usecols: usecols.append(self.index_col) return usecols
def _get_iris_args(): """Get the ideal parser arguments for the iris dataset.""" filepath = op.join(op.dirname(__file__), "testdata", "iris.csv") names = colnames(filepath) return dict(filepath_or_buffer=op.abspath(filepath), sep=",", nrows=150, error_bad_lines=False, dtype={'Petal Length': float, 'Petal Width': float, 'Sepal Length': float, 'Sepal Width': float, 'Species': str}, usecols=names, na_values=None, parse_dates=False, converters=None, header='infer', index_col=None)
def _get_person_activity_args(): """Get the ideal parser arguments for the activity dataset.""" filepath = op.join(op.dirname(__file__), "testdata", "person_activity.tsv") names = colnames(filepath, sep='\t') return dict(filepath_or_buffer=op.abspath(filepath), error_bad_lines=False, usecols=names, na_values=None, converters=None, header='infer', index_col=None, sep="\t", nrows=100, dtype={'sequence_name': str, 'tag': str, 'x': float, 'y': float, 'z': float, 'activity': str}, parse_dates=['date'])
def _get_iris_args(): """Get the ideal parser arguments for the iris dataset.""" filepath = op.join(op.dirname(__file__), "testdata", "iris.csv") names = colnames(filepath) return dict(filepath_or_buffer=op.abspath(filepath), sep=",", nrows=150, error_bad_lines=False, dtype={ 'Petal Length': float, 'Petal Width': float, 'Sepal Length': float, 'Sepal Width': float, 'Species': str }, usecols=names, na_values=None, parse_dates=False, converters=None, header='infer', index_col=None)
def _get_person_activity_args(): """Get the ideal parser arguments for the activity dataset.""" filepath = op.join(op.dirname(__file__), "testdata", "person_activity.tsv") names = colnames(filepath, sep='\t') return dict(filepath_or_buffer=op.abspath(filepath), error_bad_lines=False, usecols=names, na_values=None, converters=None, header='infer', index_col=None, sep="\t", nrows=100, dtype={ 'sequence_name': str, 'tag': str, 'x': float, 'y': float, 'z': float, 'activity': str }, parse_dates=['date'])
def _detect_row_with_na(self): """Return the list of columns in the dataframe, for which the data type has been marked as integer, but which contain NAs. :param parser_args: Dictionary containing parser arguments. """ dtypes = self.parser_args.get("dtype") usecols = self.parser_args.get("usecols") if usecols is None: usecols = colnames(self.parser_args['filepath_or_buffer']) int_cols = [col for col in usecols if dtypes.get(col) is int] fpath = self.parser_args['filepath_or_buffer'] sep = self.parser_args.get('sep', ',') nrows = self.parser_args.get('nrows') na_reps = {} if self.parser_args.get('na_values', False): for colname, na_vals in self.parser_args.get( 'na_values').iteritems(): if colname in int_cols: na_reps[colname] = na_vals converters = {} if self.parser_args.get('converters', False): for cname, cnv in self.parser_args.get('converters').iteritems(): if cname in int_cols: converters[cname] = cnv df = self.parser(fpath, sep=sep, usecols=int_cols, nrows=nrows, na_values=na_reps, converters=converters) bad_rows = [] for col in df: if np.any(pd.isnull(df[col])): bad_rows.append(col) return bad_rows
def _get_parser_args(self): if self.md5: if self.md5 != get_md5_checksum(self.filepath): msg = \ """The MD5 checksum of the file {} does not match the one specified in the schema. This may not be the file you are looking for.""" logger.warn(msg.format(self.filepath)) warnings.warn(msg.format(self.filepath), UserWarning) args = {} if self._delimiter: args['sep'] = self._delimiter # Columns to use if len(self.colnames) > 0: args['usecols'] = self.colnames # Columns to exclude if len(self.exclude_columns) > 0: usecols = colnames(self._filepath, sep=args.get('sep', ',')) for colname in self.exclude_columns: usecols.remove(colname) args['usecols'] = usecols # NA values if len(self.na_values) > 0: args['na_values'] = self.na_values # Date/Time arguments # FIXME: Allow for a mix of datetime column groupings and individual # columns if len(self.datetime_cols) > 0: if isinstance(self.datetime_cols, dict): args['parse_dates'] = self.datetime_cols elif isinstance(self.datetime_cols, list): args['parse_dates'] = [self.datetime_cols] else: parse_dates = [] for k, v in self._dtypes.iteritems(): if v is datetime.date: parse_dates.append(k) for k in parse_dates: del self._dtypes[k] args['dtype'] = self.dtypes if len(parse_dates) > 0: args['parse_dates'] = parse_dates if len(self.converters) > 0: args['converters'] = self.converters if self.header != 0: args['header'] = self.header if self.column_names is not None: if isinstance(self.column_names, list): args['names'] = self.column_names # Force include the header argument args['header'] = self.header elif isinstance(self.column_names, dict) or callable(self.column_names): self.df_rules['column_names'] = self.column_names if self.is_multifile: arglist = [] for i in range(len(self._filepath)): argset = copy.deepcopy(args) argset.update({'filepath_or_buffer': self._filepath[i]}) argset.update({'nrows': self._nrows[i]}) arglist.append(argset) return arglist else: if self._filepath: args.update({'filepath_or_buffer': self._filepath}) if "nrows" in self.specification: args.update({'nrows': self._nrows}) self.pickled_args.update(args) return self.pickled_args
def test_colnames(self): ideal = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species'] actual = colnames(self.filepath) self.assertItemsEqual(actual, ideal)
def test_colnames_infer_parser_from_sep(self): filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "person_activity.tsv") ideal = "sequence_name tag date x y z activity".split() actual = colnames(filepath, sep='\\t') self.assertItemsEqual(actual, ideal)