def _check_md5(self): import sys if sys.platform == 'win32': msg = "Verifying md5 checksums is not yet supported for your OS." logger.warn(msg) warnings.warn(msg, UserWarning) return if self.md5: if self.md5 != get_md5_checksum(self.filepath): msg = \ """The MD5 checksum of the file {} does not match the one specified in the schema. This may not be the file you are looking for.""" logger.warn(msg.format(self.filepath)) warnings.warn(msg.format(self.filepath), UserWarning)
def test_md5(self): """Check if the md5 checksum validation works properly.""" schema = deepcopy(self.basespecs["iris"]) schema['md5'] = get_md5_checksum(schema['path']) SchemaValidator(specification=schema) tempdir = tempfile.mkdtemp() outpath = op.join(tempdir, "bad_iris.csv") iris = pd.read_csv(schema['path']) del iris['Species'] iris.to_csv(outpath, index=False) schema['path'] = outpath try: with warnings.catch_warnings(record=True) as catcher: SchemaValidator(specification=schema).get_parser_args() assert len(catcher) == 1 assert issubclass(catcher[-1].category, UserWarning) finally: shutil.rmtree(tempdir)
def _get_parser_args(self): if self.md5: if self.md5 != get_md5_checksum(self.filepath): msg = \ """The MD5 checksum of the file {} does not match the one specified in the schema. This may not be the file you are looking for.""" logger.warn(msg.format(self.filepath)) warnings.warn(msg.format(self.filepath), UserWarning) args = {} if self._delimiter: args['sep'] = self._delimiter # Columns to use if len(self.colnames) > 0: args['usecols'] = self.colnames # Columns to exclude if len(self.exclude_columns) > 0: usecols = colnames(self._filepath, sep=args.get('sep', ',')) for colname in self.exclude_columns: usecols.remove(colname) args['usecols'] = usecols # NA values if len(self.na_values) > 0: args['na_values'] = self.na_values # Date/Time arguments # FIXME: Allow for a mix of datetime column groupings and individual # columns if len(self.datetime_cols) > 0: if isinstance(self.datetime_cols, dict): args['parse_dates'] = self.datetime_cols elif isinstance(self.datetime_cols, list): args['parse_dates'] = [self.datetime_cols] else: parse_dates = [] for k, v in self._dtypes.iteritems(): if v is datetime.date: parse_dates.append(k) for k in parse_dates: del self._dtypes[k] args['dtype'] = self.dtypes if len(parse_dates) > 0: args['parse_dates'] = parse_dates if len(self.converters) > 0: args['converters'] = self.converters if self.header != 0: args['header'] = self.header if self.column_names is not None: if isinstance(self.column_names, list): args['names'] = self.column_names # Force include the header argument args['header'] = self.header elif isinstance(self.column_names, dict) or callable(self.column_names): self.df_rules['column_names'] = self.column_names if self.is_multifile: arglist = [] for i in range(len(self._filepath)): argset = copy.deepcopy(args) argset.update({'filepath_or_buffer': self._filepath[i]}) argset.update({'nrows': self._nrows[i]}) arglist.append(argset) return arglist else: if self._filepath: args.update({'filepath_or_buffer': self._filepath}) if "nrows" in self.specification: args.update({'nrows': self._nrows}) self.pickled_args.update(args) return self.pickled_args
def test_md5(self): ideal = "9b3ecf3031979169c0ecc5e03cfe20a6" actual = get_md5_checksum(self.filepath) self.assertEqual(ideal, actual)
def test_md5(self): """Test the md5 checksum calculator.""" ideal = "9b3ecf3031979169c0ecc5e03cfe20a6" actual = get_md5_checksum(self.filepath) self.assertEqual(ideal, actual)