def validate_data(df): schema = Schema([ Column('brand', null_validation + string_validation), Column('gear', null_validation + string_validation), Column('model', null_validation + string_validation), Column('price', null_validation + int_validation + price_max_validation, price_min_validation), Column('fuel', null_validation + string_validation), Column('mileage', null_validation + float_validation + mileage_max_validation), Column( 'hp', null_validation + float_validation + hp_min_validation + hp_max_validation), Column('type', null_validation + string_validation), Column('geo', null_validation + string_validation), Column('model_year', null_validation + float_validation), ]) try: errors = schema.validate(df) for e in errors: print(e) except: return False else: if not errors: return True else: return False
def validate_data(self): self.setup_field_validation() if not self.open_file_and_check_for_squareness(): logger.error( "Please fix the table. Some rows have different numbers of columns to the header" ) logger.info( "Rows with different numbers of columns to the header are not validated" ) for chunk in self.df_iterator(): to_validate = chunk[self.cols_to_read] to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded # validate the snp column if present if SNP_DSET in self.header: self.schema = Schema( [SNP_VALIDATORS[h] for h in self.cols_to_validate]) errors = self.schema.validate(to_validate) self.store_errors(errors, self.snp_errors) if CHR_DSET and BP_DSET in self.header: self.schema = Schema( [POS_VALIDATORS[h] for h in self.cols_to_validate]) errors = self.schema.validate(to_validate) self.store_errors(errors, self.pos_errors) self.process_errors() if len(self.bad_rows) >= self.error_limit: break if not self.bad_rows: logger.info("File is valid") return True else: logger.info( "File is invalid - {} bad rows, limit set to {}".format( len(self.bad_rows), self.error_limit)) return False
def validate_results(conn, args, filepath): """Validates input file for GWAS result data This function validates that the contents of a file to contain GWAS result data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) # For each column, add it to the schema, and then for known ones, add the # schema validation. Use fuzzy comparisons when possible schema_columns = [] for col in df.columns: validators = [] if re.match("(SNP)|(chr)|(pos)|(nSNPs)", col, re.IGNORECASE): validators.append(CanConvertValidation(int)) # Look for any of the p-values and make sure that they can be cast as a float if re.match("((null)?pval(ue)?)", col, re.IGNORECASE): validators.append(CanConvertValidation(float)) schema_columns.append(Column(col, validators)) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def test_schema(self): """ Test this validation inside a schema, to ensure we get helpful error messages. In particular, we want to make sure that a ValidationWarning without a row number won't break the schema """ df = pd.DataFrame( data={ 'wrong_dtype1': ['not_an_int'], 'wrong_dtype2': [123], 'wrong_dtype3': [12.5] }) schema = Schema([ Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), ]) errors = schema.validate(df) self.assertEqual( sorted([str(x) for x in errors]), sorted([ 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' ]))
def validate_line(conn, args, filepath): """Validates input file for line data This function validates that the contents of a file to contain line data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ schema = Schema([ Column('line_name', [ IsDistinctValidation() ]) ]) df = pd.read_csv(filepath, header=None) if len(df.columns) != 1: raise Exception(f"Invalid file format. Excepted 1 column found {len(df.columns)} columns. This file should be a single column of each line. Each entry should be distinct.") df.columns = [ 'line_name' ] err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def __init__( self, args: Namespace, sources: Dict[str, Any], schema: List[Tuple[str, np.generic]], destinations: Dict[str, Any], stage: str, task: str, ): """Initiate parameters and client libraries for ETL task. :param args: args passed from command line, see `get_arg_parser()` :param sources: data source to be extracted, specified in task config, see `configs/*.py` :param schema: the target schema to load to. :param destinations: destinations to load data to, specified in task config, see `configs/*.py` :param stage: the stage of the loaded data, could be staging/production. :param task: the name of the task. """ # Clear cached files if args.rm: for source in sources: files = [] files += glob.glob( get_path_format(True).format( prefix=destinations["fs"]["prefix"], stage="raw", task=args.task, source=source, )) files += glob.glob( get_path_format(True).format( prefix=destinations["fs"]["prefix"], stage=stage, task=args.task, source=source, )) for f in files: log.info("Removing cached file: %s" % f) os.remove(f) self.task = task self.stage = stage self.args = args self.period = args.period self.current_date = args.date self.last_month = lookback_dates(args.date, args.period) self.sources = sources coltypes = [] for coltype in schema: coltypes += [Column(coltype[0], [IsDtypeValidation(coltype[1])])] self.schema = Schema(coltypes) self.raw_schema = schema self.destinations = destinations self.raw = dict() self.extracted_base = dict() self.extracted = dict() self.transformed = dict() self.gcs = storage.Client()
def validate_variant(conn, args, filepath): """Validates input file for variant data This function validates that the contents of a file to contain variant data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ schema = Schema([ Column('chr', [ CanConvertValidation(int) ]), Column('pos', [ CanConvertValidation(int), IsDistinctValidation() ]) ]) df = pd.read_csv(filepath, sep='\t', header=None) if len(df.columns) != 2: raise Exception(f"Invalid file format. Excepted 2 columns, found {len(df.columns)} columns. Columns should consist of chromsome number and SNP position. Filepath: {filepath}") df.columns = ['chr', 'pos'] err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def function_validation(filename, cand_stage): df = pd.read_excel(filename, sheet_name='Mobilizer') df = df.fillna('') #print(df.columns) df['date_age'] = df['Age*'] + df['Date of Birth*'].astype(str) if cand_stage == str(1): schema = Schema([ #nan check column non mandate Column('Candidate Photo', null_validation), Column('Middle Name', null_validation), Column('Last Name', null_validation), Column('Secondary Contact No', null_validation), Column('Email id', null_validation), Column('Present Panchayat', null_validation), Column('Present Taluk/Block', null_validation), Column('Present Address line1', null_validation), Column('Present Address line2', null_validation), Column('Present Village', null_validation), Column('Permanent Address line1', null_validation), Column('Permanent Address line2', null_validation), Column('Permanent Village', null_validation), Column('Permanent Panchayat', null_validation), Column('Permanent Taluk/Block', null_validation), #str+null check Column('Fresher/Experienced?*', str_validation + null_validation), Column('Salutation*', str_validation + null_validation), Column('First Name*', str_validation + null_validation), Column('Gender*', str_validation + null_validation), Column('Marital Status*', str_validation + null_validation), Column('Caste*', str_validation + null_validation), Column('Disability Status*', str_validation + null_validation), Column('Religion*', str_validation + null_validation), Column('Source of Information*', str_validation + null_validation), Column('Present District*', str_validation + null_validation), Column('Present State*', str_validation + null_validation), Column('Present Country*', str_validation + null_validation), Column('Permanent District*', str_validation + null_validation), Column('Permanent State*', str_validation + null_validation), Column('Permanent Country*', str_validation + null_validation), #pincode check Column('Present Pincode*', pincode_validation + null_validation), Column('Permanent Pincode*', pincode_validation + null_validation), #mobile number check Column('Primary contact No*', mob_validation + null_validation), #date of birth and age pass(null check) Column('Date of Birth*', null_validation), Column('Age*', null_validation), Column('date_age', dob_validation) ]) errors = schema.validate(df) errors_index_rows = [e.row for e in errors] pd.DataFrame({'col': errors}).to_csv('errors.csv') df_clean = df.drop(index=errors_index_rows) df_clean.to_csv('clean_data.csv', index=None) return (len(errors_index_rows))
def validate_genotype(conn, args, filepath): """Validates input file for genotype data This function validates that the contents of a file to contain genotype data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ # Allow for users to skip this validation step because it is time consuming if args.skip_genotype_validation is True: return schema_columns = [ Column('row_number', [ CanConvertValidation(int) & IsDistinctValidation() ]) ] # Get the number of lines from the .pos counterpart file pos_filepath = '.'.join([filepath, 'pos']) if not os.path.exists(pos_filepath): raise FileNotFoundError(f"Count not locate the position counterpart file for {filepath}") nPositions = len(pd.read_csv(pos_filepath, header=None).index) for n in range(nPositions): schema_columns.append( Column(f'pos_{n}', [ CanConvertValidation(int) & CustomSeriesValidation(lambda x: x.int in [-1,0,1,2], 'Incorrectly coded value.') ]) ) schema = Schema(schema_columns) df = pd.read_csv(filepath, sep='\t', header=None) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def defSchema(): print('Define expected Schema') schema = Schema([ Column(name='id', validations=[IsDtypeValidation(np.object_)], allow_empty=False), Column(name='comment_text', validations=[IsDtypeValidation(np.object_)], allow_empty=False), Column(name='toxic', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='severe_toxic', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='obscene', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='threat', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='insult', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='identity_hate', validations=[InListValidation([0, 1])], allow_empty=False) ]) return schema
def __init__(self): self.schemas = Schema( [ Column("RA_Report #", [CanConvertValidation(int)]), Column("RA_CAERS Created Date", [CanCallValidation(self.parse_date)]), Column( "AEC_Event Start Date", [CanCallValidation(self.parse_date)], allow_empty=True, ), Column( "PRI_Product Role", [InListValidation(["Suspect", "Concomitant"])] ), Column("PRI_Reported Brand/Product Name"), Column("PRI_FDA Industry Code"), Column("PRI_FDA Industry Name"), Column("CI_Age at Adverse Event"), Column( "CI_Age Unit", [ InListValidation( ["Year(s)", "Decade(s)", "Month(s)", "Week(s)", "Day(s)"] ) ], ), Column("CI_Gender", [InListValidation(["Female", "Male"])]), Column("AEC_One Row Outcomes"), Column("SYM_One Row Coded Symptoms"), ] )
def compile_source_data_validation_schema(self, dataset): field_names = [] field_schemas = [] if dataset is not None: sub_map = Mapper.filter_mapper(self.mapper_df, dataset) else: sub_map = self.mapper_df sub_map = sub_map[sub_map['allow_missing'].str.lower() != 'y'].dropna( subset=['source_field_name', 'source_field_type']) for idx, field in sub_map.iterrows(): field_validator = self.compile_field_validator(field) if field_validator: field_names.append(field['source_field_name']) field_schemas.append( Column(field['source_field_name'], field_validator)) if not field_schemas: return None, [] schema = Schema(field_schemas) return schema, field_names
class ClassifierParser(BaseParser): """ Implementation of classifier dao Parser. The classifier output tables contain the output data from geniepy after the classifiers have calculated desired predictions. """ default_type: DataType = None scraper: None """No online sources for classifiers output.""" schema: Schema = Schema([ Column("digest"), Column(PCPCLSFR_NAME, [IsDtypeValidation(np.float64)]), Column(CTCLSFR_NAME, [IsDtypeValidation(np.float64)]), ]) def fetch(self, chunksize: int) -> Generator[DataFrame, None, None]: """No online sources to fetch from for classifiers outputs.""" raise NotImplementedError("Classifier Output Parser has no Scrapers") @staticmethod def parse(data, dtype=DataType.CSV_STR) -> DataFrame: """ Parser function from base class. Raises: NotImplementedError -- Function not implemented since classifiers return dataframes that only need to be validated. """ raise NotImplementedError("Classifier Output Parser has no Scrapers")
def validate_phenotype(conn, args, filepath): """Validates input file for phenotype data This function validates that the contents of a file to contain phenotype data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape nrows += 1 # include the header in the row count if re.match('(genotype)|(pedigree)|(line)', df.columns[0], re.IGNORECASE) is None: raise Exception("Genotype/pedigree/line should be the first column in the phenotype file") # Rename the first column of data to be the genotypes/lines df.rename(columns={f'{df.columns[0]}': 'genotype'}, inplace=True) schema_columns = [ Column('genotype', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append( Column(df.columns[n], [ # NOTE(tparker): This may not always be true. If there any phenotypes that # are listed as categories or strings, then this would fail # Find out all the possible phenotype values. It may be difficult to # validate input data without a user-provided dtype list CanConvertValidation(float) ]) ) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def check_join_cols(df1, df2, on): schema = Schema([ Column( col, [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), IsDistinctValidation() ], ) for col in on ]) results = [schema.validate(df) for df in [df1[on], df2[on]]] if len(results) > 0: print("The following issues exist in the index:") for error in itertools.chain(*results): print(error)
def _validate(self, diagnosis_df): schema = Schema([ Column('visit_dt', [ MatchesPatternValidation(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:00$') ]), Column('sex', [InListValidation(['M', 'K'])]), Column('icd10', [ MatchesPatternValidation(r'^[CDIJKMNRZ]{1}\d{1,2}.?\d{0,2}$') ]) ]) errors = schema.validate(diagnosis_df) for error in errors: self.Logger.error(error) if len(errors) > 0: exit()
class UnorderedSchema(unittest.TestCase): schema = Schema( [Column('a'), Column('b', [LeadingWhitespaceValidation()])], ordered=False) def test_fields(self): self.assertEqual(len(self.schema.columns), 2, 'The schema is not storing all of its columns') self.assertEqual( self.schema.ordered, False, 'The schema is not storing the correct value of ordered') def test_validate_valid(self): df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['1', '2', '3']}) results = self.schema.validate(df) self.assertEqual(len(results), 0, 'A correct data frame should have no errors') def test_validate_invalid(self): df = pd.DataFrame({'a': [' 1', '2', '3'], 'b': [' 1', '2', '3']}) results = self.schema.validate(df) self.assertEqual(len(results), 1, 'An incorrect data frame should report errors') def test_mixed_columns(self): """ Tests that when ordered=False, the schema columns are associated with data frame columns by name, not position. In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in column b in the data frame (leading whitespace), and a validation on column b in the schema. Schema a b (validation) Data Frame b (error) a Thus there will only be an error if column b in the schema is linked to column b in the data frame, as is correct behaviour. """ df = pd.read_csv(StringIO(''' b,a 1,1 2,3 3,3 '''), sep=',', header=0, dtype=str) results = self.schema.validate(df) self.assertEqual(len(results), 1, 'There should be 1 error') self.assertEqual(results[0].row, 0) self.assertEqual( results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name' )
class StockPrice: """ Model representing stock prices over time """ fieldSchema = Schema([ Column('open', [IsDtypeValidation(np.float64)]), Column('close', [IsDtypeValidation(np.float64)]), Column('high', [IsDtypeValidation(np.float64)]), Column('low', [IsDtypeValidation(np.float64)]), Column('volume', [IsDtypeValidation(np.int64)]) ])
def validate_population_structure(conn, args, filepath): """Validates input file for population structure data This function validates that the contents of a file to contain population structure data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape nrows += 1 # include the header rows in the count logging.debug(f'Population structure columns: {df.columns}') logging.debug(f"Population structure dimensions: <{nrows}, {ncols}>") schema_columns = [ Column('Pedigree', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append(Column(df.columns[n], [ CanConvertValidation(float) ])) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def validate_kinship(conn, args, filepath): """Validates input file for kinship data This function validates that the contents of a file to contain kinship data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape df.rename(columns = {"Unnamed: 0": "line_name"}, inplace=True) # since column name is blank by default, rename it for later reference nrows += 1 # include the header row in the count logging.debug(f"Dimensions of kinship matrix: <{nrows}, {ncols}>") schema_columns = [ Column('line_name', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append(Column(df.columns[n], [ CanConvertValidation(float) ])) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def create_schema(self) -> Schema: """ Create Pandas schema with all the necessary validation rules read in from config """ col_list = [] for column in self.__spreadsheet_def.keys(): validators = [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ] mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column) # Special cases for checking institutions/countries... if column == 'submitting_institution': validators.append( InListValidation([i.name for i in self.__institutions])) if column == 'country': validators.append( InListValidation([i.country for i in self.__institutions])) else: # Regex validation if self.__spreadsheet_def.get_regex(column): validators.append( MatchesPatternValidation( self.__spreadsheet_def.get_regex(column), message=self.__spreadsheet_def. get_regex_validation_message(column))) # Validate allowed values elif self.__spreadsheet_def.get_allowed_values(column): validators.append( InListValidation( self.__spreadsheet_def.get_allowed_values(column), case_sensitive=False)) # Field length validation max_len = self.__spreadsheet_def.get_max_length(column) if max_len and max_len > 0: validators.append( _StringLengthValidation( 'field length is greater than {} characters'. format(str(max_len)), max_len)) # Mandatory field validation col_list.append( Column(self.__spreadsheet_def.get_column_name(column), validators, allow_empty=not mandatory_field_flag)) return Schema(col_list)
def __init__(self): self.schemas = Schema( [ Column( "Given Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "Family Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("Age", [InRangeValidation(0, 120)]), Column("Sex", [InListValidation(["Male", "Female", "Other"])]), Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]), ] )
def __init__(self): self.schemas = Schema( [ Column("id"), Column( "payer_name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("document_amount"), Column("payed_amount"), Column("payer_id_number"), Column( "payer_address", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("barcode"), Column("typable_line"), Column("number"), Column( "document_number", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("due_date", [DateFormatValidation("%m/%d/%y")]), Column( "city", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "state", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("zip_code"), Column("bank_answer_date"), Column("pdf_upload_date"), Column( "status", [InListValidation(["pending", "paid", "due", "error"])] ), Column("callback"), Column("object_id"), Column("extra"), ] )
def run(self, repo_uri: str = None, query: str = None, schema: Schema = None, layer_query: bool = None, **kwargs: Any): """ Args: Returns: - No return """ assert repo_uri, 'Must specify repo_uri.' repo_info = parse_repo(repo_uri) repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) data = sql_to_df(query, repository=repo, use_lq=layer_query) if schema is not None: errors = schema.validate(data) if errors: raise SchemaValidationError(errors) return data
class OrderedSchema(unittest.TestCase): schema = Schema( [Column('a', [LeadingWhitespaceValidation()]), Column('b')], ordered=True) def test_mixed_columns(self): """ Tests that when ordered=True, the schema columns are associated with data frame columns by position, not name. In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in column b in the data frame (leading whitespace), and a validation on column a in the schema. Schema a (validation) b Data Frame b (error) a Thus there will only be an error if column b in the schema is linked to column a in the data frame, as is correct behaviour when ordered=True. """ df = pd.read_csv(StringIO(''' b,a 1,1 2,3 3,3 '''), sep=',', header=0, dtype=str) results = self.schema.validate(df) self.assertEqual(len(results), 1, 'There should be 1 error') self.assertEqual(results[0].row, 0) self.assertEqual( results[0].column, 'b', 'The Schema object is not associating columns and column schemas by position' )
from pandas_schema import Column, Schema from pandas_schema.validation import MatchesPatternValidation, CanConvertValidation, CustomSeriesValidation import pandas as pd schema = Schema([ Column('col1', [ CanConvertValidation(int) | (CustomSeriesValidation( lambda x: x.str.len() > 1, 'Doesn\'t have more than 1 character') & MatchesPatternValidation('a')) ]) ]) test_data = pd.DataFrame({'col1': ['an', '13', 'a', '8', 'the']}) errors = schema.validate(test_data) for error in errors: print('"{}" failed!'.format(error.value))
class UnorderedSchema(unittest.TestCase): schema = Schema( [Column('a'), Column('b', [LeadingWhitespaceValidation()])], ordered=False) def test_fields(self): self.assertEqual(len(self.schema.columns), 2, 'The schema is not storing all of its columns') self.assertEqual( self.schema.ordered, False, 'The schema is not storing the correct value of ordered') def test_validate_valid(self): df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['1', '2', '3']}) results = self.schema.validate(df) self.assertEqual(len(results), 0, 'A correct data frame should have no errors') def test_validate_invalid(self): df = pd.DataFrame({'a': [' 1', '2', '3'], 'b': [' 1', '2', '3']}) results = self.schema.validate(df) self.assertEqual(len(results), 1, 'An incorrect data frame should report errors') def test_mixed_columns(self): """ Tests that when ordered=False, the schema columns are associated with data frame columns by name, not position. In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in column b in the data frame (leading whitespace), and a validation on column b in the schema. Schema a b (validation) Data Frame b (error) a Thus there will only be an error if column b in the schema is linked to column b in the data frame, as is correct behaviour. """ df = pd.read_csv(StringIO(''' b,a 1,1 2,3 3,3 '''), sep=',', header=0, dtype=str) results = self.schema.validate(df) self.assertEqual(len(results), 1, 'There should be 1 error') self.assertEqual(results[0].row, 0) self.assertEqual( results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name' ) def test_column_subset_detect(self): """ Tests that when ordered=False, validation is possible by passing a subset of the columns contained in the schema Schema a* b (validation) Data Frame b (error) a not passed column* is not being passed Thus there will only be an error if column b in the schema is linked to column b in the data frame, as is correct behaviour """ df = pd.read_csv(StringIO(''' b,a 1,1 2,3 3,3 '''), sep=',', header=0, dtype=str) results = self.schema.validate(df, columns=['b']) self.assertEqual(len(results), 1, 'There should be 1 error') self.assertEqual(results[0].row, 0) self.assertEqual( results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name' ) def test_column_subset_detect_empty(self): """ Tests that when ordered=False, validation is possible by passing a subset of the columns contained in the schema Schema a b* (validation) Data Frame b (error) a column* is not being passed There will be an error if other than zero errors are found. """ df = pd.read_csv(StringIO(''' b,a 1,1 2,3 3,3 '''), sep=',', header=0, dtype=str) # should detect no errors results_empty = self.schema.validate(df, columns=['a']) self.assertEqual(len(results_empty), 0, 'There should be no errors') def test_column_subset_error(self): """ Tests that when ordered=False, validation is possible by passing a subset of the columns contained in the schema Schema a b (validation) Data Frame b (error) a There will be an error if a column different than 'a' or 'b' is passed """ df = pd.read_csv(StringIO(''' b,a 1,1 2,3 3,3 '''), sep=',', header=0, dtype=str) # should raise a PanSchArgumentError self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c'])
def validate_and_annotate(self, file_path=''): ################################ # SCHEMAS VALIDATION # ################################ # FIX ME: this is a really janky way to do this. Passing empty string for file_path... # is a temp fix for review_changes re-validate data button causing an infinite loop when calling this with an argument :( if not file_path: file_path = self.dirname + '/for_review/intermediate_hospital_data.xlsx' try: df = pd.read_csv(file_path) except: try: df = pd.read_excel(file_path) except: print("UNACCEPTED FILE FORMAT") df.fillna("NULL", inplace=True) #~ print(file_path) #~ print(df.info) schema = Schema([ Column('N95PlanFitTested', [InListValidation(['Y', 'N', 'NULL'])]), Column('PARPsPlanTrained', [InListValidation(['Y', 'N', 'NULL'])]), ]) errors = schema.validate(df, columns=schema.get_column_names()) ####################################### # Build excel worksheet w formatting ####################################### save_path2 = self.dirname + '/for_review/' if not os.path.exists(save_path2): os.makedirs(save_path2) writer = pd.ExcelWriter(save_path2 + 'intermediate_hospital_data.xlsx', engine='xlsxwriter') # Skip row 1 headers so we can add manunally with formatting df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False) workbook = writer.book worksheet = writer.sheets['Sheet1'] ### WORKBOOK FORMATS ### yellow_highlight = workbook.add_format({'bg_color': '#FFEB9C'}) header = workbook.add_format({ 'bold': True, 'text_wrap': True, 'valign': 'top', 'fg_color': '#D7E4BC', 'border': 1 }) ######################## # Set column widths worksheet.set_column('A:II', 30) worksheet.set_default_row(hide_unused_rows=True) # Write the column headers with the defined format. for col_num, value in enumerate(df.columns.values): worksheet.write(0, col_num, value, header) # for storing error row numbers while we iterate thru error object # will use for hiding rows error_rows = [] df_length = len(df) for error in errors: error_rows.append(error.row) row = error.row + 1 column = df.columns.get_loc(error.column) # Comments worksheet.write_comment(row, column, error.message) # Highlights worksheet.conditional_format(row, column, row, column, { 'type': 'no_errors', 'format': yellow_highlight }) #~ print(error_rows); # Hide Rows that don't contain errors for i in range(df_length + 1): if i not in error_rows: worksheet.set_row(i + 1, None, None, {'hidden': True}) writer.save() # Pop up self.review_changes()
import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \ DateFormatValidation, InListValidation schema = Schema([ Column('name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('title', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('salary', [InRangeValidation(0, 33000)]), Column('sex', [InListValidation(['F', 'M'])]), Column('date', [DateFormatValidation('%Y-%m-%d')]) ]) widths = [ 9, # name 19, # title 6, # salary 4, # sex 11, # date ] # read source data test_data = pd.read_fwf("data/fixed_width.txt", widths=widths) print('orig dataset') print(test_data) # data verification
def main(): # Parse input arguments parser = get_parser() args = parser.parse_args() data_path = args.path_in path_tsv = os.path.join(data_path, 'participants.tsv') tsv_file = pd.read_csv(path_tsv, sep='\t') list_subj = [ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub') ] df = pd.DataFrame(tsv_file) list_tsv_participants = df['participant_id'].tolist() missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants)) missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj)) if missing_subjects_tsv: # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv) print('\nWarning missing following subjects from participants.tsv: ') missing_subjects_tsv.sort() pprint(missing_subjects_tsv) if missing_subjects_folder: # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder) print( '\nWarning missing data for subjects listed in participants.tsv: ') missing_subjects_folder.sort() pprint(missing_subjects_folder) for dirName, subdirList, fileList in os.walk(data_path): for file in fileList: if file.endswith('.nii.gz'): originalFilePath = os.path.join(dirName, file) jsonSidecarPath = os.path.join(dirName, file.split(".")[0] + '.json') if os.path.exists(jsonSidecarPath) == False: print("Missing jsonSidecar: " + jsonSidecarPath) # Checking participants.tsv contents schema = Schema([ Column('participant_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('sex', [InListValidation(['M', 'F'])]), Column('age', [InRangeValidation(18, 60)]), Column('height', [MatchesPatternValidation(r"[0-9]|-")]), Column('weight', [MatchesPatternValidation(r"[0-9]|-")]), Column('date_of_scan', [ DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-") ]), Column('institution_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('institution', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturer', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturers_model_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('receive_coil_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('software_versions', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('researcher', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), ]) errors = schema.validate(tsv_file) print('\nChecking the contents of participants.tsv') if not errors: print("--> all good 👍") else: for error in errors: print(error)