def __init__(self): self.schemas = Schema( [ Column( "Given Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "Family Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("Age", [InRangeValidation(0, 120)]), Column("Sex", [InListValidation(["Male", "Female", "Other"])]), Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]), ] )
def __init__(self): self.schemas = Schema( [ Column("id"), Column( "payer_name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("document_amount"), Column("payed_amount"), Column("payer_id_number"), Column( "payer_address", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("barcode"), Column("typable_line"), Column("number"), Column( "document_number", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("due_date", [DateFormatValidation("%m/%d/%y")]), Column( "city", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "state", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("zip_code"), Column("bank_answer_date"), Column("pdf_upload_date"), Column( "status", [InListValidation(["pending", "paid", "due", "error"])] ), Column("callback"), Column("object_id"), Column("extra"), ] )
def create_schema(self) -> Schema: """ Create Pandas schema with all the necessary validation rules read in from config """ col_list = [] for column in self.__spreadsheet_def.keys(): validators = [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ] mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column) # Special cases for checking institutions/countries... if column == 'submitting_institution': validators.append( InListValidation([i.name for i in self.__institutions])) if column == 'country': validators.append( InListValidation([i.country for i in self.__institutions])) else: # Regex validation if self.__spreadsheet_def.get_regex(column): validators.append( MatchesPatternValidation( self.__spreadsheet_def.get_regex(column), message=self.__spreadsheet_def. get_regex_validation_message(column))) # Validate allowed values elif self.__spreadsheet_def.get_allowed_values(column): validators.append( InListValidation( self.__spreadsheet_def.get_allowed_values(column), case_sensitive=False)) # Field length validation max_len = self.__spreadsheet_def.get_max_length(column) if max_len and max_len > 0: validators.append( _StringLengthValidation( 'field length is greater than {} characters'. format(str(max_len)), max_len)) # Mandatory field validation col_list.append( Column(self.__spreadsheet_def.get_column_name(column), validators, allow_empty=not mandatory_field_flag)) return Schema(col_list)
def check_join_cols(df1, df2, on): schema = Schema([ Column( col, [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), IsDistinctValidation() ], ) for col in on ]) results = [schema.validate(df) for df in [df1[on], df2[on]]] if len(results) > 0: print("The following issues exist in the index:") for error in itertools.chain(*results): print(error)
class DoubleValidationColumn(unittest.TestCase): """ Test a column with two different validations """ NAME = 'col1' col = Column( NAME, [TrailingWhitespaceValidation(), LeadingWhitespaceValidation()], allow_empty=False) ser = pd.Series([' a ', ' b ', ' c ']) def test_outputs(self): results = self.col.validate(self.ser) # There should be 6 errors, 2 for each row self.assertEqual(len(results), 2 * len(self.ser), 'A Column produces the wrong number of errors') for i in range(2): in_row = [r for r in results if r.row == i] self.assertEqual( len(in_row), 2, 'A Column does not report both errors for every row')
import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \ DateFormatValidation, InListValidation schema = Schema([ Column('name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('title', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('salary', [InRangeValidation(0, 33000)]), Column('sex', [InListValidation(['F', 'M'])]), Column('date', [DateFormatValidation('%Y-%m-%d')]) ]) widths = [ 9, # name 19, # title 6, # salary 4, # sex 11, # date ] # read source data test_data = pd.read_fwf("data/fixed_width.txt", widths=widths) print('orig dataset') print(test_data) # data verification
def main(): # Parse input arguments parser = get_parser() args = parser.parse_args() data_path = args.path_in path_tsv = os.path.join(data_path, 'participants.tsv') tsv_file = pd.read_csv(path_tsv, sep='\t') list_subj = [ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub') ] df = pd.DataFrame(tsv_file) list_tsv_participants = df['participant_id'].tolist() missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants)) missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj)) if missing_subjects_tsv: # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv) print('\nWarning missing following subjects from participants.tsv: ') missing_subjects_tsv.sort() pprint(missing_subjects_tsv) if missing_subjects_folder: # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder) print( '\nWarning missing data for subjects listed in participants.tsv: ') missing_subjects_folder.sort() pprint(missing_subjects_folder) for dirName, subdirList, fileList in os.walk(data_path): for file in fileList: if file.endswith('.nii.gz'): originalFilePath = os.path.join(dirName, file) jsonSidecarPath = os.path.join(dirName, file.split(".")[0] + '.json') if os.path.exists(jsonSidecarPath) == False: print("Missing jsonSidecar: " + jsonSidecarPath) # Checking participants.tsv contents schema = Schema([ Column('participant_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('sex', [InListValidation(['M', 'F'])]), Column('age', [InRangeValidation(18, 60)]), Column('height', [MatchesPatternValidation(r"[0-9]|-")]), Column('weight', [MatchesPatternValidation(r"[0-9]|-")]), Column('date_of_scan', [ DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-") ]), Column('institution_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('institution', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturer', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturers_model_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('receive_coil_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('software_versions', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('researcher', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), ]) errors = schema.validate(tsv_file) print('\nChecking the contents of participants.tsv') if not errors: print("--> all good 👍") else: for error in errors: print(error)
from pandas_schema import Column, Schema from pandas_schema.validation import ( LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, InListValidation, CustomElementValidation, ) EmptyStringValidation = CustomElementValidation(lambda d: d != "", "This field cannot be empty") nipt_results_schema = Schema([ Column("SampleID", [TrailingWhitespaceValidation(), EmptyStringValidation]), Column("SampleType", []), Column("Description", []), Column("SampleProject", [TrailingWhitespaceValidation(), EmptyStringValidation]), Column("Index1", []), Column("Index2", []), Column("Library_nM", []), Column("QCFlag", []), Column("Zscore_13", [CanConvertValidation(float)]), Column("Zscore_18", [CanConvertValidation(float)]), Column("Zscore_21", [CanConvertValidation(float)]), Column("Zscore_X", [CanConvertValidation(float)]), Column("Ratio_13", [CanConvertValidation(float)]), Column("Ratio_18", [CanConvertValidation(float)]), Column("Ratio_21", [CanConvertValidation(float)]), Column("Ratio_X", [CanConvertValidation(float)]),
# Iterate over each pair of schema columns and data frame series and run validations column_pairs, errors = self._get_column_pairs(panda_sdrf) for series, column in column_pairs: errors += column.validate(series) return sorted(errors, key=lambda e: e.row) def check_recommendations(self, panda_sdrf): column_pairs, errors = self._get_column_pairs(panda_sdrf) warnings = [] for series, column in column_pairs: warnings += column.validate_optional(series) return sorted(warnings, key=lambda e: e.row) default_schema = SDRFSchema([ SDRFColumn('source name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True, optional_type=False), SDRFColumn('characteristics[organism part]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True, optional_type=False), SDRFColumn('characteristics[disease]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True, optional_type=False), SDRFColumn('characteristics[organism]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), OntologyTerm("ncbitaxon", not_applicable=True)], allow_empty=False, optional_type=False), SDRFColumn('characteristics[cell type]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=False,
def validate_csv(cls, data, registration_type): df = pd.read_csv(data) logger.info("Printing dataframe before CSV validation...") logger.info(df) if registration_type == 'slr': csv_schema = Schema([ Column('ipaddr', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), MatchesPatternValidation( r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('username', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('password', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('sa_name', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('va_name', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('domain', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('license', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ]), Column('license_count', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ]), Column('tftp_server_ip', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), MatchesPatternValidation( r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('tftp_server_path', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]) ]) elif registration_type == 'sl': csv_schema = Schema([ Column('ipaddr', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), MatchesPatternValidation( r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('username', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('password', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('sa_name', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('va_name', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('domain', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]) ]) errors = csv_schema.validate(df) if errors: errors_list = [] for error in errors: print(error) errors_list.append(error) return False, errors_list, df else: return True, None, df
'38': 'GRCh38'} VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"] GENERIC_VALIDATORS = { SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True), EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True), OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True), HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True), BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True), EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False), REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True), FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True), LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True) } SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False) SNP_EMPTY_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} SNP_EMPTY_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs[0-9]+|nan)$')], allow_empty=False) SNP_EMPTY_VALIDATORS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False) SNP_EMPTY_VALIDATORS[BP_DSET] = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False) POS_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} POS_VALIDATORS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False) POS_VALIDATORS[BP_DSET] = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False) EFFECT_WEIGHT_VALIDATOR = {k:v for k,v in GENERIC_VALIDATORS.items()}
def validate(df): d_error = {} list_bu = [x[0] for x in BU_CHOICES] list_rd = [x[0] for x in RD_CHOICES] list_dept = [x[0] for x in DEPT_CHOICES] list_hplevel = [x[0] for x in HPLEVEL_CHOICES] list_province = [x[0] for x in PROVINCE_CHOICES] list_title = [x[0] for x in TITLE_CHOICES] NullValidation = CustomElementValidation(lambda d: d is not np.nan, "该字段不能为空") schema = Schema([ Column("南北中国", [InListValidation(list_bu)]), Column("区域", [InListValidation(list_rd)]), Column("大区", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("地区经理", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("负责代表", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column( "医院编码", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation, MatchesPatternValidation(r"^[H]{1}(\d){9}$"), ], ), Column("医院全称", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("省/自治区/直辖市", [InListValidation(list_province)]), Column("是否双call", [InListValidation(["是", "否"])]), Column("医院级别", [InListValidation(list_hplevel)]), Column("开户进展", [InListValidation(["已开户", "未开户"])]), Column("客户姓名", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), IsDistinctValidation() ]), Column("所在科室", [InListValidation(list_dept)]), Column("职称", [InListValidation(list_title)]), Column("月出诊次数(半天计)", [CanConvertValidation(int), InRangeValidation(0, 63)]), Column("每半天\n门诊量", [CanConvertValidation(int), InRangeValidation(0, )]), Column("相关病人\n比例(%)\n建议比例:40%-80%", [CanConvertValidation(int), InRangeValidation(0, 101)]), Column("备注"), ]) errors = schema.validate(df.loc[:, COL]) for error in errors: str_warning = str(error) for term in D_TRANSLATE: str_warning = str_warning.replace(term, D_TRANSLATE[term]) findword = r": [0-9]\d*" str_warning = re.sub(findword, row_refined, str_warning) d_error[str_warning] = "<br>" d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")} d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")} d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")} d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")} d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")} d_error = {**d_error, **check_hplevel_with_dept(df)} # 检查医院级别和所在科室是否出现矛盾 return d_error