def _validate(self, diagnosis_df): schema = Schema([ Column('visit_dt', [ MatchesPatternValidation(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:00$') ]), Column('sex', [InListValidation(['M', 'K'])]), Column('icd10', [ MatchesPatternValidation(r'^[CDIJKMNRZ]{1}\d{1,2}.?\d{0,2}$') ]) ]) errors = schema.validate(diagnosis_df) for error in errors: self.Logger.error(error) if len(errors) > 0: exit()
def create_schema(self) -> Schema: """ Create Pandas schema with all the necessary validation rules read in from config """ col_list = [] for column in self.__spreadsheet_def.keys(): validators = [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ] mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column) # Special cases for checking institutions/countries... if column == 'submitting_institution': validators.append( InListValidation([i.name for i in self.__institutions])) if column == 'country': validators.append( InListValidation([i.country for i in self.__institutions])) else: # Regex validation if self.__spreadsheet_def.get_regex(column): validators.append( MatchesPatternValidation( self.__spreadsheet_def.get_regex(column), message=self.__spreadsheet_def. get_regex_validation_message(column))) # Validate allowed values elif self.__spreadsheet_def.get_allowed_values(column): validators.append( InListValidation( self.__spreadsheet_def.get_allowed_values(column), case_sensitive=False)) # Field length validation max_len = self.__spreadsheet_def.get_max_length(column) if max_len and max_len > 0: validators.append( _StringLengthValidation( 'field length is greater than {} characters'. format(str(max_len)), max_len)) # Mandatory field validation col_list.append( Column(self.__spreadsheet_def.get_column_name(column), validators, allow_empty=not mandatory_field_flag)) return Schema(col_list)
def __init__(self): self.schemas = Schema( [ Column( "Given Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "Family Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("Age", [InRangeValidation(0, 120)]), Column("Sex", [InListValidation(["Male", "Female", "Other"])]), Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]), ] )
def validate_csv(cls, data, registration_type): df = pd.read_csv(data) logger.info("Printing dataframe before CSV validation...") logger.info(df) if registration_type == 'slr': csv_schema = Schema([ Column('ipaddr', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), MatchesPatternValidation( r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('username', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('password', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('sa_name', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('va_name', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('domain', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('license', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ]), Column('license_count', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ]), Column('tftp_server_ip', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), MatchesPatternValidation( r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('tftp_server_path', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]) ]) elif registration_type == 'sl': csv_schema = Schema([ Column('ipaddr', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), MatchesPatternValidation( r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('username', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('password', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('sa_name', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('va_name', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]), Column('domain', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), CustomSeriesValidation(lambda x: x.str.len() > 0, 'Column is empty!') ]) ]) errors = csv_schema.validate(df) if errors: errors_list = [] for error in errors: print(error) errors_list.append(error) return False, errors_list, df else: return True, None, df
from pandas_schema.validation import InRangeValidation, DateFormatValidation, MatchesPatternValidation ### raw data example # key ,sensor_id ,location_id ,lat ,lon ,timestamp ,pressure ,temperature ,humidity # 1 ,2266 ,1140 ,42.738 ,23.272 ,2017-07-01T00:00:07 ,95270.27 ,23.46 ,62.48 start_time = time.time() pattern_id = r'^-?\d{1,16}$' pattern_dec = r'^-?\d*\.\d{1,2}$' pattern_geo = r'^-?\d*\.\d{1,16}$' schema = Schema([ Column( 'key', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column( 'sensor_id', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column( 'location', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column('lat', [MatchesPatternValidation(pattern_geo) ]), # Number / decimal with up to 16 decimal place Column('lon', [MatchesPatternValidation(pattern_geo) ]), # Number / decimal with up to 16 decimal place Column('timestamp', [DateFormatValidation('%Y-%m-%dT%H:%M:%S')]), # Timestamp yyyy-MM-dd'T'HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01T00:00:07 Column('pressure', [MatchesPatternValidation(pattern_dec)]), # Numbers / / decimal with 1 or 2 decimals (.00) Column('temperature', [
class CtdParser(BaseParser): """ Implementation of CTD Database Parser. Comparative Toxicogenomics Gene-Disease Associations Database Parser. http://ctdbase.org/ """ default_type: DataType = DataType.CSV_STR scraper: gs.CtdScraper = gs.CtdScraper() schema: Schema = Schema([ Column("digest"), Column("genesymbol"), Column("geneid", [IsDtypeValidation(np.int64)]), Column("diseasename"), Column("diseaseid", [MatchesPatternValidation("^D[0-9]+$")]), # i.e. D000014 Column("pmids"), ]) @staticmethod def hash_record(record: pd.Series) -> str: """ Hash the ctd record to generate digest column. Arguments: record {pd.Series} -- The ctd record in form of pandas Series Returns: str -- the hex string of the computed digest """ message = str.encode(str(record.geneid) + record.diseaseid) hexdigest = hashlib.sha256(message).hexdigest() return str(hexdigest) @staticmethod def parse(data, dtype=DataType.CSV_STR) -> DataFrame: """ Parse data and convert according to parser schema. Arguments: data {Implementation dependent} -- Data to be parsed Keyword Arguments: dtype {DataType} -- Type of data to be parsed (default: {DataType.CSV}) Returns: DataFrame -- The parsed dataframe. Raises: ParserError -- If unable to parse data """ try: parsed_df = pd.read_csv(StringIO(data)) # Remove unused columns parsed_df = parsed_df.drop(columns=[ "DirectEvidence", "InferenceChemicalName", "InferenceScore", "OmimIDs", ]) # Remove prefix 'MESH:' from DiseaseIDs parsed_df["DiseaseID"] = parsed_df.apply( lambda x: x.DiseaseID.replace("MESH:", ""), axis=1) # Rename columns based on schema parsed_df.rename( columns={ "GeneSymbol": "genesymbol", "GeneID": "geneid", "DiseaseName": "diseasename", "DiseaseID": "diseaseid", "PubMedIDs": "pmids", }, inplace=True, ) # Compute and add the digest parsed_df["digest"] = parsed_df.apply(CtdParser.hash_record, axis=1) errors = CtdParser.validate(parsed_df) if errors: raise ParserError(errors) return parsed_df except Exception as parse_exp: raise ParserError(parse_exp)
BUILD_MAP = {'28': 'NCBI28', '29': 'NCBI29', '30': 'NCBI30', '31': 'NCBI31', '33': 'NCBI33', '34': 'NCBI34', '35': 'NCBI35', '36': 'NCBI36', '37': 'GRCh37', '38': 'GRCh38'} VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"] GENERIC_VALIDATORS = { SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True), EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True), OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True), HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True), BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True), EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False), REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True), FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True), LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True) } SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False)
def validate(df): d_error = {} list_bu = [x[0] for x in BU_CHOICES] list_rd = [x[0] for x in RD_CHOICES] list_dept = [x[0] for x in DEPT_CHOICES] list_hplevel = [x[0] for x in HPLEVEL_CHOICES] list_province = [x[0] for x in PROVINCE_CHOICES] list_title = [x[0] for x in TITLE_CHOICES] NullValidation = CustomElementValidation(lambda d: d is not np.nan, "该字段不能为空") schema = Schema([ Column("南北中国", [InListValidation(list_bu)]), Column("区域", [InListValidation(list_rd)]), Column("大区", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("地区经理", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("负责代表", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column( "医院编码", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation, MatchesPatternValidation(r"^[H]{1}(\d){9}$"), ], ), Column("医院全称", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("省/自治区/直辖市", [InListValidation(list_province)]), Column("是否双call", [InListValidation(["是", "否"])]), Column("医院级别", [InListValidation(list_hplevel)]), Column("开户进展", [InListValidation(["已开户", "未开户"])]), Column("客户姓名", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), IsDistinctValidation() ]), Column("所在科室", [InListValidation(list_dept)]), Column("职称", [InListValidation(list_title)]), Column("月出诊次数(半天计)", [CanConvertValidation(int), InRangeValidation(0, 63)]), Column("每半天\n门诊量", [CanConvertValidation(int), InRangeValidation(0, )]), Column("相关病人\n比例(%)\n建议比例:40%-80%", [CanConvertValidation(int), InRangeValidation(0, 101)]), Column("备注"), ]) errors = schema.validate(df.loc[:, COL]) for error in errors: str_warning = str(error) for term in D_TRANSLATE: str_warning = str_warning.replace(term, D_TRANSLATE[term]) findword = r": [0-9]\d*" str_warning = re.sub(findword, row_refined, str_warning) d_error[str_warning] = "<br>" d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")} d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")} d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")} d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")} d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")} d_error = {**d_error, **check_hplevel_with_dept(df)} # 检查医院级别和所在科室是否出现矛盾 return d_error
'15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', 'X', 'Y' ] VALIDATORS = { PVAL_DSET: Column(PVAL_DSET, [ CanConvertValidation(DSET_TYPES[PVAL_DSET]), InInclusiveRangeValidation(0, 1) ], allow_empty=False), BETA_DSET: Column(BETA_DSET, [CanConvertValidation(float)], allow_empty=True), SNP_DSET: Column(SNP_DSET, [ CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation( r'^chr[0-9XY]+_[0-9]+_[ACTGNactgn]+_[ACTGNactgn]+|LONG_STRING$') ], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False), BP_DSET: Column(BP_DSET, [ CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999) ], allow_empty=False), EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGNactgn]+|LONG_STRING$')], allow_empty=True), OTHER_DSET:
'28': 'NCBI28', '29': 'NCBI29', '30': 'NCBI30', '31': 'NCBI31', '33': 'NCBI33', '34': 'NCBI34', '35': 'NCBI35', '36': 'NCBI36', '37': 'GRCh37', '38': 'GRCh38' } VALIDATORS = { SNP_DSET: Column(SNP_DSET, [MatchesPatternValidation(r'rs[0-9]+') ]), # how do we handle the values that are like chr:bp:allele:snp? PVAL_DSET: Column( PVAL_DSET, [CanConvertValidation(float), InInclusiveRangeValidation(0, 1)] #CustomElementValidation(lambda s: float(s) >= 0 and float(s) <= 1, 'outside the range of 0 to 1')] ), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column( BP_DSET, [CanConvertValidation(int) & InInclusiveRangeValidation(1, 999999999)], allow_empty=True),
'35': 'NCBI35', '36': 'NCBI36', '37': 'GRCh37', '38': 'GRCh38' } VALID_FILE_EXTENSIONS = [ ".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip" ] GENERIC_VALIDATORS = { SNP_DSET: Column(SNP_DSET, [ CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$') ], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column(BP_DSET, [ CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999) ], allow_empty=True), EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True), OR_DSET:
import pandas as pd from io import StringIO from pandas_schema import Column, Schema from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesPatternValidation, InRangeValidation, InListValidation schema = Schema([ Column('Given Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('Family Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('Age', [InRangeValidation(0, 120)]), Column('Sex', [InListValidation(['Male', 'Female', 'Other'])]), Column('Customer ID', [MatchesPatternValidation(r'\d{4}[A-Z]{4}')]) ]) test_data = pd.read_csv( StringIO('''Given Name,Family Name,Age,Sex,Customer ID Gerald ,Hampton,82,Male,2582GABK Yuuwa,Miyake,270,male,7951WVLW Edyta,Majewska ,50,Female,775ANSID ''')) errors = schema.validate(test_data) for error in errors: print(error)
import time import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import DateFormatValidation, MatchesPatternValidation, InListValidation pattern_id = r'^-?\d{1,16}$' # Number / integer - up to 16 pattern_dec = r'^-?\d*\.\d{1,2}$' pattern_geo = r'^-?\d*\.\d{1,20}$' # geo location / decimal with up to 18 decimal place pattern_date = r'%Y-%m-%d %H:%M:%S' # Timestamp yyyy-MM-dd HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01 00:00:07 taxiRide_schema = Schema([ Column('rideId', [MatchesPatternValidation(pattern_id)]), Column('isStart', [InListValidation(['START', 'END'])]), Column('endTime', [DateFormatValidation(pattern_date)]), Column('startTime', [DateFormatValidation(pattern_date)]), Column('startLon', [MatchesPatternValidation(pattern_geo)]), Column('startLat', [MatchesPatternValidation(pattern_geo)]), Column('endLon', [MatchesPatternValidation(pattern_geo)]), Column('endLat', [MatchesPatternValidation(pattern_geo)]), Column('passengerCnt', [MatchesPatternValidation(pattern_id)]) ], ordered=True) taxiFare_schema = Schema([ Column('rideId', [MatchesPatternValidation(pattern_id)]), Column('taxiId', [MatchesPatternValidation(pattern_id)]), Column('driverId', [MatchesPatternValidation(pattern_id)]), Column('startTime', [DateFormatValidation(pattern_date)]), Column('paymentType', [InListValidation(['CSH', 'CRD', 'NOC', 'DIS', 'UNK'])]), Column('tip', [MatchesPatternValidation(pattern_dec)]),
def main(): # Parse input arguments parser = get_parser() args = parser.parse_args() data_path = args.path_in path_tsv = os.path.join(data_path, 'participants.tsv') tsv_file = pd.read_csv(path_tsv, sep='\t') list_subj = [ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub') ] df = pd.DataFrame(tsv_file) list_tsv_participants = df['participant_id'].tolist() missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants)) missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj)) if missing_subjects_tsv: # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv) print('\nWarning missing following subjects from participants.tsv: ') missing_subjects_tsv.sort() pprint(missing_subjects_tsv) if missing_subjects_folder: # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder) print( '\nWarning missing data for subjects listed in participants.tsv: ') missing_subjects_folder.sort() pprint(missing_subjects_folder) for dirName, subdirList, fileList in os.walk(data_path): for file in fileList: if file.endswith('.nii.gz'): originalFilePath = os.path.join(dirName, file) jsonSidecarPath = os.path.join(dirName, file.split(".")[0] + '.json') if os.path.exists(jsonSidecarPath) == False: print("Missing jsonSidecar: " + jsonSidecarPath) # Checking participants.tsv contents schema = Schema([ Column('participant_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('sex', [InListValidation(['M', 'F'])]), Column('age', [InRangeValidation(18, 60)]), Column('height', [MatchesPatternValidation(r"[0-9]|-")]), Column('weight', [MatchesPatternValidation(r"[0-9]|-")]), Column('date_of_scan', [ DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-") ]), Column('institution_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('institution', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturer', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturers_model_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('receive_coil_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('software_versions', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('researcher', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), ]) errors = schema.validate(tsv_file) print('\nChecking the contents of participants.tsv') if not errors: print("--> all good 👍") else: for error in errors: print(error)
import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import InRangeValidation, DateFormatValidation, MatchesPatternValidation # key ,sensor_id ,location ,lat ,lon ,timestamp ,pressure ,temperature ,humidity # 1 ,2266 ,1140 ,42.738 ,23.272 ,2017-07-01T00:00:07 ,95270.27 ,23.46 ,62.48 start_time = time.time() pattern_id = r'^-?\d{1,16}$' pattern_dec = r'^-?\d*\.\d{1,2}$' pattern_geo = r'^-?\d*\.\d{1,16}$' schema = Schema([ Column('key', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column('sensor_id', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column('location', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column('lat', [MatchesPatternValidation(pattern_geo)]), # Number / decimal with up to 16 decimal place Column('lon', [MatchesPatternValidation(pattern_geo)]), # Number / decimal with up to 16 decimal place Column('timestamp', [DateFormatValidation('%Y-%m-%dT%H:%M:%S')]), # Timestamp yyyy-MM-dd'T'HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01T00:00:07 Column('pressure', [MatchesPatternValidation(pattern_dec)]), # Numbers / / decimal with 1 or 2 decimals (.00) Column('temperature', [InRangeValidation(-146, 60), MatchesPatternValidation(r'^-?\d*\.\d{1,2}$')]), # Number / decimal with upto 2 decimal place Column('humidity', [MatchesPatternValidation(pattern_dec)]) # Numbers with 1 or 2 decimals (.00) ]) ### get data from File print('load orig dataset from file') test_data = pd.read_csv("data/testCSV_short.csv") print('orig dataset')
from pandas_schema import Column, Schema from pandas_schema.validation import MatchesPatternValidation, CanConvertValidation, CustomSeriesValidation import pandas as pd schema = Schema([ Column('col1', [ CanConvertValidation(int) | (CustomSeriesValidation( lambda x: x.str.len() > 1, 'Doesn\'t have more than 1 character') & MatchesPatternValidation('a')) ]) ]) test_data = pd.DataFrame({'col1': ['an', '13', 'a', '8', 'the']}) errors = schema.validate(test_data) for error in errors: print('"{}" failed!'.format(error.value))
from collections import defaultdict import re from pandas_schema import Column, Schema from pandas_schema.validation import (LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesPatternValidation, CustomSeriesValidation, InRangeValidation, InListValidation, DateFormatValidation) study_schema = Schema([ Column('study_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the study_id column.') & ~InListValidation([''])]), Column('pi_name', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_name column.') & ~InListValidation([''])]), Column('sample_type', [InListValidation(['wmgx', 'wmtx', '16S', 'other'])]), Column('bioproject_accession', [InListValidation(['']) | MatchesPatternValidation(r'PRJ\w+\d+')]), Column('geo_loc_name', [InListValidation(['']) | MatchesPatternValidation(r'\w+:\w+:\w+')]), Column('analysis_desc', [InListValidation(['']) | CanConvertValidation(str)]), Column('sequencing_facility', [LeadingWhitespaceValidation()]), Column('env_biom', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('env_feature', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('env_material', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('host_tissue_sampled', [InListValidation(['']) | MatchesPatternValidation(r'BTO:\d+')]), Column('animal_vendor', [LeadingWhitespaceValidation()]), Column('paired', [InListValidation(['true', 'false'])]), Column('paired_id', [InListValidation(['']) | MatchesPatternValidation(r'[a-zA-Z0-9_.]+')]) , Column('pi_email', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_email column.') & ~InListValidation([''])]) ])