예제 #1
0
    def _validate(self, diagnosis_df):
        schema = Schema([
            Column('visit_dt', [
                MatchesPatternValidation(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:00$')
            ]),
            Column('sex', [InListValidation(['M', 'K'])]),
            Column('icd10', [
                MatchesPatternValidation(r'^[CDIJKMNRZ]{1}\d{1,2}.?\d{0,2}$')
            ])
        ])

        errors = schema.validate(diagnosis_df)

        for error in errors:
            self.Logger.error(error)

        if len(errors) > 0:
            exit()
예제 #2
0
    def create_schema(self) -> Schema:
        """ Create Pandas schema with all the necessary validation rules read in from config """
        col_list = []
        for column in self.__spreadsheet_def.keys():
            validators = [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()
            ]

            mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column)

            # Special cases for checking institutions/countries...
            if column == 'submitting_institution':
                validators.append(
                    InListValidation([i.name for i in self.__institutions]))
            if column == 'country':
                validators.append(
                    InListValidation([i.country for i in self.__institutions]))
            else:
                # Regex validation
                if self.__spreadsheet_def.get_regex(column):
                    validators.append(
                        MatchesPatternValidation(
                            self.__spreadsheet_def.get_regex(column),
                            message=self.__spreadsheet_def.
                            get_regex_validation_message(column)))

                # Validate allowed values
                elif self.__spreadsheet_def.get_allowed_values(column):
                    validators.append(
                        InListValidation(
                            self.__spreadsheet_def.get_allowed_values(column),
                            case_sensitive=False))

                # Field length validation
                max_len = self.__spreadsheet_def.get_max_length(column)
                if max_len and max_len > 0:
                    validators.append(
                        _StringLengthValidation(
                            'field length is greater than {} characters'.
                            format(str(max_len)), max_len))

            # Mandatory field validation
            col_list.append(
                Column(self.__spreadsheet_def.get_column_name(column),
                       validators,
                       allow_empty=not mandatory_field_flag))

        return Schema(col_list)
예제 #3
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column(
                 "Given Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "Family Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("Age", [InRangeValidation(0, 120)]),
             Column("Sex", [InListValidation(["Male", "Female", "Other"])]),
             Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]),
         ]
     )
예제 #4
0
    def validate_csv(cls, data, registration_type):
        df = pd.read_csv(data)
        logger.info("Printing dataframe before CSV validation...")
        logger.info(df)
        if registration_type == 'slr':
            csv_schema = Schema([
                Column('ipaddr', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('username', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('password', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('sa_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('va_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('domain', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('license', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation()
                ]),
                Column('license_count', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation()
                ]),
                Column('tftp_server_ip', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('tftp_server_path', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ])
            ])
        elif registration_type == 'sl':
            csv_schema = Schema([
                Column('ipaddr', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('username', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('password', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('sa_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('va_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('domain', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ])
            ])

        errors = csv_schema.validate(df)

        if errors:
            errors_list = []
            for error in errors:
                print(error)
                errors_list.append(error)
            return False, errors_list, df
        else:
            return True, None, df
from pandas_schema.validation import InRangeValidation, DateFormatValidation, MatchesPatternValidation

### raw data example
# key       ,sensor_id ,location_id ,lat     ,lon       ,timestamp           ,pressure ,temperature  ,humidity
# 1         ,2266      ,1140      ,42.738 ,23.272    ,2017-07-01T00:00:07 ,95270.27 ,23.46        ,62.48

start_time = time.time()

pattern_id = r'^-?\d{1,16}$'
pattern_dec = r'^-?\d*\.\d{1,2}$'
pattern_geo = r'^-?\d*\.\d{1,16}$'

schema = Schema([
    Column(
        'key',
        [MatchesPatternValidation(pattern_id)]),  # Number / integer - up to 16
    Column(
        'sensor_id',
        [MatchesPatternValidation(pattern_id)]),  # Number / integer - up to 16
    Column(
        'location',
        [MatchesPatternValidation(pattern_id)]),  # Number / integer - up to 16
    Column('lat', [MatchesPatternValidation(pattern_geo)
                   ]),  # Number / decimal with up to 16 decimal place
    Column('lon', [MatchesPatternValidation(pattern_geo)
                   ]),  # Number / decimal with up to 16 decimal place
    Column('timestamp', [DateFormatValidation('%Y-%m-%dT%H:%M:%S')]),
    # Timestamp yyyy-MM-dd'T'HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01T00:00:07
    Column('pressure', [MatchesPatternValidation(pattern_dec)]),
    # Numbers / / decimal with 1 or 2 decimals (.00)
    Column('temperature', [
예제 #6
0
class CtdParser(BaseParser):
    """
    Implementation of CTD Database Parser.

    Comparative Toxicogenomics Gene-Disease Associations Database Parser.
    http://ctdbase.org/
    """

    default_type: DataType = DataType.CSV_STR
    scraper: gs.CtdScraper = gs.CtdScraper()
    schema: Schema = Schema([
        Column("digest"),
        Column("genesymbol"),
        Column("geneid", [IsDtypeValidation(np.int64)]),
        Column("diseasename"),
        Column("diseaseid",
               [MatchesPatternValidation("^D[0-9]+$")]),  # i.e. D000014
        Column("pmids"),
    ])

    @staticmethod
    def hash_record(record: pd.Series) -> str:
        """
        Hash the ctd record to generate digest column.

        Arguments:
            record {pd.Series} -- The ctd record in form of pandas Series

        Returns:
            str -- the hex string of the computed digest
        """
        message = str.encode(str(record.geneid) + record.diseaseid)
        hexdigest = hashlib.sha256(message).hexdigest()
        return str(hexdigest)

    @staticmethod
    def parse(data, dtype=DataType.CSV_STR) -> DataFrame:
        """
        Parse data and convert according to parser schema.

        Arguments:
            data {Implementation dependent} -- Data to be parsed

        Keyword Arguments:
            dtype {DataType} -- Type of data to be parsed (default: {DataType.CSV})

        Returns:
            DataFrame -- The parsed dataframe.

        Raises:
            ParserError -- If unable to parse data
        """
        try:
            parsed_df = pd.read_csv(StringIO(data))
            # Remove unused columns
            parsed_df = parsed_df.drop(columns=[
                "DirectEvidence",
                "InferenceChemicalName",
                "InferenceScore",
                "OmimIDs",
            ])
            # Remove prefix 'MESH:' from DiseaseIDs
            parsed_df["DiseaseID"] = parsed_df.apply(
                lambda x: x.DiseaseID.replace("MESH:", ""), axis=1)
            # Rename columns based on schema
            parsed_df.rename(
                columns={
                    "GeneSymbol": "genesymbol",
                    "GeneID": "geneid",
                    "DiseaseName": "diseasename",
                    "DiseaseID": "diseaseid",
                    "PubMedIDs": "pmids",
                },
                inplace=True,
            )
            # Compute and add the digest
            parsed_df["digest"] = parsed_df.apply(CtdParser.hash_record,
                                                  axis=1)
            errors = CtdParser.validate(parsed_df)
            if errors:
                raise ParserError(errors)
            return parsed_df
        except Exception as parse_exp:
            raise ParserError(parse_exp)
예제 #7
0
BUILD_MAP = {'28': 'NCBI28',
             '29': 'NCBI29',
             '30': 'NCBI30',
             '31': 'NCBI31',
             '33': 'NCBI33',
             '34': 'NCBI34',
             '35': 'NCBI35',
             '36': 'NCBI36',
             '37': 'GRCh37',
             '38': 'GRCh38'}

VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"]

GENERIC_VALIDATORS = {
    SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True),
    CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True),
    EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True),
    OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True),
    HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True),
    BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True),
    EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False),
    REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True),
    FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True),
    LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True)
}

SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False)
예제 #8
0
def validate(df):
    d_error = {}
    list_bu = [x[0] for x in BU_CHOICES]
    list_rd = [x[0] for x in RD_CHOICES]
    list_dept = [x[0] for x in DEPT_CHOICES]
    list_hplevel = [x[0] for x in HPLEVEL_CHOICES]
    list_province = [x[0] for x in PROVINCE_CHOICES]
    list_title = [x[0] for x in TITLE_CHOICES]

    NullValidation = CustomElementValidation(lambda d: d is not np.nan,
                                             "该字段不能为空")
    schema = Schema([
        Column("南北中国", [InListValidation(list_bu)]),
        Column("区域", [InListValidation(list_rd)]),
        Column("大区", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("地区经理", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("负责代表", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column(
            "医院编码",
            [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation(),
                NullValidation,
                MatchesPatternValidation(r"^[H]{1}(\d){9}$"),
            ],
        ),
        Column("医院全称", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("省/自治区/直辖市", [InListValidation(list_province)]),
        Column("是否双call", [InListValidation(["是", "否"])]),
        Column("医院级别", [InListValidation(list_hplevel)]),
        Column("开户进展", [InListValidation(["已开户", "未开户"])]),
        Column("客户姓名", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(),
            IsDistinctValidation()
        ]),
        Column("所在科室", [InListValidation(list_dept)]),
        Column("职称", [InListValidation(list_title)]),
        Column("月出诊次数(半天计)",
               [CanConvertValidation(int),
                InRangeValidation(0, 63)]),
        Column("每半天\n门诊量", [CanConvertValidation(int),
                            InRangeValidation(0, )]),
        Column("相关病人\n比例(%)\n建议比例:40%-80%",
               [CanConvertValidation(int),
                InRangeValidation(0, 101)]),
        Column("备注"),
    ])
    errors = schema.validate(df.loc[:, COL])
    for error in errors:
        str_warning = str(error)
        for term in D_TRANSLATE:
            str_warning = str_warning.replace(term, D_TRANSLATE[term])
            findword = r": [0-9]\d*"
            str_warning = re.sub(findword, row_refined, str_warning)
        d_error[str_warning] = "<br>"

    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")}
    d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")}
    d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")}
    d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")}

    d_error = {**d_error, **check_hplevel_with_dept(df)}  # 检查医院级别和所在科室是否出现矛盾
    return d_error
    '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', 'X', 'Y'
]

VALIDATORS = {
    PVAL_DSET:
    Column(PVAL_DSET, [
        CanConvertValidation(DSET_TYPES[PVAL_DSET]),
        InInclusiveRangeValidation(0, 1)
    ],
           allow_empty=False),
    BETA_DSET:
    Column(BETA_DSET, [CanConvertValidation(float)], allow_empty=True),
    SNP_DSET:
    Column(SNP_DSET, [
        CanConvertValidation(DSET_TYPES[SNP_DSET]),
        MatchesPatternValidation(
            r'^chr[0-9XY]+_[0-9]+_[ACTGNactgn]+_[ACTGNactgn]+|LONG_STRING$')
    ],
           allow_empty=True),
    CHR_DSET:
    Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False),
    BP_DSET:
    Column(BP_DSET, [
        CanConvertValidation(DSET_TYPES[BP_DSET]),
        InInclusiveRangeValidation(1, 999999999)
    ],
           allow_empty=False),
    EFFECT_DSET:
    Column(EFFECT_DSET,
           [MatchesPatternValidation(r'^[ACTGNactgn]+|LONG_STRING$')],
           allow_empty=True),
    OTHER_DSET:
예제 #10
0
    '28': 'NCBI28',
    '29': 'NCBI29',
    '30': 'NCBI30',
    '31': 'NCBI31',
    '33': 'NCBI33',
    '34': 'NCBI34',
    '35': 'NCBI35',
    '36': 'NCBI36',
    '37': 'GRCh37',
    '38': 'GRCh38'
}

VALIDATORS = {
    SNP_DSET:
    Column(SNP_DSET,
           [MatchesPatternValidation(r'rs[0-9]+')
            ]),  # how do we handle the values that are like chr:bp:allele:snp?
    PVAL_DSET:
    Column(
        PVAL_DSET,
        [CanConvertValidation(float),
         InInclusiveRangeValidation(0, 1)]
        #CustomElementValidation(lambda s: float(s) >= 0 and float(s) <= 1, 'outside the range of 0 to 1')]
    ),
    CHR_DSET:
    Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET:
    Column(
        BP_DSET,
        [CanConvertValidation(int) & InInclusiveRangeValidation(1, 999999999)],
        allow_empty=True),
예제 #11
0
    '35': 'NCBI35',
    '36': 'NCBI36',
    '37': 'GRCh37',
    '38': 'GRCh38'
}

VALID_FILE_EXTENSIONS = [
    ".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip",
    ".csv.gzip"
]

GENERIC_VALIDATORS = {
    SNP_DSET:
    Column(SNP_DSET, [
        CanConvertValidation(DSET_TYPES[SNP_DSET]),
        MatchesPatternValidation(r'^rs[0-9]+$')
    ],
           allow_empty=True),
    CHR_DSET:
    Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET:
    Column(BP_DSET, [
        CanConvertValidation(DSET_TYPES[BP_DSET]),
        InInclusiveRangeValidation(1, 999999999)
    ],
           allow_empty=True),
    EFFECT_WEIGHT_DSET:
    Column(EFFECT_WEIGHT_DSET,
           [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])],
           allow_empty=True),
    OR_DSET:
예제 #12
0
import pandas as pd
from io import StringIO
from pandas_schema import Column, Schema
from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesPatternValidation, InRangeValidation, InListValidation

schema = Schema([
    Column('Given Name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('Family Name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('Age', [InRangeValidation(0, 120)]),
    Column('Sex', [InListValidation(['Male', 'Female', 'Other'])]),
    Column('Customer ID', [MatchesPatternValidation(r'\d{4}[A-Z]{4}')])
])

test_data = pd.read_csv(
    StringIO('''Given Name,Family Name,Age,Sex,Customer ID
Gerald ,Hampton,82,Male,2582GABK
Yuuwa,Miyake,270,male,7951WVLW
Edyta,Majewska ,50,Female,775ANSID
'''))

errors = schema.validate(test_data)

for error in errors:
    print(error)
예제 #13
0
import time

import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import DateFormatValidation, MatchesPatternValidation, InListValidation

pattern_id = r'^-?\d{1,16}$'  # Number / integer - up to 16
pattern_dec = r'^-?\d*\.\d{1,2}$'
pattern_geo = r'^-?\d*\.\d{1,20}$'  # geo location / decimal with up to 18 decimal place
pattern_date = r'%Y-%m-%d %H:%M:%S'  # Timestamp yyyy-MM-dd HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01 00:00:07

taxiRide_schema = Schema([
    Column('rideId', [MatchesPatternValidation(pattern_id)]),
    Column('isStart', [InListValidation(['START', 'END'])]),
    Column('endTime', [DateFormatValidation(pattern_date)]),
    Column('startTime', [DateFormatValidation(pattern_date)]),
    Column('startLon', [MatchesPatternValidation(pattern_geo)]),
    Column('startLat', [MatchesPatternValidation(pattern_geo)]),
    Column('endLon', [MatchesPatternValidation(pattern_geo)]),
    Column('endLat', [MatchesPatternValidation(pattern_geo)]),
    Column('passengerCnt', [MatchesPatternValidation(pattern_id)])
], ordered=True)


taxiFare_schema = Schema([
    Column('rideId', [MatchesPatternValidation(pattern_id)]),
    Column('taxiId', [MatchesPatternValidation(pattern_id)]),
    Column('driverId', [MatchesPatternValidation(pattern_id)]),
    Column('startTime', [DateFormatValidation(pattern_date)]),
    Column('paymentType', [InListValidation(['CSH', 'CRD', 'NOC', 'DIS', 'UNK'])]),
    Column('tip', [MatchesPatternValidation(pattern_dec)]),
예제 #14
0
def main():
    # Parse input arguments
    parser = get_parser()
    args = parser.parse_args()

    data_path = args.path_in

    path_tsv = os.path.join(data_path, 'participants.tsv')
    tsv_file = pd.read_csv(path_tsv, sep='\t')
    list_subj = [
        name for name in os.listdir(data_path) if
        os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub')
    ]
    df = pd.DataFrame(tsv_file)
    list_tsv_participants = df['participant_id'].tolist()
    missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants))
    missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj))

    if missing_subjects_tsv:
        # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv)
        print('\nWarning missing following subjects from participants.tsv: ')
        missing_subjects_tsv.sort()
        pprint(missing_subjects_tsv)
    if missing_subjects_folder:
        # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder)
        print(
            '\nWarning missing data for subjects listed in participants.tsv: ')
        missing_subjects_folder.sort()
        pprint(missing_subjects_folder)

    for dirName, subdirList, fileList in os.walk(data_path):
        for file in fileList:
            if file.endswith('.nii.gz'):
                originalFilePath = os.path.join(dirName, file)
                jsonSidecarPath = os.path.join(dirName,
                                               file.split(".")[0] + '.json')
                if os.path.exists(jsonSidecarPath) == False:
                    print("Missing jsonSidecar: " + jsonSidecarPath)

    # Checking participants.tsv contents
    schema = Schema([
        Column('participant_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('sex', [InListValidation(['M', 'F'])]),
        Column('age', [InRangeValidation(18, 60)]),
        Column('height', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('weight', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('date_of_scan', [
            DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-")
        ]),
        Column('institution_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('institution',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturer',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturers_model_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('receive_coil_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('software_versions',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('researcher',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
    ])

    errors = schema.validate(tsv_file)
    print('\nChecking the contents of participants.tsv')
    if not errors:
        print("--> all good 👍")
    else:
        for error in errors:
            print(error)
예제 #15
0
import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import InRangeValidation, DateFormatValidation, MatchesPatternValidation

# key       ,sensor_id ,location ,lat     ,lon       ,timestamp           ,pressure ,temperature  ,humidity
# 1         ,2266      ,1140      ,42.738 ,23.272    ,2017-07-01T00:00:07 ,95270.27 ,23.46        ,62.48

start_time = time.time()

pattern_id = r'^-?\d{1,16}$'
pattern_dec = r'^-?\d*\.\d{1,2}$'
pattern_geo = r'^-?\d*\.\d{1,16}$'


schema = Schema([
    Column('key', [MatchesPatternValidation(pattern_id)]),            # Number / integer - up to 16
    Column('sensor_id', [MatchesPatternValidation(pattern_id)]),      # Number / integer - up to 16
    Column('location', [MatchesPatternValidation(pattern_id)]),       # Number / integer - up to 16
    Column('lat', [MatchesPatternValidation(pattern_geo)]),       # Number / decimal with up to 16 decimal place
    Column('lon', [MatchesPatternValidation(pattern_geo)]),       # Number / decimal with up to 16 decimal place
    Column('timestamp', [DateFormatValidation('%Y-%m-%dT%H:%M:%S')]),      # Timestamp yyyy-MM-dd'T'HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01T00:00:07
    Column('pressure', [MatchesPatternValidation(pattern_dec)]),   # Numbers / / decimal with 1 or 2 decimals (.00)
    Column('temperature', [InRangeValidation(-146, 60), MatchesPatternValidation(r'^-?\d*\.\d{1,2}$')]),  # Number / decimal with upto 2 decimal place
    Column('humidity', [MatchesPatternValidation(pattern_dec)])    # Numbers with 1 or 2 decimals (.00)
])

### get data from File
print('load orig dataset from file')

test_data = pd.read_csv("data/testCSV_short.csv")
print('orig dataset')
예제 #16
0
from pandas_schema import Column, Schema
from pandas_schema.validation import MatchesPatternValidation, CanConvertValidation, CustomSeriesValidation
import pandas as pd

schema = Schema([
    Column('col1', [
        CanConvertValidation(int) | (CustomSeriesValidation(
            lambda x: x.str.len() > 1, 'Doesn\'t have more than 1 character')
                                     & MatchesPatternValidation('a'))
    ])
])

test_data = pd.DataFrame({'col1': ['an', '13', 'a', '8', 'the']})

errors = schema.validate(test_data)

for error in errors:
    print('"{}" failed!'.format(error.value))
예제 #17
0
from collections import defaultdict
import re

from pandas_schema import Column, Schema
from pandas_schema.validation import (LeadingWhitespaceValidation, TrailingWhitespaceValidation, 
                                      CanConvertValidation, MatchesPatternValidation, CustomSeriesValidation,
                                      InRangeValidation, InListValidation, DateFormatValidation)


study_schema = Schema([
    Column('study_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the study_id column.') &
                        ~InListValidation([''])]),
    Column('pi_name', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_name column.') &
                        ~InListValidation([''])]),
    Column('sample_type', [InListValidation(['wmgx', 'wmtx', '16S', 'other'])]),
    Column('bioproject_accession', [InListValidation(['']) | MatchesPatternValidation(r'PRJ\w+\d+')]),
    Column('geo_loc_name', [InListValidation(['']) | MatchesPatternValidation(r'\w+:\w+:\w+')]),
    Column('analysis_desc', [InListValidation(['']) | CanConvertValidation(str)]),
    Column('sequencing_facility', [LeadingWhitespaceValidation()]),
    Column('env_biom', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('env_feature', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('env_material', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('host_tissue_sampled', [InListValidation(['']) | MatchesPatternValidation(r'BTO:\d+')]),
    Column('animal_vendor', [LeadingWhitespaceValidation()]),
    Column('paired', [InListValidation(['true', 'false'])]),
    Column('paired_id', [InListValidation(['']) | MatchesPatternValidation(r'[a-zA-Z0-9_.]+')])
    ,
    Column('pi_email', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_email column.') &
                        ~InListValidation([''])])
])