예제 #1
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column(
                 "Given Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "Family Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("Age", [InRangeValidation(0, 120)]),
             Column("Sex", [InListValidation(["Male", "Female", "Other"])]),
             Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]),
         ]
     )
예제 #2
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column("id"),
             Column(
                 "payer_name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("document_amount"),
             Column("payed_amount"),
             Column("payer_id_number"),
             Column(
                 "payer_address",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("barcode"),
             Column("typable_line"),
             Column("number"),
             Column(
                 "document_number",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("due_date", [DateFormatValidation("%m/%d/%y")]),
             Column(
                 "city",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "state",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("zip_code"),
             Column("bank_answer_date"),
             Column("pdf_upload_date"),
             Column(
                 "status", [InListValidation(["pending", "paid", "due", "error"])]
             ),
             Column("callback"),
             Column("object_id"),
             Column("extra"),
         ]
     )
예제 #3
0
    def create_schema(self) -> Schema:
        """ Create Pandas schema with all the necessary validation rules read in from config """
        col_list = []
        for column in self.__spreadsheet_def.keys():
            validators = [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()
            ]

            mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column)

            # Special cases for checking institutions/countries...
            if column == 'submitting_institution':
                validators.append(
                    InListValidation([i.name for i in self.__institutions]))
            if column == 'country':
                validators.append(
                    InListValidation([i.country for i in self.__institutions]))
            else:
                # Regex validation
                if self.__spreadsheet_def.get_regex(column):
                    validators.append(
                        MatchesPatternValidation(
                            self.__spreadsheet_def.get_regex(column),
                            message=self.__spreadsheet_def.
                            get_regex_validation_message(column)))

                # Validate allowed values
                elif self.__spreadsheet_def.get_allowed_values(column):
                    validators.append(
                        InListValidation(
                            self.__spreadsheet_def.get_allowed_values(column),
                            case_sensitive=False))

                # Field length validation
                max_len = self.__spreadsheet_def.get_max_length(column)
                if max_len and max_len > 0:
                    validators.append(
                        _StringLengthValidation(
                            'field length is greater than {} characters'.
                            format(str(max_len)), max_len))

            # Mandatory field validation
            col_list.append(
                Column(self.__spreadsheet_def.get_column_name(column),
                       validators,
                       allow_empty=not mandatory_field_flag))

        return Schema(col_list)
예제 #4
0
    def check_join_cols(df1, df2, on):

        schema = Schema([
            Column(
                col,
                [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    IsDistinctValidation()
                ],
            ) for col in on
        ])
        results = [schema.validate(df) for df in [df1[on], df2[on]]]

        if len(results) > 0:
            print("The following issues exist in the index:")
            for error in itertools.chain(*results):
                print(error)
예제 #5
0
class DoubleValidationColumn(unittest.TestCase):
    """
    Test a column with two different validations
    """
    NAME = 'col1'

    col = Column(
        NAME, [TrailingWhitespaceValidation(),
               LeadingWhitespaceValidation()],
        allow_empty=False)
    ser = pd.Series([' a ', ' b ', ' c '])

    def test_outputs(self):
        results = self.col.validate(self.ser)

        # There should be 6 errors, 2 for each row
        self.assertEqual(len(results), 2 * len(self.ser),
                         'A Column produces the wrong number of errors')
        for i in range(2):
            in_row = [r for r in results if r.row == i]
            self.assertEqual(
                len(in_row), 2,
                'A Column does not report both errors for every row')
예제 #6
0
import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \
    DateFormatValidation, InListValidation

schema = Schema([
    Column('name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('title',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('salary', [InRangeValidation(0, 33000)]),
    Column('sex', [InListValidation(['F', 'M'])]),
    Column('date', [DateFormatValidation('%Y-%m-%d')])
])

widths = [
    9,  # name
    19,  # title
    6,  # salary
    4,  # sex
    11,  # date
]

# read source data
test_data = pd.read_fwf("data/fixed_width.txt", widths=widths)
print('orig dataset')
print(test_data)

# data verification
예제 #7
0
def main():
    # Parse input arguments
    parser = get_parser()
    args = parser.parse_args()

    data_path = args.path_in

    path_tsv = os.path.join(data_path, 'participants.tsv')
    tsv_file = pd.read_csv(path_tsv, sep='\t')
    list_subj = [
        name for name in os.listdir(data_path) if
        os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub')
    ]
    df = pd.DataFrame(tsv_file)
    list_tsv_participants = df['participant_id'].tolist()
    missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants))
    missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj))

    if missing_subjects_tsv:
        # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv)
        print('\nWarning missing following subjects from participants.tsv: ')
        missing_subjects_tsv.sort()
        pprint(missing_subjects_tsv)
    if missing_subjects_folder:
        # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder)
        print(
            '\nWarning missing data for subjects listed in participants.tsv: ')
        missing_subjects_folder.sort()
        pprint(missing_subjects_folder)

    for dirName, subdirList, fileList in os.walk(data_path):
        for file in fileList:
            if file.endswith('.nii.gz'):
                originalFilePath = os.path.join(dirName, file)
                jsonSidecarPath = os.path.join(dirName,
                                               file.split(".")[0] + '.json')
                if os.path.exists(jsonSidecarPath) == False:
                    print("Missing jsonSidecar: " + jsonSidecarPath)

    # Checking participants.tsv contents
    schema = Schema([
        Column('participant_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('sex', [InListValidation(['M', 'F'])]),
        Column('age', [InRangeValidation(18, 60)]),
        Column('height', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('weight', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('date_of_scan', [
            DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-")
        ]),
        Column('institution_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('institution',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturer',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturers_model_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('receive_coil_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('software_versions',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('researcher',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
    ])

    errors = schema.validate(tsv_file)
    print('\nChecking the contents of participants.tsv')
    if not errors:
        print("--> all good 👍")
    else:
        for error in errors:
            print(error)
예제 #8
0
from pandas_schema import Column, Schema
from pandas_schema.validation import (
    LeadingWhitespaceValidation,
    TrailingWhitespaceValidation,
    CanConvertValidation,
    InListValidation,
    CustomElementValidation,
)

EmptyStringValidation = CustomElementValidation(lambda d: d != "",
                                                "This field cannot be empty")

nipt_results_schema = Schema([
    Column("SampleID",
           [TrailingWhitespaceValidation(), EmptyStringValidation]),
    Column("SampleType", []),
    Column("Description", []),
    Column("SampleProject",
           [TrailingWhitespaceValidation(), EmptyStringValidation]),
    Column("Index1", []),
    Column("Index2", []),
    Column("Library_nM", []),
    Column("QCFlag", []),
    Column("Zscore_13", [CanConvertValidation(float)]),
    Column("Zscore_18", [CanConvertValidation(float)]),
    Column("Zscore_21", [CanConvertValidation(float)]),
    Column("Zscore_X", [CanConvertValidation(float)]),
    Column("Ratio_13", [CanConvertValidation(float)]),
    Column("Ratio_18", [CanConvertValidation(float)]),
    Column("Ratio_21", [CanConvertValidation(float)]),
    Column("Ratio_X", [CanConvertValidation(float)]),
예제 #9
0
        # Iterate over each pair of schema columns and data frame series and run validations
        column_pairs, errors = self._get_column_pairs(panda_sdrf)
        for series, column in column_pairs:
            errors += column.validate(series)
        return sorted(errors, key=lambda e: e.row)

    def check_recommendations(self, panda_sdrf):
        column_pairs, errors = self._get_column_pairs(panda_sdrf)
        warnings = []
        for series, column in column_pairs:
            warnings += column.validate_optional(series)
        return sorted(warnings, key=lambda e: e.row)


default_schema = SDRFSchema([
    SDRFColumn('source name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
               allow_empty=True,
               optional_type=False),
    SDRFColumn('characteristics[organism part]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
               allow_empty=True,
               optional_type=False),
    SDRFColumn('characteristics[disease]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
               allow_empty=True,
               optional_type=False),
    SDRFColumn('characteristics[organism]',
               [LeadingWhitespaceValidation(), TrailingWhitespaceValidation(),
                OntologyTerm("ncbitaxon", not_applicable=True)],
               allow_empty=False,
               optional_type=False),
    SDRFColumn('characteristics[cell type]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
               allow_empty=False,
예제 #10
0
    def validate_csv(cls, data, registration_type):
        df = pd.read_csv(data)
        logger.info("Printing dataframe before CSV validation...")
        logger.info(df)
        if registration_type == 'slr':
            csv_schema = Schema([
                Column('ipaddr', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('username', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('password', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('sa_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('va_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('domain', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('license', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation()
                ]),
                Column('license_count', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation()
                ]),
                Column('tftp_server_ip', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('tftp_server_path', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ])
            ])
        elif registration_type == 'sl':
            csv_schema = Schema([
                Column('ipaddr', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('username', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('password', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('sa_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('va_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('domain', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ])
            ])

        errors = csv_schema.validate(df)

        if errors:
            errors_list = []
            for error in errors:
                print(error)
                errors_list.append(error)
            return False, errors_list, df
        else:
            return True, None, df
예제 #11
0
             '38': 'GRCh38'}

VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"]

GENERIC_VALIDATORS = {
    SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True),
    CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True),
    EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True),
    OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True),
    HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True),
    BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True),
    EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False),
    REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True),
    FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True),
    LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True)
}

SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False)

SNP_EMPTY_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
SNP_EMPTY_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs[0-9]+|nan)$')], allow_empty=False)
SNP_EMPTY_VALIDATORS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False)
SNP_EMPTY_VALIDATORS[BP_DSET]  = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False)

POS_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
POS_VALIDATORS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False)
POS_VALIDATORS[BP_DSET]  = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False)

EFFECT_WEIGHT_VALIDATOR = {k:v for k,v in GENERIC_VALIDATORS.items()}
예제 #12
0
def validate(df):
    d_error = {}
    list_bu = [x[0] for x in BU_CHOICES]
    list_rd = [x[0] for x in RD_CHOICES]
    list_dept = [x[0] for x in DEPT_CHOICES]
    list_hplevel = [x[0] for x in HPLEVEL_CHOICES]
    list_province = [x[0] for x in PROVINCE_CHOICES]
    list_title = [x[0] for x in TITLE_CHOICES]

    NullValidation = CustomElementValidation(lambda d: d is not np.nan,
                                             "该字段不能为空")
    schema = Schema([
        Column("南北中国", [InListValidation(list_bu)]),
        Column("区域", [InListValidation(list_rd)]),
        Column("大区", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("地区经理", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("负责代表", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column(
            "医院编码",
            [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation(),
                NullValidation,
                MatchesPatternValidation(r"^[H]{1}(\d){9}$"),
            ],
        ),
        Column("医院全称", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("省/自治区/直辖市", [InListValidation(list_province)]),
        Column("是否双call", [InListValidation(["是", "否"])]),
        Column("医院级别", [InListValidation(list_hplevel)]),
        Column("开户进展", [InListValidation(["已开户", "未开户"])]),
        Column("客户姓名", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(),
            IsDistinctValidation()
        ]),
        Column("所在科室", [InListValidation(list_dept)]),
        Column("职称", [InListValidation(list_title)]),
        Column("月出诊次数(半天计)",
               [CanConvertValidation(int),
                InRangeValidation(0, 63)]),
        Column("每半天\n门诊量", [CanConvertValidation(int),
                            InRangeValidation(0, )]),
        Column("相关病人\n比例(%)\n建议比例:40%-80%",
               [CanConvertValidation(int),
                InRangeValidation(0, 101)]),
        Column("备注"),
    ])
    errors = schema.validate(df.loc[:, COL])
    for error in errors:
        str_warning = str(error)
        for term in D_TRANSLATE:
            str_warning = str_warning.replace(term, D_TRANSLATE[term])
            findword = r": [0-9]\d*"
            str_warning = re.sub(findword, row_refined, str_warning)
        d_error[str_warning] = "<br>"

    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")}
    d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")}
    d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")}
    d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")}

    d_error = {**d_error, **check_hplevel_with_dept(df)}  # 检查医院级别和所在科室是否出现矛盾
    return d_error