Пример #1
0
def validate_variant(conn, args, filepath):
  """Validates input file for variant data

  This function validates that the contents of a file to contain variant data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  schema = Schema([
    Column('chr', [
      CanConvertValidation(int)
    ]),
    Column('pos', [
      CanConvertValidation(int),
      IsDistinctValidation()
    ])
  ])

  df = pd.read_csv(filepath, sep='\t', header=None)

  if len(df.columns) != 2:
    raise Exception(f"Invalid file format. Excepted 2 columns, found {len(df.columns)} columns. Columns should consist of chromsome number and SNP position. Filepath: {filepath}")

  df.columns = ['chr', 'pos']
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #2
0
def validate_results(conn, args, filepath):
  """Validates input file for GWAS result data

  This function validates that the contents of a file to contain GWAS result data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  # For each column, add it to the schema, and then for known ones, add the 
  # schema validation. Use fuzzy comparisons when possible
  schema_columns = []
  for col in df.columns:
    validators = []
    if re.match("(SNP)|(chr)|(pos)|(nSNPs)", col, re.IGNORECASE):
      validators.append(CanConvertValidation(int))
    # Look for any of the p-values and make sure that they can be cast as a float
    if re.match("((null)?pval(ue)?)", col, re.IGNORECASE):
      validators.append(CanConvertValidation(float))
    
    schema_columns.append(Column(col, validators))
  schema = Schema(schema_columns)

  err = schema.validate(df)
  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #3
0
def validate_genotype(conn, args, filepath):
  """Validates input file for genotype data

  This function validates that the contents of a file to contain genotype data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  # Allow for users to skip this validation step because it is time consuming
  if args.skip_genotype_validation is True:
    return

  
  schema_columns = [
    Column('row_number', [
      CanConvertValidation(int) &
      IsDistinctValidation()
    ])
  ]

  # Get the number of lines from the .pos counterpart file
  pos_filepath = '.'.join([filepath, 'pos'])
  if not os.path.exists(pos_filepath):
    raise FileNotFoundError(f"Count not locate the position counterpart file for {filepath}")
  nPositions = len(pd.read_csv(pos_filepath, header=None).index)

  for n in range(nPositions):
    schema_columns.append(
      Column(f'pos_{n}', [
        CanConvertValidation(int) &
        CustomSeriesValidation(lambda x: x.int in [-1,0,1,2], 'Incorrectly coded value.')
      ])
    )

  schema = Schema(schema_columns)

  df = pd.read_csv(filepath, sep='\t', header=None)

  err = schema.validate(df)
  
  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #4
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column("RA_Report #", [CanConvertValidation(int)]),
             Column("RA_CAERS Created Date", [CanCallValidation(self.parse_date)]),
             Column(
                 "AEC_Event Start Date",
                 [CanCallValidation(self.parse_date)],
                 allow_empty=True,
             ),
             Column(
                 "PRI_Product Role", [InListValidation(["Suspect", "Concomitant"])]
             ),
             Column("PRI_Reported Brand/Product Name"),
             Column("PRI_FDA Industry Code"),
             Column("PRI_FDA Industry Name"),
             Column("CI_Age at Adverse Event"),
             Column(
                 "CI_Age Unit",
                 [
                     InListValidation(
                         ["Year(s)", "Decade(s)", "Month(s)", "Week(s)", "Day(s)"]
                     )
                 ],
             ),
             Column("CI_Gender", [InListValidation(["Female", "Male"])]),
             Column("AEC_One Row Outcomes"),
             Column("SYM_One Row Coded Symptoms"),
         ]
     )
Пример #5
0
    def compile_field_validator(self, field):
        field_validator = []

        if field['source_field_type'].lower() == 'int':
            field_validator.append(CanConvertValidation(int))
        elif field['source_field_type'].lower() == 'float':
            field_validator.append(CanConvertValidation(float))

        if pd.notnull(field['min']) and pd.notnull(field['max']):
            field_validator.append(
                InRangeValidation(field['min'], field['max']))
        elif pd.notnull(field['min']):
            field_validator.append(InRangeValidation(field['min'], math.inf))
        elif pd.notnull(field['max']):
            field_validator.append(InRangeValidation(-math.inf, field['max']))

        return field_validator
Пример #6
0
class AllowEmptyColumn(unittest.TestCase):
    """
    Test a column with one single validation that allows empty columns
    """
    NAME = 'col1'

    col = Column(NAME, [CanConvertValidation(int)], allow_empty=True)
    ser = pd.Series([
        '',
    ])

    def test_outputs(self):
        results = self.col.validate(self.ser)
        self.assertEqual(len(results), 0,
                         'allow_empty is not allowing empty columns')
Пример #7
0
def validate_phenotype(conn, args, filepath):
  """Validates input file for phenotype data

  This function validates that the contents of a file to contain phenotype data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  nrows += 1 # include the header in the row count

  if re.match('(genotype)|(pedigree)|(line)', df.columns[0], re.IGNORECASE) is None:
    raise Exception("Genotype/pedigree/line should be the first column in the phenotype file")


  # Rename the first column of data to be the genotypes/lines
  df.rename(columns={f'{df.columns[0]}': 'genotype'}, inplace=True)

  schema_columns = [
    Column('genotype', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(
      Column(df.columns[n], [
        # NOTE(tparker): This may not always be true. If there any phenotypes that
        # are listed as categories or strings, then this would fail
        # Find out all the possible phenotype values. It may be difficult to
        # validate input data without a user-provided dtype list
        CanConvertValidation(float)
      ])
    )

  schema = Schema(schema_columns)
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #8
0
class SingleValidationColumn(unittest.TestCase):
    """
    Test a column with one single validation
    """
    NAME = 'col1'

    col = Column(NAME, [CanConvertValidation(int)], allow_empty=False)
    ser = pd.Series(['a', 'b', 'c'])

    def test_name(self):
        self.assertEqual(self.col.name, self.NAME,
                         'A Column does not store its name correctly')

    def test_outputs(self):
        results = self.col.validate(self.ser)

        self.assertEqual(len(results), len(self.ser),
                         'A Column produces the wrong number of errors')
        for i in range(2):
            self.assertTrue(any([r.row == i for r in results]),
                            'A Column does not report errors for every row')
Пример #9
0
def validate_population_structure(conn, args, filepath):
  """Validates input file for population structure data

  This function validates that the contents of a file to contain population
  structure data. If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  nrows += 1 # include the header rows in the count
  logging.debug(f'Population structure columns: {df.columns}')
  logging.debug(f"Population structure dimensions: <{nrows}, {ncols}>")


  schema_columns = [
    Column('Pedigree', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(Column(df.columns[n], [
      CanConvertValidation(float)
    ]))

  schema = Schema(schema_columns)
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #10
0
def validate_kinship(conn, args, filepath):
  """Validates input file for kinship data

  This function validates that the contents of a file to contain kinship data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  df.rename(columns = {"Unnamed: 0": "line_name"}, inplace=True) # since column name is blank by default, rename it for later reference
  nrows += 1 # include the header row in the count
  logging.debug(f"Dimensions of kinship matrix: <{nrows}, {ncols}>")

  schema_columns = [
    Column('line_name', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(Column(df.columns[n], [
      CanConvertValidation(float)
    ]))

  schema = Schema(schema_columns)

  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #11
0
BUILD_MAP = {'28': 'NCBI28',
             '29': 'NCBI29',
             '30': 'NCBI30',
             '31': 'NCBI31',
             '33': 'NCBI33',
             '34': 'NCBI34',
             '35': 'NCBI35',
             '36': 'NCBI36',
             '37': 'GRCh37',
             '38': 'GRCh38'}

VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"]

GENERIC_VALIDATORS = {
    SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True),
    CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True),
    EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True),
    OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True),
    HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True),
    BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True),
    EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False),
    REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True),
    FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True),
    LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True)
}

SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False)
Пример #12
0
def validate(df):
    d_error = {}
    list_bu = [x[0] for x in BU_CHOICES]
    list_rd = [x[0] for x in RD_CHOICES]
    list_dept = [x[0] for x in DEPT_CHOICES]
    list_hplevel = [x[0] for x in HPLEVEL_CHOICES]
    list_province = [x[0] for x in PROVINCE_CHOICES]
    list_title = [x[0] for x in TITLE_CHOICES]

    NullValidation = CustomElementValidation(lambda d: d is not np.nan,
                                             "该字段不能为空")
    schema = Schema([
        Column("南北中国", [InListValidation(list_bu)]),
        Column("区域", [InListValidation(list_rd)]),
        Column("大区", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("地区经理", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("负责代表", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column(
            "医院编码",
            [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation(),
                NullValidation,
                MatchesPatternValidation(r"^[H]{1}(\d){9}$"),
            ],
        ),
        Column("医院全称", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("省/自治区/直辖市", [InListValidation(list_province)]),
        Column("是否双call", [InListValidation(["是", "否"])]),
        Column("医院级别", [InListValidation(list_hplevel)]),
        Column("开户进展", [InListValidation(["已开户", "未开户"])]),
        Column("客户姓名", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(),
            IsDistinctValidation()
        ]),
        Column("所在科室", [InListValidation(list_dept)]),
        Column("职称", [InListValidation(list_title)]),
        Column("月出诊次数(半天计)",
               [CanConvertValidation(int),
                InRangeValidation(0, 63)]),
        Column("每半天\n门诊量", [CanConvertValidation(int),
                            InRangeValidation(0, )]),
        Column("相关病人\n比例(%)\n建议比例:40%-80%",
               [CanConvertValidation(int),
                InRangeValidation(0, 101)]),
        Column("备注"),
    ])
    errors = schema.validate(df.loc[:, COL])
    for error in errors:
        str_warning = str(error)
        for term in D_TRANSLATE:
            str_warning = str_warning.replace(term, D_TRANSLATE[term])
            findword = r": [0-9]\d*"
            str_warning = re.sub(findword, row_refined, str_warning)
        d_error[str_warning] = "<br>"

    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")}
    d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")}
    d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")}
    d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")}

    d_error = {**d_error, **check_hplevel_with_dept(df)}  # 检查医院级别和所在科室是否出现矛盾
    return d_error
Пример #13
0
EmptyStringValidation = CustomElementValidation(lambda d: d != "",
                                                "This field cannot be empty")

nipt_results_schema = Schema([
    Column("SampleID",
           [TrailingWhitespaceValidation(), EmptyStringValidation]),
    Column("SampleType", []),
    Column("Description", []),
    Column("SampleProject",
           [TrailingWhitespaceValidation(), EmptyStringValidation]),
    Column("Index1", []),
    Column("Index2", []),
    Column("Library_nM", []),
    Column("QCFlag", []),
    Column("Zscore_13", [CanConvertValidation(float)]),
    Column("Zscore_18", [CanConvertValidation(float)]),
    Column("Zscore_21", [CanConvertValidation(float)]),
    Column("Zscore_X", [CanConvertValidation(float)]),
    Column("Ratio_13", [CanConvertValidation(float)]),
    Column("Ratio_18", [CanConvertValidation(float)]),
    Column("Ratio_21", [CanConvertValidation(float)]),
    Column("Ratio_X", [CanConvertValidation(float)]),
    Column("Ratio_Y", [CanConvertValidation(float)]),
    Column("MappedReads", [CanConvertValidation(int)]),
    Column("GC_Dropout", [CanConvertValidation(float)]),
    Column("AT_Dropout", [CanConvertValidation(float)]),
    Column("Chr1_Ratio", [CanConvertValidation(float)]),
    Column("Chr2_Ratio", [CanConvertValidation(float)]),
    Column("Chr3_Ratio", [CanConvertValidation(float)]),
    Column("Chr4_Ratio", [CanConvertValidation(float)]),
Пример #14
0
    '34': 'NCBI34',
    '35': 'NCBI35',
    '36': 'NCBI36',
    '37': 'GRCh37',
    '38': 'GRCh38'
}

VALID_FILE_EXTENSIONS = [
    ".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip",
    ".csv.gzip"
]

GENERIC_VALIDATORS = {
    SNP_DSET:
    Column(SNP_DSET, [
        CanConvertValidation(DSET_TYPES[SNP_DSET]),
        MatchesPatternValidation(r'^rs[0-9]+$')
    ],
           allow_empty=True),
    CHR_DSET:
    Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET:
    Column(BP_DSET, [
        CanConvertValidation(DSET_TYPES[BP_DSET]),
        InInclusiveRangeValidation(1, 999999999)
    ],
           allow_empty=True),
    EFFECT_WEIGHT_DSET:
    Column(EFFECT_WEIGHT_DSET,
           [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])],
           allow_empty=True),
        lambda x: x.is_monotonic_increasing and x.is_unique,
        'date is not monotonic')
])

default_value_validators = [
    LeadingWhitespaceValidation(),
    TrailingWhitespaceValidation()
]

schemas_by_key = {
    'cases':
    Schema([
        date_validator,
        Column('cases', [
            *default_value_validators,
            CanConvertValidation(int) & CustomSeriesValidation(
                lambda x: x.is_monotonic_increasing, 'cases is not monotonic')
        ])
    ]),
    'deaths':
    Schema([
        date_validator,
        Column('deaths', [
            *default_value_validators,
            CanConvertValidation(int) & CustomSeriesValidation(
                lambda x: x.is_monotonic_increasing, 'deaths is not monotonic')
        ])
    ]),
    'recovered':
    Schema([date_validator,
            Column('recovered', [*default_value_validators])]),
Пример #16
0
#VALIDATORS = {
#    PVAL_DSET: Column(PVAL_DSET, [CanConvertValidation(DSET_TYPES[PVAL_DSET]), InInclusiveRangeValidation(0, 1)], allow_empty=False),
#    OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True),
#    RANGE_U_DSET: Column(RANGE_U_DSET, [CanConvertValidation(float)], allow_empty=True),
#    RANGE_L_DSET: Column(RANGE_L_DSET, [CanConvertValidation(float)], allow_empty=True),
#    BETA_DSET: Column(BETA_DSET, [CanConvertValidation(float)], allow_empty=True),
#    SE_DSET: Column(SE_DSET, [CanConvertValidation(float)], allow_empty=True),
#    EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGNactgn]+$')], allow_empty=True),
#    OTHER_DSET: Column(OTHER_DSET, [MatchesPatternValidation(r'^[ACTGNactgn]+$')], allow_empty=True),
#    FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(float)], allow_empty=True)
#}

SNP_VALIDATORS = {
    SNP_DSET:
    Column(SNP_DSET, [
        CanConvertValidation(DSET_TYPES[SNP_DSET]),
        MatchesPatternValidation(r'^rs[0-9]+$')
    ],
           allow_empty=False),
    CHR_DSET:
    Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET:
    Column(BP_DSET, [
        CanConvertValidation(DSET_TYPES[BP_DSET]),
        InInclusiveRangeValidation(1, 999999999)
    ],
           allow_empty=True),
    PVAL_DSET:
    Column(PVAL_DSET, [
        CanConvertValidation(DSET_TYPES[PVAL_DSET]),
        InInclusiveRangeValidation(0, 1)
Пример #17
0
    '34': 'NCBI34',
    '35': 'NCBI35',
    '36': 'NCBI36',
    '37': 'GRCh37',
    '38': 'GRCh38'
}

VALIDATORS = {
    SNP_DSET:
    Column(SNP_DSET,
           [MatchesPatternValidation(r'rs[0-9]+')
            ]),  # how do we handle the values that are like chr:bp:allele:snp?
    PVAL_DSET:
    Column(
        PVAL_DSET,
        [CanConvertValidation(float),
         InInclusiveRangeValidation(0, 1)]
        #CustomElementValidation(lambda s: float(s) >= 0 and float(s) <= 1, 'outside the range of 0 to 1')]
    ),
    CHR_DSET:
    Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET:
    Column(
        BP_DSET,
        [CanConvertValidation(int) & InInclusiveRangeValidation(1, 999999999)],
        allow_empty=True),
    OR_DSET:
    Column(OR_DSET, [CanConvertValidation(float)], allow_empty=True),
    RANGE_U_DSET:
    Column(RANGE_U_DSET, [CanConvertValidation(float)], allow_empty=True),
    RANGE_L_DSET:
from pandas_schema.validation import MatchesPatternValidation, InRangeValidation, InListValidation, CustomSeriesValidation, CustomElementValidation, CanConvertValidation, IsDtypeValidation, CanCallValidation
from validate.helpers import InInclusiveRangeValidation

from validate.common_constants import *

VALID_COLS = TO_LOAD_DSET_HEADERS_DEFAULT

VALID_CHROMOSOMES = [
    '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14',
    '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', 'X', 'Y'
]

VALIDATORS = {
    PVAL_DSET:
    Column(PVAL_DSET, [
        CanConvertValidation(DSET_TYPES[PVAL_DSET]),
        InInclusiveRangeValidation(0, 1)
    ],
           allow_empty=False),
    BETA_DSET:
    Column(BETA_DSET, [CanConvertValidation(float)], allow_empty=True),
    SNP_DSET:
    Column(SNP_DSET, [
        CanConvertValidation(DSET_TYPES[SNP_DSET]),
        MatchesPatternValidation(
            r'^chr[0-9XY]+_[0-9]+_[ACTGNactgn]+_[ACTGNactgn]+|LONG_STRING$')
    ],
           allow_empty=True),
    CHR_DSET:
    Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False),
    BP_DSET:
Пример #19
0
from pandas_schema import Column, Schema
from pandas_schema.validation import MatchesPatternValidation, CanConvertValidation, CustomSeriesValidation
import pandas as pd

schema = Schema([
    Column('col1', [
        CanConvertValidation(int) | (CustomSeriesValidation(
            lambda x: x.str.len() > 1, 'Doesn\'t have more than 1 character')
                                     & MatchesPatternValidation('a'))
    ])
])

test_data = pd.DataFrame({'col1': ['an', '13', 'a', '8', 'the']})

errors = schema.validate(test_data)

for error in errors:
    print('"{}" failed!'.format(error.value))
Пример #20
0
from pandas_schema import Column, Schema
from pandas_schema.validation import (LeadingWhitespaceValidation, TrailingWhitespaceValidation, 
                                      CanConvertValidation, MatchesPatternValidation, CustomSeriesValidation,
                                      InRangeValidation, InListValidation, DateFormatValidation)


study_schema = Schema([
    Column('study_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the study_id column.') &
                        ~InListValidation([''])]),
    Column('pi_name', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_name column.') &
                        ~InListValidation([''])]),
    Column('sample_type', [InListValidation(['wmgx', 'wmtx', '16S', 'other'])]),
    Column('bioproject_accession', [InListValidation(['']) | MatchesPatternValidation(r'PRJ\w+\d+')]),
    Column('geo_loc_name', [InListValidation(['']) | MatchesPatternValidation(r'\w+:\w+:\w+')]),
    Column('analysis_desc', [InListValidation(['']) | CanConvertValidation(str)]),
    Column('sequencing_facility', [LeadingWhitespaceValidation()]),
    Column('env_biom', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('env_feature', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('env_material', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('host_tissue_sampled', [InListValidation(['']) | MatchesPatternValidation(r'BTO:\d+')]),
    Column('animal_vendor', [LeadingWhitespaceValidation()]),
    Column('paired', [InListValidation(['true', 'false'])]),
    Column('paired_id', [InListValidation(['']) | MatchesPatternValidation(r'[a-zA-Z0-9_.]+')])
    ,
    Column('pi_email', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_email column.') &
                        ~InListValidation([''])])
])

sample_schema = Schema([
    Column('host_subject_id', [MatchesPatternValidation(r'\w+', message='Host Subject ID may only contain alphanumeric characters.')]),