Пример #1
0
    def test_schema(self):
        """
        Test this validation inside a schema, to ensure we get helpful error messages.
        In particular, we want to make sure that a ValidationWarning without a row number won't break the schema
        """
        df = pd.DataFrame(
            data={
                'wrong_dtype1': ['not_an_int'],
                'wrong_dtype2': [123],
                'wrong_dtype3': [12.5]
            })

        schema = Schema([
            Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]),
            Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]),
            Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]),
        ])

        errors = schema.validate(df)

        self.assertEqual(
            sorted([str(x) for x in errors]),
            sorted([
                'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64',
                'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64',
                'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64'
            ]))
Пример #2
0
def do_validation():
    # read the data
    data = pd.read_csv('noon.csv')
    data.dtypes
    # define validation elements
    int_validation = [
        CustomElementValidation(lambda i: check_int(i), 'is not integer')
    ]
    null_validation = [
        CustomElementValidation(lambda d: d is not np.nan,
                                'this field cannot be null')
    ]
    #d is not np.nan
    # define validation schema
    schema = pandas_schema.Schema([
        Column('Name', null_validation),
        Column('SKU', null_validation),
        Column('Price', int_validation + null_validation),
        Column('Special price', int_validation + null_validation),
        Column('Qty', int_validation + null_validation)
    ])

    # apply validation
    errors = schema.validate(data)

    for error in errors:
        print('"{}" failed!'.format(error.value))

    errors_index_rows = [e.row for e in errors]

    # save data
    pd.DataFrame({'col': errors}).to_csv('errors.csv')
Пример #3
0
class ClassifierParser(BaseParser):
    """
    Implementation of classifier dao Parser.

    The classifier output tables contain the output data from geniepy after the
    classifiers have calculated desired predictions.
    """

    default_type: DataType = None
    scraper: None
    """No online sources for classifiers output."""
    schema: Schema = Schema([
        Column("digest"),
        Column(PCPCLSFR_NAME, [IsDtypeValidation(np.float64)]),
        Column(CTCLSFR_NAME, [IsDtypeValidation(np.float64)]),
    ])

    def fetch(self, chunksize: int) -> Generator[DataFrame, None, None]:
        """No online sources to fetch from for classifiers outputs."""
        raise NotImplementedError("Classifier Output Parser has no Scrapers")

    @staticmethod
    def parse(data, dtype=DataType.CSV_STR) -> DataFrame:
        """
        Parser function from base class.

        Raises:
            NotImplementedError -- Function not implemented since classifiers return
                dataframes that only need to be validated.
        """
        raise NotImplementedError("Classifier Output Parser has no Scrapers")
def validate_data(df):
    schema = Schema([
        Column('brand', null_validation + string_validation),
        Column('gear', null_validation + string_validation),
        Column('model', null_validation + string_validation),
        Column('price',
               null_validation + int_validation + price_max_validation,
               price_min_validation),
        Column('fuel', null_validation + string_validation),
        Column('mileage',
               null_validation + float_validation + mileage_max_validation),
        Column(
            'hp', null_validation + float_validation + hp_min_validation +
            hp_max_validation),
        Column('type', null_validation + string_validation),
        Column('geo', null_validation + string_validation),
        Column('model_year', null_validation + float_validation),
    ])
    try:
        errors = schema.validate(df)
        for e in errors:
            print(e)
    except:
        return False
    else:
        if not errors:
            return True
        else:
            return False
Пример #5
0
def validate_variant(conn, args, filepath):
  """Validates input file for variant data

  This function validates that the contents of a file to contain variant data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  schema = Schema([
    Column('chr', [
      CanConvertValidation(int)
    ]),
    Column('pos', [
      CanConvertValidation(int),
      IsDistinctValidation()
    ])
  ])

  df = pd.read_csv(filepath, sep='\t', header=None)

  if len(df.columns) != 2:
    raise Exception(f"Invalid file format. Excepted 2 columns, found {len(df.columns)} columns. Columns should consist of chromsome number and SNP position. Filepath: {filepath}")

  df.columns = ['chr', 'pos']
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #6
0
class UnorderedSchema(unittest.TestCase):
    schema = Schema(
        [Column('a'),
         Column('b', [LeadingWhitespaceValidation()])],
        ordered=False)

    def test_fields(self):
        self.assertEqual(len(self.schema.columns), 2,
                         'The schema is not storing all of its columns')
        self.assertEqual(
            self.schema.ordered, False,
            'The schema is not storing the correct value of ordered')

    def test_validate_valid(self):
        df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 0,
                         'A correct data frame should have no errors')

    def test_validate_invalid(self):
        df = pd.DataFrame({'a': [' 1', '2', '3'], 'b': [' 1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 1,
                         'An incorrect data frame should report errors')

    def test_mixed_columns(self):
        """
        Tests that when ordered=False, the schema columns are associated with data frame columns by name, not position.
        In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in
        column b in the data frame (leading whitespace), and a validation on column b in the schema.

        Schema         a                b (validation)
        Data Frame     b (error)        a

        Thus there will only be an error if column b in the schema is linked to column b in the data frame,
        as is correct behaviour.
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        results = self.schema.validate(df)

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by name'
        )
Пример #7
0
class StockPrice:
    """
    Model representing stock prices over time
    """

    fieldSchema = Schema([
        Column('open', [IsDtypeValidation(np.float64)]),
        Column('close', [IsDtypeValidation(np.float64)]),
        Column('high', [IsDtypeValidation(np.float64)]),
        Column('low', [IsDtypeValidation(np.float64)]),
        Column('volume', [IsDtypeValidation(np.int64)])
    ])
Пример #8
0
def defSchema():
    print('Define expected Schema')
    schema = Schema([
        Column(name='id',
               validations=[IsDtypeValidation(np.object_)],
               allow_empty=False),
        Column(name='comment_text',
               validations=[IsDtypeValidation(np.object_)],
               allow_empty=False),
        Column(name='toxic',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='severe_toxic',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='obscene',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='threat',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='insult',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='identity_hate',
               validations=[InListValidation([0, 1])],
               allow_empty=False)
    ])
    return schema
Пример #9
0
def validate_genotype(conn, args, filepath):
  """Validates input file for genotype data

  This function validates that the contents of a file to contain genotype data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  # Allow for users to skip this validation step because it is time consuming
  if args.skip_genotype_validation is True:
    return

  
  schema_columns = [
    Column('row_number', [
      CanConvertValidation(int) &
      IsDistinctValidation()
    ])
  ]

  # Get the number of lines from the .pos counterpart file
  pos_filepath = '.'.join([filepath, 'pos'])
  if not os.path.exists(pos_filepath):
    raise FileNotFoundError(f"Count not locate the position counterpart file for {filepath}")
  nPositions = len(pd.read_csv(pos_filepath, header=None).index)

  for n in range(nPositions):
    schema_columns.append(
      Column(f'pos_{n}', [
        CanConvertValidation(int) &
        CustomSeriesValidation(lambda x: x.int in [-1,0,1,2], 'Incorrectly coded value.')
      ])
    )

  schema = Schema(schema_columns)

  df = pd.read_csv(filepath, sep='\t', header=None)

  err = schema.validate(df)
  
  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #10
0
def validate_results(conn, args, filepath):
  """Validates input file for GWAS result data

  This function validates that the contents of a file to contain GWAS result data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  # For each column, add it to the schema, and then for known ones, add the 
  # schema validation. Use fuzzy comparisons when possible
  schema_columns = []
  for col in df.columns:
    validators = []
    if re.match("(SNP)|(chr)|(pos)|(nSNPs)", col, re.IGNORECASE):
      validators.append(CanConvertValidation(int))
    # Look for any of the p-values and make sure that they can be cast as a float
    if re.match("((null)?pval(ue)?)", col, re.IGNORECASE):
      validators.append(CanConvertValidation(float))
    
    schema_columns.append(Column(col, validators))
  schema = Schema(schema_columns)

  err = schema.validate(df)
  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #11
0
def validate_line(conn, args, filepath):
  """Validates input file for line data

  This function validates that the contents of a file to contain line data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  schema = Schema([
    Column('line_name', [
      IsDistinctValidation()
    ])
  ])

  df = pd.read_csv(filepath, header=None)

  if len(df.columns) != 1:
    raise Exception(f"Invalid file format. Excepted 1 column found {len(df.columns)} columns. This file should be a single column of each line. Each entry should be distinct.")
  
  df.columns = [ 'line_name' ]
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #12
0
    def compile_source_data_validation_schema(self, dataset):
        field_names = []
        field_schemas = []

        if dataset is not None:
            sub_map = Mapper.filter_mapper(self.mapper_df, dataset)
        else:
            sub_map = self.mapper_df

        sub_map = sub_map[sub_map['allow_missing'].str.lower() != 'y'].dropna(
            subset=['source_field_name', 'source_field_type'])

        for idx, field in sub_map.iterrows():

            field_validator = self.compile_field_validator(field)
            if field_validator:
                field_names.append(field['source_field_name'])

                field_schemas.append(
                    Column(field['source_field_name'], field_validator))

        if not field_schemas:
            return None, []

        schema = Schema(field_schemas)
        return schema, field_names
Пример #13
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column(
                 "Given Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "Family Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("Age", [InRangeValidation(0, 120)]),
             Column("Sex", [InListValidation(["Male", "Female", "Other"])]),
             Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]),
         ]
     )
Пример #14
0
    def __init__(
        self,
        args: Namespace,
        sources: Dict[str, Any],
        schema: List[Tuple[str, np.generic]],
        destinations: Dict[str, Any],
        stage: str,
        task: str,
    ):
        """Initiate parameters and client libraries for ETL task.

        :param args: args passed from command line,
        see `get_arg_parser()`
        :param sources: data source to be extracted,
        specified in task config, see `configs/*.py`
        :param schema: the target schema to load to.
        :param destinations: destinations to load data to,
        specified in task config, see `configs/*.py`
        :param stage: the stage of the loaded data, could be staging/production.
        :param task: the name of the task.
        """
        # Clear cached files
        if args.rm:
            for source in sources:
                files = []
                files += glob.glob(
                    get_path_format(True).format(
                        prefix=destinations["fs"]["prefix"],
                        stage="raw",
                        task=args.task,
                        source=source,
                    ))
                files += glob.glob(
                    get_path_format(True).format(
                        prefix=destinations["fs"]["prefix"],
                        stage=stage,
                        task=args.task,
                        source=source,
                    ))
                for f in files:
                    log.info("Removing cached file: %s" % f)
                    os.remove(f)
        self.task = task
        self.stage = stage
        self.args = args
        self.period = args.period
        self.current_date = args.date
        self.last_month = lookback_dates(args.date, args.period)
        self.sources = sources
        coltypes = []
        for coltype in schema:
            coltypes += [Column(coltype[0], [IsDtypeValidation(coltype[1])])]
        self.schema = Schema(coltypes)
        self.raw_schema = schema
        self.destinations = destinations
        self.raw = dict()
        self.extracted_base = dict()
        self.extracted = dict()
        self.transformed = dict()
        self.gcs = storage.Client()
Пример #15
0
def validate_phenotype(conn, args, filepath):
  """Validates input file for phenotype data

  This function validates that the contents of a file to contain phenotype data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  nrows += 1 # include the header in the row count

  if re.match('(genotype)|(pedigree)|(line)', df.columns[0], re.IGNORECASE) is None:
    raise Exception("Genotype/pedigree/line should be the first column in the phenotype file")


  # Rename the first column of data to be the genotypes/lines
  df.rename(columns={f'{df.columns[0]}': 'genotype'}, inplace=True)

  schema_columns = [
    Column('genotype', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(
      Column(df.columns[n], [
        # NOTE(tparker): This may not always be true. If there any phenotypes that
        # are listed as categories or strings, then this would fail
        # Find out all the possible phenotype values. It may be difficult to
        # validate input data without a user-provided dtype list
        CanConvertValidation(float)
      ])
    )

  schema = Schema(schema_columns)
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #16
0
 def run(self):
     # define validation elements
     self.logger.info('1. Starting data Clean Action ..')
     system_packs_base_path = cfg.CONF.content.system_packs_base_path
     path_of_pack = system_packs_base_path + '/monitor_mqtt'
     success = False
     VALIDATORS = {
         'decimal':
         CustomElementValidation(lambda d: self.check_decimal(d),
                                 'is not decimal'),
         'int':
         CustomElementValidation(lambda i: self.check_int(i),
                                 'is not integer'),
         'null':
         CustomElementValidation(lambda d: d is not np.nan,
                                 'this field cannot be null'),
         'time_stamp':
         CustomElementValidation(lambda d: self.check_time_stamp(d),
                                 'time_stamp format is not valid')
     }
     self.logger.info('2. Loading Schema ..')
     with open(self._json_schema_path, 'r') as my_json:
         json_schema = json.load(my_json)
     column_list = [
         Column(k, [VALIDATORS[v] for v in vals])
         for k, vals in json_schema.items()
     ]
     schema = pandas_schema.Schema(column_list)
     self.logger.info('3. Loading CSV Data ..')
     data = pd.read_csv(self._data_file_path)
     self.logger.debug(data)
     try:
         self.logger.info('4. Validating input CSV data ..')
         errors = schema.validate(data)
         for e in errors:
             self.logger.debug(e)
         if errors:
             errors_index_rows = [e.row for e in errors]
             self.logger.info('5. Cleaning input CSV data ..')
             data_clean = data.drop(index=errors_index_rows)
             ct = datetime.datetime.now()
             filename = '{:%Y_%m_%d_%H_%M_%S_%f}.csv'.format(ct)
             pathoffile = path_of_pack + '/etc/clean_data_output/errors_' + filename
             message = 'Error Data file: ' + pathoffile
             self.logger.debug(message)
             pd.DataFrame({'col': errors}).to_csv(pathoffile)
         else:
             self.logger.info('5. Couldn`t find issue with input CSV ..')
             data_clean = data
             cleanpath = path_of_pack + '/etc/clean_data_output/clean_data.csv'
             cleanmessage = 'Clean Data path: ' + cleanpath
         self.logger.debug(cleanmessage)
         data_clean.to_csv(cleanpath)
         success = True
         self.logger.info('Action Completed Successfully')
     except Exception as msg:
         self.logger.info(f"FAILED STEP: {msg}\n FAILED: Clean Data Action")
     return success
Пример #17
0
    def _validate(self, diagnosis_df):
        schema = Schema([
            Column('visit_dt', [
                MatchesPatternValidation(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:00$')
            ]),
            Column('sex', [InListValidation(['M', 'K'])]),
            Column('icd10', [
                MatchesPatternValidation(r'^[CDIJKMNRZ]{1}\d{1,2}.?\d{0,2}$')
            ])
        ])

        errors = schema.validate(diagnosis_df)

        for error in errors:
            self.Logger.error(error)

        if len(errors) > 0:
            exit()
Пример #18
0
def do_validation(data):
    # define validation elements
    decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
    null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')]

    # define validation schema
    schema = pandas_schema.Schema([
            Column('ch1', decimal_validation + null_validation),
            Column('ch2', decimal_validation+ null_validation)])
    
    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    if len(errors)==0:
        return True
    else:
        for e in range(0,len(errors_index_rows)):
            print("Error on line ", errors[e].row, " for ",errors[e].column, " : ",errors[e].value, " ", errors[e].message)
        return False
Пример #19
0
 def test_custom_message(self):
     validator = InRangeValidation(min=4, message=self.message)
     for error in validator.get_errors(pd.Series(
             [
                 1,
                 2,
                 3
             ]
     ), Column('')):
         self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!')
Пример #20
0
 def test_default_message(self):
     validator = InRangeValidation(min=4)
     for error in validator.get_errors(pd.Series(
             [
                 1,
                 2,
                 3
             ]
     ), Column('')):
         self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!')
Пример #21
0
def validate(data: pd.DataFrame):
    decimal_validation = [
        CustomElementValidation(lambda x: check_decimal(x), 'is not decimal')
    ]
    datetime_validation = [
        CustomElementValidation(lambda x: check_datetime(x), 'is not datetime')
    ]
    string_validation = [
        CustomElementValidation(lambda x: check_is_string_or_nan(x),
                                'is not string')
    ]
    nan_validation = [
        CustomElementValidation(lambda x: x is not np.nan,
                                'this field cannot be NaN')
    ]

    schema = pandas_schema.Schema([
        Column('value', decimal_validation + nan_validation),
        Column('time', datetime_validation + nan_validation),
        Column('target', string_validation),
        Column('message', string_validation),
        Column('event', string_validation),
        Column('account_number', string_validation),
    ])

    errors = schema.validate(data)
    if len(errors) > 0:
        for error in errors:
            print(error)
        raise InvalidDataFrame("Invalid dataframe!")
Пример #22
0
def data_validation(filename):
    """
        :param filename: name of the csv file with data
        :return: dataframe if the data is correct, list of errors
        :does: validates the data in the csv file
    """
    # read the data
    try:
        data = pd.read_csv(filename)
    except Exception:
        return [False, ['Error reading a file.']]
    # check column names
    if (data.columns.to_list() == [
            'City', 'Cappuccino', 'Cinema', 'Wine', 'Gasoline', 'Avg Rent',
            'Avg Disposable Income'
    ]):

        # define validation elements
        decimal_validation = [
            CustomElementValidation(lambda d: check_float(d),
                                    'Must be decimal')
        ]
        null_validation = [
            CustomElementValidation(lambda d: d is not np.nan,
                                    'Must not be nan')
        ]

        # define validation schema
        schema = pandas_schema.Schema([
            Column('City', null_validation),
            Column('Cappuccino', decimal_validation + null_validation),
            Column('Cinema', decimal_validation + null_validation),
            Column('Wine', decimal_validation + null_validation),
            Column('Gasoline', decimal_validation + null_validation),
            Column('Avg Rent', decimal_validation + null_validation),
            Column('Avg Disposable Income',
                   decimal_validation + null_validation)
        ])

        # apply validation
        errors = schema.validate(data)
        errors_index_rows = [e.row for e in errors]
        data_clean = data.drop(index=errors_index_rows)

        if errors is not None and len(errors) == len(data['City']):
            return [False, errors]
        else:
            for e in errors:
                if e.column != 'City':
                    data_clean[e.column] = pd.to_numeric(data_clean[e.column])
            return [True, data_clean, errors]
    else:
        return [False, ['The criteria names are incorrect.']]
Пример #23
0
def do_validation():
    # read the data
    data = pd.read_csv('data.csv')

    # define validation elements
    decimal_validation = [
        CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')
    ]
    int_validation = [
        CustomElementValidation(lambda i: check_int(i), 'is not integer')
    ]
    null_validation = [
        CustomElementValidation(lambda d: d is not np.nan,
                                'this field cannot be null')
    ]

    # define validation schema
    schema = pandas_schema.Schema([
        Column('dec1', decimal_validation + null_validation),
        Column('dec2', decimal_validation),
        Column('dec3', decimal_validation),
        Column('dec4', decimal_validation),
        Column('dec5', decimal_validation),
        Column('dec6', decimal_validation),
        Column('dec7', decimal_validation),
        Column('company_id', int_validation + null_validation),
        Column('currency_id', int_validation + null_validation),
        Column('country_id', int_validation + null_validation)
    ])

    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col': errors}).to_csv('errors.csv')
    data_clean.to_csv('clean_data.csv')
Пример #24
0
def validate(data):
    decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
    null_validation = [CustomElementValidation(lambda d: d , 'this field cannot be null')]
    test = [CustomElementValidation(lambda d: check_decimal(d), 'invalideted') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")]
    range_text = [CustomElementValidation(lambda d: (d>=0)&(d<100), 'not range')]
    range_number = [CustomElementValidation(lambda d: (d>=0)&(d<10000000), 'not range') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")]
   
    schema = pandas_schema.Schema([
                Column('RevExp'),
                Column('budget', test),
                Column('budgetA', test),
                Column('total', test),
                Column('YTDA', test),
                Column('Q4F', test),
                Column('Q4FTB',test),
                Column('Q4FTBP', test),
                Column('comments')
                ])

    # schema = pandas_schema.Schema([
    #             Column('LHIN Program:  Revenue & Expenses'),
    #             Column('Budget', test),
    #             Column('Budget Adjustments', test),
    #             Column('Total', test),
    #             Column('YTD Actual', test),
    #             Column('Q4 Forecast', test),
    #             Column('Q4 $ Forecast Variance to Budget',test),
    #             Column('Q4 % Forecast Variance to Budget', test),
    #             Column('Comments\nExplanations are required where \nthe Q4 Forecasted % exceeds +/-10%')
    #             ])
    errors = schema.validate(data)
    # for e in errors:
    #     print(e)
    # errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]}
    list = []
    # print(errors)
    # result = jsonify({"error": tuple(errors)})
    for e in errors:
        list.append(str(e))
    errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]}
    print(errors_index)
    return errors_index
Пример #25
0
def validate_kinship(conn, args, filepath):
  """Validates input file for kinship data

  This function validates that the contents of a file to contain kinship data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  df.rename(columns = {"Unnamed: 0": "line_name"}, inplace=True) # since column name is blank by default, rename it for later reference
  nrows += 1 # include the header row in the count
  logging.debug(f"Dimensions of kinship matrix: <{nrows}, {ncols}>")

  schema_columns = [
    Column('line_name', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(Column(df.columns[n], [
      CanConvertValidation(float)
    ]))

  schema = Schema(schema_columns)

  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #26
0
def validate_population_structure(conn, args, filepath):
  """Validates input file for population structure data

  This function validates that the contents of a file to contain population
  structure data. If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  nrows += 1 # include the header rows in the count
  logging.debug(f'Population structure columns: {df.columns}')
  logging.debug(f"Population structure dimensions: <{nrows}, {ncols}>")


  schema_columns = [
    Column('Pedigree', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(Column(df.columns[n], [
      CanConvertValidation(float)
    ]))

  schema = Schema(schema_columns)
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Пример #27
0
    def create_schema(self) -> Schema:
        """ Create Pandas schema with all the necessary validation rules read in from config """
        col_list = []
        for column in self.__spreadsheet_def.keys():
            validators = [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()
            ]

            mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column)

            # Special cases for checking institutions/countries...
            if column == 'submitting_institution':
                validators.append(
                    InListValidation([i.name for i in self.__institutions]))
            if column == 'country':
                validators.append(
                    InListValidation([i.country for i in self.__institutions]))
            else:
                # Regex validation
                if self.__spreadsheet_def.get_regex(column):
                    validators.append(
                        MatchesPatternValidation(
                            self.__spreadsheet_def.get_regex(column),
                            message=self.__spreadsheet_def.
                            get_regex_validation_message(column)))

                # Validate allowed values
                elif self.__spreadsheet_def.get_allowed_values(column):
                    validators.append(
                        InListValidation(
                            self.__spreadsheet_def.get_allowed_values(column),
                            case_sensitive=False))

                # Field length validation
                max_len = self.__spreadsheet_def.get_max_length(column)
                if max_len and max_len > 0:
                    validators.append(
                        _StringLengthValidation(
                            'field length is greater than {} characters'.
                            format(str(max_len)), max_len))

            # Mandatory field validation
            col_list.append(
                Column(self.__spreadsheet_def.get_column_name(column),
                       validators,
                       allow_empty=not mandatory_field_flag))

        return Schema(col_list)
Пример #28
0
class OrderedSchema(unittest.TestCase):
    schema = Schema(
        [Column('a', [LeadingWhitespaceValidation()]),
         Column('b')],
        ordered=True)

    def test_mixed_columns(self):
        """
        Tests that when ordered=True, the schema columns are associated with data frame columns by position, not name.

        In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in
        column b in the data frame (leading whitespace), and a validation on column a in the schema.

        Schema         a (validation)   b
        Data Frame     b (error)        a

        Thus there will only be an error if column b in the schema is linked to column a in the data frame,
        as is correct behaviour when ordered=True.
        """
        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        results = self.schema.validate(df)

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by position'
        )
Пример #29
0
class AllowEmptyColumn(unittest.TestCase):
    """
    Test a column with one single validation that allows empty columns
    """
    NAME = 'col1'

    col = Column(NAME, [CanConvertValidation(int)], allow_empty=True)
    ser = pd.Series([
        '',
    ])

    def test_outputs(self):
        results = self.col.validate(self.ser)
        self.assertEqual(len(results), 0,
                         'allow_empty is not allowing empty columns')
Пример #30
0
    def check_join_cols(df1, df2, on):

        schema = Schema([
            Column(
                col,
                [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    IsDistinctValidation()
                ],
            ) for col in on
        ])
        results = [schema.validate(df) for df in [df1[on], df2[on]]]

        if len(results) > 0:
            print("The following issues exist in the index:")
            for error in itertools.chain(*results):
                print(error)