Пример #1
0
 def test_get_column_schema__valid_schema_type_but_invalid_schema_column__raises_non_oed_schema_column_error(self, schema_type):
     column = np.random.choice([
         col for stype, col_dict in get_grouped_master_schema().items()
         for col in col_dict
         if stype != schema_type
     ])
     with self.assertRaises(NonOedSchemaColumnError):
         get_column_schema(schema_type, column)
Пример #2
0
 def test_get_column_schema__valid_reinsscope_column(self, column):
     exp_col_schema = self.reinsscope_schema[('reinsscope', column.lower())]
     res_col_schema = get_column_schema('reinsscope', column.lower())
     self.assertEqual(exp_col_schema, res_col_schema)
Пример #3
0
 def test_get_column_schema__valid_loc_column(self, column):
     exp_col_schema = self.loc_schema[('loc', column.lower())]
     res_col_schema = get_column_schema('loc', column.lower())
     self.assertEqual(exp_col_schema, res_col_schema)
Пример #4
0
 def test_get_column_schema__valid_schema_type_but_invalid_column__raises_non_oed_column_error(self, schema_type, column, random_str):
     with self.assertRaises(NonOedColumnError):
         get_column_schema(schema_type, column + random_str)
Пример #5
0
 def test_get_column_schema__master_schema_type__raises_oed_error(self, column):
     with self.assertRaises(OedError):
         get_column_schema('master', column)
Пример #6
0
def sample_column(schema_type, header, str_width=None, size=10):
    """
    Sampling values in a given column in a given schema (``acc``, ``loc``,
    ``reinsinfo``, ``reinsscope``), consistent with the validation method
    for the column (if there is one), or with the column range (if a column
    range is defined), or with the data type range.

    :param schema_type: OED schema type indicator (``master``, ``loc``,
                        ``acc``, ``reinsinfo``, or ``reinsscope``)
    :type schema_type: str

    :param header: The column header
    :type column: str

    :param str_width: Optional argument applicable only to string type columns
                      with no defined column range and/or validation method. If
                      applicable this option sets a fixed width for the
                      individual string values sampled for the column
    :type str_width: int

    :param size: Number of values to sample
    :type size: int

    :return: Sampled values
    :rtype: list
    """
    if size <= 0:
        size = 10

    col_schema = get_column_schema(schema_type, header)

    if col_schema['py_dtype'] is None:
        return

    py_dtype = getattr(builtins, col_schema['py_dtype'])

    dtype_range = col_schema['dtype_range']
    column_range = col_schema['column_range']

    use_range = column_range or dtype_range

    try:
        sampling_info = json.loads(col_schema['column_sampling'])
    except (JSONDecodeError, TypeError, ValueError):
        sampling_info = sampling_func = None
    else:
        sampling_func = get_method(sampling_info['func'])

    if py_dtype in [
            int, float, str
    ] and column_range is not None and sampling_func is not None:
        return [
            sampling_func(column_range, *sampling_info['args'][1:])
            for i in range(size)
        ]
    elif py_dtype is int:
        return (np.random.randint(
            use_range.start, use_range.stop, size=size).tolist() if isinstance(
                use_range, range) else np.random.choice(use_range,
                                                        size=size).tolist())
    elif py_dtype is float:
        return (np.random.uniform(max(min(use_range), -1.79e+307),
                                  min(max(use_range), +1.79e+307),
                                  size=size).tolist())
    elif py_dtype is str and column_range is not None and sampling_func is None:
        return [np.random.choice(column_range) for i in range(size)]
    elif py_dtype is str and column_range is None:
        return [
            ''.join(
                np.random.choice(list(string.ascii_letters + string.digits),
                                 size=(str_width or 20))) for i in range(size)
        ]