Пример #1
0
def keyable_p(column):
    # `unique' can't cope with NaNs, so reject them early.
    if any(v is None or (isinstance(v, float) and math.isnan(v))
           for v in column):
        return False
    try:
        column_floats = [float(v) for v in column]
        if not all(v.is_integer() for v in column_floats):
            return False
        return len(column) == len(unique(column))
    except ValueError:
        return len(column) == len(unique(column))
Пример #2
0
def numerical_p(column, count_cutoff, ratio_cutoff):
    nu = len(unique([v for v in column if not math.isnan(v)]))
    if nu <= count_cutoff:
        return False
    if float(nu) / float(len(column)) <= ratio_cutoff:
        return False
    return True
Пример #3
0
def keyable_p(column):
    # `unique' can't cope with NaNs, so reject them early.
    if any(v is None or (isinstance(v, float) and math.isnan(v))
           for v in column):
        return False
    if all(isinstance(v, float) for v in column) and all(float(v).is_integer() \
        for v in column):
        return False
    return len(column) == len(unique(column))
Пример #4
0
def bayesdb_guess_stattypes(column_names, rows, null_values=None,
        numcat_count=None, numcat_ratio=None, distinct_ratio=None,
        nullify_ratio=None, overrides=None):
    """Heuristically guess statistical types for the data in `rows`.

    Return a list of statistical types corresponding to the columns
    named in the list `column_names`.

    :param set null_values: values to nullify.
    :param int numcat_count: number of distinct values below which
        columns whose values can all be parsed as numbers will be
        considered categorical anyway
    :param real numcat_ratio: ratio of distinct values to total values
        below which columns whose values can all be parsed as numbers
        will be considered categorical anyway
    :param real distinct_ratio: ratio of distinct values to total values
        above which a column will be ignored as a pseudo-key
        (only if count > numcat_count).
    :param real nullify_ratio: ratio of count of the most numerous value to
        total number of values above which the most numerous value should be
        nullified (set to 1 to turn off).
    :param list overrides: list of ``(name, stattype)``, overriding
        any guessed statistical type for columns by those names

    In addition to statistical types, the overrides may specify
    ``key`` or ``ignore``.
    """

    # Fill in default arguments.
    if null_values is None:
        null_values = set(("", "N/A", "none", "None"))
    if numcat_count is None:
        numcat_count = 20
    if numcat_ratio is None:
        numcat_ratio = 0.02
    if distinct_ratio is None:
        distinct_ratio = 0.9
    if nullify_ratio is None:
        nullify_ratio = 0.9
    if overrides is None:
        overrides = []

    # Build a set of the column names.
    column_name_set = set()
    duplicates = set()
    for name in column_names:
        if casefold(name) in column_name_set:
            duplicates.add(name)
        column_name_set.add(casefold(name))
    if 0 < len(duplicates):
        raise ValueError('Duplicate column names: %s' %
            (repr(list(duplicates),)))

    # Build a map for the overrides.
    #
    # XXX Support more than just stattype: allow arbitrary column
    # descriptions.
    override_map = {}
    unknown = set()
    duplicates = set()
    for name, stattype in overrides:
        if casefold(name) not in column_name_set:
            unknown.add(name)
            continue
        if casefold(name) in override_map:
            duplicates.add(name)
            continue
        override_map[casefold(name)] = casefold(stattype)
    if 0 < len(unknown):
        raise ValueError('Unknown columns overridden: %s' %
            (repr(list(unknown)),))
    if 0 < len(duplicates):
        raise ValueError('Duplicate columns overridden: %s' %
            (repr(list(duplicates)),))

    # Sanity-check the inputs.
    ncols = len(column_names)
    assert ncols == len(unique(map(casefold, column_names)))
    for ri, row in enumerate(rows):
        if len(row) < ncols:
            raise ValueError('Row %d: Too few columns: %d < %d' %
                (ri, len(row), ncols))
        if len(row) > ncols:
            raise ValueError('Row %d: Too many columns: %d > %d' %
                (ri, len(row), ncols))

    # Find a key first, if it has been specified as an override.
    key = None
    duplicate_keys = set()
    for ci, column_name in enumerate(column_names):
        if casefold(column_name) in override_map:
            if override_map[casefold(column_name)] == 'key':
                if key is not None:
                    duplicate_keys.add(column_name)
                    continue
                column = [row[ci] for row in rows]
                ints = integerify(column)
                if ints:
                   column = ints 
                if not keyable_p(column):
                    raise ValueError('Column non-unique but specified as key'
                        ': %s' % (repr(column_name),))
                key = column_name
    if 0 < len(duplicate_keys):
        raise ValueError('Multiple columns overridden as keys: %s' %
            (repr(list(duplicate_keys)),))

    # Now go through and guess the other column stattypes or use the
    # override.
    stattypes = []
    for ci, column_name in enumerate(column_names):
        if casefold(column_name) in override_map:
            stattype = override_map[casefold(column_name)]
        else:
            column = nullify(null_values, rows, ci)
            stattype = guess_column_stattype(column,
                                             distinct_ratio=distinct_ratio,
                                             nullify_ratio=nullify_ratio,
                                             numcat_count=numcat_count,
                                             numcat_ratio=numcat_ratio,
                                             have_key=(key is not None))
            if stattype == 'key':
                key = column_name
        stattypes.append(stattype)
    return stattypes
Пример #5
0
def keyable_p(column):
    # `unique' can't cope with NaNs, so reject them early.
    if any(v is None or (isinstance(v, float) and math.isnan(v))
           for v in column):
        return False
    return len(column) == len(unique(column))
Пример #6
0
def bayesdb_guess_stattypes(column_names, rows,
        count_cutoff=None, ratio_cutoff=None, overrides=None):
    """Heuristically guess statistical types for the data in `rows`.

    Return a list of statistical types corresponding to the columns
    named in the list `column_names`.

    :param int count_cutoff: number of distinct values below which
        columns whose values can all be parsed as numbers will be
        considered categorical anyway
    :param real ratio_cutoff: ratio of distinct values to total values
        below which columns whose values can all be parsed as numbers
        will be considered categorical anyway
    :param list overrides: list of ``(name, stattype)``, overriding
        any guessed statistical type for columns by those names

    In addition to statistical types, the overrides may specify
    ``key`` or ``ignore``.
    """

    # Fill in default arguments.
    if count_cutoff is None:
        count_cutoff = 20
    if ratio_cutoff is None:
        ratio_cutoff = 0.02
    if overrides is None:
        overrides = []

    # Build a set of the column names.
    column_name_set = set()
    duplicates = set()
    for name in column_names:
        if casefold(name) in column_name_set:
            duplicates.add(name)
        column_name_set.add(casefold(name))
    if 0 < len(duplicates):
        raise ValueError('Duplicate column names: %s' %
            (repr(list(duplicates),)))

    # Build a map for the overrides.
    #
    # XXX Support more than just stattype: allow arbitrary column
    # descriptions.
    override_map = {}
    unknown = set()
    duplicates = set()
    for name, stattype in overrides:
        if casefold(name) not in column_name_set:
            unknown.add(name)
            continue
        if casefold(name) in override_map:
            duplicates.add(name)
            continue
        override_map[casefold(name)] = casefold(stattype)
    if 0 < len(unknown):
        raise ValueError('Unknown columns overridden: %s' %
            (repr(list(unknown)),))
    if 0 < len(duplicates):
        raise ValueError('Duplicate columns overridden: %s' %
            (repr(list(duplicates)),))

    # Sanity-check the inputs.
    ncols = len(column_names)
    assert ncols == len(unique(map(casefold, column_names)))
    for ri, row in enumerate(rows):
        if len(row) < ncols:
            raise ValueError('Row %d: Too few columns: %d < %d' %
                (ri, len(row), ncols))
        if len(row) > ncols:
            raise ValueError('Row %d: Too many columns: %d > %d' %
                (ri, len(row), ncols))

    # Find a key first, if it has been specified as an override.
    key = None
    duplicate_keys = set()
    for ci, column_name in enumerate(column_names):
        if casefold(column_name) in override_map:
            if override_map[casefold(column_name)] == 'key':
                if key is not None:
                    duplicate_keys.add(column_name)
                    continue
                column = integerify(rows, ci)
                if not column:
                    column = [row[ci] for row in rows]
                if not keyable_p(column):
                    raise ValueError('Column non-unique but specified as key'
                        ': %s' % (repr(column_name),))
                key = column_name
    if 0 < len(duplicate_keys):
        raise ValueError('Multiple columns overridden as keys: %s' %
            (repr(list(duplicate_keys)),))

    # Now go through and guess the other column stattypes or use the
    # override.
    stattypes = []
    for ci, column_name in enumerate(column_names):
        if casefold(column_name) in override_map:
            stattype = override_map[casefold(column_name)]
        else:
            numericable = True
            column = integerify(rows, ci)
            if not column:
                column = floatify(rows, ci)
                if not column:
                    column = [row[ci] for row in rows]
                    numericable = False
            if key is None and keyable_p(column):
                stattype = 'key'
                key = column_name
            elif numericable and \
                 numerical_p(column, count_cutoff, ratio_cutoff):
                stattype = 'numerical'
            else:
                stattype = 'categorical'
        stattypes.append(stattype)
    return stattypes