def _paths_to_cats(paths, file_scheme): """ Extract categorical fields and labels from hive- or drill-style paths. FixMe: This has been pasted from https://github.com/dask/fastparquet/pull/471 Use fastparquet.api.paths_to_cats from fastparquet>0.3.2 instead. Parameters ---------- paths (Iterable[str]): file paths relative to root file_scheme (str): Returns ------- cats (OrderedDict[str, List[Any]]): a dict of field names and their values """ if file_scheme in ["simple", "flat", "other"]: cats = {} return cats cats = OrderedDict() raw_cats = OrderedDict() s = ex_from_sep("/") paths = toolz.unique(paths) if file_scheme == "hive": partitions = toolz.unique((k, v) for path in paths for k, v in s.findall(path)) for key, val in partitions: cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) else: i_val = toolz.unique( (i, val) for path in paths for i, val in enumerate(path.split("/")[:-1]) ) for i, val in i_val: key = "dir%i" % i cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) for key, v in cats.items(): # Check that no partition names map to the same value after transformation by val_to_num raw = raw_cats[key] if len(v) != len(raw): conflicts_by_value = OrderedDict() for raw_val in raw_cats[key]: conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val) conflicts = [ c for k in conflicts_by_value.values() if len(k) > 1 for c in k ] raise ValueError("Partition names map to the same value: %s" % conflicts) vals_by_type = groupby_types(v) # Check that all partition names map to the same type after transformation by val_to_num if len(vals_by_type) > 1: examples = [x[0] for x in vals_by_type.values()] warnings.warn( "Partition names coerce to values of different types, e.g. %s" % examples ) cats = OrderedDict([(key, list(v)) for key, v in cats.items()]) return cats
def _paths_to_cats(paths, scheme): """Extract out fields and labels from directory names""" # can be factored out in fastparquet from fastparquet.util import ex_from_sep, val_to_num, groupby_types cats = OrderedDict() raw_cats = OrderedDict() for path in paths: s = ex_from_sep("/") if scheme == "hive": partitions = s.findall(path) for key, val in partitions: cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) else: for i, val in enumerate(path.split("/")[:-1]): key = "dir%i" % i cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) for key, v in cats.items(): # Check that no partition names map to the same value after # transformation by val_to_num raw = raw_cats[key] if len(v) != len(raw): conflicts_by_value = OrderedDict() for raw_val in raw_cats[key]: conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val) conflicts = [ c for k in conflicts_by_value.values() if len(k) > 1 for c in k ] raise ValueError("Partition names map to the same value: %s" % conflicts) vals_by_type = groupby_types(v) # Check that all partition names map to the same type after # transformation by val_to_num if len(vals_by_type) > 1: import warnings examples = [x[0] for x in vals_by_type.values()] warnings.warn( "Partition names coerce to values of different" " types, e.g. %s" % examples ) return {k: list(v) for k, v in cats.items()}
def _paths_to_cats(paths, scheme): """Extract out fields and labels from directory names""" # can be factored out in fastparquet from fastparquet.util import ex_from_sep, val_to_num, groupby_types cats = OrderedDict() raw_cats = OrderedDict() for path in paths: s = ex_from_sep('/') if scheme == 'hive': partitions = s.findall(path) for key, val in partitions: cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) else: for i, val in enumerate(path.split('/')[:-1]): key = 'dir%i' % i cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) for key, v in cats.items(): # Check that no partition names map to the same value after # transformation by val_to_num raw = raw_cats[key] if len(v) != len(raw): conflicts_by_value = OrderedDict() for raw_val in raw_cats[key]: conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val) conflicts = [c for k in conflicts_by_value.values() if len(k) > 1 for c in k] raise ValueError("Partition names map to the same value: %s" % conflicts) vals_by_type = groupby_types(v) # Check that all partition names map to the same type after # transformation by val_to_num if len(vals_by_type) > 1: import warnings examples = [x[0] for x in vals_by_type.values()] warnings.warn("Partition names coerce to values of different" " types, e.g. %s" % examples) return {k: list(v) for k, v in cats.items()}
def test_val_to_num(): assert val_to_num('7') == 7 assert val_to_num('.7') == .7 assert val_to_num('0.7') == .7 assert val_to_num('07') == 7 assert val_to_num('0') == 0 assert val_to_num('00') == 0
def test_val_to_num(): assert val_to_num('7') == 7 assert val_to_num('.7') == .7 assert val_to_num('0.7') == .7 assert val_to_num('07') == 7 assert val_to_num('0') == 0 assert val_to_num('00') == 0 assert val_to_num('-20') == -20 assert val_to_num(7) == 7 assert val_to_num(0.7) == 0.7 assert val_to_num(0) == 0 assert val_to_num('NOW') == 'NOW' assert val_to_num('now') == 'now' assert val_to_num('TODAY') == 'TODAY' assert val_to_num('') == '' assert val_to_num('nan') == 'nan' assert val_to_num('NaN') == 'NaN' assert val_to_num('2018-10-10') == pd.to_datetime('2018-10-10') assert val_to_num('2018-10-09') == pd.to_datetime('2018-10-09') assert val_to_num('2017-12') == pd.to_datetime('2017-12') assert val_to_num('5e+6') == 5e6 assert val_to_num('5e-6') == 5e-6 assert val_to_num('0xabc') == '0xabc' assert val_to_num('hello world') == 'hello world' # The following tests document an idiosyncrasy of val_to_num which is difficult # to avoid while timedeltas are supported. assert val_to_num('50+20') == pd.to_timedelta('50+20') assert val_to_num('50-20') == pd.to_timedelta('50-20')
def test_val_to_num(): assert val_to_num('7') == 7 assert val_to_num('.7') == .7 assert val_to_num('0.7') == .7 assert val_to_num('07') == 7 assert val_to_num('0') == 0 assert val_to_num('00') == 0 assert val_to_num('-20') == -20 assert val_to_num(7) == 7 assert val_to_num(0.7) == 0.7 assert val_to_num(0) == 0 assert val_to_num('NOW') == 'NOW' assert val_to_num('now') == 'now' assert val_to_num('TODAY') == 'TODAY' assert val_to_num('') == '' assert val_to_num('2018-10-10') == pd.to_datetime('2018-10-10') assert val_to_num('2018-10-09') == pd.to_datetime('2018-10-09') assert val_to_num('2017-12') == pd.to_datetime('2017-12') assert val_to_num('5e+6') == 5e6 assert val_to_num('5e-6') == 5e-6 assert val_to_num('0xabc') == '0xabc' assert val_to_num('hello world') == 'hello world' # The following tests document an idiosyncrasy of val_to_num which is difficult # to avoid while timedeltas are supported. assert val_to_num('50+20') == pd.to_timedelta('50+20') assert val_to_num('50-20') == pd.to_timedelta('50-20')