def _paths_to_cats(paths, file_scheme): """ Extract categorical fields and labels from hive- or drill-style paths. FixMe: This has been pasted from https://github.com/dask/fastparquet/pull/471 Use fastparquet.api.paths_to_cats from fastparquet>0.3.2 instead. Parameters ---------- paths (Iterable[str]): file paths relative to root file_scheme (str): Returns ------- cats (OrderedDict[str, List[Any]]): a dict of field names and their values """ if file_scheme in ["simple", "flat", "other"]: cats = {} return cats cats = OrderedDict() raw_cats = OrderedDict() s = ex_from_sep("/") paths = toolz.unique(paths) if file_scheme == "hive": partitions = toolz.unique((k, v) for path in paths for k, v in s.findall(path)) for key, val in partitions: cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) else: i_val = toolz.unique( (i, val) for path in paths for i, val in enumerate(path.split("/")[:-1]) ) for i, val in i_val: key = "dir%i" % i cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) for key, v in cats.items(): # Check that no partition names map to the same value after transformation by val_to_num raw = raw_cats[key] if len(v) != len(raw): conflicts_by_value = OrderedDict() for raw_val in raw_cats[key]: conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val) conflicts = [ c for k in conflicts_by_value.values() if len(k) > 1 for c in k ] raise ValueError("Partition names map to the same value: %s" % conflicts) vals_by_type = groupby_types(v) # Check that all partition names map to the same type after transformation by val_to_num if len(vals_by_type) > 1: examples = [x[0] for x in vals_by_type.values()] warnings.warn( "Partition names coerce to values of different types, e.g. %s" % examples ) cats = OrderedDict([(key, list(v)) for key, v in cats.items()]) return cats
def _paths_to_cats(paths, scheme): """Extract out fields and labels from directory names""" # can be factored out in fastparquet from fastparquet.util import ex_from_sep, val_to_num, groupby_types cats = OrderedDict() raw_cats = OrderedDict() for path in paths: s = ex_from_sep("/") if scheme == "hive": partitions = s.findall(path) for key, val in partitions: cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) else: for i, val in enumerate(path.split("/")[:-1]): key = "dir%i" % i cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) for key, v in cats.items(): # Check that no partition names map to the same value after # transformation by val_to_num raw = raw_cats[key] if len(v) != len(raw): conflicts_by_value = OrderedDict() for raw_val in raw_cats[key]: conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val) conflicts = [ c for k in conflicts_by_value.values() if len(k) > 1 for c in k ] raise ValueError("Partition names map to the same value: %s" % conflicts) vals_by_type = groupby_types(v) # Check that all partition names map to the same type after # transformation by val_to_num if len(vals_by_type) > 1: import warnings examples = [x[0] for x in vals_by_type.values()] warnings.warn( "Partition names coerce to values of different" " types, e.g. %s" % examples ) return {k: list(v) for k, v in cats.items()}
def _paths_to_cats(paths, scheme): """Extract out fields and labels from directory names""" # can be factored out in fastparquet from fastparquet.util import ex_from_sep, val_to_num, groupby_types cats = OrderedDict() raw_cats = OrderedDict() for path in paths: s = ex_from_sep('/') if scheme == 'hive': partitions = s.findall(path) for key, val in partitions: cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) else: for i, val in enumerate(path.split('/')[:-1]): key = 'dir%i' % i cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) for key, v in cats.items(): # Check that no partition names map to the same value after # transformation by val_to_num raw = raw_cats[key] if len(v) != len(raw): conflicts_by_value = OrderedDict() for raw_val in raw_cats[key]: conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val) conflicts = [c for k in conflicts_by_value.values() if len(k) > 1 for c in k] raise ValueError("Partition names map to the same value: %s" % conflicts) vals_by_type = groupby_types(v) # Check that all partition names map to the same type after # transformation by val_to_num if len(vals_by_type) > 1: import warnings examples = [x[0] for x in vals_by_type.values()] warnings.warn("Partition names coerce to values of different" " types, e.g. %s" % examples) return {k: list(v) for k, v in cats.items()}
def test_groupby_types(): assert len(groupby_types([1, 2, 3])) == 1 assert len(groupby_types(["1", "2", "3.0"])) == 1 assert len(groupby_types([1, 2, 3.0])) == 2 assert len(groupby_types([1, "2", "3.0"])) == 2 assert len(groupby_types([pd.to_datetime("2000"), "2000"])) == 2