def _separate_list_columns(self, gdf): lists, scalars = [], [] for col in gdf.columns: if is_list_dtype(gdf[col]): lists.append(col) else: scalars.append(col) return _get_embedding_order(scalars), _get_embedding_order(lists)
def __init__( self, paths_or_dataset, batch_size, label_names, feature_columns=None, cat_names=None, cont_names=None, engine=None, shuffle=True, seed_fn=None, buffer_size=0.1, device=None, parts_per_chunk=1, reader_kwargs=None, global_size=None, global_rank=None, drop_last=False, sparse_names=None, sparse_max=None, sparse_as_dense=False, ): dataset = _validate_dataset( paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs ) cat_names, cont_names = _validate_schema(feature_columns, cat_names, cont_names) # sort the ccolumns to avoid getting incorrect output # (https://github.com/NVIDIA/NVTabular/issues/412) cat_names = _get_embedding_order(cat_names) cont_names = _get_embedding_order(cont_names) device = device or 0 DataLoader.__init__( self, dataset, cat_names, cont_names, label_names, batch_size, shuffle, seed_fn=seed_fn, parts_per_chunk=parts_per_chunk, device=device, global_size=global_size, global_rank=global_rank, drop_last=drop_last, sparse_names=sparse_names, sparse_max=sparse_max, sparse_as_dense=sparse_as_dense, ) self._map_fns = []
def _get_final_cols(preproc): if "cols" not in preproc.columns_ctx["final"]: preproc.create_final_cols() cat_names = _get_embedding_order( preproc.columns_ctx["final"]["cols"]["categorical"]) cont_names = sorted(preproc.columns_ctx["final"]["cols"]["continuous"]) label_name = sorted(preproc.columns_ctx["final"]["cols"]["label"]) return cat_names, cont_names, label_name
def combine_tensors(cats, conts, label): cats_list = [cats[x] for x in _get_embedding_order(cats.keys())] if cats else None conts_list = [conts[x] for x in sorted(conts.keys())] if conts else None label_list = [label[x] for x in sorted(label.keys())] if label else None # Change cats, conts to dim=1 for column dim=0 for df sub section cats = torch.stack(cats_list, dim=1) if len(cats_list) > 0 else None conts = torch.stack(conts_list, dim=1) if len(conts_list) > 0 else None label = torch.cat(label_list, dim=0) if len(label_list) > 0 else None return cats, conts, label
def __init__( self, paths_or_dataset, batch_size, label_names, feature_columns=None, cat_names=None, cont_names=None, engine=None, shuffle=True, buffer_size=0.1, workflows=None, devices=None, parts_per_chunk=1, reader_kwargs=None, ): dataset = _validate_dataset(paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs) cat_names, cont_names = _validate_schema(feature_columns, cat_names, cont_names) # sort the ccolumns to avoid getting incorrect output # (https://github.com/NVIDIA/NVTabular/issues/412) cat_names = _get_embedding_order(cat_names) cont_names = _get_embedding_order(cont_names) assert devices is None or len( devices) == 1 # TODO: figure out multi-gpu support devices = devices or [0] DataLoader.__init__( self, dataset, cat_names, cont_names, label_names, batch_size, shuffle, parts_per_chunk=parts_per_chunk, workflows=workflows, devices=devices, )
def create_tensors(self, gdf, cat_names=None, cont_names=None, label_names=None): gdf_cats, gdf_conts, gdf_label = ( gdf[_get_embedding_order(cat_names)], gdf[cont_names], gdf[label_names], ) del gdf cats = self._to_tensor(gdf_cats, torch.long) conts = self._to_tensor(gdf_conts, torch.float32) label = self._to_tensor(gdf_label, torch.float32) del gdf_cats, gdf_conts, gdf_label return [cats, conts, label]
def create_tensors(gdf, cat_names=None, cont_names=None, label_names=None): gdf_cats, gdf_conts, gdf_label = ( gdf[_get_embedding_order(cat_names)], gdf[cont_names], gdf[label_names], ) del gdf if len(gdf_cats) > 0: cats = _to_tensor(gdf_cats, torch.long, to_cpu=False) if len(gdf_conts) > 0: conts = _to_tensor(gdf_conts, torch.float32, to_cpu=False) if len(gdf_label) > 0: label = _to_tensor(gdf_label, torch.float32, to_cpu=False) del gdf_cats, gdf_conts, gdf_label return [cats[0], conts[0], label[0]]
def _create_tensors(self, gdf): """ Breaks a dataframe down into the relevant categorical, continuous, and label tensors. Can be overrideen """ # TODO: how will this work once we have multi-hots # also seems brittle to labels with mixed type gdf_cats, gdf_conts, gdf_label = ( gdf[_get_embedding_order(self.cat_names)], gdf[self.cont_names], gdf[self.label_names], ) del gdf cats = self._to_tensor(gdf_cats) conts = self._to_tensor(gdf_conts) label = self._to_tensor(gdf_label) del gdf_cats, gdf_conts, gdf_label return cats, conts, label