示例#1
0
    def __init__(
        self,
        paths_or_dataset,
        batch_size,
        label_names,
        feature_columns=None,
        cat_names=None,
        cont_names=None,
        engine=None,
        shuffle=True,
        seed_fn=None,
        buffer_size=0.1,
        device=None,
        parts_per_chunk=1,
        reader_kwargs=None,
        global_size=None,
        global_rank=None,
        drop_last=False,
        sparse_names=None,
        sparse_max=None,
        sparse_as_dense=False,
    ):
        dataset = _validate_dataset(
            paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs
        )
        cat_names, cont_names = _validate_schema(feature_columns, cat_names, cont_names)

        # sort the ccolumns to avoid getting incorrect output
        # (https://github.com/NVIDIA/NVTabular/issues/412)
        cat_names = _get_embedding_order(cat_names)
        cont_names = _get_embedding_order(cont_names)

        device = device or 0
        DataLoader.__init__(
            self,
            dataset,
            cat_names,
            cont_names,
            label_names,
            batch_size,
            shuffle,
            seed_fn=seed_fn,
            parts_per_chunk=parts_per_chunk,
            device=device,
            global_size=global_size,
            global_rank=global_rank,
            drop_last=drop_last,
            sparse_names=sparse_names,
            sparse_max=sparse_max,
            sparse_as_dense=sparse_as_dense,
        )
        self._map_fns = []
示例#2
0
    def __init__(
        self,
        paths_or_dataset,
        batch_size,
        label_names=None,
        feature_columns=None,
        cat_names=None,
        cont_names=None,
        engine=None,
        shuffle=True,
        seed_fn=None,
        buffer_size=0.1,
        device=None,
        parts_per_chunk=1,
        reader_kwargs=None,
        global_size=None,
        global_rank=None,
        drop_last=False,
        sparse_names=None,
        sparse_max=None,
        sparse_as_dense=False,
        schema=None,
    ):
        dataset = _validate_dataset(paths_or_dataset, batch_size, buffer_size,
                                    engine, reader_kwargs)
        schema = _get_schema(dataset) if not schema else schema
        cat_names, cont_names = _validate_schema(feature_columns,
                                                 cat_names,
                                                 cont_names,
                                                 schema=schema)

        device = device or 0
        device = "cpu" if not HAS_GPU else device
        DataLoader.__init__(
            self,
            dataset,
            batch_size,
            shuffle,
            cat_names=cat_names,
            cont_names=cont_names,
            label_names=label_names,
            seed_fn=seed_fn,
            parts_per_chunk=parts_per_chunk,
            device=device,
            global_size=global_size,
            global_rank=global_rank,
            drop_last=drop_last,
            sparse_names=sparse_names,
            sparse_max=sparse_max,
            sparse_as_dense=sparse_as_dense,
        )
        self._map_fns = []
示例#3
0
    def __init__(
        self,
        paths_or_dataset,
        batch_size,
        label_names,
        feature_columns=None,
        cat_names=None,
        cont_names=None,
        engine=None,
        shuffle=True,
        buffer_size=0.1,
        workflows=None,
        devices=None,
        parts_per_chunk=1,
        reader_kwargs=None,
    ):
        dataset = _validate_dataset(paths_or_dataset, batch_size, buffer_size,
                                    engine, reader_kwargs)
        cat_names, cont_names = _validate_schema(feature_columns, cat_names,
                                                 cont_names)

        # sort the ccolumns to avoid getting incorrect output
        # (https://github.com/NVIDIA/NVTabular/issues/412)
        cat_names = _get_embedding_order(cat_names)
        cont_names = _get_embedding_order(cont_names)

        assert devices is None or len(
            devices) == 1  # TODO: figure out multi-gpu support
        devices = devices or [0]
        DataLoader.__init__(
            self,
            dataset,
            cat_names,
            cont_names,
            label_names,
            batch_size,
            shuffle,
            parts_per_chunk=parts_per_chunk,
            workflows=workflows,
            devices=devices,
        )