예제 #1
0
def load_feature_spec(flags):
    if flags.dataset_type == 'synthetic_gpu' and not flags.synthetic_dataset_use_feature_spec:
        num_numerical = flags.synthetic_dataset_numerical_features
        categorical_sizes = [int(s) for s in FLAGS.synthetic_dataset_table_sizes]
        return FeatureSpec.get_default_feature_spec(number_of_numerical_features=num_numerical,
                                                    categorical_feature_cardinalities=categorical_sizes)
    fspec_path = os.path.join(flags.dataset, flags.feature_spec)
    return FeatureSpec.from_yaml(fspec_path)
예제 #2
0
def split_dataset(dataset_dir: str, output_dir: str, batch_size: int, numerical_features: int):
    categorical_sizes_file = os.path.join(dataset_dir, "model_size.json")
    with open(categorical_sizes_file) as f:
        # model_size.json contains the max value of each feature instead of the cardinality.
        # For feature spec this is changed for consistency and clarity.
        categorical_cardinalities = [int(v)+1 for v in json.load(f).values()]

    train_file = os.path.join(dataset_dir, "train_data.bin")
    test_file = os.path.join(dataset_dir, "test_data.bin")
    val_file = os.path.join(dataset_dir, "validation_data.bin")

    target_train = os.path.join(output_dir, "train")
    target_test = os.path.join(output_dir, "test")
    target_val = os.path.join(output_dir, "validation")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(target_train, exist_ok=True)
    os.makedirs(target_test, exist_ok=True)
    os.makedirs(target_val, exist_ok=True)

    # VALIDATION chunk is ignored in feature spec on purpose
    feature_spec = FeatureSpec.get_default_feature_spec(number_of_numerical_features=numerical_features,
                                                        categorical_feature_cardinalities=categorical_cardinalities)
    feature_spec.to_yaml(os.path.join(output_dir, 'feature_spec.yaml'))
    split_binary_file(test_file, target_test, categorical_cardinalities, numerical_features, batch_size)
    split_binary_file(train_file, target_train, categorical_cardinalities, numerical_features, batch_size)
    split_binary_file(val_file, target_val, categorical_cardinalities, numerical_features, batch_size)
예제 #3
0
def create_dataset_factory(
        flags,
        feature_spec: FeatureSpec,
        device_mapping: Optional[dict] = None) -> DatasetFactory:
    """
    By default each dataset can be used in single GPU or distributed setting - please keep that in mind when adding
    new datasets. Distributed case requires selection of categorical features provided in `device_mapping`
    (see `DatasetFactory#create_collate_fn`).

    :param flags:
    :param device_mapping: dict, information about model bottom mlp and embeddings devices assignment
    :return:
    """
    dataset_type = flags.dataset_type
    num_numerical_features = feature_spec.get_number_of_numerical_features()
    if is_distributed() or device_mapping:
        assert device_mapping is not None, "Distributed dataset requires information about model device mapping."
        rank = get_rank()
        local_categorical_positions = device_mapping["embedding"][rank]
        numerical_features_enabled = device_mapping["bottom_mlp"] == rank
    else:
        local_categorical_positions = list(
            range(len(feature_spec.get_categorical_feature_names())))
        numerical_features_enabled = True

    if dataset_type == "parametric":
        local_categorical_names = feature_spec.cat_positions_to_names(
            local_categorical_positions)
        return ParametricDatasetFactory(
            flags=flags,
            feature_spec=feature_spec,
            numerical_features_enabled=numerical_features_enabled,
            categorical_features_to_read=local_categorical_names)
    if dataset_type == "synthetic_gpu":
        local_numerical_features = num_numerical_features if numerical_features_enabled else 0
        world_categorical_sizes = feature_spec.get_categorical_sizes()
        local_categorical_sizes = [
            world_categorical_sizes[i] for i in local_categorical_positions
        ]
        return SyntheticGpuDatasetFactory(
            flags,
            local_numerical_features_num=local_numerical_features,
            local_categorical_feature_sizes=local_categorical_sizes)

    raise NotImplementedError(f"unknown dataset type: {dataset_type}")
예제 #4
0
def write_dataset_to_disk(dataset_train: Dataset, dataset_test: Dataset, feature_spec: FeatureSpec,
                          saving_batch_size=512) -> None:
    feature_spec.check_feature_spec()  # We rely on the feature spec being properly formatted

    categorical_features_list = feature_spec.get_categorical_feature_names()
    categorical_features_types = [feature_spec.feature_spec[feature_name][DTYPE_SELECTOR]
                                  for feature_name in categorical_features_list]
    number_of_numerical_features = feature_spec.get_number_of_numerical_features()
    number_of_categorical_features = len(categorical_features_list)

    for mapping_name, dataset in zip((TRAIN_MAPPING, TEST_MAPPING),
                                     (dataset_train, dataset_test)):
        file_streams = []
        label_path, numerical_path, categorical_paths = feature_spec.get_mapping_paths(mapping_name)
        try:
            os.makedirs(os.path.dirname(numerical_path), exist_ok=True)
            numerical_f = open(numerical_path, "wb+")
            file_streams.append(numerical_f)

            os.makedirs(os.path.dirname(label_path), exist_ok=True)
            label_f = open(label_path, 'wb+')
            file_streams.append(label_f)

            categorical_fs = []
            for feature_name in categorical_features_list:
                local_path = categorical_paths[feature_name]
                os.makedirs(os.path.dirname(local_path), exist_ok=True)
                fs = open(local_path, 'wb+')
                categorical_fs.append(fs)
                file_streams.append(fs)

            for numerical, categorical, label in tqdm.tqdm(
                    DataLoader(dataset, saving_batch_size),
                    desc=mapping_name + " dataset saving",
                    unit_scale=saving_batch_size
            ):
                assert (numerical.shape[-1] == number_of_numerical_features)
                assert (categorical.shape[-1] == number_of_categorical_features)

                numerical_f.write(numerical.to(torch.float16).cpu().numpy().tobytes())
                label_f.write(label.to(torch.bool).cpu().numpy().tobytes())
                for cat_idx, cat_feature_type in enumerate(categorical_features_types):
                    categorical_fs[cat_idx].write(
                        categorical[:, :, cat_idx].cpu().numpy().astype(cat_feature_type).tobytes())
        finally:
            for stream in file_streams:
                stream.close()
    feature_spec.to_yaml()
예제 #5
0
def main():
    args = parse_args()
    dataset_size = args.size
    fspec_in = FeatureSpec.from_yaml(args.feature_spec_in)
    fspec_in.base_directory = args.output
    cat_cardinalities = fspec_in.get_categorical_sizes()
    cat_names = fspec_in.get_categorical_feature_names()
    cardinalities = {
        name: cardinality
        for name, cardinality in zip(cat_names, cat_cardinalities)
    }
    input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0]
    numerical_names_set = set(fspec_in.channel_spec[NUMERICAL_CHANNEL])
    for mapping_name, mapping in fspec_in.source_spec.items():
        for chunk in mapping:
            assert chunk[
                'type'] == 'csv', "Only csv files supported in this generator"
            assert len(
                chunk['files']
            ) == 1, "Only one file per chunk supported in this transcoder"
            path_to_save = os.path.join(fspec_in.base_directory,
                                        chunk['files'][0])
            data = []
            for name in chunk['features']:
                if name == input_label_feature_name:
                    data.append(np.random.randint(0, 1, size=dataset_size))
                elif name in numerical_names_set:
                    data.append(np.random.rand(dataset_size))
                else:
                    local_cardinality = cardinalities[name]
                    data.append(
                        np.random.randint(0,
                                          local_cardinality,
                                          size=dataset_size))
            values = np.stack(data).T
            to_save = pd.DataFrame(values, columns=chunk['features'])
            os.makedirs(os.path.dirname(path_to_save), exist_ok=True)
            to_save.to_csv(path_to_save, index=False, header=False)
예제 #6
0
def main():
    args = parse_args()
    args_output = args.output
    args_input = args.input
    args_feature_spec_in = args.feature_spec_in
    args_feature_spec_out = args.feature_spec_out
    batch_size = args.chunk_size

    fspec_in_path = os.path.join(args_input, args_feature_spec_in)
    fspec_in = FeatureSpec.from_yaml(fspec_in_path)

    input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0]
    input_numerical_features_list = fspec_in.channel_spec[NUMERICAL_CHANNEL]
    input_categorical_features_list = fspec_in.channel_spec[
        CATEGORICAL_CHANNEL]

    # Do a pass to establish the cardinalities: they influence the type we save the dataset as
    found_cardinalities = defaultdict(lambda: 0)
    for mapping_name, mapping in fspec_in.source_spec.items():
        df_iterators = []
        for chunk in mapping:
            assert chunk[
                'type'] == 'csv', "Only csv files supported in this transcoder"
            assert len(
                chunk['files']
            ) == 1, "Only one file per chunk supported in this transcoder"
            path_to_load = os.path.join(fspec_in.base_directory,
                                        chunk['files'][0])
            chunk_iterator = pd.read_csv(path_to_load,
                                         header=None,
                                         chunksize=batch_size,
                                         names=chunk['features'])
            df_iterators.append(chunk_iterator)

        zipped = zip(*df_iterators)
        for chunks in zipped:
            mapping_df = pd.concat(chunks, axis=1)
            for feature in input_categorical_features_list:
                mapping_cardinality = mapping_df[feature].max() + 1
                previous_cardinality = found_cardinalities[feature]
                found_cardinalities[feature] = max(previous_cardinality,
                                                   mapping_cardinality)

    for feature in input_categorical_features_list:
        declared_cardinality = fspec_in.feature_spec[feature][
            CARDINALITY_SELECTOR]
        if declared_cardinality == 'auto':
            pass
        else:
            assert int(declared_cardinality) >= found_cardinalities[feature]
            found_cardinalities[feature] = int(declared_cardinality)

    categorical_cardinalities = [
        found_cardinalities[f] for f in input_categorical_features_list
    ]
    number_of_numerical_features = fspec_in.get_number_of_numerical_features()

    fspec_out = FeatureSpec.get_default_feature_spec(
        number_of_numerical_features=number_of_numerical_features,
        categorical_feature_cardinalities=categorical_cardinalities)
    fspec_out.base_directory = args.output

    for mapping_name, mapping in fspec_in.source_spec.items():

        # open files for outputting
        label_path, numerical_path, categorical_paths = fspec_out.get_mapping_paths(
            mapping_name)
        for path in [label_path, numerical_path, *categorical_paths.values()]:
            os.makedirs(os.path.dirname(path), exist_ok=True)
        output_categorical_features_list = fspec_out.get_categorical_feature_names(
        )
        numerical_f = open(numerical_path, "ab+")
        label_f = open(label_path, "ab+")
        categorical_fs = [
            open(categorical_paths[name], "ab+")
            for name in output_categorical_features_list
        ]
        categorical_feature_types = [
            get_categorical_feature_type(card)
            for card in categorical_cardinalities
        ]

        df_iterators = []
        for chunk in mapping:
            # We checked earlier it's a single file chunk
            path_to_load = os.path.join(fspec_in.base_directory,
                                        chunk['files'][0])
            chunk_iterator = pd.read_csv(path_to_load,
                                         header=None,
                                         chunksize=batch_size,
                                         names=chunk['features'])
            df_iterators.append(chunk_iterator)

        zipped = zip(*df_iterators)
        for chunks in zipped:
            mapping_df = pd.concat(
                chunks, axis=1
            )  # This takes care of making sure feature names are unique

            # Choose the right columns
            numerical_df = mapping_df[input_numerical_features_list]
            categorical_df = mapping_df[input_categorical_features_list]
            label_df = mapping_df[[input_label_feature_name]]

            numerical = torch.tensor(numerical_df.values)
            label = torch.tensor(label_df.values)
            categorical = torch.tensor(categorical_df.values)

            # Append them to the binary files
            numerical_f.write(
                numerical.to(torch.float16).cpu().numpy().tobytes())
            label_f.write(label.to(torch.bool).cpu().numpy().tobytes())
            for cat_idx, cat_feature_type in enumerate(
                    categorical_feature_types):
                categorical_fs[cat_idx].write(
                    categorical[:, cat_idx].cpu().numpy().astype(
                        cat_feature_type).tobytes())

    feature_spec_save_path = os.path.join(args_output, args_feature_spec_out)
    fspec_out.to_yaml(output_path=feature_spec_save_path)
예제 #7
0
    def __init__(
            self,
            feature_spec: FeatureSpec,
            mapping: str,
            batch_size: int = 1,
            numerical_features_enabled: bool = False,
            categorical_features_to_read: List[str] = None,  # This parameter dictates order of returned features
            prefetch_depth: int = 10,
            drop_last_batch: bool = False,
            **kwargs
    ):
        self._feature_spec = feature_spec
        self._batch_size = batch_size
        self._mapping = mapping
        feature_spec.check_feature_spec()
        categorical_features = feature_spec.channel_spec[CATEGORICAL_CHANNEL]
        numerical_features = feature_spec.channel_spec[NUMERICAL_CHANNEL]
        label_features = feature_spec.channel_spec[LABEL_CHANNEL]

        set_of_categorical_features = set(categorical_features)
        set_of_numerical_features = set(numerical_features)
        set_of_label_features = set(label_features)

        set_of_categoricals_to_read = set(categorical_features_to_read)
        bytes_per_feature = {feature_name: np.dtype(feature_spec.feature_spec[feature_name][DTYPE_SELECTOR]).itemsize
                             for feature_name in feature_spec.feature_spec.keys()}

        self._numerical_features_file = None
        self._label_file = None
        self._numerical_bytes_per_batch = bytes_per_feature[numerical_features[0]] * \
                                          len(numerical_features) * batch_size
        self._label_bytes_per_batch = np.dtype(np.bool).itemsize * batch_size
        self._number_of_numerical_features = len(numerical_features)

        chosen_mapping = feature_spec.source_spec[mapping]
        categorical_feature_files = {}
        root_path = feature_spec.base_directory
        number_of_batches = None
        for chunk in chosen_mapping:
            contained_features = chunk[FEATURES_SELECTOR]
            containing_file = chunk[FILES_SELECTOR][0]
            first_feature = contained_features[0]

            if first_feature in set_of_categorical_features:
                # Load categorical
                if first_feature not in set_of_categoricals_to_read:
                    continue  # skip chunk

                path_to_open = os.path.join(root_path, containing_file)
                cat_file = os.open(path_to_open, os.O_RDONLY)
                bytes_per_batch = bytes_per_feature[first_feature] * self._batch_size
                batch_num_float = os.fstat(cat_file).st_size / bytes_per_batch
                categorical_feature_files[first_feature] = cat_file

            elif first_feature in set_of_numerical_features:
                # Load numerical
                if not numerical_features_enabled:
                    continue  # skip chunk

                path_to_open = os.path.join(root_path, containing_file)
                self._numerical_features_file = os.open(path_to_open, os.O_RDONLY)
                batch_num_float = os.fstat(self._numerical_features_file).st_size / self._numerical_bytes_per_batch

            elif first_feature in set_of_label_features:
                # Load label
                path_to_open = os.path.join(root_path, containing_file)
                self._label_file = os.open(path_to_open, os.O_RDONLY)
                batch_num_float = os.fstat(self._label_file).st_size / self._label_bytes_per_batch

            else:
                raise ValueError("Unknown chunk type")

            local_number_of_batches = math.ceil(batch_num_float) if not drop_last_batch else math.floor(batch_num_float)
            if number_of_batches is not None:
                if local_number_of_batches != number_of_batches:
                    raise ValueError("Size mismatch in data files")
            else:
                number_of_batches = local_number_of_batches

        self._categorical_features_files = None
        if len(categorical_features_to_read) > 0:
            self._categorical_features_files = [categorical_feature_files[feature] for feature in
                                                categorical_features_to_read]
            self._categorical_bytes_per_batch = [bytes_per_feature[feature] * self._batch_size for feature in
                                                 categorical_features_to_read]
            self._categorical_types = [feature_spec.feature_spec[feature][DTYPE_SELECTOR] for feature in
                                       categorical_features_to_read]
        self._num_entries = number_of_batches
        self._prefetch_depth = min(prefetch_depth, self._num_entries)
        self._prefetch_queue = queue.Queue()
        self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
예제 #8
0
    data_loader_train = dataset_factory.create_data_loader(
        dataset_train, collate_fn=collate_fn, sampler=train_sampler)
    data_loader_test = dataset_factory.create_data_loader(
        dataset_test, collate_fn=collate_fn)
    return data_loader_train, data_loader_test


if __name__ == '__main__':
    print('Dataloader benchmark')

    parser = argparse.ArgumentParser()
    parser.add_argument('--fspec_path', type=str)
    parser.add_argument('--batch_size', type=int)
    parser.add_argument('--steps', type=int, default=1000)
    args = parser.parse_args()
    fspec = FeatureSpec.from_yaml(args.fspec_path)
    dataset = ParametricDataset(
        fspec,
        args.mapping,
        batch_size=args.batch_size,
        numerical_features_enabled=True,
        categorical_features_to_read=fspec.get_categorical_feature_names())
    begin = time.time()
    for i in range(args.steps):
        _ = dataset[i]
    end = time.time()

    step_time = (end - begin) / args.steps
    throughput = args.batch_size / step_time

    print(f'Mean step time: {step_time:.6f} [s]')
예제 #9
0
def get_embedding_sizes(fspec: FeatureSpec, max_table_size: Optional[int]) -> List[int]:
    if max_table_size is not None:
        return [min(s, max_table_size) for s in fspec.get_categorical_sizes()]
    else:
        return fspec.get_categorical_sizes()