def load_feature_spec(flags): if flags.dataset_type == 'synthetic_gpu' and not flags.synthetic_dataset_use_feature_spec: num_numerical = flags.synthetic_dataset_numerical_features categorical_sizes = [int(s) for s in FLAGS.synthetic_dataset_table_sizes] return FeatureSpec.get_default_feature_spec(number_of_numerical_features=num_numerical, categorical_feature_cardinalities=categorical_sizes) fspec_path = os.path.join(flags.dataset, flags.feature_spec) return FeatureSpec.from_yaml(fspec_path)
def split_dataset(dataset_dir: str, output_dir: str, batch_size: int, numerical_features: int): categorical_sizes_file = os.path.join(dataset_dir, "model_size.json") with open(categorical_sizes_file) as f: # model_size.json contains the max value of each feature instead of the cardinality. # For feature spec this is changed for consistency and clarity. categorical_cardinalities = [int(v)+1 for v in json.load(f).values()] train_file = os.path.join(dataset_dir, "train_data.bin") test_file = os.path.join(dataset_dir, "test_data.bin") val_file = os.path.join(dataset_dir, "validation_data.bin") target_train = os.path.join(output_dir, "train") target_test = os.path.join(output_dir, "test") target_val = os.path.join(output_dir, "validation") os.makedirs(output_dir, exist_ok=True) os.makedirs(target_train, exist_ok=True) os.makedirs(target_test, exist_ok=True) os.makedirs(target_val, exist_ok=True) # VALIDATION chunk is ignored in feature spec on purpose feature_spec = FeatureSpec.get_default_feature_spec(number_of_numerical_features=numerical_features, categorical_feature_cardinalities=categorical_cardinalities) feature_spec.to_yaml(os.path.join(output_dir, 'feature_spec.yaml')) split_binary_file(test_file, target_test, categorical_cardinalities, numerical_features, batch_size) split_binary_file(train_file, target_train, categorical_cardinalities, numerical_features, batch_size) split_binary_file(val_file, target_val, categorical_cardinalities, numerical_features, batch_size)
def create_dataset_factory( flags, feature_spec: FeatureSpec, device_mapping: Optional[dict] = None) -> DatasetFactory: """ By default each dataset can be used in single GPU or distributed setting - please keep that in mind when adding new datasets. Distributed case requires selection of categorical features provided in `device_mapping` (see `DatasetFactory#create_collate_fn`). :param flags: :param device_mapping: dict, information about model bottom mlp and embeddings devices assignment :return: """ dataset_type = flags.dataset_type num_numerical_features = feature_spec.get_number_of_numerical_features() if is_distributed() or device_mapping: assert device_mapping is not None, "Distributed dataset requires information about model device mapping." rank = get_rank() local_categorical_positions = device_mapping["embedding"][rank] numerical_features_enabled = device_mapping["bottom_mlp"] == rank else: local_categorical_positions = list( range(len(feature_spec.get_categorical_feature_names()))) numerical_features_enabled = True if dataset_type == "parametric": local_categorical_names = feature_spec.cat_positions_to_names( local_categorical_positions) return ParametricDatasetFactory( flags=flags, feature_spec=feature_spec, numerical_features_enabled=numerical_features_enabled, categorical_features_to_read=local_categorical_names) if dataset_type == "synthetic_gpu": local_numerical_features = num_numerical_features if numerical_features_enabled else 0 world_categorical_sizes = feature_spec.get_categorical_sizes() local_categorical_sizes = [ world_categorical_sizes[i] for i in local_categorical_positions ] return SyntheticGpuDatasetFactory( flags, local_numerical_features_num=local_numerical_features, local_categorical_feature_sizes=local_categorical_sizes) raise NotImplementedError(f"unknown dataset type: {dataset_type}")
def write_dataset_to_disk(dataset_train: Dataset, dataset_test: Dataset, feature_spec: FeatureSpec, saving_batch_size=512) -> None: feature_spec.check_feature_spec() # We rely on the feature spec being properly formatted categorical_features_list = feature_spec.get_categorical_feature_names() categorical_features_types = [feature_spec.feature_spec[feature_name][DTYPE_SELECTOR] for feature_name in categorical_features_list] number_of_numerical_features = feature_spec.get_number_of_numerical_features() number_of_categorical_features = len(categorical_features_list) for mapping_name, dataset in zip((TRAIN_MAPPING, TEST_MAPPING), (dataset_train, dataset_test)): file_streams = [] label_path, numerical_path, categorical_paths = feature_spec.get_mapping_paths(mapping_name) try: os.makedirs(os.path.dirname(numerical_path), exist_ok=True) numerical_f = open(numerical_path, "wb+") file_streams.append(numerical_f) os.makedirs(os.path.dirname(label_path), exist_ok=True) label_f = open(label_path, 'wb+') file_streams.append(label_f) categorical_fs = [] for feature_name in categorical_features_list: local_path = categorical_paths[feature_name] os.makedirs(os.path.dirname(local_path), exist_ok=True) fs = open(local_path, 'wb+') categorical_fs.append(fs) file_streams.append(fs) for numerical, categorical, label in tqdm.tqdm( DataLoader(dataset, saving_batch_size), desc=mapping_name + " dataset saving", unit_scale=saving_batch_size ): assert (numerical.shape[-1] == number_of_numerical_features) assert (categorical.shape[-1] == number_of_categorical_features) numerical_f.write(numerical.to(torch.float16).cpu().numpy().tobytes()) label_f.write(label.to(torch.bool).cpu().numpy().tobytes()) for cat_idx, cat_feature_type in enumerate(categorical_features_types): categorical_fs[cat_idx].write( categorical[:, :, cat_idx].cpu().numpy().astype(cat_feature_type).tobytes()) finally: for stream in file_streams: stream.close() feature_spec.to_yaml()
def main(): args = parse_args() dataset_size = args.size fspec_in = FeatureSpec.from_yaml(args.feature_spec_in) fspec_in.base_directory = args.output cat_cardinalities = fspec_in.get_categorical_sizes() cat_names = fspec_in.get_categorical_feature_names() cardinalities = { name: cardinality for name, cardinality in zip(cat_names, cat_cardinalities) } input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0] numerical_names_set = set(fspec_in.channel_spec[NUMERICAL_CHANNEL]) for mapping_name, mapping in fspec_in.source_spec.items(): for chunk in mapping: assert chunk[ 'type'] == 'csv', "Only csv files supported in this generator" assert len( chunk['files'] ) == 1, "Only one file per chunk supported in this transcoder" path_to_save = os.path.join(fspec_in.base_directory, chunk['files'][0]) data = [] for name in chunk['features']: if name == input_label_feature_name: data.append(np.random.randint(0, 1, size=dataset_size)) elif name in numerical_names_set: data.append(np.random.rand(dataset_size)) else: local_cardinality = cardinalities[name] data.append( np.random.randint(0, local_cardinality, size=dataset_size)) values = np.stack(data).T to_save = pd.DataFrame(values, columns=chunk['features']) os.makedirs(os.path.dirname(path_to_save), exist_ok=True) to_save.to_csv(path_to_save, index=False, header=False)
def main(): args = parse_args() args_output = args.output args_input = args.input args_feature_spec_in = args.feature_spec_in args_feature_spec_out = args.feature_spec_out batch_size = args.chunk_size fspec_in_path = os.path.join(args_input, args_feature_spec_in) fspec_in = FeatureSpec.from_yaml(fspec_in_path) input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0] input_numerical_features_list = fspec_in.channel_spec[NUMERICAL_CHANNEL] input_categorical_features_list = fspec_in.channel_spec[ CATEGORICAL_CHANNEL] # Do a pass to establish the cardinalities: they influence the type we save the dataset as found_cardinalities = defaultdict(lambda: 0) for mapping_name, mapping in fspec_in.source_spec.items(): df_iterators = [] for chunk in mapping: assert chunk[ 'type'] == 'csv', "Only csv files supported in this transcoder" assert len( chunk['files'] ) == 1, "Only one file per chunk supported in this transcoder" path_to_load = os.path.join(fspec_in.base_directory, chunk['files'][0]) chunk_iterator = pd.read_csv(path_to_load, header=None, chunksize=batch_size, names=chunk['features']) df_iterators.append(chunk_iterator) zipped = zip(*df_iterators) for chunks in zipped: mapping_df = pd.concat(chunks, axis=1) for feature in input_categorical_features_list: mapping_cardinality = mapping_df[feature].max() + 1 previous_cardinality = found_cardinalities[feature] found_cardinalities[feature] = max(previous_cardinality, mapping_cardinality) for feature in input_categorical_features_list: declared_cardinality = fspec_in.feature_spec[feature][ CARDINALITY_SELECTOR] if declared_cardinality == 'auto': pass else: assert int(declared_cardinality) >= found_cardinalities[feature] found_cardinalities[feature] = int(declared_cardinality) categorical_cardinalities = [ found_cardinalities[f] for f in input_categorical_features_list ] number_of_numerical_features = fspec_in.get_number_of_numerical_features() fspec_out = FeatureSpec.get_default_feature_spec( number_of_numerical_features=number_of_numerical_features, categorical_feature_cardinalities=categorical_cardinalities) fspec_out.base_directory = args.output for mapping_name, mapping in fspec_in.source_spec.items(): # open files for outputting label_path, numerical_path, categorical_paths = fspec_out.get_mapping_paths( mapping_name) for path in [label_path, numerical_path, *categorical_paths.values()]: os.makedirs(os.path.dirname(path), exist_ok=True) output_categorical_features_list = fspec_out.get_categorical_feature_names( ) numerical_f = open(numerical_path, "ab+") label_f = open(label_path, "ab+") categorical_fs = [ open(categorical_paths[name], "ab+") for name in output_categorical_features_list ] categorical_feature_types = [ get_categorical_feature_type(card) for card in categorical_cardinalities ] df_iterators = [] for chunk in mapping: # We checked earlier it's a single file chunk path_to_load = os.path.join(fspec_in.base_directory, chunk['files'][0]) chunk_iterator = pd.read_csv(path_to_load, header=None, chunksize=batch_size, names=chunk['features']) df_iterators.append(chunk_iterator) zipped = zip(*df_iterators) for chunks in zipped: mapping_df = pd.concat( chunks, axis=1 ) # This takes care of making sure feature names are unique # Choose the right columns numerical_df = mapping_df[input_numerical_features_list] categorical_df = mapping_df[input_categorical_features_list] label_df = mapping_df[[input_label_feature_name]] numerical = torch.tensor(numerical_df.values) label = torch.tensor(label_df.values) categorical = torch.tensor(categorical_df.values) # Append them to the binary files numerical_f.write( numerical.to(torch.float16).cpu().numpy().tobytes()) label_f.write(label.to(torch.bool).cpu().numpy().tobytes()) for cat_idx, cat_feature_type in enumerate( categorical_feature_types): categorical_fs[cat_idx].write( categorical[:, cat_idx].cpu().numpy().astype( cat_feature_type).tobytes()) feature_spec_save_path = os.path.join(args_output, args_feature_spec_out) fspec_out.to_yaml(output_path=feature_spec_save_path)
def __init__( self, feature_spec: FeatureSpec, mapping: str, batch_size: int = 1, numerical_features_enabled: bool = False, categorical_features_to_read: List[str] = None, # This parameter dictates order of returned features prefetch_depth: int = 10, drop_last_batch: bool = False, **kwargs ): self._feature_spec = feature_spec self._batch_size = batch_size self._mapping = mapping feature_spec.check_feature_spec() categorical_features = feature_spec.channel_spec[CATEGORICAL_CHANNEL] numerical_features = feature_spec.channel_spec[NUMERICAL_CHANNEL] label_features = feature_spec.channel_spec[LABEL_CHANNEL] set_of_categorical_features = set(categorical_features) set_of_numerical_features = set(numerical_features) set_of_label_features = set(label_features) set_of_categoricals_to_read = set(categorical_features_to_read) bytes_per_feature = {feature_name: np.dtype(feature_spec.feature_spec[feature_name][DTYPE_SELECTOR]).itemsize for feature_name in feature_spec.feature_spec.keys()} self._numerical_features_file = None self._label_file = None self._numerical_bytes_per_batch = bytes_per_feature[numerical_features[0]] * \ len(numerical_features) * batch_size self._label_bytes_per_batch = np.dtype(np.bool).itemsize * batch_size self._number_of_numerical_features = len(numerical_features) chosen_mapping = feature_spec.source_spec[mapping] categorical_feature_files = {} root_path = feature_spec.base_directory number_of_batches = None for chunk in chosen_mapping: contained_features = chunk[FEATURES_SELECTOR] containing_file = chunk[FILES_SELECTOR][0] first_feature = contained_features[0] if first_feature in set_of_categorical_features: # Load categorical if first_feature not in set_of_categoricals_to_read: continue # skip chunk path_to_open = os.path.join(root_path, containing_file) cat_file = os.open(path_to_open, os.O_RDONLY) bytes_per_batch = bytes_per_feature[first_feature] * self._batch_size batch_num_float = os.fstat(cat_file).st_size / bytes_per_batch categorical_feature_files[first_feature] = cat_file elif first_feature in set_of_numerical_features: # Load numerical if not numerical_features_enabled: continue # skip chunk path_to_open = os.path.join(root_path, containing_file) self._numerical_features_file = os.open(path_to_open, os.O_RDONLY) batch_num_float = os.fstat(self._numerical_features_file).st_size / self._numerical_bytes_per_batch elif first_feature in set_of_label_features: # Load label path_to_open = os.path.join(root_path, containing_file) self._label_file = os.open(path_to_open, os.O_RDONLY) batch_num_float = os.fstat(self._label_file).st_size / self._label_bytes_per_batch else: raise ValueError("Unknown chunk type") local_number_of_batches = math.ceil(batch_num_float) if not drop_last_batch else math.floor(batch_num_float) if number_of_batches is not None: if local_number_of_batches != number_of_batches: raise ValueError("Size mismatch in data files") else: number_of_batches = local_number_of_batches self._categorical_features_files = None if len(categorical_features_to_read) > 0: self._categorical_features_files = [categorical_feature_files[feature] for feature in categorical_features_to_read] self._categorical_bytes_per_batch = [bytes_per_feature[feature] * self._batch_size for feature in categorical_features_to_read] self._categorical_types = [feature_spec.feature_spec[feature][DTYPE_SELECTOR] for feature in categorical_features_to_read] self._num_entries = number_of_batches self._prefetch_depth = min(prefetch_depth, self._num_entries) self._prefetch_queue = queue.Queue() self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
data_loader_train = dataset_factory.create_data_loader( dataset_train, collate_fn=collate_fn, sampler=train_sampler) data_loader_test = dataset_factory.create_data_loader( dataset_test, collate_fn=collate_fn) return data_loader_train, data_loader_test if __name__ == '__main__': print('Dataloader benchmark') parser = argparse.ArgumentParser() parser.add_argument('--fspec_path', type=str) parser.add_argument('--batch_size', type=int) parser.add_argument('--steps', type=int, default=1000) args = parser.parse_args() fspec = FeatureSpec.from_yaml(args.fspec_path) dataset = ParametricDataset( fspec, args.mapping, batch_size=args.batch_size, numerical_features_enabled=True, categorical_features_to_read=fspec.get_categorical_feature_names()) begin = time.time() for i in range(args.steps): _ = dataset[i] end = time.time() step_time = (end - begin) / args.steps throughput = args.batch_size / step_time print(f'Mean step time: {step_time:.6f} [s]')
def get_embedding_sizes(fspec: FeatureSpec, max_table_size: Optional[int]) -> List[int]: if max_table_size is not None: return [min(s, max_table_size) for s in fspec.get_categorical_sizes()] else: return fspec.get_categorical_sizes()