def split_dataset(dataset_dir: str, output_dir: str, batch_size: int, numerical_features: int): categorical_sizes_file = os.path.join(dataset_dir, "model_size.json") with open(categorical_sizes_file) as f: # model_size.json contains the max value of each feature instead of the cardinality. # For feature spec this is changed for consistency and clarity. categorical_cardinalities = [int(v)+1 for v in json.load(f).values()] train_file = os.path.join(dataset_dir, "train_data.bin") test_file = os.path.join(dataset_dir, "test_data.bin") val_file = os.path.join(dataset_dir, "validation_data.bin") target_train = os.path.join(output_dir, "train") target_test = os.path.join(output_dir, "test") target_val = os.path.join(output_dir, "validation") os.makedirs(output_dir, exist_ok=True) os.makedirs(target_train, exist_ok=True) os.makedirs(target_test, exist_ok=True) os.makedirs(target_val, exist_ok=True) # VALIDATION chunk is ignored in feature spec on purpose feature_spec = FeatureSpec.get_default_feature_spec(number_of_numerical_features=numerical_features, categorical_feature_cardinalities=categorical_cardinalities) feature_spec.to_yaml(os.path.join(output_dir, 'feature_spec.yaml')) split_binary_file(test_file, target_test, categorical_cardinalities, numerical_features, batch_size) split_binary_file(train_file, target_train, categorical_cardinalities, numerical_features, batch_size) split_binary_file(val_file, target_val, categorical_cardinalities, numerical_features, batch_size)
def load_feature_spec(flags): if flags.dataset_type == 'synthetic_gpu' and not flags.synthetic_dataset_use_feature_spec: num_numerical = flags.synthetic_dataset_numerical_features categorical_sizes = [int(s) for s in FLAGS.synthetic_dataset_table_sizes] return FeatureSpec.get_default_feature_spec(number_of_numerical_features=num_numerical, categorical_feature_cardinalities=categorical_sizes) fspec_path = os.path.join(flags.dataset, flags.feature_spec) return FeatureSpec.from_yaml(fspec_path)
def main(): args = parse_args() args_output = args.output args_input = args.input args_feature_spec_in = args.feature_spec_in args_feature_spec_out = args.feature_spec_out batch_size = args.chunk_size fspec_in_path = os.path.join(args_input, args_feature_spec_in) fspec_in = FeatureSpec.from_yaml(fspec_in_path) input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0] input_numerical_features_list = fspec_in.channel_spec[NUMERICAL_CHANNEL] input_categorical_features_list = fspec_in.channel_spec[ CATEGORICAL_CHANNEL] # Do a pass to establish the cardinalities: they influence the type we save the dataset as found_cardinalities = defaultdict(lambda: 0) for mapping_name, mapping in fspec_in.source_spec.items(): df_iterators = [] for chunk in mapping: assert chunk[ 'type'] == 'csv', "Only csv files supported in this transcoder" assert len( chunk['files'] ) == 1, "Only one file per chunk supported in this transcoder" path_to_load = os.path.join(fspec_in.base_directory, chunk['files'][0]) chunk_iterator = pd.read_csv(path_to_load, header=None, chunksize=batch_size, names=chunk['features']) df_iterators.append(chunk_iterator) zipped = zip(*df_iterators) for chunks in zipped: mapping_df = pd.concat(chunks, axis=1) for feature in input_categorical_features_list: mapping_cardinality = mapping_df[feature].max() + 1 previous_cardinality = found_cardinalities[feature] found_cardinalities[feature] = max(previous_cardinality, mapping_cardinality) for feature in input_categorical_features_list: declared_cardinality = fspec_in.feature_spec[feature][ CARDINALITY_SELECTOR] if declared_cardinality == 'auto': pass else: assert int(declared_cardinality) >= found_cardinalities[feature] found_cardinalities[feature] = int(declared_cardinality) categorical_cardinalities = [ found_cardinalities[f] for f in input_categorical_features_list ] number_of_numerical_features = fspec_in.get_number_of_numerical_features() fspec_out = FeatureSpec.get_default_feature_spec( number_of_numerical_features=number_of_numerical_features, categorical_feature_cardinalities=categorical_cardinalities) fspec_out.base_directory = args.output for mapping_name, mapping in fspec_in.source_spec.items(): # open files for outputting label_path, numerical_path, categorical_paths = fspec_out.get_mapping_paths( mapping_name) for path in [label_path, numerical_path, *categorical_paths.values()]: os.makedirs(os.path.dirname(path), exist_ok=True) output_categorical_features_list = fspec_out.get_categorical_feature_names( ) numerical_f = open(numerical_path, "ab+") label_f = open(label_path, "ab+") categorical_fs = [ open(categorical_paths[name], "ab+") for name in output_categorical_features_list ] categorical_feature_types = [ get_categorical_feature_type(card) for card in categorical_cardinalities ] df_iterators = [] for chunk in mapping: # We checked earlier it's a single file chunk path_to_load = os.path.join(fspec_in.base_directory, chunk['files'][0]) chunk_iterator = pd.read_csv(path_to_load, header=None, chunksize=batch_size, names=chunk['features']) df_iterators.append(chunk_iterator) zipped = zip(*df_iterators) for chunks in zipped: mapping_df = pd.concat( chunks, axis=1 ) # This takes care of making sure feature names are unique # Choose the right columns numerical_df = mapping_df[input_numerical_features_list] categorical_df = mapping_df[input_categorical_features_list] label_df = mapping_df[[input_label_feature_name]] numerical = torch.tensor(numerical_df.values) label = torch.tensor(label_df.values) categorical = torch.tensor(categorical_df.values) # Append them to the binary files numerical_f.write( numerical.to(torch.float16).cpu().numpy().tobytes()) label_f.write(label.to(torch.bool).cpu().numpy().tobytes()) for cat_idx, cat_feature_type in enumerate( categorical_feature_types): categorical_fs[cat_idx].write( categorical[:, cat_idx].cpu().numpy().astype( cat_feature_type).tobytes()) feature_spec_save_path = os.path.join(args_output, args_feature_spec_out) fspec_out.to_yaml(output_path=feature_spec_save_path)