def load_data(args): train_val_filelist = glob.glob(args.data_train) n_train = int(args.train_val_split * len(train_val_filelist)) wgtvar = args.weight_names if wgtvar == '': wgtvar = None d = DataFormat(train_groups, train_vars, label_var, wgtvar, obs_vars, extra_label_vars=extra_label_vars, filename=train_val_filelist[0]) logging.info('Using the following variables:\n' + '\n'.join([v_group + '\n\t' + str(train_vars[v_group]) for v_group in train_groups ])) logging.info('Using weight\n' + str(wgtvar)) orig_metadata = os.path.join(os.path.dirname(train_val_filelist[0]), 'metadata.json') output_metadata = os.path.join(os.path.dirname(args.model_prefix), 'preprocessing.json') if args.predict: test_filelist = glob.glob(args.data_test) test = DataLoader(test_filelist, d, batch_size=args.batch_size, predict_mode=True, shuffle=False, args=args) return test else: train = DataLoader(train_val_filelist[:n_train], d, batch_size=args.batch_size, args=args) val = DataLoader(train_val_filelist[n_train:], d, batch_size=args.batch_size, args=args) if not os.path.exists(output_metadata): train_shapes = {} for k, v in train.provide_data: train_shapes[k] = (1,) + v[1:] dump_input_metadata(orig_metadata, groups=train_groups, shapes=train_shapes, var_names=train_vars, output=output_metadata) return (train, val)
def nb_wgt_samples(files, weight_names): if not weight_names: return nb_samples(files) nevts = [] for f in files: filelist = glob.glob(f) nevts.append(int(sum([DataFormat.nwgtsum(filename, weight_names) for filename in filelist]))) return tuple(nevts)
def nb_samples(files): nevts = [] for f in files: filelist = glob.glob(f) nevts.append( sum([ DataFormat.nevts(filename, label_var) for filename in filelist ])) return tuple(nevts)
def nb_classes(filename): return DataFormat.num_classes(filename, label_var)