def EditTrainers(args): t_op_files = glob.glob("trainers/*.pbtxt") for t_op_file in t_op_files: t_op = util.ReadOperation(t_op_file) if 'rbm1' in t_op_file or 'joint' in t_op_file: t_op.data_proto_prefix = args.data_dir else: t_op.data_proto_prefix = args.rep_dir t_op.checkpoint_directory = args.model_dir t_op.batchsize = args.batchsize with open(t_op_file, 'w') as f: text_format.PrintMessage(t_op, f)
def GetPredictions(model_file, train_op_file, output_dir, dataset='test'): board = tr.LockGPU() model = util.ReadModel(model_file) model.layer[0].data_field.test = '%s_data' % dataset train_op = util.ReadOperation(train_op_file) train_op.verbose = False train_op.get_last_piece = True train_op.randomize = False layernames = ['output_layer'] ex.ExtractRepresentations(model_file, train_op, layernames, output_dir) tr.FreeGPU(board)
def EditTrainers(data_dir, model_dir, rep_dir, numsplits): tnames = [ 'train_CD_image_layer1.pbtxt', 'train_CD_image_layer2.pbtxt', 'train_CD_text_layer1.pbtxt', 'train_CD_text_layer2.pbtxt', 'train_CD_joint_layer.pbtxt' ] for tname in tnames: t_op_file = os.path.join('trainers', 'dbn', tname) t_op = util.ReadOperation(t_op_file) if 'layer1' in tname: t_op.data_proto_prefix = data_dir else: t_op.data_proto_prefix = rep_dir t_op.checkpoint_directory = model_dir with open(t_op_file, 'w') as f: text_format.PrintMessage(t_op, f) t_op_file = os.path.join('trainers', 'classifiers', 'baseclassifier.pbtxt') t_op = util.ReadOperation(t_op_file) for i in range(1, numsplits + 1): t_op_file = os.path.join('trainers', 'classifiers', 'split_%d.pbtxt' % i) t_op.data_proto_prefix = rep_dir t_op.data_proto = os.path.join('split_%d' % i, 'data.pbtxt') t_op.checkpoint_prefix = model_dir t_op.checkpoint_directory = os.path.join('classifiers', 'split_%d' % i) with open(t_op_file, 'w') as f: text_format.PrintMessage(t_op, f) # Change prefix in multimodal dbn model mnames = ['multimodal_dbn.pbtxt'] for mname in mnames: model_file = os.path.join('models', mname) model = util.ReadModel(model_file) model.prefix = model_dir with open(model_file, 'w') as f: text_format.PrintMessage(model, f)
def SetUpTrainer(data_dir, model_dir, representation_dir): trainer_config_names = [ 'train_CD_visual_layer1.pbtxt', 'train_CD_visual_layer2.pbtxt', 'train_CD_audio_layer1.pbtxt', 'train_CD_audio_layer2.pbtxt', 'train_CD_joint_layer.pbtxt' ] for trainer_config_name in trainer_config_names: filename = os.path.join('Trainers', trainer_config_name) trainer_operation = util.ReadOperation(filename) if 'layer1' in trainer_config_name: trainer_operation.data_proto_prefix = data_dir else: trainer_operation.data_proto_prefix = representation_dir trainer_operation.checkpoint_directory = model_dir with open(filename, 'w') as f: text_format.PrintMessage(trainer_operation, f)
def __init__(self, op, data_name_list, hyperparameter_list, frac=1.0): """Initializes a DataHandler. Args: op: Operation protocol buffer. data_name_list: List of data names that should be put together. (Usually refers to a list of different modalities, e.g., ['data', 'label'] or ['image', 'audio'].) hyperparameter_list: List of hyperparameters, one for each modality. frac: What fraction of the total memory should this data handler use. """ filenames = [] numdim_list = [] datasetsize = None left_window = [] right_window = [] stats_files = [] shift = [] add_noise = [] shift_amt_x = [] shift_amt_y = [] keys = [] typesize = 4 if isinstance(op, str): op = util.ReadOperation(op) self.verbose = op.verbose verbose = self.verbose data_proto_file = os.path.join(op.data_proto_prefix, op.data_proto) dataset_proto = util.ReadData(data_proto_file) seq = False is_train = False for name, hyp in zip(data_name_list, hyperparameter_list): data_proto = next(d for d in dataset_proto.data if d.name == name) file_pattern = os.path.join(dataset_proto.prefix, data_proto.file_pattern) filenames.append(sorted(glob.glob(file_pattern))) stats_files.append( os.path.join(dataset_proto.prefix, data_proto.stats_file)) numdims = np.prod(np.array(data_proto.dimensions)) if not data_proto.sparse: numdims *= data_proto.num_labels numdim_list.append(numdims) seq = seq or data_proto.seq left_window.append(hyp.left_window) right_window.append(hyp.right_window) add_noise.append(hyp.add_noise) shift.append(hyp.shift) shift_amt_x.append(hyp.shift_amt_x) shift_amt_y.append(hyp.shift_amt_y) keys.append(data_proto.key) is_train = 'train' in name # HACK - Fix this! if datasetsize is None: datasetsize = data_proto.size else: assert datasetsize == data_proto.size, 'Size of %s is not %d' % ( name, datasetsize) # Add space for padding. if seq: max_rw = max(right_window) max_lw = max(left_window) actual_datasetsize = datasetsize datasetsize += len(filenames[0]) * (max_rw + max_lw) numdims = sum(numdim_list) batchsize = op.batchsize randomize = op.randomize self.get_last_piece = op.get_last_piece # Compute size of each cache. total_disk_space = datasetsize * numdims * typesize max_gpu_capacity = int(frac * GetBytes(dataset_proto.gpu_memory)) max_cpu_capacity = int(frac * GetBytes(dataset_proto.main_memory)) # Each capacity should correspond to integral number of batches. vectorsize_bytes = typesize * numdims batchsize_bytes = vectorsize_bytes * batchsize max_gpu_capacity = (max_gpu_capacity / batchsize_bytes) * batchsize_bytes #max_cpu_capacity = (max_cpu_capacity / batchsize_bytes) * batchsize_bytes # Don't need more than total dataset size. gpu_capacity = min(total_disk_space, max_gpu_capacity) cpu_capacity = min(total_disk_space, max_cpu_capacity) num_gpu_batches = gpu_capacity / batchsize_bytes num_cpu_batches = cpu_capacity / batchsize_bytes gpu_left_overs = gpu_capacity / vectorsize_bytes - num_gpu_batches * batchsize cpu_left_overs = cpu_capacity / vectorsize_bytes - num_cpu_batches * batchsize if self.verbose: if seq: num_valid_gpu_vectors = ( gpu_capacity / vectorsize_bytes) - len(filenames[0]) * (max_rw + max_lw) print num_valid_gpu_vectors else: print 'Batches in GPU memory: %d + leftovers %d' % ( num_gpu_batches, gpu_left_overs) print 'Batches in main memory: %d + leftovers %d' % ( num_cpu_batches, cpu_left_overs) print 'Batches in disk: %d + leftovers %d' % ( (datasetsize / batchsize), datasetsize % batchsize) if seq: import sequence_datahandler as seq_dh self.disk = seq_dh.SequenceDisk(filenames, numdim_list, datasetsize, keys=keys, left_window=left_window, right_window=right_window, verbose=verbose) self.cpu_cache = seq_dh.SequenceCache(self.disk, cpu_capacity, numdim_list, typesize=typesize, randomize=randomize, left_window=left_window, right_window=right_window, verbose=verbose) self.gpu_cache = seq_dh.SequenceGPUCache(self.cpu_cache, gpu_capacity, numdim_list, typesize=typesize, randomize=randomize, left_window=left_window, right_window=right_window, verbose=verbose, batchsize=batchsize) else: self.disk = Disk(filenames, numdim_list, datasetsize, keys=keys, verbose=self.verbose) self.cpu_cache = Cache(self.disk, cpu_capacity, numdim_list, typesize=typesize, randomize=randomize, verbose=self.verbose) self.gpu_cache = GPUCache(self.cpu_cache, gpu_capacity, numdim_list, typesize=typesize, randomize=randomize, verbose=self.verbose, shift=shift, add_noise=add_noise, center_only=not is_train, shift_amt_x=shift_amt_x, shift_amt_y=shift_amt_y) for i, stats_file in enumerate(stats_files): if hyperparameter_list[i].normalize and hyperparameter_list[ i].activation != deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX: self.gpu_cache.SetDataStats(i, stats_file) self.batchsize = batchsize if seq: datasetsize = actual_datasetsize self.num_batches = datasetsize / batchsize if self.get_last_piece and datasetsize % batchsize > 0: self.num_batches += 1
help="Multicol mode", default='rand') args = parser.parse_args() if not args.outf: raise ValueError('Output file not defined') if not args.train_file or not args.model_file: raise ValueError('Models and data missing') board = tr.LockGPU() model_file = args.model_file train_file = args.train_file model = dbm.DBM(model_file, train_file) trainer_pb = util.ReadOperation(train_file) dataset = os.path.basename(trainer_pb.data_proto_prefix) # Fix paths dirname = os.path.split(model.t_op.data_proto_prefix)[1] model.t_op.data_proto_prefix = os.path.join('datasets/',\ dirname) model.t_op.skip_last_piece = False model.t_op.get_last_piece = True model.t_op.randomize = False model.LoadModelOnGPU() model.SetUpData() if args.valid_only: data_types = ['valid']
writer.writerow(record) if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser( description='Parses results by walking directories') parser.add_argument("--outf", type=str, help="Output file") parser.add_argument("--mode", type=str, help="html/csv") args = parser.parse_args() model_paths = walk_dir('.') exp_paths = defaultdict(list) get_expid = lambda f: f.split("/")[1] get_model = lambda f: util.ReadModel(f) get_op = lambda f: util.ReadOperation(f) no_match = lambda path: all(bool(s not in path) for s in drop_string_list) model_paths['BEST'] = filter(no_match, model_paths['BEST']) for path in model_paths['BEST']: exp_paths[get_expid(path)].append(path) exp_paths = dict(exp_paths) rows = [] for exp in exp_paths: models = defaultdict(list) for f in exp_paths[exp]: model_name = os.path.basename(f).split('_')[0] models[model_name].append(f) models = dict(models)
def MakeTrainers(trainer_file, data_pbtxt_file, output_path): trainer = util.ReadOperation(trainer_file) trainer.data_proto = data_pbtxt_file trainer.checkpoint_directory = output_path util.WritePbtxt(trainer_file, trainer)