예제 #1
0
def EditTrainers(args):
    t_op_files = glob.glob("trainers/*.pbtxt")
    for t_op_file in t_op_files:
        t_op = util.ReadOperation(t_op_file)
        if 'rbm1' in t_op_file or 'joint' in t_op_file:
            t_op.data_proto_prefix = args.data_dir
        else:
            t_op.data_proto_prefix = args.rep_dir
        t_op.checkpoint_directory = args.model_dir
        t_op.batchsize = args.batchsize
        with open(t_op_file, 'w') as f:
            text_format.PrintMessage(t_op, f)
def GetPredictions(model_file, train_op_file, output_dir, dataset='test'):
    board = tr.LockGPU()
    model = util.ReadModel(model_file)
    model.layer[0].data_field.test = '%s_data' % dataset

    train_op = util.ReadOperation(train_op_file)
    train_op.verbose = False
    train_op.get_last_piece = True
    train_op.randomize = False

    layernames = ['output_layer']
    ex.ExtractRepresentations(model_file, train_op, layernames, output_dir)
    tr.FreeGPU(board)
예제 #3
0
def EditTrainers(data_dir, model_dir, rep_dir, numsplits):
    tnames = [
        'train_CD_image_layer1.pbtxt', 'train_CD_image_layer2.pbtxt',
        'train_CD_text_layer1.pbtxt', 'train_CD_text_layer2.pbtxt',
        'train_CD_joint_layer.pbtxt'
    ]
    for tname in tnames:
        t_op_file = os.path.join('trainers', 'dbn', tname)
        t_op = util.ReadOperation(t_op_file)
        if 'layer1' in tname:
            t_op.data_proto_prefix = data_dir
        else:
            t_op.data_proto_prefix = rep_dir
        t_op.checkpoint_directory = model_dir
        with open(t_op_file, 'w') as f:
            text_format.PrintMessage(t_op, f)

    t_op_file = os.path.join('trainers', 'classifiers', 'baseclassifier.pbtxt')
    t_op = util.ReadOperation(t_op_file)
    for i in range(1, numsplits + 1):
        t_op_file = os.path.join('trainers', 'classifiers',
                                 'split_%d.pbtxt' % i)
        t_op.data_proto_prefix = rep_dir
        t_op.data_proto = os.path.join('split_%d' % i, 'data.pbtxt')
        t_op.checkpoint_prefix = model_dir
        t_op.checkpoint_directory = os.path.join('classifiers', 'split_%d' % i)
        with open(t_op_file, 'w') as f:
            text_format.PrintMessage(t_op, f)

    # Change prefix in multimodal dbn model
    mnames = ['multimodal_dbn.pbtxt']
    for mname in mnames:
        model_file = os.path.join('models', mname)
        model = util.ReadModel(model_file)
        model.prefix = model_dir
        with open(model_file, 'w') as f:
            text_format.PrintMessage(model, f)
예제 #4
0
def SetUpTrainer(data_dir, model_dir, representation_dir):
    trainer_config_names = [
        'train_CD_visual_layer1.pbtxt', 'train_CD_visual_layer2.pbtxt',
        'train_CD_audio_layer1.pbtxt', 'train_CD_audio_layer2.pbtxt',
        'train_CD_joint_layer.pbtxt'
    ]
    for trainer_config_name in trainer_config_names:
        filename = os.path.join('Trainers', trainer_config_name)
        trainer_operation = util.ReadOperation(filename)
        if 'layer1' in trainer_config_name:
            trainer_operation.data_proto_prefix = data_dir
        else:
            trainer_operation.data_proto_prefix = representation_dir
        trainer_operation.checkpoint_directory = model_dir
        with open(filename, 'w') as f:
            text_format.PrintMessage(trainer_operation, f)
예제 #5
0
    def __init__(self, op, data_name_list, hyperparameter_list, frac=1.0):
        """Initializes a DataHandler.
    Args:
      op: Operation protocol buffer.
      data_name_list: List of data names that should be put together. (Usually
        refers to a list of different modalities, e.g., ['data', 'label'] or
        ['image', 'audio'].)
      hyperparameter_list: List of hyperparameters, one for each modality.
      frac: What fraction of the total memory should this data handler use.
    """
        filenames = []
        numdim_list = []
        datasetsize = None
        left_window = []
        right_window = []
        stats_files = []
        shift = []
        add_noise = []
        shift_amt_x = []
        shift_amt_y = []
        keys = []
        typesize = 4
        if isinstance(op, str):
            op = util.ReadOperation(op)
        self.verbose = op.verbose
        verbose = self.verbose
        data_proto_file = os.path.join(op.data_proto_prefix, op.data_proto)
        dataset_proto = util.ReadData(data_proto_file)
        seq = False
        is_train = False
        for name, hyp in zip(data_name_list, hyperparameter_list):
            data_proto = next(d for d in dataset_proto.data if d.name == name)
            file_pattern = os.path.join(dataset_proto.prefix,
                                        data_proto.file_pattern)
            filenames.append(sorted(glob.glob(file_pattern)))
            stats_files.append(
                os.path.join(dataset_proto.prefix, data_proto.stats_file))
            numdims = np.prod(np.array(data_proto.dimensions))
            if not data_proto.sparse:
                numdims *= data_proto.num_labels
            numdim_list.append(numdims)
            seq = seq or data_proto.seq
            left_window.append(hyp.left_window)
            right_window.append(hyp.right_window)
            add_noise.append(hyp.add_noise)
            shift.append(hyp.shift)
            shift_amt_x.append(hyp.shift_amt_x)
            shift_amt_y.append(hyp.shift_amt_y)
            keys.append(data_proto.key)
            is_train = 'train' in name  # HACK - Fix this!
            if datasetsize is None:
                datasetsize = data_proto.size
            else:
                assert datasetsize == data_proto.size, 'Size of %s is not %d' % (
                    name, datasetsize)

        # Add space for padding.
        if seq:
            max_rw = max(right_window)
            max_lw = max(left_window)
            actual_datasetsize = datasetsize
            datasetsize += len(filenames[0]) * (max_rw + max_lw)

        numdims = sum(numdim_list)
        batchsize = op.batchsize
        randomize = op.randomize
        self.get_last_piece = op.get_last_piece
        # Compute size of each cache.
        total_disk_space = datasetsize * numdims * typesize
        max_gpu_capacity = int(frac * GetBytes(dataset_proto.gpu_memory))
        max_cpu_capacity = int(frac * GetBytes(dataset_proto.main_memory))

        # Each capacity should correspond to integral number of batches.
        vectorsize_bytes = typesize * numdims
        batchsize_bytes = vectorsize_bytes * batchsize
        max_gpu_capacity = (max_gpu_capacity /
                            batchsize_bytes) * batchsize_bytes
        #max_cpu_capacity = (max_cpu_capacity / batchsize_bytes) * batchsize_bytes

        # Don't need more than total dataset size.
        gpu_capacity = min(total_disk_space, max_gpu_capacity)
        cpu_capacity = min(total_disk_space, max_cpu_capacity)
        num_gpu_batches = gpu_capacity / batchsize_bytes
        num_cpu_batches = cpu_capacity / batchsize_bytes

        gpu_left_overs = gpu_capacity / vectorsize_bytes - num_gpu_batches * batchsize
        cpu_left_overs = cpu_capacity / vectorsize_bytes - num_cpu_batches * batchsize

        if self.verbose:
            if seq:
                num_valid_gpu_vectors = (
                    gpu_capacity /
                    vectorsize_bytes) - len(filenames[0]) * (max_rw + max_lw)
                print num_valid_gpu_vectors

            else:
                print 'Batches in GPU memory: %d + leftovers %d' % (
                    num_gpu_batches, gpu_left_overs)
                print 'Batches in main memory: %d + leftovers %d' % (
                    num_cpu_batches, cpu_left_overs)
                print 'Batches in disk: %d + leftovers %d' % (
                    (datasetsize / batchsize), datasetsize % batchsize)

        if seq:
            import sequence_datahandler as seq_dh
            self.disk = seq_dh.SequenceDisk(filenames,
                                            numdim_list,
                                            datasetsize,
                                            keys=keys,
                                            left_window=left_window,
                                            right_window=right_window,
                                            verbose=verbose)
            self.cpu_cache = seq_dh.SequenceCache(self.disk,
                                                  cpu_capacity,
                                                  numdim_list,
                                                  typesize=typesize,
                                                  randomize=randomize,
                                                  left_window=left_window,
                                                  right_window=right_window,
                                                  verbose=verbose)
            self.gpu_cache = seq_dh.SequenceGPUCache(self.cpu_cache,
                                                     gpu_capacity,
                                                     numdim_list,
                                                     typesize=typesize,
                                                     randomize=randomize,
                                                     left_window=left_window,
                                                     right_window=right_window,
                                                     verbose=verbose,
                                                     batchsize=batchsize)
        else:
            self.disk = Disk(filenames,
                             numdim_list,
                             datasetsize,
                             keys=keys,
                             verbose=self.verbose)
            self.cpu_cache = Cache(self.disk,
                                   cpu_capacity,
                                   numdim_list,
                                   typesize=typesize,
                                   randomize=randomize,
                                   verbose=self.verbose)
            self.gpu_cache = GPUCache(self.cpu_cache,
                                      gpu_capacity,
                                      numdim_list,
                                      typesize=typesize,
                                      randomize=randomize,
                                      verbose=self.verbose,
                                      shift=shift,
                                      add_noise=add_noise,
                                      center_only=not is_train,
                                      shift_amt_x=shift_amt_x,
                                      shift_amt_y=shift_amt_y)
        for i, stats_file in enumerate(stats_files):
            if hyperparameter_list[i].normalize and hyperparameter_list[
                    i].activation != deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX:
                self.gpu_cache.SetDataStats(i, stats_file)
        self.batchsize = batchsize
        if seq:
            datasetsize = actual_datasetsize
        self.num_batches = datasetsize / batchsize
        if self.get_last_piece and datasetsize % batchsize > 0:
            self.num_batches += 1
예제 #6
0
                        help="Multicol mode",
                        default='rand')
    args = parser.parse_args()

    if not args.outf:
        raise ValueError('Output file not defined')

    if not args.train_file or not args.model_file:
        raise ValueError('Models and data missing')

    board = tr.LockGPU()
    model_file = args.model_file
    train_file = args.train_file
    model = dbm.DBM(model_file, train_file)

    trainer_pb = util.ReadOperation(train_file)
    dataset = os.path.basename(trainer_pb.data_proto_prefix)

    # Fix paths
    dirname = os.path.split(model.t_op.data_proto_prefix)[1]
    model.t_op.data_proto_prefix = os.path.join('datasets/',\
            dirname)
    model.t_op.skip_last_piece = False
    model.t_op.get_last_piece = True
    model.t_op.randomize = False

    model.LoadModelOnGPU()
    model.SetUpData()

    if args.valid_only:
        data_types = ['valid']
예제 #7
0
            writer.writerow(record)


if __name__ == '__main__':
    from argparse import ArgumentParser
    parser = ArgumentParser(
        description='Parses results by walking directories')
    parser.add_argument("--outf", type=str, help="Output file")
    parser.add_argument("--mode", type=str, help="html/csv")
    args = parser.parse_args()
    model_paths = walk_dir('.')
    exp_paths = defaultdict(list)

    get_expid = lambda f: f.split("/")[1]
    get_model = lambda f: util.ReadModel(f)
    get_op = lambda f: util.ReadOperation(f)

    no_match = lambda path: all(bool(s not in path) for s in drop_string_list)
    model_paths['BEST'] = filter(no_match, model_paths['BEST'])

    for path in model_paths['BEST']:
        exp_paths[get_expid(path)].append(path)
    exp_paths = dict(exp_paths)

    rows = []
    for exp in exp_paths:
        models = defaultdict(list)
        for f in exp_paths[exp]:
            model_name = os.path.basename(f).split('_')[0]
            models[model_name].append(f)
        models = dict(models)
예제 #8
0
def MakeTrainers(trainer_file, data_pbtxt_file, output_path):
    trainer = util.ReadOperation(trainer_file)
    trainer.data_proto = data_pbtxt_file
    trainer.checkpoint_directory = output_path
    util.WritePbtxt(trainer_file, trainer)