Exemplo n.º 1
0
def main():
    data_dir = sys.argv[1]
    model_dir = sys.argv[2]
    rep_dir = sys.argv[3]
    gpu_mem = sys.argv[4]
    main_mem = sys.argv[5]
    numsplits = int(sys.argv[6])

    data_pbtxt_file = os.path.join(data_dir, 'flickr.pbtxt')
    data_pb = util.ReadData(data_pbtxt_file)
    EditPaths(data_pb, data_dir, gpu_mem, main_mem)
    with open(data_pbtxt_file, 'w') as f:
        text_format.PrintMessage(data_pb, f)
    EditTrainers(data_dir, model_dir, rep_dir, numsplits)

    data_pbtxt_file_z = os.path.join(data_dir, 'flickr_z.pbtxt')
    data_pbtxt_file_nnz = os.path.join(data_dir, 'flickr_nnz.pbtxt')
    if not os.path.exists(data_pbtxt_file_z):
        CreateMissingTextData(data_pb, data_pbtxt_file_z, data_pbtxt_file_nnz)
    data_pb = util.ReadData(data_pbtxt_file_z)
    EditPaths(data_pb, data_dir, gpu_mem, main_mem)
    with open(data_pbtxt_file_z, 'w') as f:
        text_format.PrintMessage(data_pb, f)
    data_pb = util.ReadData(data_pbtxt_file_nnz)
    EditPaths(data_pb, data_dir, gpu_mem, main_mem)
    with open(data_pbtxt_file_nnz, 'w') as f:
        text_format.PrintMessage(data_pb, f)
Exemplo n.º 2
0
def main():
    #Input parameters
    data_dir = sys.argv[1]
    model_dir = sys.argv[2]
    representation_dir = sys.argv[3]
    avdata_pbtxt_file = os.path.join(data_dir, 'audiovisualdata.pbtxt')
    vdata_pbtxt_file = os.path.join(data_dir, 'visualonlydata.pbtxt')
    gpu_mem = sys.argv[4]
    main_mem = sys.argv[5]

    #Edit the data configuration file
    avdata_pb = util.ReadData(avdata_pbtxt_file)
    avdata_pb.gpu_memory = gpu_mem
    avdata_pb.main_memory = main_mem
    avdata_pb.prefix = data_dir
    with open(avdata_pbtxt_file, 'w') as f:
        text_format.PrintMessage(avdata_pb, f)

    vdata_pb = util.ReadData(vdata_pbtxt_file)
    vdata_pb.gpu_memory = gpu_mem
    vdata_pb.main_memory = main_mem
    vdata_pb.prefix = data_dir
    with open(vdata_pbtxt_file, 'w') as f:
        text_format.PrintMessage(vdata_pb, f)

    #Set up the trainer configuration file
    SetUpTrainer(data_dir, model_dir, representation_dir)
Exemplo n.º 3
0
def MakeDict(data_pbtxt):
  data_pb = util.ReadData(data_pbtxt)
  rep_dict = {}
  stats_files = {}
  for data in data_pb.data:
    rep_dict[data.name] = Load(data.file_pattern)
    stats_files[data.name] = data.stats_file
  return rep_dict, stats_files
Exemplo n.º 4
0
def GetDataHandles(op, names, hyp_list, verbose=False):
    """Returns a list of data handles.

  This method is the top-level routine for creating data handlers. It takes a
  description of which datasets to load and returns data handlers to access
  them.
  Args:
    op: Operation protocol buffer.
    names: list of list of data names. The top level list corresponds to train,
      validation and test sets. The lower-level lists correspond to data
      modalities.
    hyp_list: List of hyperparameters for each modality.
    verbose: If True, will print out details of what is happening.
  Returns:
    A list of DataHandler objects.
  """
    typesize = 4
    data_proto_file = os.path.join(op.data_proto_prefix, op.data_proto)
    dataset_proto = util.ReadData(data_proto_file)
    handlers = []
    if dataset_proto.data_handler == 'deepnet':
        size_list = []
        for name_list in names:
            size = 0
            for name in name_list:
                try:
                    data_proto = next(d for d in dataset_proto.data
                                      if d.name == name)
                except StopIteration as e:
                    print '%s not found in data pbtxt' % name
                    raise e
                datasetsize = data_proto.size
                numdims = np.prod(np.array(data_proto.dimensions))
                size += datasetsize * numdims * typesize
            size_list.append(size)
        total_size = sum(size_list)
        proportions = [float(size) / total_size for size in size_list]
        for i, name_list in enumerate(names):
            if name_list == []:
                handlers.append(None)
            else:
                handlers.append(
                    DataHandler(op, name_list, hyp_list, frac=proportions[i]))
    elif dataset_proto.data_handler == 'navdeep':
        import navdeep_datahandler
        for i, name_list in enumerate(names):
            if name_list == []:
                handlers.append(None)
            else:
                handlers.append(
                    navdeep_datahandler.NavdeepDataHandler(
                        op, dataset_proto, name_list, hyp_list))

    return handlers
Exemplo n.º 5
0
def main():
    data_dir = sys.argv[1]
    model_dir = sys.argv[2]
    rep_dir = sys.argv[3]
    gpu_mem = sys.argv[4]
    main_mem = sys.argv[5]
    numsplits = int(sys.argv[6])

    data_pbtxt_file = os.path.join(data_dir, 'RNAseq.pbtxt')
    data_pb = util.ReadData(data_pbtxt_file)
    EditPaths(data_pb, data_dir, gpu_mem, main_mem)
    with open(data_pbtxt_file, 'w') as f:
        text_format.PrintMessage(data_pb, f)
    EditTrainers(data_dir, model_dir, rep_dir, numsplits)
Exemplo n.º 6
0
def change_data(proto, datas=None):
  proto_cont = util.ReadData(proto)
  if datas is None:
    datas = []
    for m in ['image', 'text']:
      for i in [1,2,3]:
        for t in ['train', 'validation', 'test']:
          datas += [m+'_'+'hidden'+str(i)+'_'+t]
          datas += ['bae_'+m+'_'+'hidden'+str(i)+'_'+t]
          datas += ['bae_'+m+'_'+'hidden'+str(i)+'_'+t+'_all']
          datas += ['corr_'+m+'_hidden'+str(i)+'_'+t]
  for data in datas:
    try:
      data_proto = next(lay for lay in proto_cont.data if lay.name == data)
      data_proto.dimensions[0] = dimensions
    except StopIteration:
        pass
  with open(proto, 'w') as f:
    text_format.PrintMessage(proto_cont, f)
Exemplo n.º 7
0
def withPbtxt(dbPbtxt, modality1, modality2, outputpath):
    datapb = util.ReadData(dbPbtxt)
    datasets = ["train", "validation", "test"]
    datapbNew = deepnet_pb2.Dataset()
    namePrefix = modality1 + "_" + modality2 + "_"
    datapbNew.prefix = outputpath
    datapbNew.name = namePrefix + "combined_input"
    for dataset in datasets:
        fileNames1 = []
        fileNames2 = []
        for dataEntry in datapb.data:
            if modality1 in dataEntry.name and dataset in dataEntry.name:
                fileNames1 = sorted(
                    glob.glob(
                        os.path.join(datapb.prefix, dataEntry.file_pattern)))
            if modality2 in dataEntry.name and dataset in dataEntry.name:
                fileNames2 = sorted(
                    glob.glob(
                        os.path.join(datapb.prefix, dataEntry.file_pattern)))
        for i, (file1, file2) in enumerate(zip(fileNames1, fileNames2)):
            data1 = np.load(file1)
            data2 = np.load(file2)
            dataCombined = np.concatenate((data1, data2), axis=1)
            if i == 0:
                data = dataCombined
            else:
                data = np.concatenate((data, dataCombined), axis=0)
        if not os.path.exists(os.path.join(outputpath, dataset)):
            os.makedirs(os.path.join(outputpath, dataset))
        np.save(os.path.join(outputpath, dataset, "data"), data)
        dataItem = deepnet_pb2.Dataset.Data()
        dataItem.name = namePrefix + "combined_" + dataset
        dataItem.dimensions.extend([data.shape[1]])
        dataItem.size = data.shape[0]
        dataItem.file_pattern = os.path.join(dataset, "data.npy")
        datapbNew.data.extend([dataItem])
    with open(os.path.join(outputpath, "input_data.pbtxt"), 'w') as f:
        text_format.PrintMessage(datapbNew, f)
Exemplo n.º 8
0
from deepnet import util
import numpy as np
import os

if __name__ == '__main__':
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("--pfamid")
    args = parser.parse_args()

    pfamid = args.pfamid

    data_pbtxt_file = os.path.join(pfamid, 'data.pbtxt')
    data_pbtxt = util.ReadData(data_pbtxt_file)
    data_pbtxt.name = pfamid
    data_pbtxt.prefix = os.path.join(
        os.path.split(data_pbtxt.prefix)[0], pfamid)
    for data in data_pbtxt.data:
        fname = os.path.basename(data.file_pattern)
        for t in ('train', 'valid', 'test'):
            if t in data.name:
                X = np.load(os.path.join(pfamid, pfamid + "_" + t + ".npy"))
                data.size = X.shape[0]
                data.dimensions[0] = X.shape[1]
                data.file_pattern = os.path.abspath(
                    os.path.join(pfamid, pfamid + "_" + t + ".npy"))

    util.WritePbtxt(data_pbtxt_file, data_pbtxt)
Exemplo n.º 9
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("--model", type=str, help="Name of model")
    parser.add_argument("--data_dir", type=str, help="Data directory location")
    parser.add_argument("--model_dir",
                        type=str,
                        help="Directory to write models to")
    parser.add_argument("--rep_dir",
                        type=str,
                        help="Directory to write representations to")
    parser.add_argument("--gpu_mem", type=str, default="3G", help="GPU memory")
    parser.add_argument("--main_mem",
                        type=str,
                        default="30G",
                        help="CPU memory")
    parser.add_argument("--base_epsilon",
                        type=float,
                        default=0.1,
                        help="base epsilon rate")
    parser.add_argument("--sparsity",
                        action="store_true",
                        help="apply sparsity")
    parser.add_argument("--dropout", action="store_true", help="apply dropout")
    parser.add_argument("--l2_decay",
                        type=float,
                        default=0.01,
                        help="l2 decay cost")
    parser.add_argument("--initial_momentum",
                        type=float,
                        default=0.5,
                        help="initial momentum")
    parser.add_argument("--final_momentum",
                        type=float,
                        default=0.9,
                        help="final momentum")
    parser.add_argument("--input_width", type=int, \
            help="number of nodes in input_layer")
    parser.add_argument("--input_numlabels", type=int, default=21,\
            help="number of states for nodes in input_layer")
    parser.add_argument("--hidden1_width", type=int, default=100, \
            help="number of nodes in hidden layer")
    parser.add_argument("--hidden2_width", type=int, default=100, \
            help="number of nodes in hidden layer")
    parser.add_argument("--bernoulli2_hidden1_width", type=int, default=10, \
            help="number of nodes in hidden layer")
    parser.add_argument("--batchsize",
                        type=int,
                        default=1000,
                        help="batchsize")
    parser.add_argument("--steps",
                        type=int,
                        default=100000,
                        help="training steps")

    args = parser.parse_args()

    if not args.data_dir or not args.model_dir or not args.rep_dir or \
            not args.input_width :
        raise ValueError("Required input not provided")

    if 'GERBILPATH' not in os.environ:
        raise EnvironmentError('Please set GERBILPATH')
    deepnet_path = os.path.join(os.environ['GERBILPATH'], 'deepnet')

    args.data_dir = os.path.join(deepnet_path, args.data_dir)
    args.model_dir = os.path.join(deepnet_path, args.model_dir)
    args.rep_dir = os.path.join(deepnet_path, args.rep_dir)

    data_pbtxt_file = os.path.join(args.data_dir, "data.pbtxt")
    data_pb = util.ReadData(data_pbtxt_file)
    EditPaths(data_pb, args)
    with open(data_pbtxt_file, 'w') as f:
        text_format.PrintMessage(data_pb, f)
    EditTrainers(args)

    if args.model in ['lcdbm', 'dbm']:
        EditModelsDBM(args)
    else:
        EditModels(args)
Exemplo n.º 10
0
    if skipTraining:
        for dataset in datasets:
            dataMap[dataset]["label"] = dataMap[dataset]["label"].tolist()
            dataMap[dataset]["data"] = dataMap[dataset]["data"].tolist()

        prob = svm_problem(dataMap["train"]["label"], dataMap["train"]["data"])
        param = svm_parameter('-t 2 -c 4 -b 1')
        m = svm_train(prob, param)
        svm_save_model(os.path.join(outputPath, modality + '_svm.model'), m)
        p_label, p_acc, p_val = svm_predict(dataMap["validation"]["label"],
                                            dataMap["validation"]["data"], m,
                                            '-b 1')
        ACC, MSE, SCC = evaluations(dataMap["validation"]["label"], p_label)
        print "ACC on validation set: " + repr(ACC)
        p_label, p_acc, p_val = svm_predict(dataMap["test"]["label"],
                                            dataMap["test"]["data"], m, '-b 1')
        ACC, MSE, SCC = evaluations(dataMap["test"]["label"], p_label)
        print "ACC on test set: " + repr(ACC)


if __name__ == '__main__':
    pbPath = sys.argv[1]
    modality = sys.argv[2]
    outputPath = sys.argv[3]
    if len(sys.argv) > 4:
        saveFile = True if sys.argv[4].upper() == "TRUE" else False
    if len(sys.argv) > 5:
        skipTraining = True if sys.argv[5].upper() == "TRUE" else False
    datapb = util.ReadData(pbPath)
    fromPb(datapb, modality, outputPath, saveFile, skipTraining)
Exemplo n.º 11
0
def main():
    model_file = sys.argv[1]
    base_output_dir = sys.argv[2]
    rep_dir = sys.argv[3]
    prefix = sys.argv[4]
    gpu_mem = sys.argv[5]
    main_mem = sys.argv[6]
    model = util.ReadModel(model_file)
    data_pb = deepnet_pb2.Dataset()
    data_pb.name = model.name
    data_pb.gpu_memory = gpu_mem
    data_pb.main_memory = main_mem
    output_dir = os.path.join(base_output_dir, 'validation')
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    output_proto_file = os.path.join(base_output_dir, 'data.pbtxt')

    # IMAGE PATHWAY
    img_input_pbtxt = os.path.join(prefix, 'flickr.pbtxt')
    img_hidden1_pbtxt = os.path.join(rep_dir, 'image_rbm1_LAST', 'data.pbtxt')
    img_hidden2_pbtxt = os.path.join(rep_dir, 'image_rbm2_LAST', 'data.pbtxt')

    # TEXT PATHWAY
    text_input_pbtxt = os.path.join(prefix, 'flickr_nnz.pbtxt')
    text_hidden1_pbtxt = os.path.join(rep_dir, 'text_rbm1_LAST', 'data.pbtxt')
    text_hidden2_pbtxt = os.path.join(rep_dir, 'text_rbm2_LAST', 'data.pbtxt')
    text_pbtxt_z = os.path.join(rep_dir, 'generated_text', 'data.pbtxt')

    joint_pbtxt = os.path.join(rep_dir, 'joint_rbm_LAST', 'data.pbtxt')

    img_input_pb = util.ReadData(img_input_pbtxt)
    data = next(d for d in img_input_pb.data if d.name == 'image_labelled')
    data.file_pattern = os.path.join(img_input_pb.prefix, data.file_pattern)
    data.stats_file = os.path.join(img_input_pb.prefix, data.stats_file)
    data.name = 'image_input'
    data_pb.data.extend([data])

    img_hidden1_pb = util.ReadData(img_hidden1_pbtxt)
    data = next(d for d in img_hidden1_pb.data
                if d.name == 'image_hidden1_validation')
    data.file_pattern = os.path.join(img_hidden1_pb.prefix, data.file_pattern)
    data.name = 'image_hidden1'
    data_pb.data.extend([data])

    img_hidden2_pb = util.ReadData(img_hidden2_pbtxt)
    data = next(d for d in img_hidden2_pb.data
                if d.name == 'image_hidden2_validation')
    data.file_pattern = os.path.join(img_hidden2_pb.prefix, data.file_pattern)
    data.name = 'image_hidden2'
    data_pb.data.extend([data])

    indices_file = os.path.join(prefix, 'text', 'indices_labelled.npz')
    indices = np.load(indices_file)
    nnz_indices = indices['nnz_indices']
    z_indices = indices['z_indices']

    text_pb_z = util.ReadData(text_pbtxt_z)
    text_input_pb = util.ReadData(text_input_pbtxt)
    data_nnz = next(d for d in text_input_pb.data if d.name == 'text_labelled')
    data_z = next(d for d in text_pb_z.data
                  if d.name == 'text_input_layer_validation')
    output_file = os.path.join(output_dir, 'text_input-00001-of-00001.npy')
    data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix,
                 text_input_pb.prefix, 'text_input', output_file)
    data_pb.data.extend([data])

    text_hidden1_pb = util.ReadData(text_hidden1_pbtxt)
    data_nnz = next(d for d in text_hidden1_pb.data
                    if d.name == 'text_hidden1_validation')
    data_z = next(d for d in text_pb_z.data
                  if d.name == 'text_hidden1_validation')
    output_file = os.path.join(output_dir, 'text_hidden1-00001-of-00001.npy')
    data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix,
                 text_hidden1_pb.prefix, 'text_hidden1', output_file)
    data_pb.data.extend([data])

    text_hidden2_pb = util.ReadData(text_hidden2_pbtxt)
    data_nnz = next(d for d in text_hidden2_pb.data
                    if d.name == 'text_hidden2_validation')
    data_z = next(d for d in text_pb_z.data
                  if d.name == 'text_hidden2_validation')
    output_file = os.path.join(output_dir, 'text_hidden2-00001-of-00001.npy')
    data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix,
                 text_hidden2_pb.prefix, 'text_hidden2', output_file)
    data_pb.data.extend([data])

    joint_pb = util.ReadData(joint_pbtxt)
    data_nnz = next(d for d in joint_pb.data
                    if d.name == 'joint_hidden_validation')
    data_z = next(d for d in text_pb_z.data
                  if d.name == 'joint_hidden_validation')
    output_file = os.path.join(output_dir, 'joint_hidden-00001-of-00001.npy')
    data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix,
                 joint_pb.prefix, 'joint_hidden', output_file)
    data_pb.data.extend([data])

    with open(output_proto_file, 'w') as f:
        text_format.PrintMessage(data_pb, f)
Exemplo n.º 12
0
def SetupDataPbtxt(data_pbtxt_file, data_path):
    data_pbtxt = util.ReadData(data_pbtxt_file)
    for data in data_pbtxt.data:
        fname = os.path.basename(data.file_pattern)
        data.file_pattern = os.path.join(data_path, fname)
    util.WritePbtxt(data_pbtxt_file, data_pbtxt)
Exemplo n.º 13
0
from deepnet import util
from deepnet import deepnet_pb2
import sys, os
from google.protobuf import text_format

proto1 = sys.argv[1]
proto2 = sys.argv[2]
output_pbtxt = sys.argv[3]

out_dir = '/'.join(output_pbtxt.split('/')[:-1])
if out_dir and not os.path.isdir(out_dir):
    os.makedirs(out_dir)
dataset1 = util.ReadData(proto1)
name1 = dataset1.name
dataset2 = util.ReadData(proto2)
name2 = dataset2.name

dataset1_prefix = dataset1.prefix
dataset2_prefix = dataset2.prefix
prefix = os.path.commonprefix([dataset1_prefix, dataset2_prefix])

if dataset1_prefix != dataset2_prefix:
    for dataset in [dataset1, dataset2]:
        _prefix = dataset.prefix[len(prefix):]
        for d in dataset.data:
            if d.file_pattern:
                d.file_pattern = os.path.join(_prefix, d.file_pattern)
            if d.stats_file:
                d.file_pattern = os.path.join(_prefix, d.stats_file)

dataset1.MergeFrom(dataset2)
Exemplo n.º 14
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("--data_dir", type=str, help="Data directory location")
    parser.add_argument("--model_dir",
                        type=str,
                        help="Directory to write models to")
    parser.add_argument("--rep_dir",
                        type=str,
                        help="Directory to write representations to")
    parser.add_argument("--gpu_mem", type=str, default="3G", help="GPU memory")
    parser.add_argument("--main_mem",
                        type=str,
                        default="30G",
                        help="CPU memory")
    parser.add_argument("--base_epsilon",
                        type=float,
                        default=0.1,
                        help="base epsilon rate")
    parser.add_argument("--epsilon_decay",
                        action="store_true",
                        help="epsilon decay mechanism")
    parser.add_argument("--sparsity",
                        action="store_true",
                        help="apply sparsity")
    parser.add_argument("--dropout", action="store_true", help="apply dropout")
    parser.add_argument("--l2_decay",
                        type=float,
                        default=0.01,
                        help="l2 decay cost")
    parser.add_argument("--initial_momentum",
                        type=float,
                        default=0.5,
                        help="initial momentum")
    parser.add_argument("--final_momentum",
                        type=float,
                        default=0.9,
                        help="final momentum")
    parser.add_argument("--input_width", type=int, \
            help="number of nodes in input_layer")
    parser.add_argument("--input_numlabels", type=int, default=21,\
            help="number of states for nodes in input_layer")
    parser.add_argument("--hidden1_width", type=int, default=500, \
            help="number of nodes in hidden1")
    parser.add_argument("--hidden2_width", type=int, default=500, \
            help="number of nodes in hidden2")
    parser.add_argument("--batchsize",
                        type=int,
                        default=1000,
                        help="batchsize")
    parser.add_argument("--local",
                        action="store_true",
                        help="Run locally on langmead.pc")

    args = parser.parse_args()

    if not args.data_dir or not args.model_dir or not args.rep_dir or \
            not args.input_width :
        raise ValueError("Required input not provided")

    deepnet_path = awsutil.get_deepnet_path()
    args.data_dir = os.path.join(deepnet_path, args.data_dir)
    args.model_dir = os.path.join(deepnet_path, args.model_dir)
    args.rep_dir = os.path.join(deepnet_path, args.rep_dir)

    data_pbtxt_file = os.path.join(args.data_dir, "data.pbtxt")
    data_pb = util.ReadData(data_pbtxt_file)
    EditPaths(data_pb, args)
    with open(data_pbtxt_file, 'w') as f:
        text_format.PrintMessage(data_pb, f)
    EditTrainers(args)
    EditModels(args)
Exemplo n.º 15
0
def MakeDataPbtxt(data_pbtxt_file, data_path):
    data_pbtxt = util.ReadData('mnist.pbtxt')
    for data in data_pbtxt.data:
        fname = os.path.basename(data.file_pattern)
        data.file_pattern = os.path.join(data_path, fname)
    util.WritePbtxt(data_pbtxt_file, data_pbtxt)
Exemplo n.º 16
0
    def __init__(self, op, data_name_list, hyperparameter_list, frac=1.0):
        """Initializes a DataHandler.
    Args:
      op: Operation protocol buffer.
      data_name_list: List of data names that should be put together. (Usually
        refers to a list of different modalities, e.g., ['data', 'label'] or
        ['image', 'audio'].)
      hyperparameter_list: List of hyperparameters, one for each modality.
      frac: What fraction of the total memory should this data handler use.
    """
        filenames = []
        numdim_list = []
        datasetsize = None
        left_window = []
        right_window = []
        stats_files = []
        shift = []
        add_noise = []
        shift_amt_x = []
        shift_amt_y = []
        keys = []
        typesize = 4
        if isinstance(op, str):
            op = util.ReadOperation(op)
        self.verbose = op.verbose
        verbose = self.verbose
        data_proto_file = os.path.join(op.data_proto_prefix, op.data_proto)
        dataset_proto = util.ReadData(data_proto_file)
        seq = False
        is_train = False
        for name, hyp in zip(data_name_list, hyperparameter_list):
            data_proto = next(d for d in dataset_proto.data if d.name == name)
            file_pattern = os.path.join(dataset_proto.prefix,
                                        data_proto.file_pattern)
            filenames.append(sorted(glob.glob(file_pattern)))
            stats_files.append(
                os.path.join(dataset_proto.prefix, data_proto.stats_file))
            numdims = np.prod(np.array(data_proto.dimensions))
            if not data_proto.sparse:
                numdims *= data_proto.num_labels
            numdim_list.append(numdims)
            seq = seq or data_proto.seq
            left_window.append(hyp.left_window)
            right_window.append(hyp.right_window)
            add_noise.append(hyp.add_noise)
            shift.append(hyp.shift)
            shift_amt_x.append(hyp.shift_amt_x)
            shift_amt_y.append(hyp.shift_amt_y)
            keys.append(data_proto.key)
            is_train = 'train' in name  # HACK - Fix this!
            if datasetsize is None:
                datasetsize = data_proto.size
            else:
                assert datasetsize == data_proto.size, 'Size of %s is not %d' % (
                    name, datasetsize)

        # Add space for padding.
        if seq:
            max_rw = max(right_window)
            max_lw = max(left_window)
            actual_datasetsize = datasetsize
            datasetsize += len(filenames[0]) * (max_rw + max_lw)

        numdims = sum(numdim_list)
        batchsize = op.batchsize
        randomize = op.randomize
        self.get_last_piece = op.get_last_piece
        # Compute size of each cache.
        total_disk_space = datasetsize * numdims * typesize
        max_gpu_capacity = int(frac * GetBytes(dataset_proto.gpu_memory))
        max_cpu_capacity = int(frac * GetBytes(dataset_proto.main_memory))

        # Each capacity should correspond to integral number of batches.
        vectorsize_bytes = typesize * numdims
        batchsize_bytes = vectorsize_bytes * batchsize
        max_gpu_capacity = (max_gpu_capacity /
                            batchsize_bytes) * batchsize_bytes
        #max_cpu_capacity = (max_cpu_capacity / batchsize_bytes) * batchsize_bytes

        # Don't need more than total dataset size.
        gpu_capacity = min(total_disk_space, max_gpu_capacity)
        cpu_capacity = min(total_disk_space, max_cpu_capacity)
        num_gpu_batches = gpu_capacity / batchsize_bytes
        num_cpu_batches = cpu_capacity / batchsize_bytes

        gpu_left_overs = gpu_capacity / vectorsize_bytes - num_gpu_batches * batchsize
        cpu_left_overs = cpu_capacity / vectorsize_bytes - num_cpu_batches * batchsize

        if self.verbose:
            if seq:
                num_valid_gpu_vectors = (
                    gpu_capacity /
                    vectorsize_bytes) - len(filenames[0]) * (max_rw + max_lw)
                print num_valid_gpu_vectors

            else:
                print 'Batches in GPU memory: %d + leftovers %d' % (
                    num_gpu_batches, gpu_left_overs)
                print 'Batches in main memory: %d + leftovers %d' % (
                    num_cpu_batches, cpu_left_overs)
                print 'Batches in disk: %d + leftovers %d' % (
                    (datasetsize / batchsize), datasetsize % batchsize)

        if seq:
            import sequence_datahandler as seq_dh
            self.disk = seq_dh.SequenceDisk(filenames,
                                            numdim_list,
                                            datasetsize,
                                            keys=keys,
                                            left_window=left_window,
                                            right_window=right_window,
                                            verbose=verbose)
            self.cpu_cache = seq_dh.SequenceCache(self.disk,
                                                  cpu_capacity,
                                                  numdim_list,
                                                  typesize=typesize,
                                                  randomize=randomize,
                                                  left_window=left_window,
                                                  right_window=right_window,
                                                  verbose=verbose)
            self.gpu_cache = seq_dh.SequenceGPUCache(self.cpu_cache,
                                                     gpu_capacity,
                                                     numdim_list,
                                                     typesize=typesize,
                                                     randomize=randomize,
                                                     left_window=left_window,
                                                     right_window=right_window,
                                                     verbose=verbose,
                                                     batchsize=batchsize)
        else:
            self.disk = Disk(filenames,
                             numdim_list,
                             datasetsize,
                             keys=keys,
                             verbose=self.verbose)
            self.cpu_cache = Cache(self.disk,
                                   cpu_capacity,
                                   numdim_list,
                                   typesize=typesize,
                                   randomize=randomize,
                                   verbose=self.verbose)
            self.gpu_cache = GPUCache(self.cpu_cache,
                                      gpu_capacity,
                                      numdim_list,
                                      typesize=typesize,
                                      randomize=randomize,
                                      verbose=self.verbose,
                                      shift=shift,
                                      add_noise=add_noise,
                                      center_only=not is_train,
                                      shift_amt_x=shift_amt_x,
                                      shift_amt_y=shift_amt_y)
        for i, stats_file in enumerate(stats_files):
            if hyperparameter_list[i].normalize and hyperparameter_list[
                    i].activation != deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX:
                self.gpu_cache.SetDataStats(i, stats_file)
        self.batchsize = batchsize
        if seq:
            datasetsize = actual_datasetsize
        self.num_batches = datasetsize / batchsize
        if self.get_last_piece and datasetsize % batchsize > 0:
            self.num_batches += 1
Exemplo n.º 17
0
def main():
  model_file = sys.argv[1]
  base_output_dir = sys.argv[2]
  rep_dir = sys.argv[3]
  prefix = sys.argv[4]
  gpu_mem = sys.argv[5]
  main_mem = sys.argv[6]
  model = util.ReadModel(model_file)
  data_pb = deepnet_pb2.Dataset()
  data_pb.name = model.name
  data_pb.gpu_memory = gpu_mem
  data_pb.main_memory = main_mem
  output_dir = os.path.join(base_output_dir, 'validation')
  if not os.path.isdir(output_dir):
    os.makedirs(output_dir)
  output_proto_file = os.path.join(base_output_dir, 'data.pbtxt')

  # IMAGE PATHWAY
  img_input_pbtxt = os.path.join(prefix, 'RNAseq.pbtxt')
  img_hidden1_pbtxt = os.path.join(rep_dir, 'RNA1seq_rbm1_LAST', 'data.pbtxt')
  #img_hidden2_pbtxt = os.path.join(rep_dir, 'RNA1seq_rbm2_LAST', 'data.pbtxt')
 
  # TEXT PATHWAY
  text_input_pbtxt = os.path.join(prefix, 'RNAseq.pbtxt')
  text_hidden1_pbtxt = os.path.join(rep_dir, 'RNA2seq_rbm1_LAST', 'data.pbtxt')
  #text_hidden2_pbtxt = os.path.join(rep_dir, 'RNA2seq_rbm2_LAST', 'data.pbtxt')
  #text_pbtxt_z = os.path.join(rep_dir, 'generated_text', 'data.pbtxt')
  
  joint_pbtxt = os.path.join(rep_dir, 'joint_rbm_LAST', 'data.pbtxt')
  joint2_pbtxt = os.path.join(rep_dir, 'joint_rbm2_LAST', 'data.pbtxt')

  
  img_input_pb = util.ReadData(img_input_pbtxt)
  data = next(d for d in img_input_pb.data if d.name == 'RNA1seq_train')
  data.file_pattern = os.path.join(img_input_pb.prefix, data.file_pattern)
  #data.stats_file = os.path.join(img_input_pb.prefix, data.stats_file)
  data.name = 'RNA1seq_input'
  data_pb.data.extend([data])

  img_hidden1_pb = util.ReadData(img_hidden1_pbtxt)
  data = next(d for d in img_hidden1_pb.data if d.name == 'RNA1seq_hidden1_train')
  data.file_pattern = os.path.join(img_hidden1_pb.prefix, data.file_pattern)
  data.name = 'RNA1seq_hidden1'
  data_pb.data.extend([data])

  #img_hidden2_pb = util.ReadData(img_hidden2_pbtxt)
  #data = next(d for d in img_hidden2_pb.data if d.name == 'RNA1seq_hidden2_train')
  #data.file_pattern = os.path.join(img_hidden2_pb.prefix, data.file_pattern)
  #data.name = 'RNA1seq_hidden2'
  #data_pb.data.extend([data])
  
  #indices_file = os.path.join(prefix, 'text', 'indices_labelled.npz')
  #indices = np.load(indices_file)
  #nnz_indices = indices['nnz_indices']
  #z_indices = indices['z_indices']

  #text_pb_z = util.ReadData(text_pbtxt_z)
  text_input_pb = util.ReadData(text_input_pbtxt)
  data = next(d for d in text_input_pb.data if d.name == 'RNA2seq_train')
  data.file_pattern = os.path.join(text_input_pb.prefix, data.file_pattern)
  data.name = 'RNA2seq_input'
  data_pb.data.extend([data])

  text_hidden1_pb = util.ReadData(text_hidden1_pbtxt)
  data = next(d for d in text_hidden1_pb.data if d.name == 'RNA2seq_hidden1_train')
  data.file_pattern = os.path.join(text_hidden1_pb.prefix, data.file_pattern)
  data.name = 'RNA2seq_hidden1'
  data_pb.data.extend([data])

  #text_hidden2_pb = util.ReadData(text_hidden2_pbtxt)
  #data = next(d for d in text_hidden2_pb.data if d.name == 'RNA2seq_hidden2_train')
  #data.file_pattern = os.path.join(text_hidden2_pb.prefix, data.file_pattern)
  #data.name = 'RNA2seq_hidden2'
  #data_pb.data.extend([data])

  joint_pb = util.ReadData(joint_pbtxt)
  data = next(d for d in joint_pb.data if d.name == 'joint_hidden_train')
  data.file_pattern = os.path.join(joint_pb.prefix, data.file_pattern)
  data.name = 'joint_hidden'
  data_pb.data.extend([data])

  joint2_pb = util.ReadData(joint2_pbtxt)
  data = next(d for d in joint2_pb.data if d.name == 'joint_hidden2_train')
  data.file_pattern = os.path.join(joint2_pb.prefix, data.file_pattern)
  data.name = 'joint_hidden2'
  data_pb.data.extend([data])

  with open(output_proto_file, 'w') as f:
    text_format.PrintMessage(data_pb, f)