def main(): data_dir = sys.argv[1] model_dir = sys.argv[2] rep_dir = sys.argv[3] gpu_mem = sys.argv[4] main_mem = sys.argv[5] numsplits = int(sys.argv[6]) data_pbtxt_file = os.path.join(data_dir, 'flickr.pbtxt') data_pb = util.ReadData(data_pbtxt_file) EditPaths(data_pb, data_dir, gpu_mem, main_mem) with open(data_pbtxt_file, 'w') as f: text_format.PrintMessage(data_pb, f) EditTrainers(data_dir, model_dir, rep_dir, numsplits) data_pbtxt_file_z = os.path.join(data_dir, 'flickr_z.pbtxt') data_pbtxt_file_nnz = os.path.join(data_dir, 'flickr_nnz.pbtxt') if not os.path.exists(data_pbtxt_file_z): CreateMissingTextData(data_pb, data_pbtxt_file_z, data_pbtxt_file_nnz) data_pb = util.ReadData(data_pbtxt_file_z) EditPaths(data_pb, data_dir, gpu_mem, main_mem) with open(data_pbtxt_file_z, 'w') as f: text_format.PrintMessage(data_pb, f) data_pb = util.ReadData(data_pbtxt_file_nnz) EditPaths(data_pb, data_dir, gpu_mem, main_mem) with open(data_pbtxt_file_nnz, 'w') as f: text_format.PrintMessage(data_pb, f)
def main(): #Input parameters data_dir = sys.argv[1] model_dir = sys.argv[2] representation_dir = sys.argv[3] avdata_pbtxt_file = os.path.join(data_dir, 'audiovisualdata.pbtxt') vdata_pbtxt_file = os.path.join(data_dir, 'visualonlydata.pbtxt') gpu_mem = sys.argv[4] main_mem = sys.argv[5] #Edit the data configuration file avdata_pb = util.ReadData(avdata_pbtxt_file) avdata_pb.gpu_memory = gpu_mem avdata_pb.main_memory = main_mem avdata_pb.prefix = data_dir with open(avdata_pbtxt_file, 'w') as f: text_format.PrintMessage(avdata_pb, f) vdata_pb = util.ReadData(vdata_pbtxt_file) vdata_pb.gpu_memory = gpu_mem vdata_pb.main_memory = main_mem vdata_pb.prefix = data_dir with open(vdata_pbtxt_file, 'w') as f: text_format.PrintMessage(vdata_pb, f) #Set up the trainer configuration file SetUpTrainer(data_dir, model_dir, representation_dir)
def MakeDict(data_pbtxt): data_pb = util.ReadData(data_pbtxt) rep_dict = {} stats_files = {} for data in data_pb.data: rep_dict[data.name] = Load(data.file_pattern) stats_files[data.name] = data.stats_file return rep_dict, stats_files
def GetDataHandles(op, names, hyp_list, verbose=False): """Returns a list of data handles. This method is the top-level routine for creating data handlers. It takes a description of which datasets to load and returns data handlers to access them. Args: op: Operation protocol buffer. names: list of list of data names. The top level list corresponds to train, validation and test sets. The lower-level lists correspond to data modalities. hyp_list: List of hyperparameters for each modality. verbose: If True, will print out details of what is happening. Returns: A list of DataHandler objects. """ typesize = 4 data_proto_file = os.path.join(op.data_proto_prefix, op.data_proto) dataset_proto = util.ReadData(data_proto_file) handlers = [] if dataset_proto.data_handler == 'deepnet': size_list = [] for name_list in names: size = 0 for name in name_list: try: data_proto = next(d for d in dataset_proto.data if d.name == name) except StopIteration as e: print '%s not found in data pbtxt' % name raise e datasetsize = data_proto.size numdims = np.prod(np.array(data_proto.dimensions)) size += datasetsize * numdims * typesize size_list.append(size) total_size = sum(size_list) proportions = [float(size) / total_size for size in size_list] for i, name_list in enumerate(names): if name_list == []: handlers.append(None) else: handlers.append( DataHandler(op, name_list, hyp_list, frac=proportions[i])) elif dataset_proto.data_handler == 'navdeep': import navdeep_datahandler for i, name_list in enumerate(names): if name_list == []: handlers.append(None) else: handlers.append( navdeep_datahandler.NavdeepDataHandler( op, dataset_proto, name_list, hyp_list)) return handlers
def main(): data_dir = sys.argv[1] model_dir = sys.argv[2] rep_dir = sys.argv[3] gpu_mem = sys.argv[4] main_mem = sys.argv[5] numsplits = int(sys.argv[6]) data_pbtxt_file = os.path.join(data_dir, 'RNAseq.pbtxt') data_pb = util.ReadData(data_pbtxt_file) EditPaths(data_pb, data_dir, gpu_mem, main_mem) with open(data_pbtxt_file, 'w') as f: text_format.PrintMessage(data_pb, f) EditTrainers(data_dir, model_dir, rep_dir, numsplits)
def change_data(proto, datas=None): proto_cont = util.ReadData(proto) if datas is None: datas = [] for m in ['image', 'text']: for i in [1,2,3]: for t in ['train', 'validation', 'test']: datas += [m+'_'+'hidden'+str(i)+'_'+t] datas += ['bae_'+m+'_'+'hidden'+str(i)+'_'+t] datas += ['bae_'+m+'_'+'hidden'+str(i)+'_'+t+'_all'] datas += ['corr_'+m+'_hidden'+str(i)+'_'+t] for data in datas: try: data_proto = next(lay for lay in proto_cont.data if lay.name == data) data_proto.dimensions[0] = dimensions except StopIteration: pass with open(proto, 'w') as f: text_format.PrintMessage(proto_cont, f)
def withPbtxt(dbPbtxt, modality1, modality2, outputpath): datapb = util.ReadData(dbPbtxt) datasets = ["train", "validation", "test"] datapbNew = deepnet_pb2.Dataset() namePrefix = modality1 + "_" + modality2 + "_" datapbNew.prefix = outputpath datapbNew.name = namePrefix + "combined_input" for dataset in datasets: fileNames1 = [] fileNames2 = [] for dataEntry in datapb.data: if modality1 in dataEntry.name and dataset in dataEntry.name: fileNames1 = sorted( glob.glob( os.path.join(datapb.prefix, dataEntry.file_pattern))) if modality2 in dataEntry.name and dataset in dataEntry.name: fileNames2 = sorted( glob.glob( os.path.join(datapb.prefix, dataEntry.file_pattern))) for i, (file1, file2) in enumerate(zip(fileNames1, fileNames2)): data1 = np.load(file1) data2 = np.load(file2) dataCombined = np.concatenate((data1, data2), axis=1) if i == 0: data = dataCombined else: data = np.concatenate((data, dataCombined), axis=0) if not os.path.exists(os.path.join(outputpath, dataset)): os.makedirs(os.path.join(outputpath, dataset)) np.save(os.path.join(outputpath, dataset, "data"), data) dataItem = deepnet_pb2.Dataset.Data() dataItem.name = namePrefix + "combined_" + dataset dataItem.dimensions.extend([data.shape[1]]) dataItem.size = data.shape[0] dataItem.file_pattern = os.path.join(dataset, "data.npy") datapbNew.data.extend([dataItem]) with open(os.path.join(outputpath, "input_data.pbtxt"), 'w') as f: text_format.PrintMessage(datapbNew, f)
from deepnet import util import numpy as np import os if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--pfamid") args = parser.parse_args() pfamid = args.pfamid data_pbtxt_file = os.path.join(pfamid, 'data.pbtxt') data_pbtxt = util.ReadData(data_pbtxt_file) data_pbtxt.name = pfamid data_pbtxt.prefix = os.path.join( os.path.split(data_pbtxt.prefix)[0], pfamid) for data in data_pbtxt.data: fname = os.path.basename(data.file_pattern) for t in ('train', 'valid', 'test'): if t in data.name: X = np.load(os.path.join(pfamid, pfamid + "_" + t + ".npy")) data.size = X.shape[0] data.dimensions[0] = X.shape[1] data.file_pattern = os.path.abspath( os.path.join(pfamid, pfamid + "_" + t + ".npy")) util.WritePbtxt(data_pbtxt_file, data_pbtxt)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--model", type=str, help="Name of model") parser.add_argument("--data_dir", type=str, help="Data directory location") parser.add_argument("--model_dir", type=str, help="Directory to write models to") parser.add_argument("--rep_dir", type=str, help="Directory to write representations to") parser.add_argument("--gpu_mem", type=str, default="3G", help="GPU memory") parser.add_argument("--main_mem", type=str, default="30G", help="CPU memory") parser.add_argument("--base_epsilon", type=float, default=0.1, help="base epsilon rate") parser.add_argument("--sparsity", action="store_true", help="apply sparsity") parser.add_argument("--dropout", action="store_true", help="apply dropout") parser.add_argument("--l2_decay", type=float, default=0.01, help="l2 decay cost") parser.add_argument("--initial_momentum", type=float, default=0.5, help="initial momentum") parser.add_argument("--final_momentum", type=float, default=0.9, help="final momentum") parser.add_argument("--input_width", type=int, \ help="number of nodes in input_layer") parser.add_argument("--input_numlabels", type=int, default=21,\ help="number of states for nodes in input_layer") parser.add_argument("--hidden1_width", type=int, default=100, \ help="number of nodes in hidden layer") parser.add_argument("--hidden2_width", type=int, default=100, \ help="number of nodes in hidden layer") parser.add_argument("--bernoulli2_hidden1_width", type=int, default=10, \ help="number of nodes in hidden layer") parser.add_argument("--batchsize", type=int, default=1000, help="batchsize") parser.add_argument("--steps", type=int, default=100000, help="training steps") args = parser.parse_args() if not args.data_dir or not args.model_dir or not args.rep_dir or \ not args.input_width : raise ValueError("Required input not provided") if 'GERBILPATH' not in os.environ: raise EnvironmentError('Please set GERBILPATH') deepnet_path = os.path.join(os.environ['GERBILPATH'], 'deepnet') args.data_dir = os.path.join(deepnet_path, args.data_dir) args.model_dir = os.path.join(deepnet_path, args.model_dir) args.rep_dir = os.path.join(deepnet_path, args.rep_dir) data_pbtxt_file = os.path.join(args.data_dir, "data.pbtxt") data_pb = util.ReadData(data_pbtxt_file) EditPaths(data_pb, args) with open(data_pbtxt_file, 'w') as f: text_format.PrintMessage(data_pb, f) EditTrainers(args) if args.model in ['lcdbm', 'dbm']: EditModelsDBM(args) else: EditModels(args)
if skipTraining: for dataset in datasets: dataMap[dataset]["label"] = dataMap[dataset]["label"].tolist() dataMap[dataset]["data"] = dataMap[dataset]["data"].tolist() prob = svm_problem(dataMap["train"]["label"], dataMap["train"]["data"]) param = svm_parameter('-t 2 -c 4 -b 1') m = svm_train(prob, param) svm_save_model(os.path.join(outputPath, modality + '_svm.model'), m) p_label, p_acc, p_val = svm_predict(dataMap["validation"]["label"], dataMap["validation"]["data"], m, '-b 1') ACC, MSE, SCC = evaluations(dataMap["validation"]["label"], p_label) print "ACC on validation set: " + repr(ACC) p_label, p_acc, p_val = svm_predict(dataMap["test"]["label"], dataMap["test"]["data"], m, '-b 1') ACC, MSE, SCC = evaluations(dataMap["test"]["label"], p_label) print "ACC on test set: " + repr(ACC) if __name__ == '__main__': pbPath = sys.argv[1] modality = sys.argv[2] outputPath = sys.argv[3] if len(sys.argv) > 4: saveFile = True if sys.argv[4].upper() == "TRUE" else False if len(sys.argv) > 5: skipTraining = True if sys.argv[5].upper() == "TRUE" else False datapb = util.ReadData(pbPath) fromPb(datapb, modality, outputPath, saveFile, skipTraining)
def main(): model_file = sys.argv[1] base_output_dir = sys.argv[2] rep_dir = sys.argv[3] prefix = sys.argv[4] gpu_mem = sys.argv[5] main_mem = sys.argv[6] model = util.ReadModel(model_file) data_pb = deepnet_pb2.Dataset() data_pb.name = model.name data_pb.gpu_memory = gpu_mem data_pb.main_memory = main_mem output_dir = os.path.join(base_output_dir, 'validation') if not os.path.isdir(output_dir): os.makedirs(output_dir) output_proto_file = os.path.join(base_output_dir, 'data.pbtxt') # IMAGE PATHWAY img_input_pbtxt = os.path.join(prefix, 'flickr.pbtxt') img_hidden1_pbtxt = os.path.join(rep_dir, 'image_rbm1_LAST', 'data.pbtxt') img_hidden2_pbtxt = os.path.join(rep_dir, 'image_rbm2_LAST', 'data.pbtxt') # TEXT PATHWAY text_input_pbtxt = os.path.join(prefix, 'flickr_nnz.pbtxt') text_hidden1_pbtxt = os.path.join(rep_dir, 'text_rbm1_LAST', 'data.pbtxt') text_hidden2_pbtxt = os.path.join(rep_dir, 'text_rbm2_LAST', 'data.pbtxt') text_pbtxt_z = os.path.join(rep_dir, 'generated_text', 'data.pbtxt') joint_pbtxt = os.path.join(rep_dir, 'joint_rbm_LAST', 'data.pbtxt') img_input_pb = util.ReadData(img_input_pbtxt) data = next(d for d in img_input_pb.data if d.name == 'image_labelled') data.file_pattern = os.path.join(img_input_pb.prefix, data.file_pattern) data.stats_file = os.path.join(img_input_pb.prefix, data.stats_file) data.name = 'image_input' data_pb.data.extend([data]) img_hidden1_pb = util.ReadData(img_hidden1_pbtxt) data = next(d for d in img_hidden1_pb.data if d.name == 'image_hidden1_validation') data.file_pattern = os.path.join(img_hidden1_pb.prefix, data.file_pattern) data.name = 'image_hidden1' data_pb.data.extend([data]) img_hidden2_pb = util.ReadData(img_hidden2_pbtxt) data = next(d for d in img_hidden2_pb.data if d.name == 'image_hidden2_validation') data.file_pattern = os.path.join(img_hidden2_pb.prefix, data.file_pattern) data.name = 'image_hidden2' data_pb.data.extend([data]) indices_file = os.path.join(prefix, 'text', 'indices_labelled.npz') indices = np.load(indices_file) nnz_indices = indices['nnz_indices'] z_indices = indices['z_indices'] text_pb_z = util.ReadData(text_pbtxt_z) text_input_pb = util.ReadData(text_input_pbtxt) data_nnz = next(d for d in text_input_pb.data if d.name == 'text_labelled') data_z = next(d for d in text_pb_z.data if d.name == 'text_input_layer_validation') output_file = os.path.join(output_dir, 'text_input-00001-of-00001.npy') data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix, text_input_pb.prefix, 'text_input', output_file) data_pb.data.extend([data]) text_hidden1_pb = util.ReadData(text_hidden1_pbtxt) data_nnz = next(d for d in text_hidden1_pb.data if d.name == 'text_hidden1_validation') data_z = next(d for d in text_pb_z.data if d.name == 'text_hidden1_validation') output_file = os.path.join(output_dir, 'text_hidden1-00001-of-00001.npy') data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix, text_hidden1_pb.prefix, 'text_hidden1', output_file) data_pb.data.extend([data]) text_hidden2_pb = util.ReadData(text_hidden2_pbtxt) data_nnz = next(d for d in text_hidden2_pb.data if d.name == 'text_hidden2_validation') data_z = next(d for d in text_pb_z.data if d.name == 'text_hidden2_validation') output_file = os.path.join(output_dir, 'text_hidden2-00001-of-00001.npy') data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix, text_hidden2_pb.prefix, 'text_hidden2', output_file) data_pb.data.extend([data]) joint_pb = util.ReadData(joint_pbtxt) data_nnz = next(d for d in joint_pb.data if d.name == 'joint_hidden_validation') data_z = next(d for d in text_pb_z.data if d.name == 'joint_hidden_validation') output_file = os.path.join(output_dir, 'joint_hidden-00001-of-00001.npy') data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix, joint_pb.prefix, 'joint_hidden', output_file) data_pb.data.extend([data]) with open(output_proto_file, 'w') as f: text_format.PrintMessage(data_pb, f)
def SetupDataPbtxt(data_pbtxt_file, data_path): data_pbtxt = util.ReadData(data_pbtxt_file) for data in data_pbtxt.data: fname = os.path.basename(data.file_pattern) data.file_pattern = os.path.join(data_path, fname) util.WritePbtxt(data_pbtxt_file, data_pbtxt)
from deepnet import util from deepnet import deepnet_pb2 import sys, os from google.protobuf import text_format proto1 = sys.argv[1] proto2 = sys.argv[2] output_pbtxt = sys.argv[3] out_dir = '/'.join(output_pbtxt.split('/')[:-1]) if out_dir and not os.path.isdir(out_dir): os.makedirs(out_dir) dataset1 = util.ReadData(proto1) name1 = dataset1.name dataset2 = util.ReadData(proto2) name2 = dataset2.name dataset1_prefix = dataset1.prefix dataset2_prefix = dataset2.prefix prefix = os.path.commonprefix([dataset1_prefix, dataset2_prefix]) if dataset1_prefix != dataset2_prefix: for dataset in [dataset1, dataset2]: _prefix = dataset.prefix[len(prefix):] for d in dataset.data: if d.file_pattern: d.file_pattern = os.path.join(_prefix, d.file_pattern) if d.stats_file: d.file_pattern = os.path.join(_prefix, d.stats_file) dataset1.MergeFrom(dataset2)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--data_dir", type=str, help="Data directory location") parser.add_argument("--model_dir", type=str, help="Directory to write models to") parser.add_argument("--rep_dir", type=str, help="Directory to write representations to") parser.add_argument("--gpu_mem", type=str, default="3G", help="GPU memory") parser.add_argument("--main_mem", type=str, default="30G", help="CPU memory") parser.add_argument("--base_epsilon", type=float, default=0.1, help="base epsilon rate") parser.add_argument("--epsilon_decay", action="store_true", help="epsilon decay mechanism") parser.add_argument("--sparsity", action="store_true", help="apply sparsity") parser.add_argument("--dropout", action="store_true", help="apply dropout") parser.add_argument("--l2_decay", type=float, default=0.01, help="l2 decay cost") parser.add_argument("--initial_momentum", type=float, default=0.5, help="initial momentum") parser.add_argument("--final_momentum", type=float, default=0.9, help="final momentum") parser.add_argument("--input_width", type=int, \ help="number of nodes in input_layer") parser.add_argument("--input_numlabels", type=int, default=21,\ help="number of states for nodes in input_layer") parser.add_argument("--hidden1_width", type=int, default=500, \ help="number of nodes in hidden1") parser.add_argument("--hidden2_width", type=int, default=500, \ help="number of nodes in hidden2") parser.add_argument("--batchsize", type=int, default=1000, help="batchsize") parser.add_argument("--local", action="store_true", help="Run locally on langmead.pc") args = parser.parse_args() if not args.data_dir or not args.model_dir or not args.rep_dir or \ not args.input_width : raise ValueError("Required input not provided") deepnet_path = awsutil.get_deepnet_path() args.data_dir = os.path.join(deepnet_path, args.data_dir) args.model_dir = os.path.join(deepnet_path, args.model_dir) args.rep_dir = os.path.join(deepnet_path, args.rep_dir) data_pbtxt_file = os.path.join(args.data_dir, "data.pbtxt") data_pb = util.ReadData(data_pbtxt_file) EditPaths(data_pb, args) with open(data_pbtxt_file, 'w') as f: text_format.PrintMessage(data_pb, f) EditTrainers(args) EditModels(args)
def MakeDataPbtxt(data_pbtxt_file, data_path): data_pbtxt = util.ReadData('mnist.pbtxt') for data in data_pbtxt.data: fname = os.path.basename(data.file_pattern) data.file_pattern = os.path.join(data_path, fname) util.WritePbtxt(data_pbtxt_file, data_pbtxt)
def __init__(self, op, data_name_list, hyperparameter_list, frac=1.0): """Initializes a DataHandler. Args: op: Operation protocol buffer. data_name_list: List of data names that should be put together. (Usually refers to a list of different modalities, e.g., ['data', 'label'] or ['image', 'audio'].) hyperparameter_list: List of hyperparameters, one for each modality. frac: What fraction of the total memory should this data handler use. """ filenames = [] numdim_list = [] datasetsize = None left_window = [] right_window = [] stats_files = [] shift = [] add_noise = [] shift_amt_x = [] shift_amt_y = [] keys = [] typesize = 4 if isinstance(op, str): op = util.ReadOperation(op) self.verbose = op.verbose verbose = self.verbose data_proto_file = os.path.join(op.data_proto_prefix, op.data_proto) dataset_proto = util.ReadData(data_proto_file) seq = False is_train = False for name, hyp in zip(data_name_list, hyperparameter_list): data_proto = next(d for d in dataset_proto.data if d.name == name) file_pattern = os.path.join(dataset_proto.prefix, data_proto.file_pattern) filenames.append(sorted(glob.glob(file_pattern))) stats_files.append( os.path.join(dataset_proto.prefix, data_proto.stats_file)) numdims = np.prod(np.array(data_proto.dimensions)) if not data_proto.sparse: numdims *= data_proto.num_labels numdim_list.append(numdims) seq = seq or data_proto.seq left_window.append(hyp.left_window) right_window.append(hyp.right_window) add_noise.append(hyp.add_noise) shift.append(hyp.shift) shift_amt_x.append(hyp.shift_amt_x) shift_amt_y.append(hyp.shift_amt_y) keys.append(data_proto.key) is_train = 'train' in name # HACK - Fix this! if datasetsize is None: datasetsize = data_proto.size else: assert datasetsize == data_proto.size, 'Size of %s is not %d' % ( name, datasetsize) # Add space for padding. if seq: max_rw = max(right_window) max_lw = max(left_window) actual_datasetsize = datasetsize datasetsize += len(filenames[0]) * (max_rw + max_lw) numdims = sum(numdim_list) batchsize = op.batchsize randomize = op.randomize self.get_last_piece = op.get_last_piece # Compute size of each cache. total_disk_space = datasetsize * numdims * typesize max_gpu_capacity = int(frac * GetBytes(dataset_proto.gpu_memory)) max_cpu_capacity = int(frac * GetBytes(dataset_proto.main_memory)) # Each capacity should correspond to integral number of batches. vectorsize_bytes = typesize * numdims batchsize_bytes = vectorsize_bytes * batchsize max_gpu_capacity = (max_gpu_capacity / batchsize_bytes) * batchsize_bytes #max_cpu_capacity = (max_cpu_capacity / batchsize_bytes) * batchsize_bytes # Don't need more than total dataset size. gpu_capacity = min(total_disk_space, max_gpu_capacity) cpu_capacity = min(total_disk_space, max_cpu_capacity) num_gpu_batches = gpu_capacity / batchsize_bytes num_cpu_batches = cpu_capacity / batchsize_bytes gpu_left_overs = gpu_capacity / vectorsize_bytes - num_gpu_batches * batchsize cpu_left_overs = cpu_capacity / vectorsize_bytes - num_cpu_batches * batchsize if self.verbose: if seq: num_valid_gpu_vectors = ( gpu_capacity / vectorsize_bytes) - len(filenames[0]) * (max_rw + max_lw) print num_valid_gpu_vectors else: print 'Batches in GPU memory: %d + leftovers %d' % ( num_gpu_batches, gpu_left_overs) print 'Batches in main memory: %d + leftovers %d' % ( num_cpu_batches, cpu_left_overs) print 'Batches in disk: %d + leftovers %d' % ( (datasetsize / batchsize), datasetsize % batchsize) if seq: import sequence_datahandler as seq_dh self.disk = seq_dh.SequenceDisk(filenames, numdim_list, datasetsize, keys=keys, left_window=left_window, right_window=right_window, verbose=verbose) self.cpu_cache = seq_dh.SequenceCache(self.disk, cpu_capacity, numdim_list, typesize=typesize, randomize=randomize, left_window=left_window, right_window=right_window, verbose=verbose) self.gpu_cache = seq_dh.SequenceGPUCache(self.cpu_cache, gpu_capacity, numdim_list, typesize=typesize, randomize=randomize, left_window=left_window, right_window=right_window, verbose=verbose, batchsize=batchsize) else: self.disk = Disk(filenames, numdim_list, datasetsize, keys=keys, verbose=self.verbose) self.cpu_cache = Cache(self.disk, cpu_capacity, numdim_list, typesize=typesize, randomize=randomize, verbose=self.verbose) self.gpu_cache = GPUCache(self.cpu_cache, gpu_capacity, numdim_list, typesize=typesize, randomize=randomize, verbose=self.verbose, shift=shift, add_noise=add_noise, center_only=not is_train, shift_amt_x=shift_amt_x, shift_amt_y=shift_amt_y) for i, stats_file in enumerate(stats_files): if hyperparameter_list[i].normalize and hyperparameter_list[ i].activation != deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX: self.gpu_cache.SetDataStats(i, stats_file) self.batchsize = batchsize if seq: datasetsize = actual_datasetsize self.num_batches = datasetsize / batchsize if self.get_last_piece and datasetsize % batchsize > 0: self.num_batches += 1
def main(): model_file = sys.argv[1] base_output_dir = sys.argv[2] rep_dir = sys.argv[3] prefix = sys.argv[4] gpu_mem = sys.argv[5] main_mem = sys.argv[6] model = util.ReadModel(model_file) data_pb = deepnet_pb2.Dataset() data_pb.name = model.name data_pb.gpu_memory = gpu_mem data_pb.main_memory = main_mem output_dir = os.path.join(base_output_dir, 'validation') if not os.path.isdir(output_dir): os.makedirs(output_dir) output_proto_file = os.path.join(base_output_dir, 'data.pbtxt') # IMAGE PATHWAY img_input_pbtxt = os.path.join(prefix, 'RNAseq.pbtxt') img_hidden1_pbtxt = os.path.join(rep_dir, 'RNA1seq_rbm1_LAST', 'data.pbtxt') #img_hidden2_pbtxt = os.path.join(rep_dir, 'RNA1seq_rbm2_LAST', 'data.pbtxt') # TEXT PATHWAY text_input_pbtxt = os.path.join(prefix, 'RNAseq.pbtxt') text_hidden1_pbtxt = os.path.join(rep_dir, 'RNA2seq_rbm1_LAST', 'data.pbtxt') #text_hidden2_pbtxt = os.path.join(rep_dir, 'RNA2seq_rbm2_LAST', 'data.pbtxt') #text_pbtxt_z = os.path.join(rep_dir, 'generated_text', 'data.pbtxt') joint_pbtxt = os.path.join(rep_dir, 'joint_rbm_LAST', 'data.pbtxt') joint2_pbtxt = os.path.join(rep_dir, 'joint_rbm2_LAST', 'data.pbtxt') img_input_pb = util.ReadData(img_input_pbtxt) data = next(d for d in img_input_pb.data if d.name == 'RNA1seq_train') data.file_pattern = os.path.join(img_input_pb.prefix, data.file_pattern) #data.stats_file = os.path.join(img_input_pb.prefix, data.stats_file) data.name = 'RNA1seq_input' data_pb.data.extend([data]) img_hidden1_pb = util.ReadData(img_hidden1_pbtxt) data = next(d for d in img_hidden1_pb.data if d.name == 'RNA1seq_hidden1_train') data.file_pattern = os.path.join(img_hidden1_pb.prefix, data.file_pattern) data.name = 'RNA1seq_hidden1' data_pb.data.extend([data]) #img_hidden2_pb = util.ReadData(img_hidden2_pbtxt) #data = next(d for d in img_hidden2_pb.data if d.name == 'RNA1seq_hidden2_train') #data.file_pattern = os.path.join(img_hidden2_pb.prefix, data.file_pattern) #data.name = 'RNA1seq_hidden2' #data_pb.data.extend([data]) #indices_file = os.path.join(prefix, 'text', 'indices_labelled.npz') #indices = np.load(indices_file) #nnz_indices = indices['nnz_indices'] #z_indices = indices['z_indices'] #text_pb_z = util.ReadData(text_pbtxt_z) text_input_pb = util.ReadData(text_input_pbtxt) data = next(d for d in text_input_pb.data if d.name == 'RNA2seq_train') data.file_pattern = os.path.join(text_input_pb.prefix, data.file_pattern) data.name = 'RNA2seq_input' data_pb.data.extend([data]) text_hidden1_pb = util.ReadData(text_hidden1_pbtxt) data = next(d for d in text_hidden1_pb.data if d.name == 'RNA2seq_hidden1_train') data.file_pattern = os.path.join(text_hidden1_pb.prefix, data.file_pattern) data.name = 'RNA2seq_hidden1' data_pb.data.extend([data]) #text_hidden2_pb = util.ReadData(text_hidden2_pbtxt) #data = next(d for d in text_hidden2_pb.data if d.name == 'RNA2seq_hidden2_train') #data.file_pattern = os.path.join(text_hidden2_pb.prefix, data.file_pattern) #data.name = 'RNA2seq_hidden2' #data_pb.data.extend([data]) joint_pb = util.ReadData(joint_pbtxt) data = next(d for d in joint_pb.data if d.name == 'joint_hidden_train') data.file_pattern = os.path.join(joint_pb.prefix, data.file_pattern) data.name = 'joint_hidden' data_pb.data.extend([data]) joint2_pb = util.ReadData(joint2_pbtxt) data = next(d for d in joint2_pb.data if d.name == 'joint_hidden2_train') data.file_pattern = os.path.join(joint2_pb.prefix, data.file_pattern) data.name = 'joint_hidden2' data_pb.data.extend([data]) with open(output_proto_file, 'w') as f: text_format.PrintMessage(data_pb, f)