Exemplo n.º 1
0
 def ParseOther(self, baseDir, withMS = False):
     self.baseDir = baseDir
     pathDir = os.path.join(baseDir, "*.npy")
     files = glob.glob(pathDir)
     instanceCount = 0
     dataPb = deepnet_pb2.Dataset()
     
     for i, feature in enumerate(self.featureGroups):
         data = deepnet_pb2.Dataset.Data()
         data.name = feature + "_"+ os.path.basename(baseDir)
         data.file_pattern = "*"+feature+"*.npy"
         if withMS:
             data.dimensions.extend([self.featureGroupsIndex[i+1]-self.featureGroupsIndex[i]])
         else:
             dimensions = 0
             for entry in self.featureGroupsDict[feature]:
                 dimensions = dimensions + entry[1] - entry[0]
             data.dimensions.extend([dimensions])
         dataPb.data.extend([data]) 
         
     data = deepnet_pb2.Dataset.Data()
     data.name = "label_" + os.path.basename(baseDir) 
     data.dimensions.extend([1]) 
     data.file_pattern = "*label.npy"
     dataPb.data.extend([data]) 
     
     if withMS:
         MS = "withMS"
         outputProtoFile = os.path.join(baseDir, MS, "data_withMS.pbtxt")
     else:
         MS = "withoutMS"
         outputProtoFile = os.path.join(baseDir, MS, "data_withoutMS.pbtxt")
         
     dataPb.name = os.path.basename(baseDir) + "_"+ MS       
     dirPath = os.path.join(baseDir, MS)
     dataPb.prefix = dirPath
     for fileEntry in files:
         tempData = np.load(fileEntry)
         if len(tempData.shape) == 1 or tempData.shape[1] != 17593:
             continue
         instanceCount = instanceCount + tempData.shape[0]
         baseName = os.path.basename(fileEntry)
         fileName = os.path.join(dirPath,os.path.splitext(baseName)[0]) + "_" + MS
         np.save(fileName + '_label.npy', tempData[:, 17592])
         if withMS:
             for i, feature in enumerate(self.featureGroups):
                 np.save(fileName + '_' + feature + "_withMS.npy", tempData[:, self.featureGroupsIndex[i]:self.featureGroupsIndex[i + 1]])               
         else:
             for feature in self.featureGroups:
                 tempTuple = self.featureGroupsDict[feature][0]
                 tempArray = tempData[:, tempTuple[0]: tempTuple[1]]
                 if len(self.featureGroupsDict[feature]) > 1:
                     for i in range(1, len(self.featureGroupsDict[feature])):
                         tempTuple = self.featureGroupsDict[feature][i]
                         tempArray = np.concatenate((tempArray, tempData[:,tempTuple[0]: tempTuple[1]]), axis = 1)
                 np.save(fileName + '_' + feature + "_withoutMS.npy", tempArray) 
     for entry in dataPb.data:
         entry.size = instanceCount
     with open(outputProtoFile, 'w') as f:
         text_format.PrintMessage(dataPb, f) 
Exemplo n.º 2
0
def main():
  data_pbtxt = sys.argv[1]
  output_dir = sys.argv[2]
  prefix = sys.argv[3]
  r = int(sys.argv[4])
  gpu_mem = sys.argv[5]
  main_mem = sys.argv[6]
  if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

  rep_dict, stats_files = MakeDict(data_pbtxt)
  reps = rep_dict.keys()

  indices_file = os.path.join(prefix, 'splits', 'train_indices_%d.npy' % r)
  if os.path.exists(indices_file):
    train = np.load(indices_file)
    valid = np.load(os.path.join(prefix, 'splits', 'valid_indices_%d.npy' % r))
    test = np.load(os.path.join(prefix, 'splits', 'test_indices_%d.npy' % r))
  else:
    print 'Creating new split.'
    indices = np.arange(25000)
    np.random.shuffle(indices)
    train = indices[:10000]
    valid = indices[10000:15000]
    test = indices[15000:]
    np.save(os.path.join(prefix, 'splits', 'train_indices_%d.npy' % r), train)
    np.save(os.path.join(prefix, 'splits', 'valid_indices_%d.npy' % r), valid)
    np.save(os.path.join(prefix, 'splits', 'test_indices_%d.npy' % r), test)

    
  print 'Splitting data'
  dataset_pb = deepnet_pb2.Dataset()
  dataset_pb.name = 'flickr_split_%d' % r
  dataset_pb.gpu_memory = gpu_mem
  dataset_pb.main_memory = main_mem
  for rep in reps:
    data = rep_dict[rep]
    stats_file = stats_files[rep]
    DumpDataSplit(data[train], output_dir, 'train_%s' % rep, dataset_pb, stats_file)
    DumpDataSplit(data[valid], output_dir, 'valid_%s' % rep, dataset_pb, stats_file)
    DumpDataSplit(data[test], output_dir, 'test_%s' % rep, dataset_pb, stats_file)

  print 'Splitting labels'
  labels = np.load(os.path.join(prefix, 'labels.npy')).astype('float32')
  DumpLabelSplit(labels[train,], output_dir, 'train_labels', dataset_pb)
  DumpLabelSplit(labels[valid,], output_dir, 'valid_labels', dataset_pb)
  DumpLabelSplit(labels[test,], output_dir, 'test_labels', dataset_pb)

  #d = 'indices'
  #np.save(os.path.join(output_dir, 'train_%s.npy' % d), train)
  #np.save(os.path.join(output_dir, 'valid_%s.npy' % d), valid)
  #np.save(os.path.join(output_dir, 'test_%s.npy' % d), test)

  with open(os.path.join(output_dir, 'data.pbtxt'), 'w') as f:
    text_format.PrintMessage(dataset_pb, f)

  print 'Output written in directory %s' % output_dir
Exemplo n.º 3
0
def withPbtxt(dbPbtxt, modality1, modality2, outputpath):
    datapb = util.ReadData(dbPbtxt)
    datasets = ["train", "validation", "test"]
    datapbNew = deepnet_pb2.Dataset()
    namePrefix = modality1 + "_" + modality2 + "_"
    datapbNew.prefix = outputpath
    datapbNew.name = namePrefix + "combined_input"
    for dataset in datasets:
        fileNames1 = []
        fileNames2 = []
        for dataEntry in datapb.data:
            if modality1 in dataEntry.name and dataset in dataEntry.name:
                fileNames1 = sorted(
                    glob.glob(
                        os.path.join(datapb.prefix, dataEntry.file_pattern)))
            if modality2 in dataEntry.name and dataset in dataEntry.name:
                fileNames2 = sorted(
                    glob.glob(
                        os.path.join(datapb.prefix, dataEntry.file_pattern)))
        for i, (file1, file2) in enumerate(zip(fileNames1, fileNames2)):
            data1 = np.load(file1)
            data2 = np.load(file2)
            dataCombined = np.concatenate((data1, data2), axis=1)
            if i == 0:
                data = dataCombined
            else:
                data = np.concatenate((data, dataCombined), axis=0)
        if not os.path.exists(os.path.join(outputpath, dataset)):
            os.makedirs(os.path.join(outputpath, dataset))
        np.save(os.path.join(outputpath, dataset, "data"), data)
        dataItem = deepnet_pb2.Dataset.Data()
        dataItem.name = namePrefix + "combined_" + dataset
        dataItem.dimensions.extend([data.shape[1]])
        dataItem.size = data.shape[0]
        dataItem.file_pattern = os.path.join(dataset, "data.npy")
        datapbNew.data.extend([dataItem])
    with open(os.path.join(outputpath, "input_data.pbtxt"), 'w') as f:
        text_format.PrintMessage(datapbNew, f)
Exemplo n.º 4
0
    def ParsePerson(self, baseDir, ne=True, withMS=False):
        self.baseDir = baseDir
        for person in self.subPersonDir:
            instanceCount = 0
            dataPb = deepnet_pb2.Dataset()
            outputProtoFile = os.path.join(self.baseDir, person, 'data.pbtxt')
            for i, feature in enumerate(self.featureGroups):
                data = deepnet_pb2.Dataset.Data()
                data.name = person + "_" + feature
                data.file_pattern = "*" + feature + ".npy"
                if withMS:
                    data.dimensions.extend([
                        self.featureGroupsIndex[i + 1] -
                        self.featureGroupsIndex[i]
                    ])
                else:
                    dimensions = 0
                    for entry in self.featureGroupsDict[feature]:
                        dimensions = dimensions + entry[1] - entry[0]
                data.dimensions.extend([dimensions])
                dataPb.data.extend([data])

            data = deepnet_pb2.Dataset.Data()
            data.name = person + "_label"
            data.dimensions.extend([1])
            data.file_pattern = "*label.npy"
            dataPb.data.extend([data])
            dataPb.prefix = os.path.join(self.baseDir, person)
            if withMS:
                dataPb.name = os.path.basename(baseDir) + "withMS"
                outputProtoFile = os.path.join(baseDir, 'data_withMS.pbtxt')
            else:
                dataPb.name = os.path.basename(baseDir) + "withoutMS"
                outputProtoFile = os.path.join(baseDir, 'data_withoutMS.pbtxt')
            if ne:
                filePath = os.path.join(self.baseDir, person, "*.npy")
                files = glob.glob(filePath)
                for fileEntry in files:
                    tempData = np.load(fileEntry)
                    assert (tempData.shape[1] == 17593)
                    instanceCount = instanceCount + tempData.shape[0]

                    fileName = os.path.splitext(fileEntry)[0]
                    if withMS:
                        for i, feature in self.featureGroups:
                            np.save(
                                fileName + '_' + feature + "_withMS.npy",
                                tempData[:, self.featureGroupsIndex[i]:self.
                                         featureGroupsIndex[i + 1]])
                    else:
                        for feature in self.featureGroups:
                            tempTuple = self.featureGroupsDict[feature][0]
                            tempArray = tempData[:, tempTuple[0]:tempTuple[1]]
                            if len(self.featureGroupsDict[feature]) > 1:
                                for i in range(
                                        1,
                                        len(self.featureGroupsDict[feature])):
                                    tempTuple = self.featureGroupsDict[
                                        feature][i]
                                    tempArray = np.concatenate(
                                        (tempArray,
                                         tempData[:,
                                                  tempTuple[0]:tempTuple[1]]),
                                        axis=1)
                            np.save(
                                fileName + '_' + feature + "_withoutMS.npy",
                                tempArray)
                    np.save(fileName + '_label.npy', tempData[:, 17592])

            else:
                for fType in self.subTypeDir:
                    filePath = os.path.join(self.baseDir, person, fType,
                                            "*.npy")
                    files = glob.glob(filePath)
                    for fileEntry in files:
                        tempData = np.load(fileEntry)
                        assert (tempData.shape[1] == 17593)
                        instanceCount = instanceCount + tempData.shape[0]

                        baseName = os.path.splitext(
                            os.path.basename(fileEntry))[0]
                        fileName = os.path.join(self.baseDir, person, baseName)
                        if withMS:
                            for i, feature in enumerate(self.featureGroups):
                                np.save(
                                    fileName + '_' + feature + "_withtMS.npy",
                                    tempData[:,
                                             self.featureGroupsIndex[i]:self.
                                             featureGroupsIndex[i + 1]])

                        else:
                            for feature in self.featureGroups:
                                tempTuple = self.featureGroupsDict[feature][0]
                                tempArray = tempData[:,
                                                     tempTuple[0]:tempTuple[1]]
                                if len(self.featureGroupsDict[feature]) > 1:
                                    for i in range(
                                            1,
                                            len(self.featureGroupsDict[feature]
                                                )):
                                        tempTuple = self.featureGroupsDict[
                                            feature][i]
                                        tempArray = np.concatenate((
                                            tempArray,
                                            tempData[:,
                                                     tempTuple[0]:tempTuple[1]]
                                        ),
                                                                   axis=1)
                                np.save(
                                    fileName + '_' + feature +
                                    "_withoutMS.npy", tempArray)
                        np.save(fileName + '_label.npy', tempData[:, 17592])
            for entry in dataPb.data:
                entry.size = instanceCount
            with open(outputProtoFile, 'w') as f:
                text_format.PrintMessage(dataPb, f)
Exemplo n.º 5
0
def main():
    model_file = sys.argv[1]
    base_output_dir = sys.argv[2]
    rep_dir = sys.argv[3]
    prefix = sys.argv[4]
    gpu_mem = sys.argv[5]
    main_mem = sys.argv[6]
    model = util.ReadModel(model_file)
    data_pb = deepnet_pb2.Dataset()
    data_pb.name = model.name
    data_pb.gpu_memory = gpu_mem
    data_pb.main_memory = main_mem
    output_dir = os.path.join(base_output_dir, 'validation')
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    output_proto_file = os.path.join(base_output_dir, 'data.pbtxt')

    # IMAGE PATHWAY
    img_input_pbtxt = os.path.join(prefix, 'flickr.pbtxt')
    img_hidden1_pbtxt = os.path.join(rep_dir, 'image_rbm1_LAST', 'data.pbtxt')
    img_hidden2_pbtxt = os.path.join(rep_dir, 'image_rbm2_LAST', 'data.pbtxt')

    # TEXT PATHWAY
    text_input_pbtxt = os.path.join(prefix, 'flickr_nnz.pbtxt')
    text_hidden1_pbtxt = os.path.join(rep_dir, 'text_rbm1_LAST', 'data.pbtxt')
    text_hidden2_pbtxt = os.path.join(rep_dir, 'text_rbm2_LAST', 'data.pbtxt')
    text_pbtxt_z = os.path.join(rep_dir, 'generated_text', 'data.pbtxt')

    joint_pbtxt = os.path.join(rep_dir, 'joint_rbm_LAST', 'data.pbtxt')

    img_input_pb = util.ReadData(img_input_pbtxt)
    data = next(d for d in img_input_pb.data if d.name == 'image_labelled')
    data.file_pattern = os.path.join(img_input_pb.prefix, data.file_pattern)
    data.stats_file = os.path.join(img_input_pb.prefix, data.stats_file)
    data.name = 'image_input'
    data_pb.data.extend([data])

    img_hidden1_pb = util.ReadData(img_hidden1_pbtxt)
    data = next(d for d in img_hidden1_pb.data
                if d.name == 'image_hidden1_validation')
    data.file_pattern = os.path.join(img_hidden1_pb.prefix, data.file_pattern)
    data.name = 'image_hidden1'
    data_pb.data.extend([data])

    img_hidden2_pb = util.ReadData(img_hidden2_pbtxt)
    data = next(d for d in img_hidden2_pb.data
                if d.name == 'image_hidden2_validation')
    data.file_pattern = os.path.join(img_hidden2_pb.prefix, data.file_pattern)
    data.name = 'image_hidden2'
    data_pb.data.extend([data])

    indices_file = os.path.join(prefix, 'text', 'indices_labelled.npz')
    indices = np.load(indices_file)
    nnz_indices = indices['nnz_indices']
    z_indices = indices['z_indices']

    text_pb_z = util.ReadData(text_pbtxt_z)
    text_input_pb = util.ReadData(text_input_pbtxt)
    data_nnz = next(d for d in text_input_pb.data if d.name == 'text_labelled')
    data_z = next(d for d in text_pb_z.data
                  if d.name == 'text_input_layer_validation')
    output_file = os.path.join(output_dir, 'text_input-00001-of-00001.npy')
    data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix,
                 text_input_pb.prefix, 'text_input', output_file)
    data_pb.data.extend([data])

    text_hidden1_pb = util.ReadData(text_hidden1_pbtxt)
    data_nnz = next(d for d in text_hidden1_pb.data
                    if d.name == 'text_hidden1_validation')
    data_z = next(d for d in text_pb_z.data
                  if d.name == 'text_hidden1_validation')
    output_file = os.path.join(output_dir, 'text_hidden1-00001-of-00001.npy')
    data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix,
                 text_hidden1_pb.prefix, 'text_hidden1', output_file)
    data_pb.data.extend([data])

    text_hidden2_pb = util.ReadData(text_hidden2_pbtxt)
    data_nnz = next(d for d in text_hidden2_pb.data
                    if d.name == 'text_hidden2_validation')
    data_z = next(d for d in text_pb_z.data
                  if d.name == 'text_hidden2_validation')
    output_file = os.path.join(output_dir, 'text_hidden2-00001-of-00001.npy')
    data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix,
                 text_hidden2_pb.prefix, 'text_hidden2', output_file)
    data_pb.data.extend([data])

    joint_pb = util.ReadData(joint_pbtxt)
    data_nnz = next(d for d in joint_pb.data
                    if d.name == 'joint_hidden_validation')
    data_z = next(d for d in text_pb_z.data
                  if d.name == 'joint_hidden_validation')
    output_file = os.path.join(output_dir, 'joint_hidden-00001-of-00001.npy')
    data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix,
                 joint_pb.prefix, 'joint_hidden', output_file)
    data_pb.data.extend([data])

    with open(output_proto_file, 'w') as f:
        text_format.PrintMessage(data_pb, f)
Exemplo n.º 6
0
def main():
  model_file = sys.argv[1]
  base_output_dir = sys.argv[2]
  rep_dir = sys.argv[3]
  prefix = sys.argv[4]
  gpu_mem = sys.argv[5]
  main_mem = sys.argv[6]
  model = util.ReadModel(model_file)
  data_pb = deepnet_pb2.Dataset()
  data_pb.name = model.name
  data_pb.gpu_memory = gpu_mem
  data_pb.main_memory = main_mem
  output_dir = os.path.join(base_output_dir, 'validation')
  if not os.path.isdir(output_dir):
    os.makedirs(output_dir)
  output_proto_file = os.path.join(base_output_dir, 'data.pbtxt')

  # IMAGE PATHWAY
  img_input_pbtxt = os.path.join(prefix, 'RNAseq.pbtxt')
  img_hidden1_pbtxt = os.path.join(rep_dir, 'RNA1seq_rbm1_LAST', 'data.pbtxt')
  #img_hidden2_pbtxt = os.path.join(rep_dir, 'RNA1seq_rbm2_LAST', 'data.pbtxt')
 
  # TEXT PATHWAY
  text_input_pbtxt = os.path.join(prefix, 'RNAseq.pbtxt')
  text_hidden1_pbtxt = os.path.join(rep_dir, 'RNA2seq_rbm1_LAST', 'data.pbtxt')
  #text_hidden2_pbtxt = os.path.join(rep_dir, 'RNA2seq_rbm2_LAST', 'data.pbtxt')
  #text_pbtxt_z = os.path.join(rep_dir, 'generated_text', 'data.pbtxt')
  
  joint_pbtxt = os.path.join(rep_dir, 'joint_rbm_LAST', 'data.pbtxt')
  joint2_pbtxt = os.path.join(rep_dir, 'joint_rbm2_LAST', 'data.pbtxt')

  
  img_input_pb = util.ReadData(img_input_pbtxt)
  data = next(d for d in img_input_pb.data if d.name == 'RNA1seq_train')
  data.file_pattern = os.path.join(img_input_pb.prefix, data.file_pattern)
  #data.stats_file = os.path.join(img_input_pb.prefix, data.stats_file)
  data.name = 'RNA1seq_input'
  data_pb.data.extend([data])

  img_hidden1_pb = util.ReadData(img_hidden1_pbtxt)
  data = next(d for d in img_hidden1_pb.data if d.name == 'RNA1seq_hidden1_train')
  data.file_pattern = os.path.join(img_hidden1_pb.prefix, data.file_pattern)
  data.name = 'RNA1seq_hidden1'
  data_pb.data.extend([data])

  #img_hidden2_pb = util.ReadData(img_hidden2_pbtxt)
  #data = next(d for d in img_hidden2_pb.data if d.name == 'RNA1seq_hidden2_train')
  #data.file_pattern = os.path.join(img_hidden2_pb.prefix, data.file_pattern)
  #data.name = 'RNA1seq_hidden2'
  #data_pb.data.extend([data])
  
  #indices_file = os.path.join(prefix, 'text', 'indices_labelled.npz')
  #indices = np.load(indices_file)
  #nnz_indices = indices['nnz_indices']
  #z_indices = indices['z_indices']

  #text_pb_z = util.ReadData(text_pbtxt_z)
  text_input_pb = util.ReadData(text_input_pbtxt)
  data = next(d for d in text_input_pb.data if d.name == 'RNA2seq_train')
  data.file_pattern = os.path.join(text_input_pb.prefix, data.file_pattern)
  data.name = 'RNA2seq_input'
  data_pb.data.extend([data])

  text_hidden1_pb = util.ReadData(text_hidden1_pbtxt)
  data = next(d for d in text_hidden1_pb.data if d.name == 'RNA2seq_hidden1_train')
  data.file_pattern = os.path.join(text_hidden1_pb.prefix, data.file_pattern)
  data.name = 'RNA2seq_hidden1'
  data_pb.data.extend([data])

  #text_hidden2_pb = util.ReadData(text_hidden2_pbtxt)
  #data = next(d for d in text_hidden2_pb.data if d.name == 'RNA2seq_hidden2_train')
  #data.file_pattern = os.path.join(text_hidden2_pb.prefix, data.file_pattern)
  #data.name = 'RNA2seq_hidden2'
  #data_pb.data.extend([data])

  joint_pb = util.ReadData(joint_pbtxt)
  data = next(d for d in joint_pb.data if d.name == 'joint_hidden_train')
  data.file_pattern = os.path.join(joint_pb.prefix, data.file_pattern)
  data.name = 'joint_hidden'
  data_pb.data.extend([data])

  joint2_pb = util.ReadData(joint2_pbtxt)
  data = next(d for d in joint2_pb.data if d.name == 'joint_hidden2_train')
  data.file_pattern = os.path.join(joint2_pb.prefix, data.file_pattern)
  data.name = 'joint_hidden2'
  data_pb.data.extend([data])

  with open(output_proto_file, 'w') as f:
    text_format.PrintMessage(data_pb, f)