예제 #1
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbosity)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        if X_nonzero.size > 0: 
          task_metadata_rows[task].append(
            DiskDataset.write_data_to_disk(
                task_dirs[task_num], basename, [task],
                X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        DiskDataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=dataset.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
예제 #2
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbosity)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        if X_nonzero.size > 0: 
          task_metadata_rows[task].append(
            DiskDataset.write_data_to_disk(
                task_dirs[task_num], basename, [task],
                X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        DiskDataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=dataset.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
예제 #3
0
 def target_4_dataset_save(self,dataset,file):
     compound=dataset.ids.tolist()
     target=dataset.get_task_names()
     print(target)
     w=dataset.w
     print('w.shape')
     print(w.shape)
     compuond_4_target=[]
     target_4=['P21728','P14416','P08908','P28223']
     
     target_4=sorted(target_4,key=lambda x:target.index(x))
     target_4_index=[target.index(i) for i in target_4]
     print('target_4')
     print(target_4_index)
     for i in range(len(compound)):
         z=0
         for j in target_4_index:
         
             if w[i,j]>0:
                 z=z+1
         if z>0:
             compuond_4_target.append(i)
             
         
     compound_shard=[]
     
         
     dataset1=dataset.select(compuond_4_target)
     print(compuond_4_target)
     cpd=compuond_4_target
     metadata_rows=[]
     shard_generator=self.shard_generator(cpd,target_4_index,dataset1)
     for shard_num, (X, y, w, ids) in enumerate(shard_generator):
       basename = "shard-%d" % shard_num
       metadata_rows.append(
           DiskDataset.write_data_to_disk(file, basename,target_4 , X, y, w,
                                          ids))
       metadata_df = DiskDataset._construct_metadata(metadata_rows)
       self.save_metadata(target_4, metadata_df, file)
       time2 = time.time()