def _to_singletask(dataset, task_dirs): """Transforms a multitask dataset to a collection of singletask datasets.""" tasks = dataset.get_task_names() assert len(tasks) == len(task_dirs) log("Splitting multitask dataset into singletask datasets", dataset.verbosity) task_metadata_rows = {task: [] for task in tasks} for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()): log("Processing shard %d" % shard_num, dataset.verbosity) basename = "dataset-%d" % shard_num for task_num, task in enumerate(tasks): log("\tTask %s" % task, dataset.verbosity) w_task = w[:, task_num] y_task = y[:, task_num] # Extract those datapoints which are present for this task X_nonzero = X[w_task != 0] num_datapoints = X_nonzero.shape[0] y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1)) w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1)) ids_nonzero = ids[w_task != 0] if X_nonzero.size > 0: task_metadata_rows[task].append( DiskDataset.write_data_to_disk( task_dirs[task_num], basename, [task], X_nonzero, y_nonzero, w_nonzero, ids_nonzero)) task_datasets = [ DiskDataset(data_dir=task_dirs[task_num], metadata_rows=task_metadata_rows[task], verbosity=dataset.verbosity) for (task_num, task) in enumerate(tasks)] return task_datasets
def target_4_dataset_save(self,dataset,file): compound=dataset.ids.tolist() target=dataset.get_task_names() print(target) w=dataset.w print('w.shape') print(w.shape) compuond_4_target=[] target_4=['P21728','P14416','P08908','P28223'] target_4=sorted(target_4,key=lambda x:target.index(x)) target_4_index=[target.index(i) for i in target_4] print('target_4') print(target_4_index) for i in range(len(compound)): z=0 for j in target_4_index: if w[i,j]>0: z=z+1 if z>0: compuond_4_target.append(i) compound_shard=[] dataset1=dataset.select(compuond_4_target) print(compuond_4_target) cpd=compuond_4_target metadata_rows=[] shard_generator=self.shard_generator(cpd,target_4_index,dataset1) for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk(file, basename,target_4 , X, y, w, ids)) metadata_df = DiskDataset._construct_metadata(metadata_rows) self.save_metadata(target_4, metadata_df, file) time2 = time.time()