Пример #1
0
 def _save_npz_data(self, data_list, data_path, max_num_per_file=10000):
     if not exists(data_path):
         os.makedirs(data_path)
     n = len(data_list)
     for i in range(int((n - 1) / max_num_per_file) + 1):
         file = 'part-%05d.npz' % i
         sub_data_list = self.data_list[i * max_num_per_file: (i + 1) * max_num_per_file]
         save_data_list_to_npz(join(data_path, file), sub_data_list)
Пример #2
0
 def test_data_list_to_npz(self):
     data_list = [{"a": np.array([1, 23, 4])}, {"a": np.array([2, 34, 5])}]
     npz_file = 'tmp.npz'
     save_data_list_to_npz(data_list, npz_file)
     reload_data_list = load_npz_to_data_list(npz_file)
     self.assertEqual(len(data_list), len(reload_data_list))
     for d1, d2 in zip(data_list, reload_data_list):
         self.assertEqual(len(d1), len(d2))
         for key in d1:
             self.assertTrue((d1[key] == d2[key]).all())
Пример #3
0
    def _save_npz_data(self, data_list, data_path, max_num_per_file=10000):
        if not exists(data_path):
            os.makedirs(data_path)

        sub_data_list = []
        count = 0
        for data in self.data_generator:
            sub_data_list.append(data)
            if len(sub_data_list) == 0:
                file = 'part-%05d.npz' % count
                save_data_list_to_npz(join(data_path, file), sub_data_list)
                sub_data_list = []
                count += 1
        if len(sub_data_list) > 0:
            file = 'part-%05d.npz' % count
            save_data_list_to_npz(join(data_path, file), sub_data_list)
Пример #4
0
def main(args):
    """Entry for data preprocessing."""

    os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True)
    processed_dir = os.path.join(data_dir, 'processed')

    save_dir = '%s/GDSC/drug_graph_feat' % args.data_dir
    raw_drug_feature(Drug_smiles_file, save_dir)
    Drug_feature_file = '%s/GDSC/drug_graph_feat' % data_dir

    metadata = metadata_generate(Drug_info_file,
                                 Cell_line_info_file,
                                 Genomic_mutation_file,
                                 Drug_feature_file,
                                 Gene_expression_file,
                                 Methylation_file)
    drug_feature, mutation_feature, gexpr_feature, methylation_feature, data_idx = metadata['metadata']
    drug_feature = gen_drug_feature(drug_feature, israndom=args.israndom)
    train_idx, test_idx = data_split(data_idx, args.split_ratio)
    print('==============================')
    print("train_set : test_set == %.2f" % (len(train_idx) / len(test_idx)))

    for split in ['train', 'test']:
        index = train_idx if split == 'train' else test_idx
        mutation_data, gexpr_data, methylation_data, target, cancer_type_list = gen_omics_feature(index,
                                                                                                  mutation_feature,
                                                                                                  gexpr_feature,
                                                                                                  methylation_feature)
        drug_list = gen_drug_graph(drug_feature, index)
        data_lst = [{'drug_list': drug_list, 'mutation_data': mutation_data, 'gexpr_data': gexpr_data,
                     'methylation_data': methylation_data, 'target': target, 'cancer_type_list': cancer_type_list}]

        npz = os.path.join(processed_dir, '{}_{}.npz'.format(split, args.split_ratio))
        save_data_list_to_npz(data_lst, npz)

    print('==============================')
    print('{} training samples and {} testing samples have been generated and saved in {} '.format(len(train_idx),
                                                                                                   len(test_idx),
                                                                                                   processed_dir))
Пример #5
0
def preprocess_dataset(name):
    """
    Preprocess raw datasets.

    Args:
        name (str): name of the dataset.
    """
    data_dir = os.path.join('data', name, 'raw')
    if not os.path.exists(data_dir):
        print('Ignore MUTAG dataset. Cannot find the corresponding folder: %s.' % data_dir)
        return

    can, txt = Datasets[name]
    smiles_path = os.path.join(data_dir, can)
    labels_path = os.path.join(data_dir, txt)
    smiles_list = pd.read_csv(smiles_path, sep=' ', header=None)[0]
    labels = pd.read_csv(labels_path, header=None)[0].replace(-1, 0).values

    data_list, data_smiles_list = [], []
    for i in range(len(smiles_list)):
        s = smiles_list[i]
        mol = AllChem.MolFromSmiles(s)
        if mol is not None:
            data = mol_to_graph_data(mol)
            data['label'] = labels[i].reshape([-1])
            data_list.append(data)
            data_smiles_list.append(smiles_list[i])

    processed_dir = os.path.join('data', name, 'processed')
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    with open(os.path.join(processed_dir, 'smiles.txt'), 'w') as f:
        for smiles in smiles_list:
            f.write('%s\n' % smiles)

    save_data_list_to_npz(
        data_list, os.path.join(processed_dir, 'data.npz'))
Пример #6
0
def main():
    """Entry for data preprocessing."""
    tokenizer = ProteinTokenizer()
    for dataset in ['davis', 'kiba']:
        data_dir = os.path.join(args.dataset_root, dataset)
        if not os.path.exists(data_dir):
            print('Cannot find {}'.format(data_dir))
            continue

        train_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt')))
        train_fold = [ee for e in train_fold for ee in e]  # flatten
        test_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt')))
        ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')),
                            object_pairs_hook=OrderedDict)
        proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')),
                             object_pairs_hook=OrderedDict)
        # Use encoding 'latin1' to load py2 pkl from py3
        # pylint: disable=E1123
        affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'),
                               encoding='latin1')

        smiles_lst, protein_lst = [], []
        for k in ligands.keys():
            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]),
                                      isomericSmiles=True)
            smiles_lst.append(smiles)

        for k in proteins.keys():
            protein_lst.append(proteins[k])

        if dataset == 'davis':
            # Kd data
            affinity = [-np.log10(y / 1e9) for y in affinity]

        affinity = np.asarray(affinity)

        # pylint: disable=E1123
        os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True)
        for split in ['train', 'test']:
            print('processing {} set of {}'.format(split, dataset))

            split_dir = os.path.join(data_dir, 'processed', split)
            # pylint: disable=E1123
            os.makedirs(split_dir, exist_ok=True)

            fold = train_fold if split == 'train' else test_fold
            rows, cols = np.where(np.isnan(affinity) == False)
            rows, cols = rows[fold], cols[fold]

            data_lst = []
            for idx in range(len(rows)):
                mol = AllChem.MolFromSmiles(smiles_lst[rows[idx]])
                mol_graph = mol_to_graph_data(mol)
                data = {k: v for k, v in mol_graph.items()}

                seqs = []
                for seq in protein_lst[cols[idx]].split('\x01'):
                    seqs.extend(tokenizer.gen_token_ids(seq))
                data['protein_token_ids'] = np.array(seqs)

                af = affinity[rows[idx], cols[idx]]
                if dataset == 'davis':
                    data['Log10_Kd'] = np.array([af])
                elif dataset == 'kiba':
                    data['KIBA'] = np.array([af])

                data_lst.append(data)

            random.shuffle(data_lst)
            npz = os.path.join(split_dir, '{}_{}.npz'.format(dataset, split))
            save_data_list_to_npz(data_lst, npz)

        print('==============================')
        print('dataset:', dataset)
        print('train_fold:', len(train_fold))
        print('test_fold:', len(test_fold))
        print('unique drugs:', len(set(smiles_lst)))
        print('unique proteins:', len(set(protein_lst)))