def tensorize_cond(mol_batch, vocab): x, y, cond = zip(*mol_batch) cond = [map(int, c.split(',')) for c in cond] cond = numpy.array(cond) x = MolGraph.tensorize(x, vocab, common_atom_vocab) y = MolGraph.tensorize(y, vocab, common_atom_vocab) return to_numpy(x)[:-1] + to_numpy(y) + (cond, ) #no need of order for x
def tensorize(mol_batch, vocab): x = MolGraph.tensorize(mol_batch, vocab, common_atom_vocab) return to_numpy(x)
if __name__ == "__main__": lg = rdkit.RDLogger.logger() lg.setLevel(rdkit.RDLogger.CRITICAL) parser = argparse.ArgumentParser() parser.add_argument('--train', required=True) parser.add_argument('--vocab', required=True) parser.add_argument('--batch_size', type=int, default=20) parser.add_argument('--ncpu', type=int, default=4) args = parser.parse_args() with open(args.vocab) as f: vocab = [x.strip("\r\n ").split() for x in f] MolGraph.load_fragments([x[0] for x in vocab if eval(x[-1])]) args.vocab = PairVocab([(x, y) for x, y, _ in vocab], cuda=False) pool = Pool(args.ncpu) random.seed(1) with open(args.train) as f: data = [line.strip("\r\n ").split()[0] for line in f] random.shuffle(data) batches = [ data[i:i + args.batch_size] for i in range(0, len(data), args.batch_size) ] func = partial(tensorize, vocab=args.vocab)
def tensorize(mol_batch, vocab, include_smiles=False): x = MolGraph.tensorize(mol_batch, vocab, common_atom_vocab, include_smiles) return to_numpy(x, include_smiles)
if __name__ == "__main__": lg = rdkit.RDLogger.logger() lg.setLevel(rdkit.RDLogger.CRITICAL) parser = argparse.ArgumentParser() parser.add_argument('--train', required=True) parser.add_argument('--vocab', required=True) parser.add_argument('--batch_size', type=int, default=20) parser.add_argument('--ncpu', type=int, default=4) args = parser.parse_args() with open(args.vocab) as f: vocab = [x.strip("\r\n ").split() for x in f] args.vocab = PairVocab(vocab, cuda=False) MolGraph.load_fragments([x[0] for x in vocab]) pool = Pool(args.ncpu) random.seed(1) with open(args.train) as f: data = [line.strip("\r\n ").split()[0] for line in f] random.shuffle(data) batches = [ data[i:i + args.batch_size] for i in range(0, len(data), args.batch_size) ] func = partial(tensorize, vocab=args.vocab) all_data = pool.map(func, batches)
def tensorize_pair(mol_batch, vocab): x, y = zip(*mol_batch) x = MolGraph.tensorize(x, vocab, common_atom_vocab) y = MolGraph.tensorize(y, vocab, common_atom_vocab) return to_numpy(x)[:-1] + to_numpy(y) #no need of order for x