def main(): """Main function""" args = parse_args() smiles_list = uc.read_smi_file(args.input_smiles_path) LOG.info("Building vocabulary") tokenizer = mv.SMILESTokenizer() vocabulary = mv.create_vocabulary(smiles_list, tokenizer=tokenizer) tokens = vocabulary.tokens() LOG.info("Vocabulary contains %d tokens: %s", len(tokens), tokens) network_params = { 'num_layers': args.num_layers, 'layer_size': args.layer_size, 'cell_type': args.cell_type, 'embedding_layer_size': args.embedding_layer_size, 'dropout': args.dropout } model = mm.Model(no_cuda=True, vocabulary=vocabulary, tokenizer=tokenizer, network_params=network_params, max_sequence_length=args.max_sequence_length) LOG.info("Saving model at %s", args.output_model_path) model.save(args.output_model_path)
def load_sets(set_path): file_paths = [set_path] if os.path.isdir(set_path): file_paths = sorted(glob.glob("{}/*.smi".format(set_path))) for path in it.cycle(file_paths): # stores the path instead of the set yield list(uc.read_smi_file(path))
def load_sets(set_path): file_paths = [set_path] if os.path.isdir(set_path): file_paths = sorted(glob.glob("{}/*.smi".format(set_path))) for path in file_paths: yield list(uc.read_smi_file(path))
def main(): """Main function.""" args = parse_args() model = mm.DecoratorModel.load_from_file(args.model_path, mode="eval") input_scaffolds = list(uc.read_smi_file(args.input_scaffold_path)) if args.output_smiles_path: if args.use_gzip: args.output_smiles_path += ".gz" output_file = uc.open_file(args.output_smiles_path, "w+") write_func = functools.partial(output_file.write) else: output_file = tqdm.tqdm write_func = functools.partial(output_file.write, end="") sample_model = ma.SampleModel(model, args.batch_size) for scaff, dec, nll in ul.progress_bar(sample_model.run(input_scaffolds), total=len(input_scaffolds)): output_row = [scaff, dec] if args.with_nll: output_row.append("{:.8f}".format(nll)) write_func("\t".join(output_row) + "\n") if args.output_smiles_path: output_file.close()
def main(): """Main function.""" args = parse_args() ut.set_default_device("cuda") model = mm.Model.load_from_file(args.model_path, mode="sampling") input_csv = uc.open_file(args.input_csv_path, mode="rt") if args.use_gzip: args.output_csv_path += ".gz" output_csv = uc.open_file(args.output_csv_path, mode="wt+") calc_nlls_action = ma.CalculateNLLsFromModel(model, batch_size=args.batch_size, logger=LOG) smiles_list = list(uc.read_smi_file(args.input_csv_path)) for nll in ul.progress_bar(calc_nlls_action.run(smiles_list), total=len(smiles_list)): input_line = input_csv.readline().strip() output_csv.write("{}\t{:.8f}\n".format(input_line, nll)) input_csv.close() output_csv.close()
def _initialize_dataloader(self, path): training_set = uc.read_smi_file(path) dataset = md.Dataset(smiles_list=training_set, vocabulary=self._model.vocabulary, tokenizer=mv.SMILESTokenizer()) dataloader = torch.utils.data.DataLoader( dataset, batch_size=self._batch_size, shuffle=self._shuffle_each_epoch, collate_fn=md.Dataset.collate_fn) return dataloader
def main(): """Main function.""" args = parse_args() model = mm.Model.load_from_file(args.model_path, mode="sampling") training_set = list(uc.read_smi_file(args.training_set_path)) validation_set = list(uc.read_smi_file(args.validation_set_path)) writer = tbx.SummaryWriter(log_dir=args.log_path) ma.CollectStatsFromModel(model, args.epoch, training_set, validation_set, writer, sample_size=args.sample_size, with_weights=args.with_weights, to_mol_func=uc.get_mol_func(args.smiles_type), logger=LOG).run() writer.close()
def run(self): """ Calculates likelihoods of a set of molecules. """ ut.set_default_device("cuda") model = mm.Model.load_from_file(self._model_path, sampling_mode=True) nll_iterator, size = md.calculate_nlls_from_model( model, uc.read_smi_file(self._input_csv_path), batch_size=self._batch_size) with open(self._input_csv_path, "r") as input_csv: with open(self._output_csv_path, "w+") as output_csv: for nlls in ul.progress_bar(nll_iterator, size): for nll in nlls: line = input_csv.readline().strip() output_csv.write("{},{:.12f}\n".format(line, nll))
def main(): """Main function.""" args = parse_args() model = mm.DecoratorModel.load_from_file(args.model_path, mode="eval") input_scaffolds = list(uc.read_smi_file(args.input_scaffold_path)) sample_scaffolds = SampleScaffolds( model, num_randomized_smiles=args.num_randomized_smiles, num_decorations_per_scaffold=args.num_decorations_per_scaffold, decorator_type=args.decorator_type, batch_size=args.batch_size, num_partitions=args.num_partitions, logger=LOG) results_df = sample_scaffolds.run(input_scaffolds) results_df.write.parquet(args.output_parquet_folder)
def __init__(self, input_smiles_path, output_model_path='storage', num_layers=1, layer_size=512, embedding_layer_size=128, dropout=0., max_sequence_length=256, memory_cells=32, cell_size=20, read_heads=8, model_type='dnc', controller_type='lstm', num_controller_layers=3): """ Creates a CreateModelRunner. :param input_smiles_path: The input smiles string. :param output_model_path: The path to the newly created model. :param num_gru_layers: Number of GRU Layers. :param gru_layer_size: Size of each GRU layer. :param embedding_layer_size: Size of the embedding layer. :return: """ self._smiles_list = uc.read_smi_file(input_smiles_path) self._output_model_path = output_model_path self._num_layers = num_layers self._layer_size = layer_size self._embedding_layer_size = embedding_layer_size self._dropout = dropout self._max_sequence_length = max_sequence_length self._memory_cells = memory_cells self._cell_size = cell_size self._read_heads = read_heads self._model_type = model_type self._controller_type = controller_type self._num_controller_layers = num_controller_layers self._already_run = False
def calc_nlls(path): return np.concatenate( list( md.calculate_nlls_from_model( self._model, uc.read_smi_file(path, num=self._sample_size))[0]))