Exemplo n.º 1
0
def rdkit_mmff94_xyz(smiles, **kwargs):
    """
    Returns the string of the XYZ file obtained performing the MMFF94 molecular mechanics optimization of the given
    SMILES using RDKit.
    Writing temporary files in $MM_WORKING_DIR if defined or otherwise in /tmp
    :param smiles: input_SMILES
    :param max_iterations: max number of iterations (default 500)
    :return : XYZ string of optimized geometry, success (whether the MM optimization was successful and the smiles has
    stayed identical after optimization)
    """

    working_dir = os.environ[
        "MM_WORKING_DIR"] if "MM_WORKING_DIR" in os.environ else "/tmp"

    # Converting the molecule to RDKit object
    mol = MolFromSmiles(smiles)
    smi_canon = MolToSmiles(MolFromSmiles(smiles))

    # Setting paths
    filename_smiles = str(os.getpid()) + "_" + smi_to_filename(smi_canon)
    xyz_path = join(working_dir, filename_smiles + '.xyz')
    post_MM_smi_path = join(working_dir, filename_smiles + '.smi')

    # Computing geometry
    try:

        # Adding implicit hydrogens
        mol = AddHs(mol)

        # MM optimization
        EmbedMolecule(mol)

        value = MMFFOptimizeMolecule(mol, maxIters=kwargs["max_iterations"])

        # Success if returned value is null
        success_RDKIT_output = value == 0

        # Computing XYZ from optimized molecule
        xyz_str = MolToXYZBlock(mol)

        # Writing optimized XYZ to file
        with open(xyz_path, "w") as f:
            f.writelines(xyz_str)

        # Success if the optimization has converged and the post MM smiles is identical the pre MM smiles
        success = success_RDKIT_output and check_identical_geometries(
            xyz_path, smi_canon, post_MM_smi_path)

    except Exception as e:
        success = False
        xyz_str = None
    finally:
        # Removing files
        remove_files([post_MM_smi_path, xyz_path])

    return xyz_str, success
def get_mol_from_smiles(smiles: str) -> Mol:
    mol = MolFromSmiles(smiles)
    try:
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol, maxAttempts=5000)
        AllChem.UFFOptimizeMolecule(mol)
        mol = Chem.RemoveHs(mol)
    except ValueError:
        mol = MolFromSmiles(smiles)
        AllChem.Compute2DCoords(mol)

    return mol
Exemplo n.º 3
0
    def load_pop_from_smiles_list(self, smiles_list, atom_mutability=True):
        """
        Loading the population from the given smiles list.
        Setting the internal variables to their values
        :param smiles_list: list of SMILES
        :param atom_mutability: whether the core of the molecules of the starting population can be modified
        :return:
        """

        if self.shuffle_init_pop:
            np.random.shuffle(smiles_list)

        # Iterating over all the given smiles
        for i, smi in enumerate(smiles_list):

            # Loading QuMolGraph object
            self.pop[i] = MolGraph(MolFromSmiles(smi), sanitize_mol=True, mutability=atom_mutability)

            # Saving smiles in the tabu dictionary and in action history initialization
            self.pop_tabu_list[i] = self.pop[i].to_aromatic_smiles()
            self.actions_history[i] = self.pop[i].to_aromatic_smiles()

        # Evaluation of the population (not recording the count of calls to the objective function)
        print("Computing scores at initialization...")
        self.evaluation_strategy.set_params(**self.evaluation_strategy_parameters["evaluate_init_pop"])
        self.evaluation_strategy.disable_calls_count()
        self.evaluation_strategy.compute_record_scores_init_pop(self.pop)
        self.evaluation_strategy.enable_calls_count()
        self.evaluation_strategy.set_params(**self.evaluation_strategy_parameters["evaluate_new_solution"])
Exemplo n.º 4
0
def compute_mol_legend(action_history_k,
                       smi,
                       action_history_scores,
                       legend_scores_keys_strat=None):
    legend = ""
    last = 0
    scores_float = []

    if legend_scores_keys_strat is not None:
        for i, key_strat in enumerate(legend_scores_keys_strat):

            score = None

            if isinstance(key_strat, str):
                score = action_history_scores[action_history_k][key_strat]
            elif isinstance(key_strat, EvaluationStrategy):
                score = key_strat.evaluate_individual(
                    MolGraph(MolFromSmiles(smi), sanitize_mol=True))

            scores_float.append(score)
            score_str = "{:.2f}".format(score)

            if i == 1:
                legend += " ["
            elif i > 1:
                legend += ", "

            legend += score_str
            last = i

        if last >= 1:
            legend += "]"

    return legend, scores_float
Exemplo n.º 5
0
 def evaluate_individual(self, individual):
     if individual is None:
         return None
     else:
         mol_graph = MolFromSmiles(individual.to_aromatic_smiles())
         score = qed(mol_graph)
         return score, [score]
Exemplo n.º 6
0
    def extract_shingles(self, individual):

        qry_shingles = set()

        radius_constr = self.radius + 1

        # Reloading molecule to make it aromatic
        mol = MolFromSmiles(individual.to_aromatic_smiles())

        for atm_idx in range(individual.mol_graph.GetNumAtoms()):
            for N in range(1, radius_constr):
                bonds = AllChem.FindAtomEnvironmentOfRadiusN(mol, N, atm_idx)

                if not bonds:
                    break

                # the reportedly faster method
                atoms = set()
                for bond_id in bonds:
                    bond = mol.GetBondWithIdx(bond_id)
                    atoms.add(bond.GetBeginAtomIdx())
                    atoms.add(bond.GetEndAtomIdx())

                if self.rooted:
                    new_shingle = Chem.rdmolfiles.MolFragmentToSmiles(
                        mol, list(atoms), bonds, 0, 0, False, False, atm_idx,
                        True, False, False)
                else:
                    new_shingle = Chem.rdmolfiles.MolFragmentToSmiles(
                        mol, list(atoms), bonds, 0, 0, False, False, -1, True,
                        False, False)

                qry_shingles.add(new_shingle)

        return qry_shingles
Exemplo n.º 7
0
def compute_mol_attributes(graph,
                           labels_dict,
                           actions_history_smi_pop,
                           actions_history_smi_removed,
                           actions_history_scores_pop,
                           actions_history_scores_removed,
                           legend_scores_keys_strat=None):
    images_attributes = {}
    scores_attributes = {}

    draw_opt = DrawingOptions()
    draw_opt.coordScale = 0.9
    draw_opt.dotsPerAngstrom = 30

    for action_history_k in labels_dict.keys():

        if action_history_k in actions_history_smi_pop:

            smi = actions_history_smi_pop[action_history_k]
            img = MolToImage(MolFromSmiles(smi),
                             size=(800, 800),
                             options=draw_opt)
            images_attributes[action_history_k] = crop_image_with_transparency(
                img)

            legend, _ = compute_mol_legend(action_history_k, smi,
                                           actions_history_scores_pop,
                                           legend_scores_keys_strat)
            scores_attributes[action_history_k] = legend

        else:

            smi = actions_history_smi_removed[action_history_k]
            img = MolToImage(MolFromSmiles(smi),
                             size=(800, 800),
                             options=draw_opt)
            images_attributes[action_history_k] = crop_image_with_transparency(
                img)

            legend, _ = compute_mol_legend(action_history_k, smi,
                                           actions_history_scores_removed,
                                           legend_scores_keys_strat)

            scores_attributes[action_history_k] = legend

    nx.set_node_attributes(graph, images_attributes, "image")
    nx.set_node_attributes(graph, scores_attributes, "score_label")
Exemplo n.º 8
0
def test_react(reactant, expected_products):
    reactor = Reactor()
    reactant_mol = MolFromSmiles(reactant)
    AllChem.EmbedMolecule(reactant_mol, AllChem.ETKDG())
    products = reactor.react(reactant_mol)

    products = mols2smiles(products)
    assert products == expected_products
Exemplo n.º 9
0
 def _transform(self, x):
     try:
         mol = MolFromInchi(x['standard_inchi'])
     except:
         mol = MolFromSmiles(x['Compound_SMILES'])
     info = {}
     AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, self.dim, bitInfo=info)
     return list(info.keys())
Exemplo n.º 10
0
def get_score_components(smiles):
    '''
    Get the non-normalized score components
    :param smiles: a VALID smiles string
    :return: a tuple of floats
    '''
    this_mol = MolFromSmiles(smiles)
    return get_score_components_from_mol(this_mol)
Exemplo n.º 11
0
    def _set_target_fps(self, pickaxe: Pickaxe):
        for smiles in pickaxe.target_smiles:
            mol = MolFromSmiles(smiles)
            if self.fingerprint_method == "Morgan":
                fp = AllChem.GetMorganFingerprintAsBitVect(mol, **self.fingerprint_args)
            else:
                fp = RDKFingerprint(mol)

            self.target_fps.append(fp)
Exemplo n.º 12
0
    def evaluate_individual(self, individual):

        mol_graph = MolFromSmiles(individual.to_aromatic_smiles())

        log_p = Descriptors.MolLogP(mol_graph)
        sas_score = sascorer.calculateScore(mol_graph)
        largest_ring_size = self.get_largest_ring_size(mol_graph)
        cycle_score = max(largest_ring_size - 6, 0)
        score = log_p - sas_score - cycle_score
        return score, [score]
Exemplo n.º 13
0
def draw_mol_labels(labels_dict, actions_history_smi_pop, actions_history_smi_removed,
                    actions_history_scores_pop, actions_history_scores_removed, legend_scores_keys_strat=None,
                    problem_type="max", mols_per_row=4, draw_n_mols=None):
    smi_to_draw = {}
    legends_to_draw = {}
    scores_float = {}

    for action_history_k in labels_dict.keys():

        if labels_dict[action_history_k] != "":

            if action_history_k in actions_history_smi_pop:
                smi = actions_history_smi_pop[action_history_k]
                smi_to_draw[labels_dict[action_history_k]] = smi

                legend, scores = compute_mol_legend(action_history_k, smi, actions_history_scores_pop,
                                                    legend_scores_keys_strat)
                legends_to_draw[labels_dict[action_history_k]] = legend
                scores_float[labels_dict[action_history_k]] = scores
            else:
                smi = actions_history_smi_removed[action_history_k]
                smi_to_draw[labels_dict[action_history_k]] = smi

                legend, scores = compute_mol_legend(action_history_k, smi, actions_history_scores_removed,
                                                    legend_scores_keys_strat)
                legends_to_draw[labels_dict[action_history_k]] = legend
                scores_float[labels_dict[action_history_k]] = scores

    mols = []
    legends = []
    scores_to_sort = []
    for k, smi in smi_to_draw.items():
        mols.append(MolFromSmiles(smi))
        legends.append(legends_to_draw[k])
        scores_to_sort.append(scores_float[k][0])


    mols = np.array(mols)
    legends = np.array(legends)

    # Sorting molecules
    sorted_order = np.argsort(scores_to_sort)
    if problem_type == "max":
        sorted_order = sorted_order[::-1]

    # Filtering molecules if necessary
    if draw_n_mols is not None:
        mols = mols[:draw_n_mols]
        legends = legends[:draw_n_mols]

    legends = list(legends[sorted_order])
    mols = list(mols[sorted_order])

    img = MolsToGridImage(mols, legends=legends, molsPerRow=mols_per_row, subImgSize=(200, 200))
    return img
Exemplo n.º 14
0
def get_all_metrics(smiles):
    mols = [MolFromSmiles(s) for s in smiles]
    scorer = NormalizedScorer()
    scores, norm_scores = scorer.get_scores_from_mols(mols)
    arom_rings = np.array([Descriptors.NumAromaticRings(m) for m in mols])
    metrics = np.concatenate([scores.sum(axis=1)[:, None],
                              norm_scores.sum(axis=1)[:, None],
                              scores[:, 1][:, None],
                              norm_scores[:, 1][:, None],
                              arom_rings[:, None]],
                             axis=1)
    return (smiles, metrics)
Exemplo n.º 15
0
    def extract_descriptors(self, individual):
        """
        Returning the descriptor(s) extracted from the given individual
        :param individual:
        :return:
        """

        if self.descriptor_key == "gen_scaffolds":
            return [
                MolToSmiles(
                    MurckoScaffold.MakeScaffoldGeneric(
                        MolFromSmiles(individual.to_smiles())))
            ]
        elif self.descriptor_key == "ifg":
            curr_ifgs = ifg.identify_functional_groups(
                MolFromSmiles(individual.to_smiles()))
            return list(set([curr_ifg[2] for curr_ifg in curr_ifgs]))
        elif self.descriptor_key == "atoms":
            return list(set(individual.get_atom_types()))
        elif self.descriptor_key == "shg_1":
            return list(extract_shingles(individual, 1))
        elif self.descriptor_key == "checkmol":
            return list(set(extract_checkmol(individual)))
Exemplo n.º 16
0
def obabel_mmff94_xyz(smiles, **kwargs):
    """
    Returns the string of the XYZ file obtained performing the MMFF94 molecular mechanics optimization of the given
    SMILES using obabel.
    Writing temporary files in $MM_WORKING_DIR if defined or otherwise in /tmp
    :param smiles : input SMILES
    :return : XYZ string of optimized geometry, success (whether the MM optimization was successful and the smiles has
    stayed identical after optimization)
    """

    working_dir = os.environ[
        "MM_WORKING_DIR"] if "MM_WORKING_DIR" in os.environ else "/tmp"

    # Computing RDKIT canonical SMILES
    smi_canon = MolToSmiles(MolFromSmiles(smiles))
    filename_smiles = str(os.getpid()) + "_" + smi_to_filename(smi_canon)

    # Computing files paths
    smi_path = join(working_dir, filename_smiles + ".smi")
    xyz_path = join(working_dir, filename_smiles + ".xyz")
    post_MM_smi_path = join(working_dir, filename_smiles + ".post_MM.smi")

    try:

        # Writing smiles to file
        with open(smi_path, "w") as f:
            f.write(smi_canon)

        # Converting SMILES to XYZ after computing MM (Obabel MMFF94)
        command_obabel = join(os.getenv("OPT_LIBS"),
                              "obabel/openbabel-2.4.1/bin/obabel") + " -ismi " + smi_path \
                               + " -oxyz -O " + xyz_path + " --gen3d"
        os.system(command_obabel + " > /dev/null 2> /dev/null")

        # Reading XYZ string
        with open(xyz_path, "r") as f:
            xyz_str = f.read()

        # Success if the post MM smiles is identical the pre MM smiles
        success = check_identical_geometries(xyz_path, smi_canon,
                                             post_MM_smi_path)

    except Exception as e:
        success = False
        xyz_str = None
    finally:
        # Removing files
        remove_files([smi_path, xyz_path, post_MM_smi_path])

    return xyz_str, success
Exemplo n.º 17
0
def load_obabel_smi(smi_path):
    """
    Converting a OpenBabel SMILES into a canonical aromatic RDKit SMILES
    :param smi_path:
    :return:
    """

    # Extracting smiles
    with open(smi_path, "r") as f:
        new_smi = f.readline()

        # Loading converged mol
        new_mol = MolFromSmiles(new_smi)

        # Removing stereo information
        RemoveStereochemistry(new_mol)

        # Removing hydrogens
        new_mol = RemoveHs(new_mol)

        # Converting to SMILES
        smi_rdkit = MolToSmiles(MolFromSmiles(MolToSmiles(new_mol)))

        return smi_rdkit
Exemplo n.º 18
0
def extract_shingles(smiles, level, as_list=False):
    """
    Extracting up to the given level from the given smiles
    see https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0321-8
    """

    if as_list:
        qry_shingles = list()
    else:
        qry_shingles = set()

    radius_constr = level + 1

    # Reloading molecule to make it aromatic
    mol = MolFromSmiles(smiles)

    for atm_idx in range(mol.GetNumAtoms()):
        for N in range(1, radius_constr):
            bonds = AllChem.FindAtomEnvironmentOfRadiusN(mol, N, atm_idx)

            if not bonds:
                break

            # the reportedly faster method
            atoms = set()
            for bond_id in bonds:
                bond = mol.GetBondWithIdx(bond_id)
                atoms.add(bond.GetBeginAtomIdx())
                atoms.add(bond.GetEndAtomIdx())

            # Computed rooted shingle
            new_shingle = Chem.rdmolfiles.MolFragmentToSmiles(
                mol, list(atoms), bonds, 0, 0, False, False, atm_idx, True,
                False, False)
            if as_list:
                qry_shingles.append(new_shingle)
            else:
                qry_shingles.add(new_shingle)

    return qry_shingles
Exemplo n.º 19
0
def load_obabel_smi(smi_path, sanitize_mol):
    # Extracting smiles
    with open(smi_path, "r") as f:
        new_smi = f.readline()

        print("obabel new smi : " + new_smi)

        # Loading converged mol
        new_mol = MolFromSmiles(new_smi)

        # Removing stereo information
        RemoveStereochemistry(new_mol)

        # Removing hydrogens
        new_mol = RemoveHs(new_mol)

        # Converting to SMILES
        smi_rdkit = MolGraph(new_mol,
                             sanitize_mol=sanitize_mol).to_aromatic_smiles()
        print("rdkit new smi : " + smi_rdkit)

        return smi_rdkit
Exemplo n.º 20
0
    def load_pop_from_smiles_list(self, smiles_list, atom_mutability=True):
        """
        Loading the population from the given smiles list.
        Setting the internal variables to their values
        :param smiles_list: list of SMILES
        :param atom_mutability: whether the core of the molecules of the starting population can be modified
        :return:
        """

        # Iterating over all the given smiles
        for i, smi in enumerate(smiles_list):

            # Loading QuMolGraph object
            self.pop[i] = MolGraph(MolFromSmiles(smi), sanitize_mol=True, mutability=atom_mutability)

            # Saving smiles in the tabu dictionary and in action history initialization
            self.pop_tabu_list[i] = self.pop[i].to_aromatic_smiles()
            self.actions_history[i] = self.pop[i].to_aromatic_smiles()

        # Evaluation of the population
        print("Computing descriptors at initialization...")
        self.evaluation_strategy.compute_record_scores(self.pop)
Exemplo n.º 21
0
    def evaluate_individual(self, individual):
        """
        from https://github.com/bowenliu16/rl_graph_generation/blob/master/gym-molecule/gym_molecule/envs/molecule.py
        """
        # normalization constants, statistics from 250k_rndm_zinc_drugs_clean.smi
        logP_mean = 2.4570953396190123
        logP_std = 1.434324401111988
        SA_mean = -3.0525811293166134
        SA_std = 0.8335207024513095
        cycle_mean = -0.0485696876403053
        cycle_std = 0.2860212110245455

        mol_graph = MolFromSmiles(individual.to_aromatic_smiles())

        log_p = Descriptors.MolLogP(mol_graph)
        SA = -sascorer.calculateScore(mol_graph)

        # cycle score
        cycle_list = nx.cycle_basis(
            nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol_graph)))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_score = -cycle_length

        normalized_log_p = (log_p - logP_mean) / logP_std
        normalized_SA = (SA - SA_mean) / SA_std
        normalized_cycle = (cycle_score - cycle_mean) / cycle_std

        score = normalized_log_p + normalized_SA + normalized_cycle

        return score, [score]
Exemplo n.º 22
0
 def __init__(self, smiles):
     self._mol = MolFromSmiles(smiles)
Exemplo n.º 23
0
 def get_scores(self, smiles):
     mols = [MolFromSmiles(s) for s in smiles]
     return self.get_scores_from_mols(mols)
Exemplo n.º 24
0
    def initialize(self):
        """
        Initialization of EvoMol with starting values.
        This method MUST BE CALLED BEFORE running the algorithm.
        :return:
        """

        # Initialization of population
        self.pop = list(np.full((self.pop_max_size,), None))

        # Initialization of the dictionaries containing the smiles of former and current individuals as keys
        self.pop_tabu_list = list(np.full((self.pop_max_size,), None))

        # Intialization of the list of all individual ever inserted in the population, the list of their
        # corresponding number of calls to the objective function at insertion and the list of the corresponding steps.
        # Also recording the values of the objective function
        self.all_generated_individuals_smiles = []
        self.all_generated_individuals_n_obj_calls = []
        self.all_generated_individuals_step = []
        self.all_generated_individuals_obj_value = []
        self.all_generated_individuals_improver = []
        self.all_generated_individuals_success_obj_computation = []

        # Insuring the SMILES of the external tabu list are canonical
        if self.external_tabu_list is not None:
            self.external_tabu_list = [MolGraph(MolFromSmiles(smi)).to_aromatic_smiles() for smi in self.external_tabu_list]

        # Initialization of the dictionary containing the traces of steps of the algorithm
        self.step_traces = {
            'scores': {},
            'n_replaced': [],
            'additional_values': {},
            'timestamps': []
        }

        # Initialization of keys in the self.step_traces dict declared by the evaluation strategy instance
        for k in self.evaluation_strategy.keys() + ["total"]:
            for stat in ["mean", "med", "min", "max", "std"]:
                self.step_traces["scores"][k + "_" + stat] = []

        # Initialization of keys in the self.step_traces dict for additional population scores
        for k in self.evaluation_strategy.get_additional_population_scores().keys():
            print(k)
            self.step_traces['additional_values'][k] = []

        # Initialization of the step counter.
        self.curr_step_id = 0

        # Initialization of errors list
        self.errors = []
        self.curr_total_scores = None
        self.curr_scores = None
        self.timestamp_start = None

        # Computing idx of kth score to be recorded vector
        for i, k in enumerate(self.evaluation_strategy.keys()):
            if k == self.kth_score_to_record_key:
                self.kth_score_to_record_idx = i

        self.kth_score_history = deque(maxlen=500)

        self.n_success_mut = np.zeros(self.pop_max_size, dtype=np.int)
        self.n_fail_mut = np.zeros(self.pop_max_size, dtype=np.int)

        self.actions_history = list(np.full(self.pop_max_size, None))
        self.removed_actions_score_smi_tuple = {}

        # Computing start timestamp
        self.timestamp_start = time.time()
Exemplo n.º 25
0
def mol_from_smiles(smiles):
    if type(smiles) == 'str':
        return MolFromSmiles(smiles)
    else:  # assume we have a list-like
        return [MolFromSmiles(s) for s in smiles]
Exemplo n.º 26
0
def edit_smiles(request):
    smiles = request.GET.get('SMILES', '')
    mol = MolFromSmiles(smiles)

    return render(request, 'cspace/chemical-editor.html',
                  {'molblock': MolToMolBlock(mol)})
                  if pre_parser(t) is not None]))
        print(len(these_smiles))
    these_actions = my_model.strings_to_actions(these_smiles)
    action_seq_length = my_model.action_seq_length(these_actions)
    onehot = my_model.actions_to_one_hot(these_actions)
    append_data = {
        'smiles': np.array(these_smiles, dtype=dt),
        'indices': np.array(these_indices),
        'actions': these_actions,
        'valid': np.ones((len(these_smiles))),
        'seq_len': action_seq_length,
        'data': onehot
    }
    if molecules:
        from rdkit.Chem.rdmolfiles import MolFromSmiles
        mols = [MolFromSmiles(s) for s in these_smiles]
        raw_scores = np.array([get_score_components_from_mol(m) for m in mols])
        append_data['raw_scores'] = raw_scores
        num_atoms = np.array([len(m.GetAtoms()) for m in mols])
        append_data['num_atoms'] = num_atoms

    ds.append(append_data)

if molecules:
    # also calculate mean and std of the scores, to use in the ultimate objective
    raw_scores = np.array(ds.h5f['raw_scores'])
    score_std = raw_scores.std(0)
    score_mean = raw_scores.mean(0)
    ds.append_to_dataset('score_std', score_std)
    ds.append_to_dataset('score_mean', score_mean)
Exemplo n.º 28
0
 def get_mol(self):
     return MolFromSmiles(self.smiles)
Exemplo n.º 29
0
def main():
  # change this to False to produce the equation dataset
  molecules = True
  # change this to False to get character-based encodings instead of grammar-based
  grammar = 'new' #use True  for the grammar used by Kusner et al

  # can't define model class inside settings as it itself uses settings a lot
  _, my_model = get_vae(molecules, grammar)
  def pre_parser(x):
      try:
          return next(my_model._parser.parse(x))
      except Exception as e:
          return None

  settings = get_settings(molecules,grammar)
  MAX_LEN = settings['max_seq_length']
  #feature_len = settings['feature_len']
  dest_file = settings['data_path']
  source_file = settings['source_data']

  # Read in the strings
  f = open(source_file,'r')
  L = []
  for line in f:
      line = line.strip()
      L.append(line)
  f.close()

  # convert to one-hot and save, in small increments to save RAM
  #dest_file = dest_file.replace('.h5','_new.h5')
  ds = IncrementingHDF5Dataset(dest_file)

  step = 100
  dt = h5py.special_dtype(vlen=str)     # PY3 hdf5 datatype for variable-length Unicode strings
  size = min(10000, len(L))
  for i in tqdm(range(0, size, step)):#for i in range(0, 1000, 2000):
      #print('Processing: i=[' + str(i) + ':' + str(i + step) + ']')
      these_indices = list(range(i, min(i + step,len(L))))
      these_smiles = L[i:min(i + step,len(L))]
      if grammar=='new': # have to weed out non-parseable strings
          tokens = [my_model._tokenize(s.replace('-c','c')) for s in these_smiles]
          these_smiles, these_indices = list(zip(*[(s,ind) for s,t,ind in zip(these_smiles, tokens, these_indices) if pre_parser(t) is not None]))
          #print(len(these_smiles))
      these_actions = torch.tensor(my_model.strings_to_actions(these_smiles))
      action_seq_length = my_model.action_seq_length(these_actions)
      onehot = my_model.actions_to_one_hot(these_actions)
      append_data = {'smiles': np.array(these_smiles, dtype=dt),
                    'indices': np.array(these_indices),
                    'actions': these_actions,
                    'valid': np.ones((len(these_smiles))),
                    'seq_len': action_seq_length,
                    'data': onehot}
      if molecules:
          from rdkit.Chem.rdmolfiles import MolFromSmiles
          mols = [MolFromSmiles(s) for s in these_smiles]
          raw_scores = np.array([get_score_components_from_mol(m) for m in mols])
          append_data['raw_scores'] = raw_scores
          num_atoms = np.array([len(m.GetAtoms()) for m in mols])
          append_data['num_atoms'] = num_atoms

      ds.append(append_data)

  if molecules:
      # also calculate mean and std of the scores, to use in the ultimate objective
      raw_scores = np.array(ds.h5f['raw_scores'])
      score_std = raw_scores.std(0)
      score_mean = raw_scores.mean(0)
      ds.append_to_dataset('score_std',score_std)
      ds.append_to_dataset('score_mean', score_mean)

  print('success!')
Exemplo n.º 30
0
 def _is_radical(self, mol):
     return NumRadicalElectrons(MolFromSmiles(
         mol.to_aromatic_smiles())) != 0