Exemplo n.º 1
0
 def test_standardize_chemical_3(self):
     # Data
     wrong_smiles = '[H]OC(=O)C([H])([H])C([H])([H])C([H])(N=C(O[H])C([H])([H])C([H])([H])C([H])(N=C(O[H])C([H])(OP(=O)(O[H])OC([H])([H])C([H])(O[H])C([H])(O[H])C([H])(O[H])C([H])([H])n1c2nc(=O)nc(O[H])c-2c([H])c2c([H])c([H])c(OP(=O)(OC([H])([H])C(C([H])([H])[H])(C([H])([H])[H])C([H])(O[H])C(=NC([H])([H])C([H])([H])C(=NC([H])([H])C([H])([H])SC(=O)C([H])([H])C([H])([H])C([H])([H])C([H])(C(=C([H])[H])C([H])([H])[H])C([H])([H])C(=O)O[H])O[H])O[H])OP(=O)(O[H])OC([H])([H])C3([H])OC([H])(n4[c]([H])n([H])[c]5[c](N([H])[H])[n][c]([H])[n][c]54)C([H])(O[H])C3([H])OP(=O)(O[H])O[H])c([H])c21)C([H])([H])[H])C(=O)O[H])C(=O)O[H]'
     # Test
     wrong_mol = Chem.MolFromSmiles(wrong_smiles, sanitize=False)
     with pytest.raises(Exception):
         standardize_chemical(wrong_mol)
Exemplo n.º 2
0
 def test_standardize_chemical_2(self):
     # Data
     violacein_smiles = 'OC1=NC(=C\\C1=C1/C(O)=NC2=CC=CC=C12)C1=CNC2=C1C=C(O)C=C2'
     violacein_mol = Chem.MolFromSmiles(violacein_smiles, sanitize=False)
     # Test simplest case
     std_mol_1 = standardize_chemical(violacein_mol,
                                      add_hs=False,
                                      rm_stereo=False)
     assert Chem.MolToSmiles(
         std_mol_1
     ) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=C/C1=C1\\C(O)=Nc2ccccc21'
     # Test adding Hs
     std_mol_2 = standardize_chemical(violacein_mol,
                                      add_hs=True,
                                      rm_stereo=False)
     assert Chem.MolToSmiles(
         std_mol_2
     ) == '[H]OC1=NC(c2c([H])n([H])c3c([H])c([H])c(O[H])c([H])c23)=C([H])/C1=C1\\C(O[H])=Nc2c([H])c([H])c([H])c([H])c21'
     # Test removing stereo
     std_mol_3 = standardize_chemical(violacein_mol,
                                      add_hs=False,
                                      rm_stereo=True)
     assert Chem.MolToSmiles(
         std_mol_3) == 'O=C1NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(=O)Nc2ccccc21'
     # Test adding Hs + removing stereo
     std_mol_4 = standardize_chemical(violacein_mol,
                                      add_hs=True,
                                      rm_stereo=True)
     assert Chem.MolToSmiles(
         std_mol_4
     ) == '[H]Oc1c([H])c([H])c2c(c1[H])c(C1=C([H])C(=C3C(=O)N([H])c4c([H])c([H])c([H])c([H])c43)C(=O)N1[H])c([H])n2[H]'
Exemplo n.º 3
0
 def test_standardize_chemical_1(self):
     rdmol = Chem.MolFromSmiles(
         '[H][O][C](=[O])[C]([H])([O][H])[C]([H])([H])[H]')
     rdmol_std_1 = standardize_chemical(rdmol, add_hs=False)
     assert Chem.MolToSmiles(rdmol_std_1) == 'CC(O)C(=O)O'
     rdmol_std_2 = standardize_chemical(rdmol, add_hs=True)
     assert Chem.MolToSmiles(
         rdmol_std_2, allHsExplicit=True
     ) == '[H][O][C](=[O])[C]([H])([O][H])[C]([H])([H])[H]'
Exemplo n.º 4
0
 next(csv_reader)  # skip first line
 for element in csv_reader:
     applicable_rules_10_dict_sim[element["Rule_ID"]] = {
         "Rule_SMARTS": element["Rule_SMARTS"],
         "biological_score": 1,
         "EC_number": ["EC: None"],
         "Rule_SMILES": element["Rule_SMILES"]
     }
     # Obtaining substrate ECFP:
     if element["Substrate_ID"] in all_ecfps.keys():
         ECFP = all_ecfps[element["Substrate_ID"]]
     else:
         rd_mol = Chem.MolFromSmiles(
             element["Substrate_SMILES"],
             sanitize=False)  # Important: Sanitize = False
         rd_mol = standardize_chemical(rd_mol, add_hs=True, rm_stereo=True)
         ECFP = AllChem.GetMorganFingerprintAsBitVect(rd_mol,
                                                      radius=2,
                                                      nBits=1024,
                                                      useFeatures=False,
                                                      useChirality=False)
         all_ecfps[element["Substrate_ID"]] = ECFP
     applicable_rules_10_dict_sim[element["Rule_ID"]]["substrate_ECFP"] = [
         ECFP
     ]
     product_names = element["Product_IDs"].split(".")
     product_smiles = element["Product_SMILES"].split(".")
     applicable_rules_10_dict_sim[element["Rule_ID"]]["products_ECFP"] = []
     for i in range(len(product_smiles)):
         if product_names[i] in all_ecfps.keys():
             ECFP = all_ecfps[product_names[i]]
Exemplo n.º 5
0
 def test_rdmol_import(self):
     ethylene_glycol = Compound(InChI = "InChI=1S/C2H6O2/c3-1-2-4/h3-4H,1-2H2")
     mol_file = Chem.inchi.MolFromInchi("InChI=1S/C2H6O2/c3-1-2-4/h3-4H,1-2H2", sanitize=False)
     mol_file = standardize_chemical(mol_file)
     ethylene_glycol_bis = Compound(rdkit_obj = mol_file)
     assert(ethylene_glycol.eq_full_inchi_key(ethylene_glycol_bis))
Exemplo n.º 6
0
    def compute(self):
        """Rules under fire."""
        for rindex, rsmarts in enumerate(self._rsmarts_list):
            # Extract corresponding reaction rule ID if any
            if self._rid_list:
                rid = self._rid_list[rindex]
            else:
                rid = None
            # Get RDKit reaction object
            try:
                rd_rule = AllChem.ReactionFromSmarts(rsmarts)
                rd_rule.Initialize()
            except Exception as e:
                raise RuleConversionError(e) from e

            for cindex, inchi in enumerate(self._inchi_list):
                # Extract corresponding substrate ID if any
                if self._cid_list:
                    cid = self._cid_list[cindex]
                else:
                    cid = None
                # Get standardized RDKit mol
                try:
                    # rd_mol = Chem.MolFromSmiles(csmiles, sanitize=False)  # Important: Sanitize = False
                    rd_mol = Chem.MolFromInchi(
                        inchi, sanitize=False)  # Important: Sanitize = False
                    rd_mol = standardize_chemical(rd_mol,
                                                  add_hs=False,
                                                  rm_stereo=True,
                                                  heavy=True)
                except Exception as e:
                    raise ChemConversionError(e) from e
                # General args to used for both matching and firing
                kwargs = {'rd_rule': rd_rule, 'rd_mol': rd_mol}
                # Matching
                has_match = None
                match_exec_time = None
                match_timed_out = None
                match_error = None
                if self._try_match:
                    try:
                        has_match, match_exec_time = self._run_with_timeout(
                            worker=worker_match,
                            kwargs=kwargs,
                            timeout=self._match_timeout)
                        match_timed_out = False
                        match_error = None
                    except mp.TimeoutError as e:
                        has_match = None
                        match_exec_time = None
                        match_timed_out = True
                        match_error = str(e)
                    except Exception as e:
                        has_match = None
                        match_exec_time = None
                        match_timed_out = False
                        match_error = str(e)
                # Firing
                try:
                    ans, fire_exec_time = self._run_with_timeout(
                        worker=worker_fire,
                        kwargs=kwargs,
                        timeout=self._fire_timeout)
                    rdmols, failed = standardize_results(ans,
                                                         add_hs=False,
                                                         rm_stereo=True)
                    inchikeys, inchis, smiles = handle_results(rdmols)
                    fire_timed_out = False
                    fire_error = None
                except ChemConversionError as e:
                    inchikeys = None
                    inchis = None
                    smiles = None
                    fire_timed_out = False
                    fire_error = str(e)
                    logging.warning(e)
                except mp.TimeoutError as e:
                    fire_exec_time = None
                    inchikeys = None
                    inchis = None
                    smiles = None
                    fire_timed_out = True
                    fire_error = str(e)
                    logging.error('TIMEOUT: cid={}, rid={}'.format(cid, rid))
                    logging.error('TIMEOUT: original error={}'.format(e))
                except Exception as e:
                    fire_exec_time = None
                    inchikeys = None
                    inchis = None
                    smiles = None
                    fire_timed_out = False
                    fire_error = str(e)
                    logging.warning(e)
                # JSONify and store
                json_str = self._jsonify(rsmarts=rsmarts,
                                         inchi=inchi,
                                         rid=rid,
                                         cid=cid,
                                         has_match=has_match,
                                         match_timed_out=match_timed_out,
                                         match_exec_time=match_exec_time,
                                         match_error=match_error,
                                         fire_timed_out=fire_timed_out,
                                         fire_exec_time=fire_exec_time,
                                         fire_error=fire_error,
                                         inchikeys_list=inchikeys,
                                         inchis_list=inchis,
                                         smiles_list=smiles)
                self._json.append(json_str)
def __run__(rule_address_with_H=None,
            rule_address_without_H=None,
            rm_stereo=True):

    all_ecfps = {}

    rule_10_subset_address = "{}/tests/data/rules_r10_subset.tsv".format(
        os.path.dirname(os.path.abspath(__file__)))
    applicable_rules_10_dict_sim = {}
    with open(rule_10_subset_address, "r") as csv_file:
        fieldnames = [
            "Rule_ID", "Reaction_ID", "Diameter", "Direction", "Rule_order",
            "Rule_SMARTS", "Substrate_ID", "Substrate_SMILES", "Product_IDs",
            "Product_SMILES", "Rule_SMILES", "Rule_SMARTS_lite"
        ]
        csv_reader = csv.DictReader(csv_file,
                                    delimiter='\t',
                                    fieldnames=fieldnames)
        next(csv_reader)  # skip first line
        for element in csv_reader:
            applicable_rules_10_dict_sim[element["Rule_ID"]] = {
                "Rule_SMARTS": element["Rule_SMARTS"],
                "biological_score": 1,
                "EC_number": ["EC: None"],
                "Rule_SMILES": element["Rule_SMILES"]
            }
            # Obtaining substrate ECFP:
            if element["Substrate_ID"] in all_ecfps.keys():
                ECFP = all_ecfps[element["Substrate_ID"]]
            else:
                rd_mol = Chem.MolFromSmiles(
                    element["Substrate_SMILES"],
                    sanitize=False)  # Important: Sanitize = False
                rd_mol = standardize_chemical(rd_mol,
                                              add_hs=True,
                                              rm_stereo=rm_stereo)
                ECFP = AllChem.GetMorganFingerprintAsBitVect(
                    rd_mol,
                    radius=2,
                    nBits=1024,
                    useFeatures=False,
                    useChirality=not rm_stereo)
                all_ecfps[element["Substrate_ID"]] = ECFP
            applicable_rules_10_dict_sim[
                element["Rule_ID"]]["substrate_ECFP"] = ECFP
            product_names = element["Product_IDs"].split(".")
            product_smiles = element["Product_SMILES"].split(".")
            applicable_rules_10_dict_sim[
                element["Rule_ID"]]["products_ECFP"] = []
            for i in range(len(product_smiles)):
                if product_names[i] in all_ecfps.keys():
                    ECFP = all_ecfps[product_names[i]]
                else:
                    rd_mol = Chem.MolFromSmiles(
                        product_smiles[i],
                        sanitize=False)  # Important: Sanitize = False
                    rd_mol = standardize_chemical(rd_mol,
                                                  add_hs=True,
                                                  rm_stereo=rm_stereo)
                    ECFP = AllChem.GetMorganFingerprintAsBitVect(
                        rd_mol,
                        radius=2,
                        nBits=1024,
                        useFeatures=False,
                        useChirality=not rm_stereo)
                    all_ecfps[product_names[i]] = ECFP
                applicable_rules_10_dict_sim[
                    element["Rule_ID"]]["products_ECFP"].append(ECFP)

    rule_mixed_subset_address = "{}/tests/data/rules_mixed_subset.tsv".format(
        os.path.dirname(os.path.abspath(__file__)))
    applicable_rules_mixed_dict_sim = {}
    with open(rule_mixed_subset_address, "r") as csv_file:
        fieldnames = [
            "Rule_ID", "Reaction_ID", "Diameter", "Direction", "Rule_order",
            "Rule_SMARTS", "Substrate_ID", "Substrate_SMILES", "Product_IDs",
            "Product_SMILES", "Rule_SMILES", "Rule_SMARTS_lite"
        ]
        csv_reader = csv.DictReader(csv_file,
                                    delimiter='\t',
                                    fieldnames=fieldnames)
        next(csv_reader)  # skip first line
        for element in csv_reader:
            applicable_rules_mixed_dict_sim[element["Rule_ID"]] = {
                "Rule_SMARTS": element["Rule_SMARTS"],
                "biological_score": 1,
                "EC_number": ["EC: None"],
                "Rule_SMILES": element["Rule_SMILES"]
            }
            if element["Substrate_ID"] in all_ecfps.keys():
                ECFP = all_ecfps[element["Substrate_ID"]]
            else:
                rd_mol = Chem.MolFromSmiles(
                    element["Substrate_SMILES"],
                    sanitize=False)  # Important: Sanitize = False
                rd_mol = standardize_chemical(rd_mol,
                                              add_hs=True,
                                              rm_stereo=rm_stereo)
                ECFP = AllChem.GetMorganFingerprintAsBitVect(
                    rd_mol,
                    radius=2,
                    nBits=1024,
                    useFeatures=False,
                    useChirality=not rm_stereo)
                all_ecfps[element["Substrate_ID"]] = ECFP
            applicable_rules_mixed_dict_sim[
                element["Rule_ID"]]["substrate_ECFP"] = ECFP
            product_names = element["Product_IDs"].split(".")
            product_smiles = element["Product_SMILES"].split(".")
            applicable_rules_mixed_dict_sim[
                element["Rule_ID"]]["products_ECFP"] = []
            for i in range(len(product_smiles)):
                if product_names[i] in all_ecfps.keys():
                    ECFP = all_ecfps[product_names[i]]
                else:
                    rd_mol = Chem.MolFromSmiles(
                        product_smiles[i],
                        sanitize=False)  # Important: Sanitize = False
                    rd_mol = standardize_chemical(rd_mol,
                                                  add_hs=True,
                                                  rm_stereo=rm_stereo)
                    ECFP = AllChem.GetMorganFingerprintAsBitVect(
                        rd_mol,
                        radius=2,
                        nBits=1024,
                        useFeatures=False,
                        useChirality=not rm_stereo)
                    all_ecfps[product_names[i]] = ECFP
                applicable_rules_mixed_dict_sim[
                    element["Rule_ID"]]["products_ECFP"].append(ECFP)

    def merge_rule_characteristics(current_characteristics,
                                   new_characteristics, rule_id):
        """
        The aim of this function is to merge new_characteristics of a rule into current_characteristics.
        It will be called when parsing real datasets.
        This is because rule IDs are based on hashed SMARTS, meaning rules from differetn original reaction can have the smae ID.
        It checks the
        - used diameters
        - susbtrate and products ECFPs
        - Reaction_ID
        - Biological score
        """
        assert current_characteristics["Rule_SMARTS"] == new_characteristics[
            "Rule_SMARTS"]
        assert current_characteristics["Diameter"] == new_characteristics[
            "Diameter"]
        # If compounds and products are already in list, do not add them
        common_substrate = new_characteristics["Substrate_ID"][
            0] in current_characteristics["Substrate_ID"]
        common_products = new_characteristics["Product_IDs"][
            0] in current_characteristics["Product_IDs"]
        if common_substrate and common_products:
            logging.info(
                "Hashed rule {} versions ({} and {}) are based on the same substrates and products"
                .format(rule_id, new_characteristics["Reaction_ID"][0],
                        current_characteristics["Reaction_ID"]))
        else:
            current_characteristics["Substrate_ID"].append(
                new_characteristics["Substrate_ID"][0])
            current_characteristics["substrate_ECFP"].append(
                new_characteristics["substrate_ECFP"][0])
            current_characteristics["Product_IDs"].append(
                new_characteristics["Product_IDs"][0])
            current_characteristics["products_ECFP"].append(
                new_characteristics["products_ECFP"][0])
        # Biological score:
        if current_characteristics["biological_score"] != new_characteristics[
                "biological_score"]:
            logging.info("Different biological scores for {}".format(rule_id))
            current_characteristics["biological_score"] = max(
                current_characteristics["biological_score"],
                new_characteristics["biological_score"])
        # Merging EC numbers
        for EC in new_characteristics["EC_number"]:
            if EC not in current_characteristics["EC_number"]:
                current_characteristics["EC_number"].append(EC)
        # Merging Original Identifiers
        if new_characteristics["Reaction_ID"][
                0] not in current_characteristics["Reaction_ID"]:
            current_characteristics["Reaction_ID"].append(
                new_characteristics["Reaction_ID"][0])
        return (current_characteristics)

    logging.info(
        "-------------------Currently processing rules with Hs--------------------------"
    )
    all_ecfps = {}
    full_rules_retro_H = {}
    full_rules_forward_H = {}

    if rule_address_with_H is None:
        logging.warning(
            "No rule address with H was given. Moving on to the next dataset")
    else:
        with open(rule_address_with_H, "r") as csv_file:
            csv_reader = csv.DictReader(csv_file, delimiter='\t')
            next(csv_reader)
            # Following booleans are for logging missing columns only once.
            # They are useful when using rule formats without as much information as RetroRules.
            first_log_biological_None = True
            first_log_EC_None = True
            first_log_rule_usage_None = True
            first_log_diameter_None = True
            first_log_rule_smiles_None = True
            first_log_chemistry_sub_None = True
            first_log_chemistry_sub_name_None = True
            first_log_chemistry_prod_None = True
            first_log_chemistry_prod_name_None = True
            for element in csv_reader:
                try:
                    rule_id = element["Rule_ID"]
                except KeyError:
                    try:
                        rule_id = element["# Rule_ID"]
                    except KeyError:
                        message = "Rule_ID is mandatory. If you do not have such a column, please create it"
                        raise IncorrectFormatting(message)
                try:
                    reac_id = element["Reaction_ID"]
                except KeyError:
                    message = "Reaction_ID is mandatory. If you do not have such a column, please create it"
                    raise IncorrectFormatting(message)
                try:
                    try:
                        biological_score = float(element["Score_normalized"])
                    except ValueError:
                        logging.warning(
                            "No biological score for {}. Is set to 1".format(
                                rule_id))
                except KeyError:
                    if first_log_biological_None:
                        first_log_biological_None = False
                        logging.warning(
                            "No column Score_normalized. Default biological score is set to 1"
                        )
                    biological_score = 1

                # Calculate ECFPs - substrate
                try:
                    sub_smiles = element["Substrate_SMILES"]
                    try:
                        sub_id = element["Substrate_ID"]
                        # ID and smiles present, calculate chemistry.
                        if sub_id in all_ecfps.keys():
                            sub_ECFP = all_ecfps[sub_id]
                        else:
                            rd_mol = Chem.MolFromSmiles(
                                sub_smiles,
                                sanitize=False)  # Important: Sanitize = False
                            rd_mol = standardize_chemical(rd_mol,
                                                          add_hs=True,
                                                          rm_stereo=rm_stereo)
                            sub_ECFP = AllChem.GetMorganFingerprintAsBitVect(
                                rd_mol,
                                radius=2,
                                nBits=1024,
                                useFeatures=False,
                                useChirality=not rm_stereo)
                            all_ecfps[sub_id] = sub_ECFP
                    except KeyError:
                        message = "Substrate_ID is mandatory is you specify Substrate_SMILES \n"
                        message_bis = "If you do not have such a column, please create it. You can use InChIKey if you do not have a proper ID"
                        raise IncorrectFormatting(message + message_bis)
                except KeyError:
                    if first_log_chemistry_sub_None:
                        first_log_chemistry_sub_None = False
                        logging.warning(
                            "No column Substrate_SMILES. Set to None \n Chemical scoring will not be available"
                        )
                    sub_smiles = None
                    sub_ECFP = None
                    try:
                        sub_id = element["Substrate_ID"]
                    except KeyError:
                        if first_log_chemistry_sub_name_None:
                            first_log_chemistry_sub_name_None = False
                            logging.warning(
                                "No column Substrate_ID. Set to Unspecified")
                        sub_id = "Unspecified"
                # # Calculate ECFPs - products
                try:
                    product_smiles = element["Product_SMILES"].split(".")
                    try:
                        product_ids = element["Product_IDs"].split(".")
                        assert len(product_smiles) == len(product_ids)
                        # All information is avalaible for ECFP calculation for products
                        products_ECFPs = []
                        for i in range(len(product_smiles)):
                            if product_ids[i] in all_ecfps.keys():
                                prod_ECFP = all_ecfps[product_ids[i]]
                            else:
                                rd_mol = Chem.MolFromSmiles(
                                    product_smiles[i], sanitize=False
                                )  # Important: Sanitize = False
                                rd_mol = standardize_chemical(
                                    rd_mol, add_hs=True, rm_stereo=rm_stereo)
                                prod_ECFP = AllChem.GetMorganFingerprintAsBitVect(
                                    rd_mol,
                                    radius=2,
                                    nBits=1024,
                                    useFeatures=False,
                                    useChirality=not rm_stereo)
                                all_ecfps[product_ids[i]] = prod_ECFP
                            products_ECFPs.append(prod_ECFP)
                        product_ids = [set(element['Product_IDs'].split('.'))]
                    except KeyError:
                        message = "Product_IDs is mandatory is you specify Product_SMILES \n"
                        message_bis = "If you do not have such a column, please create it. You can use InChIKeys if you do not have proper IDs"
                        raise IncorrectFormatting(message + message_bis)
                    except AssertionError:
                        message = "Product IDs and structures do not have the same len for {} ({})".format(
                            product_names, rule_id)
                        raise IncorrectFormatting(message)
                except KeyError:
                    if first_log_chemistry_prod_None:
                        first_log_chemistry_prod_None = False
                        logging.warning(
                            "No column product_smiles. Set to None \n Chemical scoring will not be available"
                        )
                    product_smiles = None
                    products_ECFPs = None
                    try:
                        product_ids = element["Product_IDs"]
                    except KeyError:
                        if first_log_chemistry_prod_name_None:
                            first_log_chemistry_prod_name_None = False
                            logging.warning(
                                "No column Product_IDs. Set to Unspecified")
                        product_ids = "Unspecified"
                # EC number:
                try:
                    EC_number = element["Reaction_EC_number"].split(",")
                except KeyError:
                    if first_log_EC_None:
                        first_log_EC_None = False
                        logging.warning(
                            "No column Reaction_EC_number. Default EC is set to Unspecified"
                        )
                    EC_number = "Unspecified"

                # Rule characteristics
                # Keep smart, keep original ids as lists
                try:
                    diameter = element["Diameter"]
                except KeyError:
                    if first_log_diameter_None:
                        first_log_diameter_None = False
                        logging.warning(
                            "No column Diameter. Default diameter is set to 0")
                    diameter = 0
                # Load rule smarts
                try:
                    Rule_SMARTS = element["Rule_SMARTS"]
                except KeyError:
                    message = "Rule_SMARTS is mandatory. If you do not have such a column, please create it"
                    raise IncorrectFormatting(message)
                try:
                    Rule_SMILES = element["Rule_SMILES"]
                except KeyError:
                    if first_log_rule_smiles_None:
                        first_log_rule_smiles_None = False
                        logging.warning(
                            "No column Rule_SMILES. Default Rule_SMILES is set to empty string"
                        )
                    Rule_SMILES = ""
                rule_characteristics_extended = {
                    "Rule_SMARTS": Rule_SMARTS,
                    "biological_score": biological_score,
                    "EC_number": EC_number,
                    "substrate_ECFP": [sub_ECFP],
                    "products_ECFP": [products_ECFPs],
                    "Reaction_ID": [reac_id],
                    "Substrate_ID": [sub_id],
                    "Product_IDs": product_ids,
                    "Diameter": diameter,
                    "Rule_SMILES": Rule_SMILES
                }
                try:
                    usage = element["Rule_usage"]
                except KeyError:
                    if first_log_rule_usage_None:
                        logging.warning(
                            "No column Rule_usage. Default usage is set to both directions"
                        )
                        first_log_rule_usage_None = False
                    usage = "both"

                if usage == "forward":
                    if rule_id in full_rules_forward_H.keys():
                        new_characteristics = merge_rule_characteristics(
                            current_characteristics=full_rules_forward_H[
                                rule_id],
                            new_characteristics=rule_characteristics_extended,
                            rule_id=rule_id)
                        full_rules_forward_H[rule_id] = new_characteristics
                    else:
                        full_rules_forward_H[
                            rule_id] = rule_characteristics_extended
                elif usage == "retro":
                    if rule_id in full_rules_retro_H.keys():
                        new_characteristics = merge_rule_characteristics(
                            current_characteristics=full_rules_retro_H[
                                rule_id],
                            new_characteristics=rule_characteristics_extended,
                            rule_id=rule_id)
                        full_rules_retro_H[rule_id] = new_characteristics
                    else:
                        full_rules_retro_H[
                            rule_id] = rule_characteristics_extended
                elif usage == "both":
                    # Adding to retro
                    if rule_id in full_rules_retro_H.keys():
                        new_characteristics = merge_rule_characteristics(
                            current_characteristics=full_rules_retro_H[
                                rule_id],
                            new_characteristics=rule_characteristics_extended,
                            rule_id=rule_id)
                        full_rules_retro_H[rule_id] = new_characteristics
                    else:
                        full_rules_retro_H[
                            rule_id] = rule_characteristics_extended
                    # Adding to forward
                    if rule_id in full_rules_forward_H.keys():
                        new_characteristics = merge_rule_characteristics(
                            current_characteristics=full_rules_forward_H[
                                rule_id],
                            new_characteristics=rule_characteristics_extended,
                            rule_id=rule_id)
                        full_rules_forward_H[rule_id] = new_characteristics
                    else:
                        full_rules_forward_H[
                            rule_id] = rule_characteristics_extended
                else:
                    raise NotImplementedError

        useful_characteristics = [
            "Rule_SMARTS", "biological_score", "EC_number", "substrate_ECFP",
            "products_ECFP", "Reaction_ID", "Diameter", "Rule_SMILES"
        ]

        for rid, rule_char in full_rules_forward_H.items():
            full_rules_forward_H[rid] = dict(
                (attribute, rule_char[attribute])
                for attribute in useful_characteristics)

        for rid, rule_char in full_rules_retro_H.items():
            full_rules_retro_H[rid] = dict(
                (attribute, rule_char[attribute])
                for attribute in useful_characteristics)

        with open("{}/{}.pkl".format(folder_to_save, "full_rules_forward_H"),
                  "wb") as pickle_handler:
            pickle.dump(full_rules_forward_H, pickle_handler)

        with open("{}/{}.pkl".format(folder_to_save, "full_rules_retro_H"),
                  "wb") as pickle_handler:
            pickle.dump(full_rules_retro_H, pickle_handler)

    logging.info(
        "-------------------Currently processing rules without Hs--------------------------"
    )

    all_ecfps_no_H = {}
    full_rules_retro_no_H = {}
    full_rules_forward_no_H = {}

    if rule_address_without_H is None:
        logging.warning("No rule address without H was given.")
    else:
        with open(rule_address_without_H, "r") as csv_file:
            csv_reader = csv.DictReader(csv_file, delimiter='\t')
            next(csv_reader)
            first_log_biological_None = True
            first_log_EC_None = True
            first_log_rule_usage_None = True
            first_log_diameter_None = True
            first_log_rule_smiles_None = True
            first_log_chemistry_sub_None = True
            first_log_chemistry_sub_name_None = True
            first_log_chemistry_prod_None = True
            first_log_chemistry_prod_name_None = True

            for element in csv_reader:
                try:
                    rule_id = element["Rule_ID"]
                except KeyError:
                    try:
                        rule_id = element["# Rule_ID"]
                    except KeyError:
                        message = "Rule_ID is mandatory. If you do not have such a column, please create it"
                        raise IncorrectFormatting(message)
                try:
                    try:
                        biological_score = float(element["Score_normalized"])
                    except ValueError:
                        logging.warning(
                            "No biological score for {}. Is set to 1".format(
                                rule_id))
                        biological_score = 1
                except KeyError:
                    if first_log_biological_None:
                        first_log_biological_None = False
                        logging.warning(
                            "No column Score_normalized. Default biological score is set to 1"
                        )
                    biological_score = 1

                # Calculate ECFPs - substrate
                try:
                    sub_smiles = element["Substrate_SMILES"]
                    try:
                        sub_id = element["Substrate_ID"]
                        # ID and smiles present, calculate chemistry.
                        if sub_id in all_ecfps_no_H.keys():
                            sub_ECFP = all_ecfps_no_H[sub_id]
                        else:
                            rd_mol = Chem.MolFromSmiles(
                                sub_smiles,
                                sanitize=False)  # Important: Sanitize = False
                            rd_mol = standardize_chemical(rd_mol,
                                                          add_hs=False,
                                                          rm_stereo=rm_stereo)
                            sub_ECFP = AllChem.GetMorganFingerprintAsBitVect(
                                rd_mol,
                                radius=2,
                                nBits=1024,
                                useFeatures=False,
                                useChirality=not rm_stereo)
                            all_ecfps_no_H[sub_id] = sub_ECFP
                    except KeyError:
                        message = "Substrate_ID is mandatory is you specify Substrate_SMILES \n"
                        message_bis = "If you do not have such a column, please create it. You can use InChIKey if you do not have a proper ID"
                        raise IncorrectFormatting(message + message_bis)
                except KeyError:
                    if first_log_chemistry_sub_None:
                        first_log_chemistry_sub_None = False
                        logging.warning(
                            "No column Substrate_SMILES. Set to None. \n Chemical scoring will not be available"
                        )
                    sub_smiles = None
                    sub_ECFP = None
                    try:
                        sub_id = element["Substrate_ID"]
                    except KeyError:
                        if first_log_chemistry_sub_name_None:
                            first_log_chemistry_sub_name_None = False
                            logging.warning(
                                "No column Substrate_ID. Set to Unspecified")
                        sub_id = "Unspecified"
                ## Calculate ECFP:
                try:
                    product_smiles = element["Product_SMILES"].split(".")
                    try:
                        product_ids = element["Product_IDs"].split(".")
                        assert len(product_smiles) == len(product_ids)
                        # All information is avalaible for ECFP calculation for products
                        products_ECFPs = []
                        for i in range(len(product_smiles)):
                            if product_ids[i] in all_ecfps_no_H.keys():
                                prod_ECFP = all_ecfps_no_H[product_ids[i]]
                            else:
                                rd_mol = Chem.MolFromSmiles(
                                    product_smiles[i], sanitize=False
                                )  # Important: Sanitize = False
                                rd_mol = standardize_chemical(
                                    rd_mol, add_hs=False, rm_stereo=rm_stereo)
                                prod_ECFP = AllChem.GetMorganFingerprintAsBitVect(
                                    rd_mol,
                                    radius=2,
                                    nBits=1024,
                                    useFeatures=False,
                                    useChirality=not rm_stereo)
                                all_ecfps[product_ids[i]] = prod_ECFP
                            products_ECFPs.append(prod_ECFP)
                        product_ids = [set(element['Product_IDs'].split('.'))]
                    except KeyError:
                        message = "Product_IDs is mandatory is you specify Product_SMILES \n"
                        message_bis = "If you do not have such a column, please create it. You can use InChIKeys if you do not have proper IDs"
                        raise IncorrectFormatting(message + message_bis)
                    except AssertionError:
                        message = "Product IDs and structures do not have the same len for {} ({})".format(
                            product_names, rule_id)
                        raise IncorrectFormatting(message)
                except KeyError:
                    if first_log_chemistry_prod_None:
                        first_log_chemistry_prod_None = False
                        logging.warning(
                            "No column product_smiles. Set to None \n Chemical scoring will not be available"
                        )
                    product_smiles = None
                    products_ECFPs = None
                    try:
                        product_ids = element["Product_IDs"]
                    except KeyError:
                        if first_log_chemistry_prod_name_None:
                            first_log_chemistry_prod_name_None = False
                            logging.warning(
                                "No column Product_IDs. Set to Unspecified")
                        product_ids = "Unspecified"
                # EC number:
                try:
                    EC_number = element["Reaction_EC_number"].split(",")
                except KeyError:
                    if first_log_EC_None:
                        first_log_EC_None = False
                        logging.warning(
                            "No column Ec_number. Default EC is set to Unspecified"
                        )
                    EC_number = "Unspecified"

                # Rule characteristics
                try:
                    diameter = element["Diameter"]
                except KeyError:
                    if first_log_diameter_None:
                        first_log_diameter_None = False
                        logging.warning(
                            "No column Diameter. Default diameter is set to 0")
                    diameter = 0
                try:
                    reac_id = element["Reaction_ID"]
                except KeyError:
                    message = "Reaction_ID is mandatory. If you do not have such a column, please create it"
                    raise IncorrectFormatting(message)
                try:
                    Rule_SMARTS = element["Rule_SMARTS"]
                except KeyError:
                    message = "Rule_SMARTS is mandatory. If you do not have such a column, please create it"
                    raise IncorrectFormatting(message)
                try:
                    Rule_SMILES = element["Rule_SMILES"]
                except KeyError:
                    if first_log_rule_smiles_None:
                        first_log_rule_smiles_None = False
                        logging.warning(
                            "No column Rule_SMILES. Default Rule_SMILES is set to empty string"
                        )
                    Rule_SMILES = ""

                rule_characteristics_extended = {
                    "Rule_SMARTS": Rule_SMARTS,
                    "biological_score": biological_score,
                    "EC_number": EC_number,
                    "substrate_ECFP": [sub_ECFP],
                    "products_ECFP": [products_ECFPs],
                    "Reaction_ID": [reac_id],
                    "Substrate_ID": [sub_id],
                    "Product_IDs": product_ids,
                    "Diameter": diameter,
                    "Rule_SMILES": Rule_SMILES
                }
                try:
                    usage = element["Rule_usage"]
                except KeyError:
                    if first_log_rule_usage_None:
                        logging.warning(
                            "No column Rule_usage. Default usage is set to both directions"
                        )
                        first_log_rule_usage_None = False
                    usage = "both"

                if usage == "forward":
                    if rule_id in full_rules_forward_no_H.keys():
                        new_characteristics = merge_rule_characteristics(
                            current_characteristics=full_rules_forward_no_H[
                                rule_id],
                            new_characteristics=rule_characteristics_extended,
                            rule_id=rule_id)
                        full_rules_forward_no_H[rule_id] = new_characteristics
                    else:
                        full_rules_forward_no_H[
                            rule_id] = rule_characteristics_extended
                elif usage == "retro":
                    if rule_id in full_rules_retro_no_H.keys():
                        new_characteristics = merge_rule_characteristics(
                            current_characteristics=full_rules_retro_no_H[
                                rule_id],
                            new_characteristics=rule_characteristics_extended,
                            rule_id=rule_id)
                        full_rules_retro_no_H[rule_id] = new_characteristics
                    else:
                        full_rules_retro_no_H[
                            rule_id] = rule_characteristics_extended
                elif usage == "both":
                    # Adding to retro
                    if rule_id in full_rules_retro_no_H.keys():
                        new_characteristics = merge_rule_characteristics(
                            current_characteristics=full_rules_retro_no_H[
                                rule_id],
                            new_characteristics=rule_characteristics_extended,
                            rule_id=rule_id)
                        full_rules_retro_no_H[rule_id] = new_characteristics
                    else:
                        full_rules_retro_no_H[
                            rule_id] = rule_characteristics_extended
                    # Adding to forward
                    if rule_id in full_rules_forward_no_H.keys():
                        new_characteristics = merge_rule_characteristics(
                            current_characteristics=full_rules_forward_no_H[
                                rule_id],
                            new_characteristics=rule_characteristics_extended,
                            rule_id=rule_id)
                        full_rules_forward_no_H[rule_id] = new_characteristics
                    else:
                        full_rules_forward_no_H[
                            rule_id] = rule_characteristics_extended

                else:
                    raise NotImplementedError

        useful_characteristics = [
            "Rule_SMARTS", "biological_score", "EC_number", "substrate_ECFP",
            "products_ECFP", "Reaction_ID", "Diameter", "Rule_SMILES"
        ]

        for rid, rule_char in full_rules_forward_no_H.items():
            full_rules_forward_no_H[rid] = dict(
                (attribute, rule_char[attribute])
                for attribute in useful_characteristics)

        for rid, rule_char in full_rules_retro_no_H.items():
            full_rules_retro_no_H[rid] = dict(
                (attribute, rule_char[attribute])
                for attribute in useful_characteristics)

        with open(
                "{}/{}.pkl".format(folder_to_save, "full_rules_forward_no_H"),
                "wb") as pickle_handler:
            pickle.dump(full_rules_forward_no_H, pickle_handler)

        with open("{}/{}.pkl".format(folder_to_save, "full_rules_retro_no_H"),
                  "wb") as pickle_handler:
            pickle.dump(full_rules_retro_no_H, pickle_handler)