Exemplo n.º 1
0
def smiles_to_inchi_inchikey(smiles,verbose=0):
          
    error_counter=0
    warning_counter=0
    inchis=[]
    inchis2=[]
    inchiKeys=[]

    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        if(mol!=None):
            inchi, retcode, message, logs, aux =rdinchi.MolToInchi(mol)
            inchiKey=rdinchi.InchiToInchiKey(inchi)   
            inchis.append(inchi)  
            inchiKeys.append(inchiKey)   
            try:
                mol, retcode, message, logs=rdinchi.InchiToMol(inchi)
                if(mol!=None):                
                    inchi2, retcode, message, logs, aux =rdinchi.MolToInchi(mol)
                    inchis2.append(inchi2)  
                else: 
                    inchis2.append("XXX") 
                    if(verbose!=0):
                        print(elem+ ": InChI cannot converted to mol object, added XXX instead!")
            except:
                if(verbose!=0):
                    print(retcode)
                    print(message)
                    print("Smiles:"+elem)
                inchis2.append("XXX")  
            
        else:  
            inchis.append("XXX")
            inchiKeys.append("XXX")
            inchis2.append("XXX")            
            if(verbose!=0):
                print(elem+ ": can not converted added XXX instead! ")            
            error_counter=error_counter+1

    for i1,i2 in zip(inchis,inchis2):
        if(i1!=i2):
            if(verbose!=0):
                print("Warning:"+i1+" - "+i2)
            warning_counter=warning_counter+1

    print("\nGeneration of InChI and InChIKey from SMILES is completed.")    
    print("Total errors:"+str(error_counter))
    print("Total warnings:"+str(warning_counter)+"\n")    
                                          
    return inchis,inchiKeys
Exemplo n.º 2
0
def MolToInchiAndAuxInfo(mol,
                         options="",
                         logLevel=None,
                         treatWarningAsError=False):
    """Returns the standard InChI string and InChI auxInfo for a molecule

    Keyword arguments:
    logLevel -- the log level used for logging logs and messages from InChI
    API. set to None to diable the logging completely
    treatWarningAsError -- set to True to raise an exception in case of a
    molecule that generates warning in calling InChI API. The resultant InChI
    string and AuxInfo string as well as the error message are encoded in the
    exception.

    Returns:
    a tuple of the standard InChI string and the auxInfo string returned by
    InChI API, in that order, for the input molecule
    """
    inchi, retcode, message, logs, aux = rdinchi.MolToInchi(mol, options)
    if logLevel is not None:
        if logLevel not in logLevelToLogFunctionLookup:
            raise ValueError("Unsupported log level: %d" % logLevel)
        log = logLevelToLogFunctionLookup[logLevel]
        if retcode == 0:
            log(message)
    if retcode != 0:
        if retcode == 1:
            logger.warning(message)
        else:
            logger.error(message)

    if treatWarningAsError and retcode != 0:
        raise InchiReadWriteError(inchi, aux, message)
    return inchi, aux
Exemplo n.º 3
0
def sln_to_smiles(sln_list,verbose=0):      
    
    error_counter=0
    warning_counter=0
    smiles_list= []
    inchis_from_sln=[]
    inchis_from_smiles=[]

    for elem in sln_list:
        mol=rdSLNParse.MolFromQuerySLN(elem) 

        if(mol!=None):
            smiles=Chem.MolToSmiles(mol)
            smiles_list.append(smiles)
            inchi, retcode, message, logs, aux =rdinchi.MolToInchi(mol)  
            inchis_from_sln.append(inchi)              
            mol=Chem.MolFromSmiles(smiles) 
            
            if(mol!=None):                
                inchi, retcode, message, logs, aux =rdinchi.MolToInchi(mol)
                inchis_from_smiles.append(inchi)  
            else: 
                inchis_from_smiles.append("XXX")
                if(verbose!=0):
                    print(elem+ ": SMILES cannot converted to mol object, added XXX instead!")

        else:        
            smiles_list.append("XXX")
            print(elem+ ": SLN can not converted to mol object, added XXX instead!")
            error_counter=error_counter+1
        
    for i1,i2 in zip(inchis_from_sln,inchis_from_smiles):
        if(i1!=i2):
            if(verbose!=0):
                print("Warning:"+i1+" - "+i2)
            warning_counter=warning_counter+1
     
    print("\nConversion from SLN to SMILES is completed.")    
    print("Total errors:"+str(error_counter))
    print("Total warnings:"+str(warning_counter)+"\n")    
              
    return smiles_list
def parse_file(input_file, db_name):
    """ takes all text from nanpdb database file and returns a list of lists 
    with NPs which is easy to use.
  
    input_file: nanpdb database txt file
    db_name: database name
    """

    all_lines = input_file.split('\n')
    all_lines = all_lines[:-1]
    all_info_list = []
    for line in all_lines:
        line = line.split('\t')
        info_per_row_list = []
        for value in line:
            my_string = ""
            if len(value) == 0:
                value = "NA"
            my_string += value
            info_per_row_list += [my_string]
        info_per_row_list += [db_name]
        all_info_list += [info_per_row_list]


    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']
    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    for line in all_info_list:
        # generate molecules
        m = Chem.MolFromSmiles(line[0])

        # MonoisotopicMass
        mol_mass = str(Descriptors.ExactMolWt(m))[:-5]
        mol_mass_list += [mol_mass]

        # InChI
        inchi = rdinchi.MolToInchi(m)
        inchi_list += [inchi[0]]

        # SMILES
        SMILES_list += [line[0]]

        # Identifier
        identifier_list += [line[1]]

        # MolecularFormula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]

        # NA list
    nr_of_structures = len(all_info_list)
    NA_list += ['NA'] * nr_of_structures

    # InChIKey
    inchi_key_list = []
    inchi_key_list2 = []
    for inchi in inchi_list:
        inchi_key = rdinchi.InchiToInchiKey(inchi)
        inchi_key_list2 += [inchi_key]
    inchi_key_list += inchi_key_list2

    # InChiKey1 and InChiKey2
    for inchikey in inchi_key_list:
        inchikey = inchikey.split('-')
        inchikey1 = inchikey[0]
        inchikey2 = inchikey[1]
        inchi_key1_list += [inchikey1]
        inchi_key2_list += [inchikey2]


    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return attribute_names, overall_list
Exemplo n.º 5
0
def parse_data(input_file):
    """ takes all text from norine database file and returns a list of lists 
    with all CLASS data and an attribute list.
  
    input_file: norine database txt file
    """
    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']

    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    pre_SMILES_list = []
    identifier_list = []
    all_lines = input_file.split('\n')
    all_lines = all_lines[2:]
    for line in all_lines:
        line = line.split('\t')

        #Convert to mol and remove invalid structures
        smile_string = ''
        id_string = ''
        m = line[2]
        id_name = line[0]
        mol = Chem.MolFromSmiles(m)
        if mol != None:
            smile_string += m
            id_string += id_name
        pre_SMILES_list += [smile_string]

        #Source identifiers
        identifier_list += [id_string]

    pre_inchi_list = []
    for smile in pre_SMILES_list:
        #Generate mol
        m = Chem.MolFromSmiles(smile)
        #SMILES
        sm = Chem.MolToSmiles(m)
        SMILES_list += [sm]
        #Monoisotopic mass
        mol_weigth = Descriptors.ExactMolWt(m)
        mol_mass_list += [mol_weigth]
        #Mol Forumula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]
        # InChI
        inchi = rdinchi.MolToInchi(m)
        pre_inchi_list += [inchi[0]]

    # InChIKey1 and InChIKey2
    for inchi in pre_inchi_list:
        if not str(inchi).startswith('InCh'):
            inchi = 'NA'
        inchi_list += [inchi]

    pre_inchi_key_list = []
    for inchi2 in inchi_list:
        if inchi2 == 'NA':
            inchi_key = "NA-NA"
            pre_inchi_key_list += [inchi_key]
        if inchi2 != 'NA':
            inchi_key = rdinchi.InchiToInchiKey(inchi2)
            pre_inchi_key_list += [inchi_key]

    for inchi_key in pre_inchi_key_list:
        inchi_key = inchi_key.split('-')
        inchi_key2 = inchi_key[1]
        inchi_key2_list += [inchi_key2]
        inchi_key1 = inchi_key[0]
        inchi_key1_list += [inchi_key1]

    # NA list
    nr_of_structures = len(SMILES_list)
    NA_list += ['NA'] * nr_of_structures

    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return attribute_names, overall_list
def create_CLASS_data(data_dict):
    """ Generates CLASS data for the strepto data present in the strep_dict.
    
    input_file: streptodb dictionary
    """    
    
    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']
    
    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = [] 
    inchi_key2_list = [] 
    mol_formula_list = []
    NA_list = []
    
    # Identifier
    identifier_list = data_dict['compound_id']
    
    # SMILES
    SMILES_list = data_dict['canonical_smiles']
    
    for SMILE in SMILES_list:
        # generate molecules
        m = Chem.MolFromSmiles(SMILE)
    
        # MonoisotopicMass
        mol_mass = str(Descriptors.ExactMolWt(m))[:-3]
        mol_mass_list += [mol_mass]

        # InChI         
        inchi = rdinchi.MolToInchi(m)
        inchi_list += [inchi[0]]   
        
        # MolecularFormula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]

    # NA list    
    nr_of_structures = len(data_dict['canonical_smiles'])
    NA_list += ['NA'] * nr_of_structures
    
    # InChIKey
    inchi_key_list = [] 
    inchi_key_list2 = []    
    for inchi in inchi_list:
        inchi_key = rdinchi.InchiToInchiKey(inchi)
        inchi_key_list2 += [inchi_key]
    inchi_key_list += inchi_key_list2
    
    # InChiKey1 and InChiKey2
    for inchikey in inchi_key_list:
        inchikey = inchikey.split('-')
        inchikey1 = inchikey[0]
        inchikey2 = inchikey[1]
        inchi_key1_list += [inchikey1]
        inchi_key2_list += [inchikey2]
    
    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]
    
    return  attribute_names, overall_list   
Exemplo n.º 7
0
def diff_mol_pdb(mol, pdbfile, logfile=devnull):
    with stdout_redirected(to=logfile, stdout=sys_stderr):
        with stdout_redirected(to=logfile, stdout=sys_stdout):
            remove_isotopes(mol, sanitize=True)
            nhmol = Chem.RemoveHs(mol,
                                  implicitOnly=False,
                                  updateExplicitCount=True,
                                  sanitize=True)
            try:
                Chem.Kekulize(nhmol)
            except:
                pass
            checkconnect = True
            pdbmol = None
            try:
                pdbmol = Chem.MolFromPDBFile(pdbfile,
                                             removeHs=False,
                                             sanitize=True)
            except:
                pass
            if pdbmol is None:
                pdbmol = Chem.MolFromPDBFile(pdbfile,
                                             removeHs=False,
                                             sanitize=False)
                if pdbmol is None:
                    raise ParsingError("Cannot open PDB molecule.")
                pdbmol = disconnect(pdbmol)
                Chem.SanitizeMol(pdbmol, catchErrors=True)

            nhpdbmol = Chem.RemoveHs(pdbmol,
                                     implicitOnly=False,
                                     updateExplicitCount=True,
                                     sanitize=False)

            Chem.SanitizeMol(nhpdbmol, catchErrors=True)

            try:
                print(
                    'Applying bond orders and formal charges from molecule file to PDB molecule ... '
                )
                nhpdbmol = AssignBondOrdersFromTemplate(nhmol, nhpdbmol)
                newpdbmol = Chem.AddHs(nhpdbmol,
                                       addCoords=True,
                                       explicitOnly=True)
                newpdbmol.UpdatePropertyCache()
                newpdbmol = correct_hydrogen_num_from_pdbmol(pdbmol, newpdbmol)
                newpdbmol = set_hydrogen_coor_from_pdbmol(pdbmol,
                                                          newpdbmol,
                                                          refconfId=-1,
                                                          confId=-1)
            except Exception:
                print(
                    "WARNING: Cannot assign bond orders from molecule file template. Checking only non-hydrogen connectivity."
                )
                checkconnect = False
                newpdbmol = nhpdbmol
                pass

            #Stoichiometric formula check
            impnum = count_implicit_hydrogens(newpdbmol)
            failnum = 0
            result = 'OK'
            unformula = remove_charge_formula(
                rdMolDescriptors.CalcMolFormula(mol))
            pdbunformula = remove_charge_formula(
                rdMolDescriptors.CalcMolFormula(newpdbmol))
            #print(pdbunformula)
            pdbunformula = fix_formula(pdbunformula, impnum)
            if unformula != pdbunformula:
                failnum += 1
                result = 'FAIL: Molecules have different Stoichiometric formulas ' + unformula + ' ' + pdbunformula + '.'
            print('Stoichiometric formula check (without charge): ' + result)

            print('Generating Fixed H InChI for molecule file ... ')
            inchi, code, msg, log, aux = rdinchi.MolToInchi(
                mol, options='-FixedH -DoNotAddH')
            if code == 0:
                #print(inchi)
                pass
            if code == 1:
                # print(inchi)
                print(msg)
            else:
                print(msg)

            print('Generating Standard InChI for molecule file ... ')
            sinchi, code, msg, log, aux = rdinchi.MolToInchi(
                mol, options=' -DoNotAddH')
            if code == 0:
                #print(sinchi)
                pass
            if code == 1:
                #print(sinchi)
                print(msg)
            else:
                print(msg)

            maininchi = truncate_inchi(inchi, ['connect'])

            print('Generating Fixed H InChI for PDB molecule ... ')
            pdbinchi, code, msg, log, aux = rdinchi.MolToInchi(
                newpdbmol, options='-FixedH -DoNotAddH')
            if code == 0:
                pass
            if code == 1:
                print(msg)
            else:
                print(msg)

            print('Generating Standard InChI for PDB molecule ... ')
            pdbsinchi, code, msg, log, aux = rdinchi.MolToInchi(
                newpdbmol, options=' -DoNotAddH')
            if code == 0:
                pass
            if code == 1:
                print(msg)
            else:
                print(msg)

            pdbmaininchi = truncate_inchi(pdbinchi, ['connect'])

            result = 'OK'
            if maininchi != pdbmaininchi:
                result = 'FAIL: Molecules have diferent scaffolds\n' + maininchi + ' ' + pdbmaininchi + '.'
                failnum += 1
                print('Main chain InChI check: ' + result)
            else:
                print('Main chain InChI check: ' + result)
                result = 'OK'
                if checkconnect:
                    if sinchi != pdbsinchi:
                        result = 'FAIL: Molecules are not the same compound or have different net charge.\n' + sinchi + '\n' + pdbsinchi + '.'
                        failnum += 1
                        print('Standard InChI check: ' + result)
                    else:
                        print('Standard InChI check: ' + result)
                        result = 'OK'
                        if inchi != pdbinchi:
                            result = 'FAIL: Molecules have different protonation/tautomery\n' + inchi + '\n' + pdbinchi + '.'
                            failnum += 1
                        print('Fixed H InChI check: ' + result)
                        print('OK')

            return failnum, newpdbmol, nhpdbmol
def generate_data(input_file):
    """ takes all text from the input structure data file and returns a list of
    lists with all generated data needed for the sqlite database.
  
    input_file: input structure txt file
    """

    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    pre_SMILES_list = []
    identifier_list = []
    all_lines = input_file.split('\n')
    if all_lines[-1] == '':
        all_lines = all_lines[:-1]
    for line in all_lines:
        line = line.split('\t')

        #Convert to mol and remove invalid structures
        smile_string = ''
        id_string = ''
        m = line[0]
        id_name = line[1]
        mol = Chem.MolFromSmiles(m)
        if mol != None:
            smile_string += m
            id_string += id_name
        pre_SMILES_list += [smile_string]

        #Source identifiers
        identifier_list += [id_string]

    pre_inchi_list = []
    for smile in pre_SMILES_list:
        #Generate mol
        m = Chem.MolFromSmiles(smile)
        #SMILES, canonical
        sm = Chem.MolToSmiles(m)
        SMILES_list += [sm]
        #Monoisotopic mass
        mol_weigth = Descriptors.ExactMolWt(m)
        mol_mass_list += [mol_weigth]
        #Mol Forumula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]
        # InChI
        inchi = rdinchi.MolToInchi(m)
        pre_inchi_list += [inchi[0]]

    # InChIKey1 and InChIKey2
    for inchi in pre_inchi_list:
        if not str(inchi).startswith('InCh'):
            inchi = 'NA'
        inchi_list += [inchi]

    pre_inchi_key_list = []
    for inchi2 in inchi_list:
        if inchi2 == 'NA':
            inchi_key = "NA-NA"
            pre_inchi_key_list += [inchi_key]
        if inchi2 != 'NA':
            inchi_key = rdinchi.InchiToInchiKey(inchi2)
            pre_inchi_key_list += [inchi_key]

    for inchi_key in pre_inchi_key_list:
        inchi_key = inchi_key.split('-')
        inchi_key2 = inchi_key[1]
        inchi_key2_list += [inchi_key2]
        inchi_key1 = inchi_key[0]
        inchi_key1_list += [inchi_key1]

    # NA list
    nr_of_structures = len(SMILES_list)
    NA_list += ['NA'] * nr_of_structures

    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return overall_list