예제 #1
0
def InchiToInchiKey(inchi):
    """Return the InChI key for the given InChI string. Return None on error"""
    ret = rdinchi.InchiToInchiKey(inchi)
    if ret:
        return ret
    else:
        return None
예제 #2
0
def smiles_to_inchi_inchikey(smiles,verbose=0):
          
    error_counter=0
    warning_counter=0
    inchis=[]
    inchis2=[]
    inchiKeys=[]

    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        if(mol!=None):
            inchi, retcode, message, logs, aux =rdinchi.MolToInchi(mol)
            inchiKey=rdinchi.InchiToInchiKey(inchi)   
            inchis.append(inchi)  
            inchiKeys.append(inchiKey)   
            try:
                mol, retcode, message, logs=rdinchi.InchiToMol(inchi)
                if(mol!=None):                
                    inchi2, retcode, message, logs, aux =rdinchi.MolToInchi(mol)
                    inchis2.append(inchi2)  
                else: 
                    inchis2.append("XXX") 
                    if(verbose!=0):
                        print(elem+ ": InChI cannot converted to mol object, added XXX instead!")
            except:
                if(verbose!=0):
                    print(retcode)
                    print(message)
                    print("Smiles:"+elem)
                inchis2.append("XXX")  
            
        else:  
            inchis.append("XXX")
            inchiKeys.append("XXX")
            inchis2.append("XXX")            
            if(verbose!=0):
                print(elem+ ": can not converted added XXX instead! ")            
            error_counter=error_counter+1

    for i1,i2 in zip(inchis,inchis2):
        if(i1!=i2):
            if(verbose!=0):
                print("Warning:"+i1+" - "+i2)
            warning_counter=warning_counter+1

    print("\nGeneration of InChI and InChIKey from SMILES is completed.")    
    print("Total errors:"+str(error_counter))
    print("Total warnings:"+str(warning_counter)+"\n")    
                                          
    return inchis,inchiKeys
def parse_file(input_file, db_name):
    """ takes all text from nanpdb database file and returns a list of lists 
    with NPs which is easy to use.
  
    input_file: nanpdb database txt file
    db_name: database name
    """

    all_lines = input_file.split('\n')
    all_lines = all_lines[:-1]
    all_info_list = []
    for line in all_lines:
        line = line.split('\t')
        info_per_row_list = []
        for value in line:
            my_string = ""
            if len(value) == 0:
                value = "NA"
            my_string += value
            info_per_row_list += [my_string]
        info_per_row_list += [db_name]
        all_info_list += [info_per_row_list]


    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']
    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    for line in all_info_list:
        # generate molecules
        m = Chem.MolFromSmiles(line[0])

        # MonoisotopicMass
        mol_mass = str(Descriptors.ExactMolWt(m))[:-5]
        mol_mass_list += [mol_mass]

        # InChI
        inchi = rdinchi.MolToInchi(m)
        inchi_list += [inchi[0]]

        # SMILES
        SMILES_list += [line[0]]

        # Identifier
        identifier_list += [line[1]]

        # MolecularFormula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]

        # NA list
    nr_of_structures = len(all_info_list)
    NA_list += ['NA'] * nr_of_structures

    # InChIKey
    inchi_key_list = []
    inchi_key_list2 = []
    for inchi in inchi_list:
        inchi_key = rdinchi.InchiToInchiKey(inchi)
        inchi_key_list2 += [inchi_key]
    inchi_key_list += inchi_key_list2

    # InChiKey1 and InChiKey2
    for inchikey in inchi_key_list:
        inchikey = inchikey.split('-')
        inchikey1 = inchikey[0]
        inchikey2 = inchikey[1]
        inchi_key1_list += [inchikey1]
        inchi_key2_list += [inchikey2]


    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return attribute_names, overall_list
예제 #4
0
def parse_data(input_file):
    """ takes all text from norine database file and returns a list of lists 
    with all CLASS data and an attribute list.
  
    input_file: norine database txt file
    """
    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']

    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    pre_SMILES_list = []
    identifier_list = []
    all_lines = input_file.split('\n')
    all_lines = all_lines[2:]
    for line in all_lines:
        line = line.split('\t')

        #Convert to mol and remove invalid structures
        smile_string = ''
        id_string = ''
        m = line[2]
        id_name = line[0]
        mol = Chem.MolFromSmiles(m)
        if mol != None:
            smile_string += m
            id_string += id_name
        pre_SMILES_list += [smile_string]

        #Source identifiers
        identifier_list += [id_string]

    pre_inchi_list = []
    for smile in pre_SMILES_list:
        #Generate mol
        m = Chem.MolFromSmiles(smile)
        #SMILES
        sm = Chem.MolToSmiles(m)
        SMILES_list += [sm]
        #Monoisotopic mass
        mol_weigth = Descriptors.ExactMolWt(m)
        mol_mass_list += [mol_weigth]
        #Mol Forumula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]
        # InChI
        inchi = rdinchi.MolToInchi(m)
        pre_inchi_list += [inchi[0]]

    # InChIKey1 and InChIKey2
    for inchi in pre_inchi_list:
        if not str(inchi).startswith('InCh'):
            inchi = 'NA'
        inchi_list += [inchi]

    pre_inchi_key_list = []
    for inchi2 in inchi_list:
        if inchi2 == 'NA':
            inchi_key = "NA-NA"
            pre_inchi_key_list += [inchi_key]
        if inchi2 != 'NA':
            inchi_key = rdinchi.InchiToInchiKey(inchi2)
            pre_inchi_key_list += [inchi_key]

    for inchi_key in pre_inchi_key_list:
        inchi_key = inchi_key.split('-')
        inchi_key2 = inchi_key[1]
        inchi_key2_list += [inchi_key2]
        inchi_key1 = inchi_key[0]
        inchi_key1_list += [inchi_key1]

    # NA list
    nr_of_structures = len(SMILES_list)
    NA_list += ['NA'] * nr_of_structures

    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return attribute_names, overall_list
def create_CLASS_data(data_dict):
    """ Generates CLASS data for the strepto data present in the strep_dict.
    
    input_file: streptodb dictionary
    """    
    
    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']
    
    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = [] 
    inchi_key2_list = [] 
    mol_formula_list = []
    NA_list = []
    
    # Identifier
    identifier_list = data_dict['compound_id']
    
    # SMILES
    SMILES_list = data_dict['canonical_smiles']
    
    for SMILE in SMILES_list:
        # generate molecules
        m = Chem.MolFromSmiles(SMILE)
    
        # MonoisotopicMass
        mol_mass = str(Descriptors.ExactMolWt(m))[:-3]
        mol_mass_list += [mol_mass]

        # InChI         
        inchi = rdinchi.MolToInchi(m)
        inchi_list += [inchi[0]]   
        
        # MolecularFormula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]

    # NA list    
    nr_of_structures = len(data_dict['canonical_smiles'])
    NA_list += ['NA'] * nr_of_structures
    
    # InChIKey
    inchi_key_list = [] 
    inchi_key_list2 = []    
    for inchi in inchi_list:
        inchi_key = rdinchi.InchiToInchiKey(inchi)
        inchi_key_list2 += [inchi_key]
    inchi_key_list += inchi_key_list2
    
    # InChiKey1 and InChiKey2
    for inchikey in inchi_key_list:
        inchikey = inchikey.split('-')
        inchikey1 = inchikey[0]
        inchikey2 = inchikey[1]
        inchi_key1_list += [inchikey1]
        inchi_key2_list += [inchikey2]
    
    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]
    
    return  attribute_names, overall_list   
def generate_data(input_file):
    """ takes all text from the input structure data file and returns a list of
    lists with all generated data needed for the sqlite database.
  
    input_file: input structure txt file
    """

    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    pre_SMILES_list = []
    identifier_list = []
    all_lines = input_file.split('\n')
    if all_lines[-1] == '':
        all_lines = all_lines[:-1]
    for line in all_lines:
        line = line.split('\t')

        #Convert to mol and remove invalid structures
        smile_string = ''
        id_string = ''
        m = line[0]
        id_name = line[1]
        mol = Chem.MolFromSmiles(m)
        if mol != None:
            smile_string += m
            id_string += id_name
        pre_SMILES_list += [smile_string]

        #Source identifiers
        identifier_list += [id_string]

    pre_inchi_list = []
    for smile in pre_SMILES_list:
        #Generate mol
        m = Chem.MolFromSmiles(smile)
        #SMILES, canonical
        sm = Chem.MolToSmiles(m)
        SMILES_list += [sm]
        #Monoisotopic mass
        mol_weigth = Descriptors.ExactMolWt(m)
        mol_mass_list += [mol_weigth]
        #Mol Forumula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]
        # InChI
        inchi = rdinchi.MolToInchi(m)
        pre_inchi_list += [inchi[0]]

    # InChIKey1 and InChIKey2
    for inchi in pre_inchi_list:
        if not str(inchi).startswith('InCh'):
            inchi = 'NA'
        inchi_list += [inchi]

    pre_inchi_key_list = []
    for inchi2 in inchi_list:
        if inchi2 == 'NA':
            inchi_key = "NA-NA"
            pre_inchi_key_list += [inchi_key]
        if inchi2 != 'NA':
            inchi_key = rdinchi.InchiToInchiKey(inchi2)
            pre_inchi_key_list += [inchi_key]

    for inchi_key in pre_inchi_key_list:
        inchi_key = inchi_key.split('-')
        inchi_key2 = inchi_key[1]
        inchi_key2_list += [inchi_key2]
        inchi_key1 = inchi_key[0]
        inchi_key1_list += [inchi_key1]

    # NA list
    nr_of_structures = len(SMILES_list)
    NA_list += ['NA'] * nr_of_structures

    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return overall_list