예제 #1
0
def check_reaction(reactants, products):
    """
    """
    if isinstance(reactants, list): reactants = ".".join(reactants)
    if isinstance(products, list): products = ".".join(products)

    reactants = Chem.MolFromSmiles(reactants)
    products = Chem.MolFromSmiles(products)
    return rdMolDescriptors.CalcMolFormula(
        reactants) == rdMolDescriptors.CalcMolFormula(products)
예제 #2
0
  def testMolFormula(self):
    m = Chem.MolFromSmiles("[2H]C([3H])O")
    formula = rdMD.CalcMolFormula(m)
    self.assertEqual(formula,'CH4O')
    formula = rdMD.CalcMolFormula(m,separateIsotopes=True)
    self.assertEqual(formula,'CH2DTO')
    formula = rdMD.CalcMolFormula(m,separateIsotopes=True,abbreviateHIsotopes=False)
    self.assertEqual(formula,'CH2[2H][3H]O')

    m = Chem.MolFromSmiles("[2H][13CH2]CO")
    formula = rdMD.CalcMolFormula(m)
    self.assertEqual(formula,'C2H6O')
    formula = rdMD.CalcMolFormula(m,separateIsotopes=True)
    self.assertEqual(formula,'C[13C]H5DO')
예제 #3
0
 def set2DStructure(self):
     if self.smiles:
         try:
             mol = Chem.MolFromSmiles(self.smiles)
             self.molwt = rdMolDescriptors.CalcExactMolWt(mol)
             self.molformula = rdMolDescriptors.CalcMolFormula(mol)
             rdDepictor.Compute2DCoords(mol)
             self.structure_image = Draw.MolToImage(mol, size=(400,200), kekulize=True, wedgeBonds=False)
             pixdata = self.structure_image.load()
             for y in range(self.structure_image.size[1]):
                 for x in range(self.structure_image.size[0]):
                     if pixdata[x, y] == (255, 255, 255, 255):
                         pixdata[x, y] = (255, 255, 255, 0)
             self.structure_qt = ImageQt.ImageQt(self.structure_image)
         except Exception as e:
             print(e)
             self.molwt = False
             self.molformula = False
             self.structure_image = False
             self.structure_data = False
     else:
         self.molwt = False
         self.molformula = False
         self.structure_image = False
         self.structure_data = False
예제 #4
0
def epilion2sdf(abbr_lst, save_sdf):

    if isinstance(abbr_lst, str):
        try:
            if os.path.isfile(abbr_lst):
                logger.info(f'Try to open file: {abbr_lst}')
                with open(abbr_lst, 'r') as infile_obj:
                    abbr_lst = infile_obj.readlines()
            else:
                logger.error(f'Can NOT load input: {abbr_lst}')
                logger.info('!! END PROCESSING !!')
                exit()
        except Exception as e:
            logger.error(f'Can NOT load input: {abbr_lst}')
            logger.error(e)

    fa_decoder = ParserFA()
    pl_decoder = ParserPL()

    info_dct = {}

    for abbr in abbr_lst:
        logger.info(abbr)
        if fa_decoder.is_fa(abbr):
            smi = fa_decoder.get_smi_fa(abbr)
            logger.info(abbr + ': ' + smi)
            info_dct[abbr] = smi
        elif pl_decoder.is_pl(abbr):
            smi = pl_decoder.get_smi_pl(abbr)
            logger.info(abbr + ': ' + smi)
            info_dct[abbr] = smi
        else:
            logger.info(f'Can NOT parse abbreviation: {abbr}')

    sdf_writer = Chem.SDWriter(open(save_sdf, mode='w'))

    for m in abbr_lst:
        if m in info_dct:
            smi = info_dct[m]
            try:
                mol = Chem.MolFromSmiles(smi)
                AllChem.Compute2DCoords(mol)
                mol.SetProp('_Name', m)
                m_mass = Descriptors.MolWt(mol)
                m_exactmass = rdMolDescriptors.CalcExactMolWt(mol)
                m_formula = rdMolDescriptors.CalcMolFormula(mol)
                mol.SetProp('EXACT_MASS', '%.6f' % m_exactmass)
                mol.SetProp('NOMINAL_MASS', '%.3f' % m_mass)
                mol.SetProp('FORMULA', m_formula)
                sdf_writer.write(mol)
            except Exception as e:
                logger.error(f'! FAILED: {m}')
                logger.error(
                    f'! FAILED to generate structure from SMILES: {smi}')
                logger.error(e)
        else:
            logger.warning(f'!! Can NOT parse: {m}')
예제 #5
0
def parse_data(input_file):
    """ takes all text from NPAtlas database file and returns a list of lists 
    with all CLASS data and an attribute list.
  
    input_file: NPAtlas database txt file
    """
    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']

    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    all_lines = input_file.split('\n')
    all_lines = all_lines[1:-1]
    for line in all_lines:
        line = line.split('\t')

        #SMILE
        m = line[1]
        m = Chem.MolFromSmiles(m)
        sm = Chem.MolToSmiles(m)
        SMILES_list += [sm]
        #Monoisotopic mass
        mol_weigth = Descriptors.ExactMolWt(m)
        mol_mass_list += [mol_weigth]
        #Source identifiers
        identifier_list += [line[0]]
        #Inchi
        inchi_list += [line[2]]
        #InchiKeys
        inchi_key = line[3].split('-')
        inchi_key2 = inchi_key[1]
        inchi_key2_list += [inchi_key2]
        inchi_key1 = inchi_key[0]
        inchi_key1_list += [inchi_key1]
        #Mol Forumula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]

    # NA list
    nr_of_structures = len(SMILES_list)
    NA_list += ['NA'] * nr_of_structures


    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return attribute_names, overall_list
예제 #6
0
def get_molecular_formula(smi):
    """
    Return the molecular formula of the molecule corresponding to the smiles
    """
    try:
        mol = Chem.AddHs(Chem.MolFromSmiles(smi))
    except NameError:
        logging.error('RDKit is not installed or loaded correctly.')
        sys.exit()
    return rdMolDescriptors.CalcMolFormula(mol)
def main(in_file, output):

  Cmpds  = {}
  InMols = rdkit_open([in_file])
  print('\n # Number of input molecule: {0}'.format(len(InMols)))
  for mol in InMols:
    m = {}

    name = mol.GetProp('_Name').split()[0]
    
    m['Name'] = name
    m['Formula'] = rd.CalcMolFormula(mol)
    m['SMILES'] = Chem.MolToSmiles(mol)

    m['MW']   = rd._CalcMolWt(mol)               # Molecular Weight
    m['logP'] = rd.CalcCrippenDescriptors(mol)[0]  # Partition coefficient
    m['HDon'] = rd.CalcNumLipinskiHBD(mol)      # Lipinski Hbond donor
    m['HAcc'] = rd.CalcNumLipinskiHBA(mol)      # Lipinski Hbond acceptor
    m['TPSA'] = rd.CalcTPSA(mol)                # Topological polar surface area

    m['Rotat'] = rd.CalcNumRotatableBonds(mol, strict=True) # Rotatable bond
    m['MolRef'] = rd.CalcCrippenDescriptors(mol)[1]         # Molar refractivity
    m['AliRing'] = rd.CalcNumAliphaticRings(mol)        # Aliphatic ring number
    m['AroRing'] = rd.CalcNumAromaticRings(mol)         # Aromatic ring number
#    m['Stereo'] = rd.CalcNumAtomStereoCenters(mol)      # Stereo center number
#    m['UnspStereo'] = rd.CalcNumUnspecifiedAtomStereoCenters(mol)  # unspecified stereo

    m['SMILES'] = Chem.MolToSmiles(mol, 
                    isomericSmiles=True, allHsExplicit=False)
    Cmpds[name] = m

  ####################################

  df = pd.DataFrame.from_dict(Cmpds, orient='index')
  df.index.name = 'Name'

  # Columns of data to print out
  Columns = [ 'Formula',
              'MW',    'logP',   'HDon',    'HAcc',    'TPSA',
              'Rotat', 'MolRef', 'AliRing', 'AroRing', 
              #'Stereo', 'UnspStereo', 
              'SMILES', ]
  reorder = df[Columns]

  # Output to CSV
  reorder.to_csv( output+'.csv', sep=',', na_rep='NA', encoding='utf-8',
                  float_format='%.5f', header=True )

  # Output to Excel
  reorder.to_excel( output+'.xlsx', header=True, na_rep='NA' )
예제 #8
0
def parse_epilion(abbr: str) -> dict:

    fa_decoder = ParserFA()
    pl_decoder = ParserPL()

    info_dct = {}

    converter = Converter(abbr_cfg_path)
    epilion_id = converter.convert_abbr(abbr)

    if fa_decoder.is_fa(epilion_id):
        smi = fa_decoder.get_smi_fa(epilion_id)
        logger.info(epilion_id + ': ' + smi)
    elif pl_decoder.is_pl(epilion_id):
        smi = pl_decoder.get_smi_pl(epilion_id)
        logger.info(epilion_id + ': ' + smi)
    else:
        logger.info(f'Can NOT parse abbreviation: {epilion_id}')

    try:
        mol = Chem.MolFromSmiles(smi)
        AllChem.Compute2DCoords(mol)
        # m_mass = Descriptors.MolWt(mol)
        m_exactmass = rdMolDescriptors.CalcExactMolWt(mol)
        m_formula = rdMolDescriptors.CalcMolFormula(mol)
        img = Draw.MolToImage(mol, size=(600, 400))
        img_io = BytesIO()
        img.save(img_io, format='png')
        img_io.seek(0)
        img.save(img_io, format='png')
        img_data = base64.b64encode(img_io.getbuffer())
        img_data_url = r'data:image/png;base64,' + img_data.decode("utf-8")

        info_dct['id'] = epilion_id
        info_dct['formula'] = m_formula
        info_dct['exactmass'] = '%.4f' % m_exactmass
        info_dct['img'] = img_data_url

    except Exception as e:
        logger.error(f'! FAILED: {epilion_id}')
        logger.error(f'! FAILED to generate structure from SMILES: {smi}')
        logger.error(e)

    return info_dct
예제 #9
0
    def calcMolprops(self):
        """Calculate masses for mol using RDKit

        Masses calculated and rounded to 4 decimal points
        [M+H]+ and other adducts can be calculated using RDKit
        and the calculate_exact_mass function by providing an
        appropriate SMILES string
        """
        self.inchi = Chem.MolToInchi(self.rdmol)
        self.inchikey = Chem.MolToInchiKey(self.rdmol)
        self.accurate_mass = round(Descriptors.ExactMolWt(self.rdmol), 4)
        self.mass = round(Descriptors.MolWt(self.rdmol), 4)
        self.m_plus_h = round(
            self.accurate_mass + calculate_exact_mass('[H+]'), 4)
        self.m_plus_na = round(
            self.accurate_mass + calculate_exact_mass('[Na+]'), 4)
        # Set name in molblock
        self.rdmol.SetProp('_Name', self.name)
        rdDepictor.Compute2DCoords(self.rdmol)
        self.molblock = Chem.MolToMolBlock(self.rdmol)
        self.formula = rdMolDescriptors.CalcMolFormula(self.rdmol)
예제 #10
0
 def calculate_properties(self, smiles=None, mol=None, props=[]):
     """this method calculates basic properties for the mol
     returns : error (bool)"""
     if len(props) == 0:
         return True
     if mol is None:
         mol = Chem.MolFromSmiles(smiles)
     if mol is None:
         return True
     if 'py_formula' in props:
         self.data['py_formula'] = desc.CalcMolFormula(mol)
     if 'py_em' in props:
         self.data['py_em'] = round(desc.CalcExactMolWt(mol), 5)
     if 'py_n_Cl_Br' in props:
         all_atoms = []
         for atom in mol.GetAtoms():
             all_atoms.append(atom.GetSymbol())
         n_Cl = all_atoms.count('Cl')
         n_Br = all_atoms.count('Br')
         self.data['py_n_Cl_Br'] = n_Cl + n_Br
     if 'py_na' in props:
         self.data['py_na'] = mol.GetNumAtoms()
     if 'py_mw' in props:
         self.data['py_mw'] = desc._CalcMolWt(mol)
     if 'py_fsp3' in props:
         self.data['py_fsp3'] = desc.CalcFractionCSP3(mol)
     if 'py_rb' in props:
         self.data['py_rb'] = desc.CalcNumRotatableBonds(mol)
     if 'py_tpsa' in props:
         self.data['py_tpsa'] = desc.CalcTPSA(mol)
     if 'py_clogp' in props:
         self.data['py_clogp'] = desc.CalcCrippenDescriptors(mol)[0]
     if 'py_nar' in props:
         self.data['py_nar'] = desc.CalcNumAromaticRings(mol)
     if 'py_nhba' in props:
         self.data['py_nhba'] = desc.CalcNumHBA(mol)
     if 'py_nhbd' in props:
         self.data['py_nhbd'] = desc.CalcNumHBD(mol)
     return False
예제 #11
0
def generateCompoundPropertiesTask(structure, debug=False):
    if debug:
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    molecule = structure.molecule
    if not molecule.compoundProperty:
        prop = CompoundProperties(molecule=molecule)
    else:
        prop = molecule.compoundProperty

    saltRemover = SaltRemover()
    mol = Chem.MolFromMolBlock(str(structure.molfile))
    base = saltRemover.StripMol(mol)
    prop.hbd = Descriptors.CalcNumHBD(mol)
    prop.hba = Descriptors.CalcNumHBA(mol)
    prop.rtb = Descriptors.CalcNumRotatableBonds(mol)
    prop.alogp = Crippen.MolLogP(mol)
    prop.psa = Descriptors.CalcTPSA(mol)
    prop.full_mwt = NewDescriptors.MolWt(mol)
    # prop.exact_mass = Descriptors.CalcExactMolWt(mol)

    if base.GetNumAtoms():
        prop.mw_freebase = NewDescriptors.MolWt(base)

    prop.full_molformula = Descriptors.CalcMolFormula(mol)

    try:
        prop.save()

    except IntegrityError as e:
        if debug:
            print e.message
        else:
            raise e
def generate_data(input_file):
    """ takes all text from the input structure data file and returns a list of
    lists with all generated data needed for the sqlite database.
  
    input_file: input structure txt file
    """

    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    pre_SMILES_list = []
    identifier_list = []
    all_lines = input_file.split('\n')
    if all_lines[-1] == '':
        all_lines = all_lines[:-1]
    for line in all_lines:
        line = line.split('\t')

        #Convert to mol and remove invalid structures
        smile_string = ''
        id_string = ''
        m = line[0]
        id_name = line[1]
        mol = Chem.MolFromSmiles(m)
        if mol != None:
            smile_string += m
            id_string += id_name
        pre_SMILES_list += [smile_string]

        #Source identifiers
        identifier_list += [id_string]

    pre_inchi_list = []
    for smile in pre_SMILES_list:
        #Generate mol
        m = Chem.MolFromSmiles(smile)
        #SMILES, canonical
        sm = Chem.MolToSmiles(m)
        SMILES_list += [sm]
        #Monoisotopic mass
        mol_weigth = Descriptors.ExactMolWt(m)
        mol_mass_list += [mol_weigth]
        #Mol Forumula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]
        # InChI
        inchi = rdinchi.MolToInchi(m)
        pre_inchi_list += [inchi[0]]

    # InChIKey1 and InChIKey2
    for inchi in pre_inchi_list:
        if not str(inchi).startswith('InCh'):
            inchi = 'NA'
        inchi_list += [inchi]

    pre_inchi_key_list = []
    for inchi2 in inchi_list:
        if inchi2 == 'NA':
            inchi_key = "NA-NA"
            pre_inchi_key_list += [inchi_key]
        if inchi2 != 'NA':
            inchi_key = rdinchi.InchiToInchiKey(inchi2)
            pre_inchi_key_list += [inchi_key]

    for inchi_key in pre_inchi_key_list:
        inchi_key = inchi_key.split('-')
        inchi_key2 = inchi_key[1]
        inchi_key2_list += [inchi_key2]
        inchi_key1 = inchi_key[0]
        inchi_key1_list += [inchi_key1]

    # NA list
    nr_of_structures = len(SMILES_list)
    NA_list += ['NA'] * nr_of_structures

    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return overall_list
def parse_file(input_file, db_name):
    """ takes all text from nanpdb database file and returns a list of lists 
    with NPs which is easy to use.
  
    input_file: nanpdb database txt file
    db_name: database name
    """

    all_lines = input_file.split('\n')
    all_lines = all_lines[:-1]
    all_info_list = []
    for line in all_lines:
        line = line.split('\t')
        info_per_row_list = []
        for value in line:
            my_string = ""
            if len(value) == 0:
                value = "NA"
            my_string += value
            info_per_row_list += [my_string]
        info_per_row_list += [db_name]
        all_info_list += [info_per_row_list]


    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']
    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    for line in all_info_list:
        # generate molecules
        m = Chem.MolFromSmiles(line[0])

        # MonoisotopicMass
        mol_mass = str(Descriptors.ExactMolWt(m))[:-5]
        mol_mass_list += [mol_mass]

        # InChI
        inchi = rdinchi.MolToInchi(m)
        inchi_list += [inchi[0]]

        # SMILES
        SMILES_list += [line[0]]

        # Identifier
        identifier_list += [line[1]]

        # MolecularFormula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]

        # NA list
    nr_of_structures = len(all_info_list)
    NA_list += ['NA'] * nr_of_structures

    # InChIKey
    inchi_key_list = []
    inchi_key_list2 = []
    for inchi in inchi_list:
        inchi_key = rdinchi.InchiToInchiKey(inchi)
        inchi_key_list2 += [inchi_key]
    inchi_key_list += inchi_key_list2

    # InChiKey1 and InChiKey2
    for inchikey in inchi_key_list:
        inchikey = inchikey.split('-')
        inchikey1 = inchikey[0]
        inchikey2 = inchikey[1]
        inchi_key1_list += [inchikey1]
        inchi_key2_list += [inchikey2]


    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return attribute_names, overall_list
예제 #14
0
def parse_data(input_file):
    """ takes all text from norine database file and returns a list of lists 
    with all CLASS data and an attribute list.
  
    input_file: norine database txt file
    """
    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']

    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = []
    inchi_key2_list = []
    mol_formula_list = []
    NA_list = []

    pre_SMILES_list = []
    identifier_list = []
    all_lines = input_file.split('\n')
    all_lines = all_lines[2:]
    for line in all_lines:
        line = line.split('\t')

        #Convert to mol and remove invalid structures
        smile_string = ''
        id_string = ''
        m = line[2]
        id_name = line[0]
        mol = Chem.MolFromSmiles(m)
        if mol != None:
            smile_string += m
            id_string += id_name
        pre_SMILES_list += [smile_string]

        #Source identifiers
        identifier_list += [id_string]

    pre_inchi_list = []
    for smile in pre_SMILES_list:
        #Generate mol
        m = Chem.MolFromSmiles(smile)
        #SMILES
        sm = Chem.MolToSmiles(m)
        SMILES_list += [sm]
        #Monoisotopic mass
        mol_weigth = Descriptors.ExactMolWt(m)
        mol_mass_list += [mol_weigth]
        #Mol Forumula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]
        # InChI
        inchi = rdinchi.MolToInchi(m)
        pre_inchi_list += [inchi[0]]

    # InChIKey1 and InChIKey2
    for inchi in pre_inchi_list:
        if not str(inchi).startswith('InCh'):
            inchi = 'NA'
        inchi_list += [inchi]

    pre_inchi_key_list = []
    for inchi2 in inchi_list:
        if inchi2 == 'NA':
            inchi_key = "NA-NA"
            pre_inchi_key_list += [inchi_key]
        if inchi2 != 'NA':
            inchi_key = rdinchi.InchiToInchiKey(inchi2)
            pre_inchi_key_list += [inchi_key]

    for inchi_key in pre_inchi_key_list:
        inchi_key = inchi_key.split('-')
        inchi_key2 = inchi_key[1]
        inchi_key2_list += [inchi_key2]
        inchi_key1 = inchi_key[0]
        inchi_key1_list += [inchi_key1]

    # NA list
    nr_of_structures = len(SMILES_list)
    NA_list += ['NA'] * nr_of_structures

    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]

    return attribute_names, overall_list
예제 #15
0
파일: cheminfo.py 프로젝트: alongd/KinBot
def get_molecular_formula(smi):
    """
    Return the molecular formula of the molecule corresponding to the smiles
    """
    mol = Chem.AddHs(Chem.MolFromSmiles(smi))
    return rdMolDescriptors.CalcMolFormula(mol)
예제 #16
0
def theolpp(usr_params):
    """
    param_dct = {'lipid_class': lipid_class, 'ox_level': ox_level,
                 'oap_mode': oap_mode, 'ocp_mode': ocp_mode,
                 'lyso_oap_mode': lyso_oap_mode, 'lyso_ocp_mode': lyso_ocp_mode,
                 'ox_max': ox_max, 'keto_max': keto_max, 'ooh_max': ooh_max, 'epoxy_max': epoxy_max,
                 'lipid_lst_path': lipid_lst_path, 'lipid_tab': lipid_tab,
                 'prostane_mode': prostane_mode, 'ox_prostane_mode': ox_prostane_mode,
                 'sdf_path': sdf_path, 'msp_mode': msp_mode, 'msp_path': msp_path,
                 'mod_lst_path': mod_lst_path, 'fa_lst_path': fa_lst_path, 'prostane_mod_path': prostane_mod_path,
                 'prostane_abbr_path': prostane_abbr_path, 'frag_pattern_path': frag_pattern_path}
    :param usr_params:
    :return:
    """

    t_start = time.clock()

    pl_table = usr_params['lipid_lst_path']
    fa_table = usr_params['fa_lst_path']
    mod_table = usr_params['mod_lst_path']
    isop_cfg = usr_params['prostane_mod_path']
    isopabbr_cfg = usr_params['prostane_abbr_path']
    # pl_class_use_lst = ['PA', 'PC', 'PE', 'PG', 'PI', 'PIP', 'PS']
    pl_class = usr_params['lipid_class']
    pl_class_use_lst = [pl_class]
    ox_level = usr_params['ox_level']

    oap_mode = usr_params['oap_mode']
    ocp_mode = usr_params['ocp_mode']
    lyso_oap_mode = usr_params['lyso_oap_mode']
    lyso_ocp_mode = usr_params['lyso_ocp_mode']

    ox_max = usr_params['ox_max']
    keto_max = usr_params['keto_max']
    ooh_max = usr_params['ooh_max']
    epoxy_max = usr_params['epoxy_max']

    prostane_mode = usr_params['prostane_mode']
    prostane_ox_mode = usr_params['ox_prostane_mode']
    save_sdf = usr_params['sdf_path']
    save_spectra = usr_params['msp_mode']
    save_msp = usr_params['msp_path']
    score_xlsx = usr_params['frag_pattern_path']
    pl_fp_xlsx = usr_params['pl_hg_path']

    pl_df = pd.read_excel(pl_table, sheetname=usr_params['lipid_tab'])
    fa_df = pd.read_csv(fa_table, index_col=0)
    print(pl_df.head())

    # Select export species OAP, OCP, Lyso OAP, Lyso OCP
    ban_lst = ['LYSOLYSO']
    if oap_mode == 0:
        ban_lst.extend(['UNMODOAP', 'OAPUNMOD', 'OAPOAP'])
    if ocp_mode == 0:
        ban_lst.extend(['UNMODOCP', 'OCPUNMOD', 'OCPOCP'])
    if lyso_oap_mode == 0:
        ban_lst.extend(['LYSOOAP', 'OAPLYSO'])
    if lyso_ocp_mode == 0:
        ban_lst.extend(['LYSOOCP', 'OCPLYSO'])
    if ox_level == 1:
        ban_lst.extend(
            ['OAPOAP', 'OCPOCP', 'OAPOCP', 'OCPOAP', 'OAPUNMOD', 'OCPUNMOD'])

    ox_param_dct = {
        'MAX_MOD': ox_max,
        'MAX_KETO': keto_max,
        'MAX_OOH': ooh_max,
        'MAX_EPOXY': epoxy_max
    }

    # sdf_writer = Chem.SDWriter(open(save_sdf, mode='w'))
    if save_spectra == 1 and len(save_msp) > 0:
        msp_obj = open(save_msp, mode='w')
    else:
        msp_obj = None
    sdf_dct = {}

    parser = PLParser()
    abbr_gen = AbbrGenerator()

    frag_gen = TheoFrag(pl_class, score_xlsx)
    fingerprint_gen = FingerprintGen(pl_fp_xlsx)

    c_lst = []

    fa_lpp_df_dct = {}

    sum_theo_lpp_dct = {}
    for (_idx, _row) in pl_df.iterrows():

        _pl_abbr = str(_row['phospholipids'])

        _pl_elem_lst, pl_info_dct = parser.get_composition(_pl_abbr)
        print('PL composition ==>', _pl_elem_lst)
        _pl_hg_abbr = _pl_elem_lst[0]

        # get smiles from abbr

        if _pl_hg_abbr in pl_class_use_lst:
            c_lst.append(_pl_abbr)

            # prepare output
            _pl_lpp_df = pd.DataFrame()

            print('Start oxidation of ==>', _pl_abbr)
            _pl_sn1_abbr = _pl_elem_lst[1]
            _pl_sn2_abbr = _pl_elem_lst[2]
            if len(pl_info_dct.keys()) > 0:
                sn1_link = pl_info_dct['sn1_link']
                sn1_c_num = int(pl_info_dct['sn1_c_num'])
                sn1_db_num = int(pl_info_dct['sn1_db_num'])
                sn1_omega_type = int(pl_info_dct['sn1_omega_type'])
                if sn1_omega_type == 0:
                    sn1_query_code = 'Link == "%s" and C == % i and DB == %i' % (
                        sn1_link, sn1_c_num, sn1_db_num)
                    sn1_fa_df = fa_df.query(sn1_query_code)
                    sn1_fa_df = sn1_fa_df.query(sn1_query_code).head(1)
                else:
                    sn1_query_code = 'Link == "%s" C == % i and DB == %i' % (
                        sn1_link, sn1_c_num, sn1_db_num)
                    sn1_fa_df = fa_df.query(sn1_query_code)
                    sn1_fa_df = sn1_fa_df.query(
                        'Link == "%s" and omega == %i' %
                        (sn1_link, sn1_omega_type)).head(1)

                sn2_link = pl_info_dct['sn2_link']
                sn2_c_num = int(pl_info_dct['sn2_c_num'])
                sn2_db_num = int(pl_info_dct['sn2_db_num'])
                sn2_omega_type = int(pl_info_dct['sn2_omega_type'])
                if sn2_omega_type == 0:
                    sn2_query_code = 'Link == "%s" and C == % i and DB == %i' % (
                        sn2_link, sn2_c_num, sn2_db_num)
                    sn2_fa_df = fa_df.query(sn2_query_code)
                    sn2_fa_df = sn2_fa_df.query(sn2_query_code).head(1)
                else:
                    sn2_query_code = 'Link == "%s" and C == % i and DB == %i' % (
                        sn2_link, sn2_c_num, sn2_db_num)
                    sn2_fa_df = fa_df.query(sn2_query_code)
                    sn2_fa_df = sn2_fa_df.query(
                        'Link == "%s" and omega == %i' %
                        (sn2_link, sn2_omega_type)).head(1)

                _pl_sn1_smiles = sn1_fa_df.loc[_pl_sn1_abbr, 'SMILES']
                _pl_sn2_smiles = sn2_fa_df.loc[_pl_sn2_abbr, 'SMILES']
                print('sn1 =>', _pl_sn1_smiles, '|| sn2 =>', _pl_sn2_smiles)

            else:
                _pl_sn1_smiles = ''
                _pl_sn2_smiles = ''

            # check if FA already oxidized to speed up
            if _pl_sn1_abbr in fa_lpp_df_dct.keys():
                sn1_mod_sum_df = fa_lpp_df_dct[_pl_sn1_abbr]
            else:
                sn1_link_dct = fa_link_filter(_pl_sn1_smiles)
                sn1_mod_sum_df = oxidizer(sn1_link_dct, mod_table, isop_cfg,
                                          isopabbr_cfg, ox_level, ox_param_dct,
                                          prostane_mode, prostane_ox_mode)
                fa_lpp_df_dct[_pl_sn1_abbr] = sn1_mod_sum_df.copy()

            if _pl_sn2_abbr in fa_lpp_df_dct.keys():
                sn2_mod_sum_df = fa_lpp_df_dct[_pl_sn2_abbr]
            else:
                sn2_link_dct = fa_link_filter(_pl_sn2_smiles)
                sn2_mod_sum_df = oxidizer(sn2_link_dct, mod_table, isop_cfg,
                                          isopabbr_cfg, ox_level, ox_param_dct,
                                          prostane_mode, prostane_ox_mode)
                fa_lpp_df_dct[_pl_sn2_abbr] = sn2_mod_sum_df.copy()

            for (_sn1_idx, _sn1_row) in sn1_mod_sum_df.iterrows():
                _sn1_mod_smiles = _sn1_row['FULL_SMILES']
                _sn1_abbr_str = _sn1_row['FA_ABBR']
                _sn1_typ_str = _sn1_row['FA_TYPE']
                _sn1_formula_str = _sn1_row['FA_FORMULA']

                for (_sn2_idx, _sn2_row) in sn2_mod_sum_df.iterrows():
                    _sn2_mod_smiles = _sn2_row['FULL_SMILES']
                    _sn2_abbr_str = _sn2_row['FA_ABBR']
                    _sn2_typ_str = _sn2_row['FA_TYPE']
                    _sn2_formula_str = _sn2_row['FA_FORMULA']

                    _oap_ocp_lst = [_sn1_typ_str, _sn2_typ_str]
                    _lpp_typ = ''.join(_oap_ocp_lst)

                    if _lpp_typ not in ban_lst:
                        _lpp_smiles = LPPmerge.pl_lpp(_pl_hg_abbr,
                                                      sn1=_sn1_mod_smiles,
                                                      sn2=_sn2_mod_smiles)
                        _lpp_id_str = str(''.join([
                            _pl_hg_abbr, '(', _sn1_abbr_str, '/',
                            _sn2_abbr_str, ')'
                        ]))

                        _lpp_sub_class_json = '{"SN1": "%s", "SN2": "%s"}' % (
                            _sn1_typ_str, _sn2_typ_str)

                        _lpp_info_dct = {
                            'LPP_ORIGIN': _pl_abbr,
                            'LPP_SMILES': _lpp_smiles,
                            'LPP_CLASS': _pl_hg_abbr,
                            'SN1_SMILES': _sn1_mod_smiles,
                            'SN2_SMILES': _sn2_mod_smiles,
                            'SN1_ABBR': _sn1_abbr_str,
                            'SN2_ABBR': _sn2_abbr_str,
                            'SN1_JSON': _sn1_row['FA_JSON'],
                            'SN2_JSON': _sn2_row['FA_JSON'],
                            'SN1_FORMULA': _sn1_formula_str,
                            'SN2_FORMULA': _sn2_formula_str,
                            'LM_ID': _lpp_id_str,
                            'SN_JSON': _lpp_sub_class_json
                        }
                        if save_spectra == 1:
                            _lpp_info_dct['MSP_JSON'] = frag_gen.calc_frags(
                                _lpp_info_dct)

                        _lpp_info_se = pd.Series(data=_lpp_info_dct)
                        _pl_lpp_df[_lpp_id_str] = _lpp_info_se

                        # check if same lpp generated already
                        # Currently use bulk settings
                        if _lpp_id_str in sdf_dct.keys():
                            _lpp_origin = sdf_dct[_lpp_id_str]['LPP_ORIGIN']
                            _lpp_origin_lst = _lpp_origin.split(',')
                            if _pl_abbr in _lpp_origin_lst:
                                pass
                            else:
                                _lpp_origin_lst.append(_pl_abbr)
                                sdf_dct[_lpp_id_str]['LPP_ORIGIN'] = ','.join(
                                    _lpp_origin_lst)
                        else:
                            sdf_dct[_lpp_id_str] = _lpp_info_dct.copy()

                        # clean memory by deleting these dicts and series
                        del _lpp_info_dct, _lpp_info_se

            # generate summary table
            _pl_lpp_df = _pl_lpp_df.transpose()
            print('==> %i of LPP generated !!' % _pl_lpp_df.shape[0])
            print('==> ==> Move to next lipid==> ')
            # print(_pl_lpp_df.head())
            sum_theo_lpp_dct[_pl_abbr] = _pl_lpp_df

            # create sdf
            # for (_lpp_i, _lpp_r) in _pl_lpp_df.iterrows():

    sum_theo_lpp_pl = pd.Panel(data=sum_theo_lpp_dct)
    print(sum_theo_lpp_pl.shape)

    # write to sdf
    print('==>Start to generate SDF ==> MSP mode = %i' % save_spectra)
    print('!! %i structures in total !!' % len(sdf_dct.keys()))

    mzcalc = MZcalc()

    sdf_writer = Chem.SDWriter(open(save_sdf, mode='w'))

    if save_spectra == 1:

        for _k_lpp in sdf_dct.keys():
            _lpp_dct = sdf_dct[_k_lpp]
            if len(json.loads(_lpp_dct['MSP_JSON']).keys()) > 0:
                _lpp_smiles = str(_lpp_dct['LPP_SMILES'])
                # print(_lpp_smiles)
                _lpp_mol = Chem.MolFromSmiles(_lpp_smiles)
                AllChem.Compute2DCoords(_lpp_mol)
                _lpp_mol.SetProp('_Name', str(_lpp_dct['LM_ID']))
                _lpp_mass = Descriptors.MolWt(_lpp_mol)
                _lpp_exactmass = rdMolDescriptors.CalcExactMolWt(_lpp_mol)
                _lpp_formula = rdMolDescriptors.CalcMolFormula(_lpp_mol)
                _lpp_mol.SetProp('EXACT_MASS', '%.6f' % _lpp_exactmass)
                _lpp_mol.SetProp('NOMINAL_MASS', '%.3f' % _lpp_mass)
                _lpp_mol.SetProp('FORMULA', _lpp_formula)
                _lpp_sn2_smi = _lpp_dct['SN2_SMILES']

                if str(_lpp_dct['LPP_CLASS']
                       ) == 'PC' and _lpp_sn2_smi[-9:] != r'C(O)=O)=O':
                    _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula)
                    _lpp_neg_precursor_formula = mzcalc.get_formula(
                        _lpp_neg_precursor_elem, charge='[M+HCOO]-')
                    _lpp_neg_precursor_mz = mzcalc.get_mono_mz(
                        _lpp_formula, charge='[M+HCOO]-')
                    _lpp_neg_precursor_info = '{"[M+HCOO]-": ["%s", %f]}' % (
                        _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz)

                else:
                    _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula)
                    _lpp_neg_precursor_formula = mzcalc.get_formula(
                        _lpp_neg_precursor_elem, charge='[M-H]-')
                    _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula,
                                                               charge='[M-H]-')
                    _lpp_neg_precursor_info = '{"[M-H]-": ["%s", %f]}' % (
                        _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz)

                _lpp_dct['PRECURSOR_JSON'] = _lpp_neg_precursor_info
                _lpp_mol.SetProp('PRECURSOR_JSON', _lpp_neg_precursor_info)
                _lpp_dct['EXACT_MASS'] = _lpp_exactmass
                fp_mz_lst = fingerprint_gen.get_fingerprint(_lpp_dct)
                _lpp_dct['FINGERPRINT'] = fp_mz_lst
                _lpp_mol.SetProp('FINGERPRINT', json.dumps(fp_mz_lst))

                for _k in _lpp_dct.keys():
                    _lpp_mol.SetProp(_k, str(_lpp_dct[_k]))

                sdf_writer.write(_lpp_mol)
                if save_spectra == 1 and len(save_msp) > 0:
                    MSPcreator.to_msp(msp_obj, _lpp_dct)

    elif save_spectra == 0:

        for _k_lpp in sdf_dct.keys():
            _lpp_dct = sdf_dct[_k_lpp]
            _lpp_smiles = str(_lpp_dct['LPP_SMILES'])
            _lpp_mol = Chem.MolFromSmiles(_lpp_smiles)
            AllChem.Compute2DCoords(_lpp_mol)
            _lpp_mol.SetProp('_Name', str(_lpp_dct['LM_ID']))
            _lpp_mass = Descriptors.MolWt(_lpp_mol)
            _lpp_exactmass = rdMolDescriptors.CalcExactMolWt(_lpp_mol)
            _lpp_formula = rdMolDescriptors.CalcMolFormula(_lpp_mol)
            _lpp_mol.SetProp('EXACT_MASS', '%.6f' % _lpp_exactmass)
            _lpp_mol.SetProp('NOMINAL_MASS', '%.3f' % _lpp_mass)
            _lpp_mol.SetProp('FORMULA', _lpp_formula)
            _lpp_sn2_smi = _lpp_dct['SN2_SMILES']

            if str(_lpp_dct['LPP_CLASS']
                   ) == 'PC' and _lpp_sn2_smi[-9:] != r'C(O)=O)=O':
                _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula)
                _lpp_neg_precursor_formula = mzcalc.get_formula(
                    _lpp_neg_precursor_elem, charge='[M+HCOO]-')
                _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula,
                                                           charge='[M+HCOO]-')
                _lpp_neg_precursor_info = '{"[M+HCOO]-": ["%s", %f]}' % (
                    _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz)

            else:
                _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula)
                _lpp_neg_precursor_formula = mzcalc.get_formula(
                    _lpp_neg_precursor_elem, charge='[M-H]-')
                _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula,
                                                           charge='[M-H]-')
                _lpp_neg_precursor_info = '{"[M-H]-": ["%s", %f]}' % (
                    _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz)

            _lpp_dct['PRECURSOR_JSON'] = _lpp_neg_precursor_info
            _lpp_mol.SetProp('PRECURSOR_JSON', _lpp_neg_precursor_info)
            _lpp_dct['EXACT_MASS'] = _lpp_exactmass
            fp_mz_lst = fingerprint_gen.get_fingerprint(_lpp_dct)
            _lpp_dct['FINGERPRINT'] = fp_mz_lst
            _lpp_mol.SetProp('FINGERPRINT', json.dumps(fp_mz_lst))

            for _k in _lpp_dct.keys():
                _lpp_mol.SetProp(_k, str(_lpp_dct[_k]))

            sdf_writer.write(_lpp_mol)

    sdf_writer.close()
    if save_spectra == 1 and len(save_msp) > 0:
        msp_obj.close()

    SDFsummary.sdf2xlsx(save_sdf, str(save_sdf)[:-4] + '.xlsx')
    # if save_spectra == 1:
    SDFsummary.sdf2sum_fa(save_sdf, str(save_sdf)[:-4] + '_FA_SUM.xlsx')

    t_spent = time.clock() - t_start
    info_updater_1 = '=>%i of LPP generated ==> ' % len(sdf_dct.keys())
    info_updater_2 = '=>==> %i of phospholipids processed in %.3fs ==> ==> Finished !!!!!!' % (
        len(c_lst), t_spent)

    return info_updater_1, info_updater_2
예제 #17
0
                return


def check_maxes(formd, maxes):
    bools = [v < maxes[e] for e, v in formd.items()]
    return all(bools)


def rec_formula(mz, ppm=5):
    maxes = dict(get_elemaxs(mz))
    error = mz * (ppm * 1E-6)
    mlow, mhigh = mz - error, mz + error
    formula = {e: 0 for e in eles.keys()}
    return _rec_form(formula, mlow, mhigh, maxes)


def _rec_form(ele_idx, formula, mlow, mhigh, maxes):
    good_form = check_formula(formula, mlow, mhigh)
    good_form = True
    under_maxes = check_maxes(formula, maxes)
    if good_form and under_maxes:
        yield formula
    else:
        formula[ele]
        pass


sm = get_soome_mols()
masses = [rdMolDescriptors.CalcExactMolWt(m) for m in sm]
formulas = [rdMolDescriptors.CalcMolFormula(m) for m in sm]
예제 #18
0
def calculate_formula_in_dataframe(x):
    formula = ''
    if x:
        formula = rdMolDescriptors.CalcMolFormula(x)
    return formula
예제 #19
0
def formatdb(smiles):
    df = pd.read_csv(smiles, sep='\t', header=None)
    os.remove(smiles)

    smi = list(df[0])
    m = [Chem.MolFromSmiles(x) for x in smi]
    inchi = []
    ikeys = []
    ikey1 = []
    ikey2 = []
    form = []
    exmass = []
    for i in range(len(m)):
        try:
            inchi.append(Chem.rdinchi.MolToInchi(m[i])[0])
            ikey = Chem.rdinchi.InchiToInchiKey(inchi[i])
            ikeys.append(ikey)
            ikey1.append(ikey.split('-')[0])
            ikey2.append(ikey.split('-')[1])
            form.append(rdMD.CalcMolFormula(m[i]))
            exmass.append(rdMD.CalcExactMolWt(m[i]))
        except:
            ikeys.append('')
            inchi.append('')
            ikey1.append('')
            ikey2.append('')
            form.append('')
            exmass.append('')

    data = {
        'inchikey': ikeys,
        'MonoisotopicMass': exmass,
        'InChI': inchi,
        'SMILES': list(df[0]),
        'Identifier': list(df[1]),
        'InChIKey2': ikey2,
        'InChIKey1': ikey1,
        'MolecularFormula': form
    }

    cn = [
        "inchikey", "MonoisotopicMass", "InChI", "SMILES", "Identifier",
        "InChIKey2", "InChIKey1", "MolecularFormula"
    ]
    formdata = pd.DataFrame(data, columns=cn)

    classy = query_inchikey(ikeys)

    # If the structure do not show a classification, try query
    #in_process = get_class(list(df[0]), chunksize=100)
    #classy = poll(in_process)

    classy = classy[['inchikey', 'kingdom', 'superclass', 'class', 'subclass']]
    classy.columns = [
        'inchikey', 'kingdom_name', 'superclass_name', 'class_name',
        'subclass_name'
    ]

    formfinal = pd.merge(formdata, classy, how='left', on=['inchikey'])

    formfinal = formfinal.fillna('')
    formfinal.drop('inchikey', axis=1, inplace=True)

    id = [x for x in range(len(ikeys)) if ikeys[x] == '']
    formfinal.drop(formfinal.index[id], inplace=True)

    formfinal.to_csv(smiles + '_FORMATED.txt', index=False, sep='\t')
    return 'Done'
def create_CLASS_data(data_dict):
    """ Generates CLASS data for the strepto data present in the strep_dict.
    
    input_file: streptodb dictionary
    """    
    
    attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\
    'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\
    'superclass_name', 'class_name', 'subclass_name']
    
    mol_mass_list = []
    inchi_list = []
    SMILES_list = []
    identifier_list = []
    inchi_key1_list = [] 
    inchi_key2_list = [] 
    mol_formula_list = []
    NA_list = []
    
    # Identifier
    identifier_list = data_dict['compound_id']
    
    # SMILES
    SMILES_list = data_dict['canonical_smiles']
    
    for SMILE in SMILES_list:
        # generate molecules
        m = Chem.MolFromSmiles(SMILE)
    
        # MonoisotopicMass
        mol_mass = str(Descriptors.ExactMolWt(m))[:-3]
        mol_mass_list += [mol_mass]

        # InChI         
        inchi = rdinchi.MolToInchi(m)
        inchi_list += [inchi[0]]   
        
        # MolecularFormula
        mol_formula = rdMolDescriptors.CalcMolFormula(m)
        mol_formula_list += [mol_formula]

    # NA list    
    nr_of_structures = len(data_dict['canonical_smiles'])
    NA_list += ['NA'] * nr_of_structures
    
    # InChIKey
    inchi_key_list = [] 
    inchi_key_list2 = []    
    for inchi in inchi_list:
        inchi_key = rdinchi.InchiToInchiKey(inchi)
        inchi_key_list2 += [inchi_key]
    inchi_key_list += inchi_key_list2
    
    # InChiKey1 and InChiKey2
    for inchikey in inchi_key_list:
        inchikey = inchikey.split('-')
        inchikey1 = inchikey[0]
        inchikey2 = inchikey[1]
        inchi_key1_list += [inchikey1]
        inchi_key2_list += [inchikey2]
    
    overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\
    [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\
    [NA_list]+[NA_list]+[NA_list]+[NA_list]
    
    return  attribute_names, overall_list   
예제 #21
0
def extract_molecules(xml_3d_filename, outfile):
    """
    Extract molecules and then stick into database
    """

    tree3d = ET.parse(xml_3d_filename)
    root = tree3d.getroot()

    molecules = []
    for molecule in root.findall('{http://www.xml-cml.org/schema}molecule'):
        molecules.append(molecule)

    MAX_DEBUG_ITER = 100000000

    molecules_df = []
    for m, _ in tqdm(zip(molecules, range(MAX_DEBUG_ITER)),
                     total=len(molecules)):
        mol_id = m.attrib['id']

        mol = Chem.RWMol()
        mol.SetProp("id", mol_id)
        name = ""
        if 'title' in m.attrib:
            name = m.attrib['title']
        mol.SetProp("name", name)
        atomArray = m.find("{http://www.xml-cml.org/schema}atomArray")
        bondArray = m.find("{http://www.xml-cml.org/schema}bondArray")
        atom_pos_map = {}

        atoms_3dloc = []
        for ai, a in enumerate(atomArray):
            #print(a.attrib)
            atom = Chem.Atom(a.attrib['elementType'])
            x3 = float(a.attrib['x3'])
            y3 = float(a.attrib['y3'])
            z3 = float(a.attrib['z3'])

            #atom.SetIsotope(int(a.attrib['isotopeNumber']))
            atom.SetFormalCharge(int(a.attrib['formalCharge']))
            #atom.SetNumExplicitHs(int(a.attrib['hydrogenCount']))
            atom.SetProp('id', a.attrib['id'])
            idx = mol.AddAtom(atom)
            atom_pos_map[a.attrib['id']] = idx
            assert idx == ai

            atoms_3dloc.append((x3, y3, z3))

        for b in bondArray:
            atom_refs = b.attrib['atomRefs2']
            bond_order = b.attrib['order']
            a1, a2 = atom_refs.split(" ")
            if bond_order == 'S':
                bond = Chem.rdchem.BondType.SINGLE
            elif bond_order == 'D':
                bond = Chem.rdchem.BondType.DOUBLE
            elif bond_order == 'T':
                bond = Chem.rdchem.BondType.TRIPLE
            else:
                raise NotImplementedError()

            mol.AddBond(atom_pos_map[a1], atom_pos_map[a2], order=bond)

            C_count = np.sum([a.GetSymbol() == 'C' for a in mol.GetAtoms()])
            H_count = np.sum([a.GetSymbol() == 'H' for a in mol.GetAtoms()])

        try:
            Chem.SanitizeMol(mol)
            formula = rdMD.CalcMolFormula(mol)

            error_msg = ""
            valid = True
        except ValueError as e:
            print("error sanitizing", name, e)
            error_msg = str(e)
            valid = False

        mol = mol.GetMol()
        c = datautil.array_to_conf(np.array(atoms_3dloc))
        mol.AddConformer(c)

        molecules_df.append({
            'mol_id': mol_id,
            'name': name,
            'C_count': C_count,
            'H_count': H_count,
            'formula': formula,
            'error_msg': error_msg,
            'mol': mol,
            'valid': valid
        })
    molecules_df = pd.DataFrame(molecules_df).set_index('mol_id')

    out = []
    for row_i, row in tqdm(molecules_df.iterrows(), total=len(molecules_df)):
        nmrshift_mol = row.mol

        id_to_pos = {
            nmrshift_mol.GetAtomWithIdx(i).GetProp('id'): i
            for i in range(nmrshift_mol.GetNumAtoms())
        }
        for id_str, pos in id_to_pos.items():
            out.append({'atom': id_str, 'atom_idx': pos, 'molecule': row_i})
    mol_atomid_to_idx = pd.DataFrame(out).set_index(['molecule', 'atom'])

    pickle.dump(
        {
            'molecules_df': molecules_df,
            'mol_atomid_to_idx': mol_atomid_to_idx
        }, open(outfile, 'wb'))
예제 #22
0
'''
Created on 5 Jul 2017

@author: dghosh
'''

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

filepath = 'insert filepath here'
with open(filepath) as f:
    listMol = f.read().splitlines()
outfile = open(filepath[-4:] + 'molFolmula.txt', 'w')
for molSmile in listMol:
    mol = Chem.MolFromSmiles(molSmile)
    outfile.write(rdMolDescriptors.CalcMolFormula(mol))
예제 #23
0
    def calc_mz(self,
                elem_info,
                mod=None,
                score=0,
                charge='[M-H]-',
                lpp_info_dct=None):

        if charge in self.charge_mz_dct.keys(
        ) and charge in self.charge_elem_dct.keys():
            pass
        else:
            charge = '[M-H]-'

        if isinstance(elem_info, str):
            # test if elem_info is smiles code
            try:
                _mol = Chem.MolFromSmiles(elem_info)
                AllChem.Compute2DCoords(_mol)
                # _exactmass = rdMolDescriptors.CalcExactMolWt(_mol)
                _formula = rdMolDescriptors.CalcMolFormula(_mol)
                _elem_dct = self.parse_formula(_formula)
            except:
                _elem_dct = self.parse_formula(elem_info)
        elif isinstance(elem_info, dict):
            _elem_dct = elem_info.copy()
        else:
            _elem_dct = {}

        if mod is not None or mod != '':
            if _elem_dct is None or _elem_dct == {}:
                _elem_dct = self.get_mod_elem(elem_dct=None,
                                              mod=mod,
                                              lpp_info_dct=lpp_info_dct)
            else:
                _elem_dct = self.get_mod_elem(elem_dct=_elem_dct,
                                              mod=mod,
                                              lpp_info_dct=lpp_info_dct)

        ion_mz, _ion_elem_dct = self.formula_to_mz(_elem_dct, charge=charge)

        elem_order_lst = ['C', 'H', 'N', 'O', 'P', 'S', 'Na', 'K']
        _ion_elem = ''
        for _e in elem_order_lst:
            if _e in _ion_elem_dct.keys():
                _ion_elem += _e
                if _ion_elem_dct[_e] > 1:
                    _ion_elem += str(_ion_elem_dct[_e])
                else:
                    pass
        if charge in ['[M+H]+', '[M+Na]+', '[M+K]+', '[M+NH4]+']:
            _ion_elem += '+'
        elif charge in ['[M-H]-', '[M+FA-H]-', '[M+HCOO]-']:
            _ion_elem += '-'

        # charged_info = '|'.join([frag_type, _ion_elem])

        # ion_info = (round(ion_mz, 4), score, _ion_elem)
        ion_info_dct = {
            'mz': round(ion_mz, 4),
            'i': score,
            'formula': _ion_elem
        }

        return ion_info_dct
예제 #24
0
def diff_mol_pdb(mol, pdbfile, logfile=devnull):
    with stdout_redirected(to=logfile, stdout=sys_stderr):
        with stdout_redirected(to=logfile, stdout=sys_stdout):
            remove_isotopes(mol, sanitize=True)
            nhmol = Chem.RemoveHs(mol,
                                  implicitOnly=False,
                                  updateExplicitCount=True,
                                  sanitize=True)
            try:
                Chem.Kekulize(nhmol)
            except:
                pass
            checkconnect = True
            pdbmol = None
            try:
                pdbmol = Chem.MolFromPDBFile(pdbfile,
                                             removeHs=False,
                                             sanitize=True)
            except:
                pass
            if pdbmol is None:
                pdbmol = Chem.MolFromPDBFile(pdbfile,
                                             removeHs=False,
                                             sanitize=False)
                if pdbmol is None:
                    raise ParsingError("Cannot open PDB molecule.")
                pdbmol = disconnect(pdbmol)
                Chem.SanitizeMol(pdbmol, catchErrors=True)

            nhpdbmol = Chem.RemoveHs(pdbmol,
                                     implicitOnly=False,
                                     updateExplicitCount=True,
                                     sanitize=False)

            Chem.SanitizeMol(nhpdbmol, catchErrors=True)

            try:
                print(
                    'Applying bond orders and formal charges from molecule file to PDB molecule ... '
                )
                nhpdbmol = AssignBondOrdersFromTemplate(nhmol, nhpdbmol)
                newpdbmol = Chem.AddHs(nhpdbmol,
                                       addCoords=True,
                                       explicitOnly=True)
                newpdbmol.UpdatePropertyCache()
                newpdbmol = correct_hydrogen_num_from_pdbmol(pdbmol, newpdbmol)
                newpdbmol = set_hydrogen_coor_from_pdbmol(pdbmol,
                                                          newpdbmol,
                                                          refconfId=-1,
                                                          confId=-1)
            except Exception:
                print(
                    "WARNING: Cannot assign bond orders from molecule file template. Checking only non-hydrogen connectivity."
                )
                checkconnect = False
                newpdbmol = nhpdbmol
                pass

            #Stoichiometric formula check
            impnum = count_implicit_hydrogens(newpdbmol)
            failnum = 0
            result = 'OK'
            unformula = remove_charge_formula(
                rdMolDescriptors.CalcMolFormula(mol))
            pdbunformula = remove_charge_formula(
                rdMolDescriptors.CalcMolFormula(newpdbmol))
            #print(pdbunformula)
            pdbunformula = fix_formula(pdbunformula, impnum)
            if unformula != pdbunformula:
                failnum += 1
                result = 'FAIL: Molecules have different Stoichiometric formulas ' + unformula + ' ' + pdbunformula + '.'
            print('Stoichiometric formula check (without charge): ' + result)

            print('Generating Fixed H InChI for molecule file ... ')
            inchi, code, msg, log, aux = rdinchi.MolToInchi(
                mol, options='-FixedH -DoNotAddH')
            if code == 0:
                #print(inchi)
                pass
            if code == 1:
                # print(inchi)
                print(msg)
            else:
                print(msg)

            print('Generating Standard InChI for molecule file ... ')
            sinchi, code, msg, log, aux = rdinchi.MolToInchi(
                mol, options=' -DoNotAddH')
            if code == 0:
                #print(sinchi)
                pass
            if code == 1:
                #print(sinchi)
                print(msg)
            else:
                print(msg)

            maininchi = truncate_inchi(inchi, ['connect'])

            print('Generating Fixed H InChI for PDB molecule ... ')
            pdbinchi, code, msg, log, aux = rdinchi.MolToInchi(
                newpdbmol, options='-FixedH -DoNotAddH')
            if code == 0:
                pass
            if code == 1:
                print(msg)
            else:
                print(msg)

            print('Generating Standard InChI for PDB molecule ... ')
            pdbsinchi, code, msg, log, aux = rdinchi.MolToInchi(
                newpdbmol, options=' -DoNotAddH')
            if code == 0:
                pass
            if code == 1:
                print(msg)
            else:
                print(msg)

            pdbmaininchi = truncate_inchi(pdbinchi, ['connect'])

            result = 'OK'
            if maininchi != pdbmaininchi:
                result = 'FAIL: Molecules have diferent scaffolds\n' + maininchi + ' ' + pdbmaininchi + '.'
                failnum += 1
                print('Main chain InChI check: ' + result)
            else:
                print('Main chain InChI check: ' + result)
                result = 'OK'
                if checkconnect:
                    if sinchi != pdbsinchi:
                        result = 'FAIL: Molecules are not the same compound or have different net charge.\n' + sinchi + '\n' + pdbsinchi + '.'
                        failnum += 1
                        print('Standard InChI check: ' + result)
                    else:
                        print('Standard InChI check: ' + result)
                        result = 'OK'
                        if inchi != pdbinchi:
                            result = 'FAIL: Molecules have different protonation/tautomery\n' + inchi + '\n' + pdbinchi + '.'
                            failnum += 1
                        print('Fixed H InChI check: ' + result)
                        print('OK')

            return failnum, newpdbmol, nhpdbmol
예제 #25
0
    def get_mod_elem(self, elem_dct=None, mod=None, lpp_info_dct=None):

        mod_dct = {
            '-H2O': {
                'H': -2,
                'O': -1
            },
            '+H2O': {
                'H': 2,
                'O': 1
            },
            '-CO2': {
                'C': -1,
                'O': -2
            },
            '+HCOO': {
                'H': 1,
                'C': 1,
                'O': 2
            },
            '-C3H9N': {
                'C': -3,
                'H': -9,
                'N': -1
            },
            '-C3H5NO2': {
                'C': -3,
                'O': -2,
                'H': -5,
                'N': -1
            },
            '-CH3COOH': {
                'C': -2,
                'O': -2,
                'H': -4
            },
            '-CH2': {
                'C': -1,
                'H': -2
            },
            '-H': {
                'H': -1
            },
            '-CH3': {
                'C': -1,
                'H': -3
            },
            '+CH3': {
                'C': +1,
                'H': +3
            }
        }

        _sn1_elem = {}
        _sn2_elem = {}

        # get the formula as dict
        if elem_dct is None:
            if lpp_info_dct is None:
                _elem_dct = {}
            else:
                _lpp_type = lpp_info_dct['LPP_CLASS']
                _lpp_full_smi = lpp_info_dct['LPP_SMILES']
                _sn1_smi = lpp_info_dct['SN1_SMILES']
                _sn2_smi = lpp_info_dct['SN2_SMILES']
                _sn1_formula = lpp_info_dct['SN1_FORMULA']
                _sn2_formula = lpp_info_dct['SN2_FORMULA']
                _sn1_elem = self.parse_formula(_sn1_formula)
                _sn2_elem = self.parse_formula(_sn2_formula)

                _lyso_smi = 'O'
                if re.match(r'\[M-[sS][nN][1].*', mod):
                    _frag_smi = pl_lpp(_lpp_type, sn1=_lyso_smi, sn2=_sn2_smi)
                elif re.match(r'\[M-[sS][nN][2].*', mod):
                    _frag_smi = pl_lpp(_lpp_type, sn1=_sn1_smi, sn2=_lyso_smi)
                # PC M == M-CH3
                elif re.match(r'\[M-CH3-[sS][nN][1].*', mod):
                    _frag_smi = pl_lpp(_lpp_type, sn1=_lyso_smi, sn2=_sn2_smi)
                elif re.match(r'\[M-CH3-[sS][nN][2].*', mod):
                    _frag_smi = pl_lpp(_lpp_type, sn1=_sn1_smi, sn2=_lyso_smi)

                elif re.match(r'\[[sS][nN][1].*', mod):
                    _frag_smi = _sn1_smi
                elif re.match(r'\[[sS][nN][2].*', mod):
                    _frag_smi = _sn2_smi
                elif re.match(r'\[M[+-][HCFA].*', mod):
                    _frag_smi = _lpp_full_smi
                else:
                    _frag_smi = ''

                _mol = Chem.MolFromSmiles(_frag_smi)
                AllChem.Compute2DCoords(_mol)
                _formula = rdMolDescriptors.CalcMolFormula(_mol)
                _elem_dct = self.parse_formula(_formula)
        else:
            _elem_dct = elem_dct.copy()

        _mod_sum_elem_dct = {'C': 0, 'H': 0, 'N': 0, 'O': 0, 'P': 0}
        if mod is None or mod == '':
            _mod_sum_elem_dct = {}
        else:
            chk0 = re.compile(r'.*-[sS][nN][12].*')
            chk1 = re.compile(r'.*[-]H2O.*')
            chk2 = re.compile(r'.*[+]H2O.*')
            chk3 = re.compile(r'.*[-]CO2.*')
            chk4 = re.compile(r'.*[-]C3H9N.*')
            chk5 = re.compile(r'.*[-]CH3COOH.*')
            chk6 = re.compile(r'.*[-]CH3.*')
            chk7 = re.compile(r'.*[-]C3H5NO2.*')

            chk9 = re.compile(r'.*[+]HCOO.*')
            chk10 = re.compile(r'.*[+]CH3.*')
            chk11 = re.compile(r'(P[ACEGSI]4?P?_)(.*)([+-])')

            # if chk0.match(mod):
            #     if re.match(r'.*-[sS][nN]1.*', mod):
            #         _mod_sum_elem_dct['H'] += 2
            #         _mod_sum_elem_dct['O'] += 1
            #         for _key in _sn1_elem.keys():
            #             _mod_sum_elem_dct[_key] -= _sn1_elem[_key]
            #
            #     if re.match(r'.*-[sS][nN]2.*', mod):
            #         _mod_sum_elem_dct['H'] += 2
            #         _mod_sum_elem_dct['O'] += 1
            #         for _key in _sn2_elem.keys():
            #             _mod_sum_elem_dct[_key] -= _sn2_elem[_key]

            if chk1.match(mod):
                _mod_elem_dct = mod_dct['-H2O']
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
            # for [M-sn+H2O-H]-, the H2O was add already
            # elif chk2.match(mod):
            #     _mod_elem_dct = mod_dct['+H2O']
            if chk2.match(mod):
                _mod_elem_dct = mod_dct['+H2O']
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
            if chk3.match(mod):
                _mod_elem_dct = mod_dct['-CO2']
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
            if chk4.match(mod):
                _mod_elem_dct = mod_dct['-C3H9N']
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
            if chk5.match(mod):
                _mod_elem_dct = mod_dct['-CH3COOH']
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
            if chk6.match(mod):
                _mod_elem_dct = mod_dct['-CH3']
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
                _mod_sum_elem_dct['H'] += 1  # For PC only
            if chk7.match(mod):
                _mod_elem_dct = mod_dct['-C3H5NO2']
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]

            if chk9.match(mod):
                _mod_elem_dct = mod_dct['+HCOO']
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
            if chk10.match(mod):
                _mod_elem_dct = mod_dct['+CH3']  # For PC only
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
            if chk11.match(mod):
                chk11_match = chk11.match(mod)
                chk11_lst = chk11_match.groups()
                mod_elem_str = chk11_lst[1]
                _mod_elem_dct = self.parse_formula(formula=mod_elem_str)
                _mod_elem_dct['H'] += 1
                for _key in _mod_elem_dct.keys():
                    _mod_sum_elem_dct[_key] += _mod_elem_dct[_key]
            else:
                pass

        # get sum keys form both dict
        _frag_elem_dct = {}
        _charged_keys_lst = set(
            sum([_elem_dct.keys(), _mod_sum_elem_dct.keys()], []))
        for _key in _charged_keys_lst:
            _frag_elem_dct[_key] = _elem_dct.get(
                _key, 0) + _mod_sum_elem_dct.get(_key, 0)

        return _frag_elem_dct