Exemplo n.º 1
0
    def get_structure(self):
        """Get the pdb structure of the molecule."""

        # we can have a str or a list of bytes as input
        if isinstance(self.pdb_data, str):
            self.complex = freesasa.Structure(self.pdb_data)
        else:
            self.complex = freesasa.Structure()
            atomdata = self.sql.get('name,resName,resSeq,chainID,x,y,z')
            for atomName, residueName, residueNumber, chainLabel, x, y, z in atomdata:
                atomName = '{:>2}'.format(atomName[0])
                self.complex.addAtom(atomName, residueName, residueNumber,
                                     chainLabel, x, y, z)
        self.result_complex = freesasa.calc(self.complex)

        self.chains = {}
        self.result_chains = {}
        for label in self.chains_label:
            self.chains[label] = freesasa.Structure()
            atomdata = self.sql.get('name,resName,resSeq,chainID,x,y,z',
                                    chainID=label)
            for atomName, residueName, residueNumber, chainLabel, x, y, z in atomdata:
                atomName = '{:>2}'.format(atomName[0])
                self.chains[label].addAtom(atomName, residueName,
                                           residueNumber, chainLabel, x, y, z)
            self.result_chains[label] = freesasa.calc(self.chains[label])
Exemplo n.º 2
0
def getFreeSASAStructureFromModel(structure, classifier=None):
    outFile = "gsfm.temp.pdb"
    structure.save(outFile)
    
    if(classifier is not None):
        freesasa_structure = freesasa.Structure(outFile, classifier=classifier)
    else:
        freesasa_structure = freesasa.Structure(outFile)
    
    if(os.access(outFile, os.R_OK)):
        os.remove(outFile)
    
    return freesasa_structure
Exemplo n.º 3
0
def cb_sasas(design_pdb, fg_vdm_txt):
    sasa_dict = {}  # key is resnum, value is list [aa, cbsasa]

    # get the vdm AAs and resnums from txtfile
    vdms_aa = []
    vdms_resnum = []
    with open(fg_vdm_txt) as inF:
        for line in inF:
            line = line.strip()
            line = line.split(' ')
            vdms_resnum.append(line[0])
            vdms_aa.append(line[1])

    # parse design and do freesasa calc
    prody_parsed = pr.parsePDB(design_pdb, altloc='A', model=1)
    fs_struct = freesasa.Structure(design_pdb)  # more atoms
    fs_result = freesasa_cb(prody_parsed,
                            probe_radius=3)  # less atoms bc this is Cb cutoff

    # get sasa
    for resnum, aa in zip(vdms_resnum,
                          vdms_aa):  # shouldn't have to worry about
        #neg resnums in designed proteins
        # get Cb atoms
        prody_pdb_bb_cb_atom_ind = prody_parsed.select(
            'protein and (backbone or name CB) and \
            not element H D').getIndices()
        # get Cb atoms for resnum
        sele = prody_parsed.select('protein and (backbone or name CB) and resnum ' + str(resnum) \
            + ' and not element H D')
        bb_cb_atom_ind = sele.getIndices()
        sasa_3A_probe = '{0:.2f}'.format(sum(fs_result.atomArea(i) for i in \
            np.where(np.in1d(prody_pdb_bb_cb_atom_ind,bb_cb_atom_ind))[0]))
        sasa_dict[int(resnum)] = [aa, float(sasa_3A_probe)]
    return sasa_dict
Exemplo n.º 4
0
def get_area(this_run,basename):
    
    path_dictionary=setup_paths()
    outpath = path_dictionary["pdb_path"] + basename + '.pdb'
    print('getting area')
    # convert to pdb
    obConversion = openbabel.OBConversion()
    obConversion.SetInFormat("xyz")
    obConversion.SetOutFormat("pdb")
    OBMol = openbabel.OBMol()
    obConversion.ReadFile(OBMol, this_run.init_geopath)
    obConversion.WriteFile(OBMol, outpath)

    # measure free SA
    dc = DerivedClassifierT()
    myopt = {'halt-at-unknown': False,
     'hetatm': True,
     'hydrogen': True,
     'join-models': False,
     'skip-unknown': False}
    structure = freesasa.Structure(outpath,classifier = dc, options = myopt)
    structure.setRadiiWithClassifier(dc)

    result = freesasa.calc(structure).totalArea()
    this_run.area = result
Exemplo n.º 5
0
def featurize(structure: Structure) -> list[Any]:
    """
    Calculates 3D ML features from the `structure`.
    """
    structure1 = freesasa.Structure(pdbpath)
    result = freesasa.calc(structure1)
    area_classes = freesasa.classifyResults(result, structure1)

    Total_area = []
    Total_area.append(result.totalArea())

    Polar_Apolar = []

    for key in area_classes:
        # print( key, ": %.2f A2" % area_classes[key])
        Polar_Apolar.append(area_classes[key])
    # get all the residues
    residues = [res for res in structure.get_residues()]
    seq_length = []
    seq_length.append(len(residues))
    # calculate some random 3D features (you should be smarter here!)
    protein_length = residues[1]["CA"] - residues[-2]["CA"]
    angle = calc_dihedral(
        residues[1]["CA"].get_vector(),
        residues[2]["CA"].get_vector(),
        residues[-3]["CA"].get_vector(),
        residues[-2]["CA"].get_vector(),
    )
    # create the feature vector
    features = [Total_area, Polar_Apolar, protein_length, seq_length, angle]

    return features
Exemplo n.º 6
0
def sa_calc(polymer_pdb, radius):
    # pdb files are needed for calculation surface area
    mol_file = Chem.MolFromMolFile(polymer_pdb)
    # hydrogens are removed in the mol file
    pdb_file = Chem.AddHs(mol_file, addCoords = True)
    # convert mol file to pdb file in rdkit
    Chem.MolToPDBFile(pdb_file, out_dir+NAME+'_new.pdb')

	# hydrogens are removed in the default option
    option_with_Hs =  {    'hetatm' : True,
                           'hydrogen' : True,
                           'join-models' : False,
                           'skip-unknown' : False,
                           'halt-at-unknown' : False    }

    # calculate solvent accessible surface area(probe radius = 1.4 Å or 3.6 Å)
    para = freesasa.Parameters()
    freesasa.Parameters.setProbeRadius(para, radius)
    # calculate sa for different type of polymers
    free_struct = freesasa.Structure(out_dir+NAME+'_new.pdb', options = option_with_Hs)
    free_calc = freesasa.calc(free_struct, para)
    total = free_calc.totalArea()
    # round to 4 decimals
    decimal = round(total, 4)
    print (f'Total SASA is {decimal} Å^2 when probe radius is {radius} Å.')
    atom_number = mol_file.GetNumAtoms()
    normalized_sa = round(decimal / atom_number, 4)

    # save data to a txt file
    with open (out_dir + 'Average surface area.txt', 'a+') as Asa:
       Asa.write(f'The normalized surface area of {NAME} is ' + str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.\n'
        )
    print ('Nomalized solvent accessible surface area is '+ str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.\n')
def get_area_classes(file):
    struct = freesasa.Structure(file)
    result = freesasa.calc(struct)
    area_classes = freesasa.classifyResults(result, struct)
    list_areas = [(list(area_classes.values())[0]),
                  (list(area_classes.values())[1]),
                  result.totalArea()]
    return list_areas
def calculate_SAS(temp_dict, pdb_path, seq_len):
    struct = freesasa.Structure(str(pdb_path))
    result = freesasa.calc(struct)
    area_classes = freesasa.classifyResults(result, struct)
    polar = area_classes['Polar']
    apolar = area_classes['Apolar']
    sasa_fraction = (polar + apolar) / seq_len
    temp_dict.update({
        "Polar": polar,
        "Apolar": apolar,
        "SASA Fraction": sasa_fraction
    })
Exemplo n.º 9
0
def calcSASA(Latm, selection):
    """Calcule la surface accessible au solvent (SAS) des acides aminés de la selecion
	Retourne la SAS pour une sélection donnée
	"""
    freesasa.setVerbosity(1)
    structure = freesasa.Structure()
    for a in Latm:
        structure.addAtom(a.ty, a.resname, a.resN, a.chain, a.traj[0],
                          a.traj[1], a.traj[2])
    result = freesasa.calc(structure)
    selections = freesasa.selectArea((selection, 'all, resn ala'), structure,
                                     result)
    return selections[selection.split()[0][:-1]]
Exemplo n.º 10
0
def cal_sasa(prot, resilist):
    structure = freesasa.Structure(prot)
    result = freesasa.calc(structure)

    for i in range(len(resilist)):
        resi_ind = resilist[i]['resi_seq']
        chain = resilist[i]['chain']
        sasa_value = freesasa.selectArea(
            ('alanine, resn ala',
             'we, resi ' + str(resi_ind) + ' and chain ' + chain), structure,
            result)
        resilist[i]['SASA'] = sasa_value['we']
    return resilist
Exemplo n.º 11
0
def run_freesasa_biopython(pdb_path):
	global freesasa
	if freesasa is None:
		try:
			import freesasa
		except ImportError:
			raise RuntimeError("Cannot use this method. Please save the pdb file and rerun with docker")

	with silence_stdout(), silence_stderr():
		#Automatically removes hydrogens
		sasa_struct = freesasa.Structure(pdb_path)
		sasa = freesasa.calc(sasa_struct)

	return sasa, sasa_struct
Exemplo n.º 12
0
def _compute_asa(df):
    """Compute solvent-accessible surface area for provided strucutre."""
    bp = dt.df_to_bp(df)
    structure = freesasa.Structure(
        classifier=freesasa.Classifier.getStandardClassifier('naccess'),
        options={
            'hydrogen': True,
            'skip-unknown': True
        })
    for i, atom in df.iterrows():
        if atom['resname'] != 'UNK' and atom['element'] != 'H':
            structure.addAtom(atom['name'], atom['resname'], atom['residue'],
                              atom['chain'], atom['x'], atom['y'], atom['z'])
    result = freesasa.calc(structure)
    return result.totalArea()
Exemplo n.º 13
0
 def get_attributes(self):
     # read pdb file
     with open(self.file_path, "r") as f:
         self.data = f.readlines()
     # calculate solvent access data
     try:
         self.solvent_access = fs.calc(fs.Structure(self.file_path))
     except Exception:
         raise
     self._clean_data()
     try:
         self._ca_attributes()
     except AssertionError:
         raise
     self._distance_to_others()
     self._find_in_range()
Exemplo n.º 14
0
    def _get_sasa(self):
        if freesasa is None:
            print "SASA not installed! SASA will be 0"
            return None, None
        if self.sasa is None:
        	pdbfd, tmp_pdb_path = tempfile.mkstemp()
            with os.fdopen(pdbfd, 'w') as tmp:
                writePDBStream(tmp, self.structure)

            with silence_stdout(), silence_stderr():
                self.sasa_struct = freesasa.Structure(tmp_pdb_path)
                self.sasa = freesasa.calc(self.sasa_struct)

            os.remove(tmp_pdb_path)

        return self.sasa, self.sasa_struct
Exemplo n.º 15
0
def sasa_from_file(file: Union[str, pathlib.Path]) -> Sasa:
    """Get the freesasa.Result.residueAreas() dictionary
    obtained after parsing a PDB file to a freesasa.Structure
    and calling fresasa.calc() on it.
    """
    if isinstance(file, str):
        file = pathlib.Path(file)
    elif isinstance(file, pathlib.Path):
        pass
    else:
        raise TypeError(
            "Invalid argument type. File should be 'str' or pathlib.Path")

    if not file.exists():
        raise FileNotFoundError(
            f"File {file.absolute().as_posix()} does not exist.")

    _struct = freesasa.Structure(file.absolute().as_posix())
    _sasa = freesasa.calc(_struct)

    return ObjDict(_sasa.residueAreas())
Exemplo n.º 16
0
    def _get_scores(self, df, pdb_id, pdb_chain):
        sifts = get_sifts_alignment_for_chain(pdb_id, pdb_chain,
                                              self.sifts_directory,
                                              self.download_sifts)
        if sifts is None:
            scores = None
        else:
            df = pd.merge(df,
                          sifts,
                          left_on='residue',
                          right_on='uniprot position',
                          how='left')

            pdb_file_path = os.path.join(self.pdb_directory, pdb_id + '.pdb')
            if not os.path.isfile(pdb_file_path):
                # PDB file not already downloaded.
                if self.download_pdb_file:
                    download_pdb_file(pdb_id, self.pdb_directory)
                else:
                    raise LookupError(
                        "PDB file {} is not in the pdb_directory {}".format(
                            pdb_id, self.pdb_directory))

            structure = freesasa.Structure(pdb_file_path)
            result = freesasa.calc(structure, self.freesasa_parameters)
            chain_results = result.residueAreas()[pdb_chain]
            scores = np.full(len(df), np.nan)
            for i, residue in enumerate(df['pdb position']):
                if not np.isnan(residue):
                    try:
                        scores[i] = getattr(chain_results[str(int(residue))],
                                            self.metric)
                    except KeyError as e:
                        pass

        return scores
Exemplo n.º 17
0
def cb_sasas(design_pdb):
    sasa_dict = {} # key is resnum, value is list [aa, cbsasa]
    
    # parse design and do freesasa calc
    prody_parsed = pr.parsePDB(design_pdb, altloc='A', model=1)
    fs_struct = freesasa.Structure(design_pdb) # more atoms 
    fs_result = freesasa_cb(prody_parsed, probe_radius=3) # less atoms bc this is Cb cutoff 

    # get sasa for each resnum
    chain_start = min(prody_parsed.getResnums())
    chain_end = max(prody_parsed.getResnums())
    for resnum in range(chain_start,chain_end+1):
        # get Cb atoms 
        prody_pdb_bb_cb_atom_ind = prody_parsed.select('protein and (backbone or name CB) and \
            not element H D').getIndices()
        # get Cb atoms for resnum
        sele = prody_parsed.select('protein and (backbone or name CB) and resnum ' + str(resnum) \
            + ' and not element H D')
        resname = sele.getResnames()[0]
        bb_cb_atom_ind = sele.getIndices() 
        sasa_3A_probe = '{0:.2f}'.format(sum(fs_result.atomArea(i) for i in \
            np.where(np.in1d(prody_pdb_bb_cb_atom_ind,bb_cb_atom_ind))[0]))
        sasa_dict[int(resnum)] = [resname, float(sasa_3A_probe)]
    return sasa_dict
def surface_list(file1):

    maximum_area = {
        'ALA': 120.56,
        'CYS': 143.79,
        'ASP': 157.04,
        'GLU': 188.42,
        'PHE': 227.46,
        'GLY': 89.41,
        'HIS': 200.14,
        'ILE': 96.42,
        'LYS': 213.74,
        'LEU': 206.32,
        'MET': 216.63,
        'ASN': 149.85,
        'PRO': 155.07,
        'GLN': 186.83,
        'ARG': 229.51,
        'SER': 128.27,
        'THR': 138.58,
        'VAL': 169.82,
        'TRP': 269.35,
        'TYR': 241.54
    }

    global chain_A
    global chain_B

    surface_list_a1 = []
    surface_list_b1 = []

    structure = freesasa.Structure(file1)
    result = freesasa.calc(structure)

    for residue1 in chain_A.get_residues():
        try:
            res_id = residue1["CA"].get_full_id()[3][1]
            select_word = str(res_id) + ", " + "chain H and resi " + str(
                res_id)
            selections = freesasa.selectArea((select_word, ), structure,
                                             result)
            for key in selections:
                if float('%.3f' % (selections[key] / maximum_area[chain_A[
                        residue1.get_full_id()[3][1]].get_resname()])) > 0.05:
                    surface_list_a1.append(res_id)
        except Exception:
            pass
        continue

    for residue2 in chain_B.get_residues():
        try:
            res_id = residue2["CA"].get_full_id()[3][1]
            select_word = str(res_id) + ", " + "chain L and resi " + str(
                res_id)
            selections = freesasa.selectArea((select_word, ), structure,
                                             result)
            for key in selections:
                if float('%.3f' % (selections[key] / maximum_area[chain_B[
                        residue2.get_full_id()[3][1]].get_resname()])) > 0.05:
                    surface_list_b1.append(res_id)
        except Exception:
            pass
        continue

    return surface_list_a1, surface_list_b1
Exemplo n.º 19
0
def sa_conformers(file_1, func_1, file_2, func_2, units, radius):
    # turn off cache
    stk.OPTIONS['cache'] = False
    
    # number of conformers
    N = 10
    """
    functional groups:
       ['diol'] and ['dibromine']/['difluorene']
       or
       ['bromine'] and ['bromine']/['iodine']
    """
    name_1 = file_1.replace('.mol', '')
    unit_1 = stk.StructUnit2(file_1, func_1)

    name_2 = file_2.replace('.mol', '')
    unit_2 = stk.StructUnit2(file_2, func_2)

    # make polymer
    NAME = name_1+'_'+name_2+'_AB_poly'
    print(f'Creating polymer: {NAME}')
    polymer = stk.Polymer([unit_1, unit_2], stk.Linear('AB', [0, 0], n=units, ends='h'))
    # write unoptimized structure
    polymer.write(NAME+'.mol')
    mol_polymer = rdkit.MolFromMolFile(NAME + '.mol')
    #print(f'{NAME} has {polymer.mol.get_no_atoms()} atoms!')
    print(f'Optimizing polymer {NAME} and saving {N} conformers')
    # clean molecule with ETKDG
    embedder = stk.UFF(use_cache=False)
    embedder.optimize(polymer, conformer=-1)
    # write optimized polymer to json
    polymer.dump(NAME+'_opt.json')
    polymer.write(NAME+'_opt.mol')
    # make N conformers of the polymer molecule
    etkdg = rdkit.ETKDGv2()
    etkdg.randomSeed = 1000
    etkdg.verbose = True
    etkdg.maxIterations = 200000
    cids = rdkit.EmbedMultipleConfs(
        mol=polymer.mol, 
        numConfs=N,
        params=etkdg
    )
    print(f'Made {len(cids)} conformers...')
    print(f'Warning! I have not implemented an optimization of the ETKDG cleaned polymers!')

    # iterate over conformers and save structure
    file_dir = '/home/fanyuzhao/Monomers/OH+F/dimer/conformers/'
    new_dir = file_dir+NAME+'_'+str(units)+'_'+str(radius)+'/'
    for cid in cids:
        # build directories
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)
        # write optimized polymer to mol
        polymer.write(new_dir+NAME+'_'+str(cid)+'_opt.mol', conformer=cid)
        # write optimized polymer to pdb
        polymer.write(new_dir+NAME+'_'+str(cid)+'_opt.pdb', conformer=cid)
        print(f'Done! {N} ETKDG conformers of polymer written to {NAME}_{N}_opt.mol/pdb')

    # pdb file from stk can not be read in freesasa
    # save the new pdb file in rdkit from mol files
    for item in os.listdir(new_dir):
        if item.endswith('.mol'):
            file_pdb = item.replace('.mol', '')
            a = rdkit.MolFromMolFile(os.path.join(new_dir, item))
            # hydrogens are removed when converting the file in rdkit
            b = rdkit.AddHs(a, addCoords = True)
            rdkit.MolToPDBFile(b, new_dir + file_pdb + '_new.pdb')

    # calculate solvent accessible surface area(probe radius = 1.4Å and 3.6Å)
    # hydrogens are removed in the default option
    # hetatm are ignored in the default option
    options_with_Hs =  {    'hetatm' : True,
                            'hydrogen' : True,
                            'join-models' : False,
                            'skip-unknown' : False,
                            'halt-at-unknown' : False    }

    sa_list = []
    pdb_list = []
    # loop all new pdb files
    for pdb in os.listdir(new_dir):
        if pdb.endswith("_new.pdb"):
            # use freesasa to calculate SASA
            para = freesasa.Parameters()
            freesasa.Parameters.setProbeRadius(para, radius)
            free_struct = freesasa.Structure(os.path.join(new_dir, pdb), options = options_with_Hs)
            free_calc = freesasa.calc(free_struct, para)
            total = free_calc.totalArea()
            # keep 3 decimals
            decimal = round(total, 4)
            sa_list.append(decimal)
            name_pdb = pdb.replace('.pdb', '')
            pdb_list.append(name_pdb)
    # calculate average SASA(probe radius = 1.4Å)
    sa_average = round(sum(sa_list) / len(sa_list), 4)
    atom_number = mol_polymer.GetNumAtoms()
    normalized_sa = round(sa_average / atom_number, 4)
    with open (file_dir + 'Average surface area of conformers.txt', 'a+') as Asa:
        Asa.write(f'The normalized surface area of {NAME}_{units} is ' + str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + f'Å and chain length of {units}.\n')
    print ('The avarage surface area of the conformers is ' + str(sa_average) + ' Å^2 with the probe size of ' + str(radius) + 'Å.')

    # save data to a csv table
    # save pdb file and surface area to a directory
    dic = {p: s for p, s in zip(pdb_list, sa_list)}
    download_dict = new_dir + 'Solvent accessible surface area of ' + NAME +'.csv'
    csv = open(download_dict, 'w')
    columnTitleRow = "Polymer_name, SASA\n"
    csv.write(columnTitleRow)

    for key in dic.keys():
        Polymer_name = key
        SASA = dic[key]
        row = Polymer_name + "," + str(SASA) + "\n"
        csv.write(row)
    print ('Nomalized solvent accessible surface area is '+ str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.')
Exemplo n.º 20
0
    def __init__(self, comb, pdb_acc_code, chain, **kwargs):
        """ :comb: arg: instance of cls Comb with attributes pdbchain_dict, ifg_selection_info
        :pdb_acc_code: type: str: 4 character pdb accession code
        :param kwargs: 
            path_to_pdb
            path_to_dssp 
        """
        #search for acc code in input_dir_pdb from comb object.
        assert isinstance(pdb_acc_code,
                          str), 'PDB accession code needs to be a string'
        pdb_file = [
            file.name for file in os.scandir(comb.input_dir_pdb)
            if pdb_acc_code in file.name
        ]
        try:
            if pdb_file:
                pdb_file = pdb_file[0]
                self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + pdb_file,
                                             altloc='A',
                                             model=1)
            elif 'path_to_pdb' in kwargs:
                self.prody_pdb = pr.parsePDB(kwargs.get('path_to_pdb'),
                                             altloc='A',
                                             model=1)
            else:  # NEED TO UPDATE: note if going to fetch pdb, it should be sent through Reduce first...
                try:
                    os.mkdir(comb.input_dir_pdb + 'raw')
                    os.mkdir(comb.input_dir_pdb + 'reduce')
                except:
                    pass
                pr.fetchPDB(pdb_acc_code,
                            compressed=False,
                            folder=comb.input_dir_pdb + 'raw')
                os.system(comb.path_to_reduce + comb.reduce +
                          ' -FLIP -Quiet -DB ' + comb.path_to_reduce +
                          'reduce_wwPDB_het_dict.txt ' + comb.input_dir_pdb +
                          'raw/' + pdb_acc_code.lower() + '.pdb > ' +
                          comb.input_dir_pdb + 'reduce/' +
                          pdb_acc_code.lower() + 'H.pdb')
                self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + 'reduce/' +
                                             pdb_acc_code.lower() + 'H.pdb',
                                             altloc='A',
                                             model=1)
        except NameError:
            raise NameError(
                'ParsePDB instance needs a pdb file path or a valid pdb accession code.'
            )

        self.pdb_acc_code = pdb_acc_code.lower()
        self.pdb_chain = chain
        if len(self.prody_pdb) == len(self.prody_pdb.select('icode _')) \
                and self.prody_pdb.select('protein and chain ' + self.pdb_chain) is not None:
            self.contacts = pr.Contacts(self.prody_pdb)
            self.set_bonds()

            if pdb_file:
                self.fs_struct = freesasa.Structure(comb.input_dir_pdb +
                                                    pdb_file)
            elif 'path_to_pdb' in kwargs:
                self.fs_struct = freesasa.Structure(kwargs.get('path_to_pdb'))
            else:
                path = comb.input_dir_pdb + 'reduce/'
                self.fs_struct = freesasa.Structure(path + next(
                    file.name for file in os.scandir(path)
                    if self.pdb_acc_code in file.name))

            self.fs_result = freesasa.calc(self.fs_struct)

            self.fs_result_cb_3A = self.freesasa_cb(probe_radius=3)
            self.fs_result_cb_4A = self.freesasa_cb(probe_radius=4)
            self.fs_result_cb_5A = self.freesasa_cb(probe_radius=5)
            self.prody_pdb_bb_cb_atom_ind = self.prody_pdb.select(
                'protein and (backbone or name CB) '
                'and not element H D').getIndices()

            dssp_file = [
                file.name for file in os.scandir(comb.input_dir_dssp)
                if pdb_acc_code in file.name
            ]
            if dssp_file:
                dssp_file = dssp_file[0]
                self.dssp = pr.parseDSSP(comb.input_dir_dssp + dssp_file,
                                         self.prody_pdb)
            elif 'path_to_dssp' in kwargs:
                self.dssp = pr.parseDSSP(kwargs.get('path_to_dssp'),
                                         self.prody_pdb)
            else:
                if pdb_file:
                    pr.execDSSP(comb.input_dir_pdb + pdb_file,
                                outputdir=comb.input_dir_dssp)
                elif 'path_to_pdb' in kwargs:
                    pr.execDSSP(kwargs.get('path_to_pdb'),
                                outputdir=comb.input_dir_dssp)
                else:
                    path = comb.input_dir_pdb + 'reduce/' + next(
                        file.name
                        for file in os.scandir(comb.input_dir_pdb + 'reduce')
                        if pdb_acc_code in file.name)
                    pr.execDSSP(path, outputdir=comb.input_dir_dssp)

                self.dssp = pr.parseDSSP(
                    comb.input_dir_dssp +
                    next(file.name for file in os.scandir(comb.input_dir_dssp)
                         if pdb_acc_code in file.name), self.prody_pdb)
            self.possible_ifgs = self.find_possible_ifgs(comb)
        else:
            self.possible_ifgs = None
        # valence and hydrogen bond data for vandermers and iFGs of ParsedPDB protein instance
        # iFG specific:
        self._ifg_pdb_info = []
        self._ifg_atom_density = []
        self._ifg_contact_water = []
        self._ifg_contact_ligand = []
        self._ifg_contact_metal = []
        # vdM specific:
        self._vdm_pdb_info = []
        self._vdm_sasa_info = []
        self._ifg_contact_vdm = []
        self._ifg_hbond_vdm = []
        self._ifg_hbond_water = []
        self._ifg_hbond_ligand = []
        self._ifg_ca_hbond_vdm = []
Exemplo n.º 21
0
def openfile():
    global prob, probab, te
    global my_seq
    global anti
    global structure, structure_id, filename
    global antigenicity, hydro, flex, sec
    global m, a, c, b, length, j, k
    global hydroph, flexi, access
    anti = []
    sec = []
    probab = []
    from tkinter import filedialog
    root = Tk()
    root.filename = filedialog.askopenfilename(
        initialdir="/",
        title="Select file",
        filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb")))
    filename = root.filename
    print(filename)
    structure_id = "1e6j"
    structure = PDBParser().get_structure(structure_id, root.filename)
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure):
        my_seq = pp.get_sequence()  # type: Seq
        print(my_seq)
    for model in structure:
        for chain in model:
            print(chain)
    sequence = list(my_seq)
    m = ''.join(sequence)
    print(m)
    length = len(m)  # type: int
    print("Sequence consist of", length, "Amino Acids")
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    analysed_seq = ProteinAnalysis(m)
    print("Molecular weight = ", analysed_seq.molecular_weight())
    print("Amino Acid Count = ", analysed_seq.count_amino_acids())
    print("Secondary structure fraction =",
          analysed_seq.secondary_structure_fraction())
    kd = {
        'A': 1.8,
        'R': -4.5,
        'N': -3.5,
        'D': -3.5,
        'C': 2.5,
        'Q': -3.5,
        'E': -3.5,
        'G': -0.4,
        'H': -3.2,
        'I': 4.5,
        'L': 3.8,
        'K': -3.9,
        'M': 1.9,
        'F': 2.8,
        'P': -1.6,
        'S': -0.8,
        'T': -0.7,
        'W': -0.9,
        'Y': -1.3,
        'V': 4.2
    }
    c = list(analysed_seq.flexibility())
    b = list(analysed_seq.protein_scale(kd, 10, 1.0))
    hydro = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flex = list(analysed_seq.flexibility())
    hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flexi = list(analysed_seq.flexibility())

    i = 1
    j = -1  # type: int
    k = 9
    while i <= (length - 10):
        print("Sequence is = ", m[j + 1:k + 1])
        print("Flexibility value = ", c[j + 1])
        print("Hydrophilicity value = ", b[j + 1])
        ana_seq = ''.join(m[j + 1:k + 1])
        analyze_seq = ProteinAnalysis(ana_seq)
        # For Secondary structure Analysis
        print("Secondary structure fraction =",
              analyze_seq.secondary_structure_fraction())
        a = list(analyze_seq.secondary_structure_fraction())
        a = a[0]
        sec.append(a)
        i += 1
        j += 1
        k += 1
    f = length
    r = 1
    y = 10
    global acc, logacc
    acc = []
    for i in range(0, f):
        str1 = "accessibility, resi "
        str2 = str(r) + "-" + str(y)
        saving = str1 + str2
        print(saving)
        r = r + 1
        y = y + 1
        structure = freesasa.Structure("1e6j.pdb")
        resulta = freesasa.calc(structure)
        area_classes = freesasa.classifyResults(resulta, structure)
        print("Total : %.2f A2" % resulta.totalArea())
        for key in area_classes:
            print(key, ": %.2f A2" % area_classes[key])
        resulta = freesasa.calc(
            structure,
            freesasa.Parameters({
                'algorithm': freesasa.LeeRichards,
                'n-slices': 10
            }))
        selections = freesasa.selectArea(('alanine, resn ala', saving),
                                         structure, resulta)
        for key in selections:
            print(key, ": %.2f A2" % selections[key])
            a = selections[key]
            acc.append(a)

    l = acc[0::2]
    access = l
    print(acc)
    print(l)
    logacc = [math.log(y, 10) for y in l]

    print(logacc)
Exemplo n.º 22
0
    def handle(self, *args, **options):

        failed = []

        # get preferred chain for PDB-code
        references = Structure.objects.all().prefetch_related(
            'pdb_code', 'pdb_data')

        for reference in references:

            preferred_chain = reference.preferred_chain.split(',')[0]
            pdb_code = reference.pdb_code.index
            try:
                print(pdb_code)
                if "refined" in pdb_code:
                    failed.append(pdb_code)
                    continue
                #structure = self.load_pdb(pdb_code)
                structure = self.load_pdb_var(pdb_code, reference.pdb_data.pdb)
                # grab residues with the generic numbering for this structure
                db_reslist = list(
                    Residue.objects.exclude(
                        generic_number__isnull=True).filter(
                            protein_conformation__protein=reference.
                            protein_conformation.protein).prefetch_related(
                                'generic_number'))

                #######################################################################
                ############################# filter  pdb #############################

                db_tmlist = [[] for i in range(TMNUM)]
                db_set = set()
                for r in db_reslist:
                    if r.generic_number.label[:2] in [
                            "1x", "2x", "3x", "4x", "5x", "6x", "7x"
                    ]:  # and r.generic_number in pchain
                        db_tmlist[int(r.generic_number.label[0]) - 1].append(
                            r.sequence_number)
                        db_set.add((' ', r.sequence_number, ' '))

                def recurse(entity, slist):
                    for subenty in entity.get_list():
                        if not subenty.id in slist[0]:
                            entity.detach_child(subenty.id)
                        elif slist[1:]:
                            recurse(subenty, slist[1:])

                recurse(structure, [[0], preferred_chain])
                hse_struct = deepcopy(structure)
                recurse(structure, [[0], preferred_chain, db_set])

                pchain = structure[0][preferred_chain]

                #######################################################################
                ############### Calculate the axes through the helices ################
                #######################################################################
                N = 3

                hres_list = [
                    np.asarray([pchain[r]["CA"].get_coord() for r in sl],
                               dtype=float) for sl in db_tmlist
                ]
                h_cb_list = [
                    np.asarray([
                        pchain[r]["CB"].get_coord()
                        if "CB" in pchain[r] else np.array([None, None, None])
                        for r in sl
                    ],
                               dtype=float) for sl in db_tmlist
                ]

                # fast and fancy way to take the average of N consecutive elements
                hres_three = np.asarray([
                    sum([h[i:-(len(h) % N) or None:N] for i in range(N)]) / N
                    for h in hres_list
                ])
                helices_mn = np.asarray(
                    [np.mean(h, axis=0) for h in hres_three])
                self.save_pseudo(hres_three, pdb_code + "helper")

                #######################################################################
                ################################# PCA #################################
                #######################################################################

                def pca_line(pca, h, r=0):
                    if ((not r) if pca.fit_transform(h)[0][0] < 0 else r):
                        return pca.inverse_transform(
                            np.asarray([[-20, 0, 0], [20, 0, 0]]))
                    else:
                        return pca.inverse_transform(
                            np.asarray([[20, 0, 0], [-20, 0, 0]]))

                helix_pcas = [PCA() for i in range(7)]
                pos_list = np.asarray([
                    pca_line(helix_pcas[i], h, i % 2)
                    for i, h in enumerate(hres_three)
                ])
                #self.write_cgo_arrow_pml(pdb_code, "pca",pos_list)

                pos_list = np.mean(pos_list, axis=0)
                #self.write_cgo_arrow_pml(pdb_code, "pca_mean",[pos_list])

                pca = PCA()
                pos_list = pca_line(pca, np.vstack(hres_three))
                #self.write_cgo_arrow_pml(pdb_code, "pca_all",[pos_list])

                pos_list = np.asarray([
                    pca_line(PCA(), h[:len(h) // 2:(-(i % 2) or 1)])
                    for i, h in enumerate(hres_three)
                ])
                pos_list = pos_list - (np.mean(pos_list, axis=1) -
                                       helices_mn).reshape(-1, 1, 3)
                #self.write_cgo_arrow_pml(pdb_code, "pca_extra",pos_list)
                #self.write_cgo_arrow_pml(pdb_code, "pca_extra_mean",[np.mean(pos_list,axis=0)])

                pca_extra = PCA()
                pos_list = pca_line(pca_extra, np.vstack(pos_list))

                #self.write_cgo_arrow_pml(pdb_code, "pca_extra_pca",[pos_list])

                #######################################################################
                ################################ Angles ###############################
                #######################################################################

                def calc_angle(b, c):
                    ba = -b
                    bc = c + ba
                    ba[:, 0] = 0
                    return np.degrees(
                        np.arccos(
                            inner1d(ba, bc) / (np.linalg.norm(ba, axis=1) *
                                               np.linalg.norm(bc, axis=1))))

                def ca_cb_calc(i, pca):
                    fin = np.isfinite(h_cb_list[i][:, 0])
                    return calc_angle(pca.transform(hres_list[i][fin]),
                                      pca.transform(h_cb_list[i][fin]))

                def axes_calc(i, pca_list, pca):
                    p = pca_list[i]
                    h = hres_list[i]
                    a = (np.roll(np.vstack((h, h[0])), 1, axis=0)[:-1] + h +
                         np.roll(np.vstack((h, h[-1])), -1, axis=0)[:-1]) / 3
                    b = p.transform(h)
                    b[:, 1:] = p.transform(a)[:, 1:]
                    b = p.inverse_transform(b)
                    return calc_angle(pca.transform(b), pca.transform(h))

                def set_bfactor(structure, angles):
                    for r, an in zip(structure[0][preferred_chain].get_list(),
                                     angles):
                        for a in r:
                            a.set_bfactor(an)

                centerpca = pca

                ########################### Axis to CA to CB ##########################

                tv = np.isfinite(np.concatenate(h_cb_list)[:, 0])
                angle = np.full_like(tv, -1, dtype=float)
                angle[tv] = np.concatenate(
                    [ca_cb_calc(i, centerpca) for i in range(TMNUM)])
                set_bfactor(structure, angle)

                self.save_pdb(structure, pdb_code + 'angle_colored_ca_cb.pdb')

                ######################### Axis to Axis to CA ##########################

                angle2 = np.concatenate([
                    axes_calc(i, helix_pcas, centerpca) for i in range(TMNUM)
                ])

                set_bfactor(structure, angle2)
                self.save_pdb(structure, pdb_code + 'angle_colored_axes.pdb')

                ### ASA

                pdbstruct = freesasa.Structure("pymol_output/" + pdb_code +
                                               'angle_colored_axes.pdb')
                res = freesasa.calc(pdbstruct)

                asa_list = []
                oldnum = -1
                for i in range(res.nAtoms()):
                    resnum = pdbstruct.residueNumber(i)
                    if resnum == oldnum:
                        asa_list[-1] += res.atomArea(i)
                    else:
                        asa_list.append(res.atomArea(i))
                        oldnum = resnum

                set_bfactor(structure, asa_list)
                self.save_pdb(structure, pdb_code + 'asa_colored.pdb')

                reslist = [r.id[1] for r in pchain.get_list()]

                ### HSE
                model = hse_struct[0]
                exp_ca = pdb.HSExposure.HSExposureCA(model)
                [[a.set_bfactor(x[1][1]) for a in x[0]] for x in exp_ca]
                recurse(hse_struct, [[0], preferred_chain, db_set])
                r = [x[0] for x in exp_ca]
                #x = model["A"].get_list()
                x = pchain.get_list()
                for r in (set(x) - set(r)):
                    for a in r:
                        a.set_bfactor(-1)

                exp_ca = [
                    a["CA"].get_bfactor()
                    for a in hse_struct[0][preferred_chain].get_list()
                ]
                self.save_pdb(hse_struct, pdb_code + 'hsea_colored.pdb')

                with open('pymol_output/' + pdb_code + '_measures.pickle',
                          'wb') as handle:
                    pickle.dump((np.array(reslist), np.array(asa_list),
                                 np.array(exp_ca), angle, angle2), handle)

            except Exception as e:
                print("ERROR!!", pdb_code, e)
                failed.append(pdb_code)
                continue

        print(len(failed), "of", len(references), "failed:", failed)
Exemplo n.º 23
0
def parse_pdb_coordinates(pdb_path: str,
                          start_position: int,
                          end_position: int,
                          position_correction: int,
                          chain: str,
                          sasa: bool = False) -> DataFrame:
    """
    Parse coordinate of CA atoms. Will also return the bfactor and SASA using freesasa.
    If PDB is missing atoms, it can handle it.
    """

    # Get structure from PDB
    structure = PDBParser().get_structure('pdb', pdb_path)

    coordinates = []
    commands = []
    bfactors = []
    positions_worked = []  # positions present in pdb

    # Iterate over each CA atom and geet coordinates
    for i in np.arange(start_position + position_correction,
                       end_position + position_correction):
        # first check if atom exists
        try:
            structure[0][chain][int(i)].has_id("CA")
            # Get atom from pdb and geet coordinates
            atom = list(structure[0][chain][int(i)]["CA"].get_vector()) + [i]
            coordinates.append(atom)
            # Get SASA command for each residue and bfactor
            residue = "s{}, chain {} and resi {}".format(str(i), chain, str(i))
            commands.append(residue)
            bfactor = (structure[0][chain][int(i)]["CA"].get_bfactor())
            bfactors.append(np.log10(bfactor))
            positions_worked.append(i)
        except:
            print("residue {} not found".format(str(i)))
            coordinates.append([np.nan, np.nan, np.nan, i])

    # Convert to df
    df_coordinates = DataFrame(columns=['x', 'y', 'z', 'Position'],
                               data=coordinates)

    # Center data
    x, y, z = centroid(df_coordinates)
    df_coordinates['x_cent'] = (df_coordinates['x'] - x).abs()**2
    df_coordinates['y_cent'] = (df_coordinates['y'] - y).abs()**2
    df_coordinates['z_cent'] = (df_coordinates['z'] - z).abs()**2
    df_coordinates['Distance'] = df_coordinates['x_cent'] + df_coordinates[
        'y_cent'] + df_coordinates['z_cent']

    # Add sasa values
    if sasa:
        # Get structure for SASA
        structure_sasa = freesasa.Structure(pdb_path)
        result = freesasa.calc(structure_sasa)
        # Calculate sasa
        sasa_area = freesasa.selectArea(commands, structure_sasa, result)
        df_sasa: DataFrame = DataFrame(columns=['SASA'],
                                       data=sasa_area.values())
        df_sasa['log B-factor'] = bfactors
        df_sasa['Position'] = positions_worked

        # Merge
        df_coordinates = df_coordinates.merge(df_sasa,
                                              how='outer',
                                              on='Position')

    return df_coordinates
Exemplo n.º 24
0
def	get_DNA_H_SASA(pdb_file,csvfileout,chain=None,resids=[],seq=None,probe_radius=1.4,slicen=100,vdw_set=None,Hcontrib=[1.0]*7,n_threads=1,verbose=False):
	"""
	Function is a warapper to the FREESASA library to calculate the Surface Accessible Surface Area out
	atoms in pdb_file, then expreacts the SASA deoxiribose hydrogen atoms and sums it up
	for every nucleotide with coefficients Hcontrib.
	chain - name of the DNA chain of interest in pdb_file, if chain has no name leave blank ('')
	resids - a list of resids to calculate H-SASA values.
	seq - seqeunce of the DNA strand, string or biopython Seq object.
	Hcontrib - coefficients for individual SASA of deoxyribose hydrogens for summing them up into H-SASA profile,
		order [H1' H2' H2'' H3' H4' H5' H5'']
	Note: chain, resids, seq, Hcontrib - can be also a list of two or more instances,
			to make calculation for several chains, spans of resids or combinations of Hcontrib at once.
			In this case number of elements in chain, resids, Hcontrib should be the same,
			and the algorithm will iterate through all list simultaneously (i.e. no combination will be tried).
			Chains should be of the same length.
	probe_radius - size of probe to roll.
	slicen - number of slices per atom, controls precision of the calculation.
	vdw_set - seleting the set of VdW radii:
		None - default for FREESASA used
		charmm36-rmin - rmin from charmm36 forcefield
		abmer10-rmin - rmin from AMBER10 forcefield


	Return
	--------
	CSV file csvfileout with columns of H-SASA profiles along the sequence.
	"""
	chain=[chain] if isinstance(chain,basestring) else list(chain)
	if len(chain)>1:
		assert len(chain)==len(resids)
		assert len(chain)==len(seq)
		assert len(chain)==len(Hcontrib)
	else:
		resids=[resids]
		seq=[seq]
		Hcontrib=[Hcontrib]

	if not verbose:
		freesasa.setVerbosity(freesasa.nowarnings)
	hatoms=['H1\'','H2\'','H2\'','H3\'','H4\'','H5\'','H5\'\'']

	if vdw_set=='charmm36-rmin':
		#Open config from package in a tricky way, independent of package installation mode
		temp2 = tempfile.NamedTemporaryFile(delete=False)
		conffile = pkgutil.get_data('hydroid', 'pkgdata/charmm36_rmin.config')
		temp2.write(conffile)
		temp2.seek(0)
		temp2.close()
		classifier = freesasa.Classifier(temp2.name)
		os.remove(temp2.name)
		####
		structure = freesasa.Structure(pdb_file,classifier, options={'hydrogen' : True,'hetatm' : True})
	elif vdw_set=='amber10-rmin':
		#Open config from package in a tricky way, independent of package installation mode
		temp2 = tempfile.NamedTemporaryFile(delete=False)
		conffile = pkgutil.get_data('hydroid', 'pkgdata/amber10_rmin.config')
		temp2.write(conffile)
		temp2.seek(0)
		temp2.close()

		classifier = freesasa.Classifier(temp2.name)
		os.remove(temp2.name)
		
		####
		structure = freesasa.Structure(pdb_file,classifier, options={'hydrogen' : True,'hetatm' : True})
	else:
		structure = freesasa.Structure(pdb_file, options={'hydrogen' : True,'hetatm' : True})
	print "Launching FreeSASA calculation..."
	result = freesasa.calc(structure,freesasa.Parameters({'algorithm' : freesasa.LeeRichards,'n-slices' : slicen,'probe-radius':probe_radius,'n-threads':n_threads}))
	# result = freesasa.calc(structure,freesasa.Parameters({'algorithm' : freesasa.ShrakeRupley,'n-slices' : slicen,'n-threads':n_threads}))
	print "Calculation done"
	
	print "Extracting SASA values ..."
	
	res=dict()
	
	for ch,rids,Hcont,i in zip(chain,resids,Hcontrib,range(len(chain))):
		res[i]=pd.Series()

		if (np.array(Hcont)==1.0).all():
		#simplified procedure, we can do it faster: we need to calculate all H-SASA at once
			sels=[]
			for resid in rids:
				if len(ch)>0:
					sels.append('%d,(chain %s) and (resi %s%d) and (name %s)'%(resid, ch,'\\' if resid<0 else '', resid, '+'.join(hatoms)))
				else:
					sels.append('%d,(resi %s%d) and (name %s)'%(resid,'\\' if resid<0 else '', resid, '+'.join(hatoms)))
			selections = freesasa.selectArea(sels,structure, result)
			res[i]=res[i].add(pd.Series(selections)*1.0,fill_value=0)
		else:
		#regular procedure
			for hat,hcont in zip(hatoms,Hcont):
				sels=[]
				if hcont!=0:
					for resid in rids:
						if len(ch)>0:
							sels.append('%d,(chain %s) and (resi %s%d) and (name %s)'%(resid, ch,'\\' if resid<0 else '', resid, hat))
						else:
							sels.append('%d,(resi %s%d) and (name %s)'%(resid,'\\' if resid<0 else '', resid, hat))
				selections = freesasa.selectArea(sels,structure, result)
				res[i]=res[i].add(pd.Series(selections)*float(hcont),fill_value=0)

	for i in range(len(chain)):
		res[i].index=res[i].index.map(int)
		res[i]=res[i].sort_index()
	if len(chain)==1:
		df=pd.DataFrame({'resid':res[0].index,'Site':['%d%s'%(n,l) for n,l in zip(range(1,1+len(seq[0])),seq[0])],'H-SASA':res[0].values})
	else:
		df=pd.DataFrame()
		for ch,i in zip(chain,range(len(chain))):
			# print res[i]
			# print seq[i]
			ndf=pd.DataFrame({'resid_%d'%i:res[i].index,'Site_%d'%i:['%d%s'%(n,l) for n,l in zip(range(1,1+len(seq[i])),seq[i])],'H-SASA_%d'%i:res[i].values})
			df=pd.concat([df,ndf],axis=1)
	print "Outputting H-SASA profile to %s"%csvfileout
	df.to_csv(csvfileout)
Exemplo n.º 25
0
def calculate_surface_points(receptor,
                             ligand,
                             num_points,
                             rec_translation,
                             surface_density,
                             seed=STARTING_POINTS_SEED,
                             has_membrane=False,
                             num_sphere_points=100):
    """Calculates the position of num_points on the surface of the given protein"""
    if num_points < 0:
        raise SetupError(
            "Invalid number of points to generate over the surface")

    receptor_atom_coordinates = receptor.representative(has_membrane)

    distances_matrix_rec = distance.pdist(receptor_atom_coordinates)
    receptor_max_diameter = np.max(distances_matrix_rec)
    distances_matrix_lig = distance.pdist(ligand.representative())
    ligand_max_diameter = np.max(distances_matrix_lig)
    surface_distance = ligand_max_diameter / 4.0

    # Surface
    pdb_file_name = Path(
        receptor.structure_file_names[receptor.representative_id])
    molecule = parsePDB(pdb_file_name).select('protein or nucleic')
    if has_membrane:
        pdb_no_membrane = str(
            pdb_file_name.absolute().parent /
            f"{pdb_file_name.stem}_no_membrane{pdb_file_name.suffix}")
        writePDB(pdb_no_membrane, molecule)
    surface = molecule.select('protein and surface or nucleic and name P')
    coords = surface.getCoords()

    # SASA
    if num_points == 0:
        if has_membrane:
            structure = freesasa.Structure(pdb_no_membrane)
        else:
            structure = freesasa.Structure(str(pdb_file_name))
        result = freesasa.calc(structure)
        total_sasa = result.totalArea()
        num_points = ceil(total_sasa / surface_density)

    # Surface clusters
    if len(coords) > num_points:
        # Extremely important to set seed in order to get reproducible results
        np.random.seed(seed)
        surface_clusters = kmeans2(data=coords,
                                   k=num_points,
                                   minit='points',
                                   iter=100)
        surface_centroids = surface_clusters[0]
    else:
        surface_centroids = coords

    # Create points over the surface of each surface cluster
    sampling = []
    for sc in surface_centroids:
        sphere_points = np.array(points_on_sphere(num_sphere_points))
        surface_points = sphere_points * surface_distance + sc
        sampling.append(surface_points)

    # Filter out not compatible points
    centroids_kd_tree = KDTree(surface_centroids)
    for i_centroid in range(len(sampling)):
        # print('.', end="", flush=True)
        centroid = surface_centroids[i_centroid]
        # Search for this centroid neighbors
        centroid_neighbors = centroids_kd_tree.query_ball_point(centroid,
                                                                r=20.)
        # For each neighbor, remove points too close
        for n in centroid_neighbors:
            points_to_remove = []
            if n != i_centroid:
                for i_p, p in enumerate(sampling[i_centroid]):
                    if np.linalg.norm(
                            p - surface_centroids[n]) <= surface_distance:
                        points_to_remove.append(i_p)
                points_to_remove = list(set(points_to_remove))
                sampling[i_centroid] = [sampling[i_centroid][i_p] \
                    for i_p in range(len(sampling[i_centroid])) if i_p not in points_to_remove]

    s = []
    for points in sampling:
        s.extend(points)

    # Final cluster of points
    if len(s) > num_points:
        # Extremely important to set seed in order to get reproducible results
        np.random.seed(seed)
        s_clusters = kmeans2(data=s, k=num_points, minit='points', iter=100)
        s = s_clusters[0]

    for p in s:
        p += rec_translation

    return s, receptor_max_diameter, ligand_max_diameter
Exemplo n.º 26
0
def calculate_sasa(pdbfile, chain, multichain=True, relative_type='sidechain'):
    """

    :param pdbfile: String of PDB file name.
    :param chain: String or List of chain identifiers.
    :param multichain: Boolean. True to separate chains. This allows SASA calculation for a single unattached monomer.
    False if you want to calculate SASA for the structure 'as-is'.
    :return: Pandas Dataframe of residue number, types, and sasa values as columns.
    """
    import freesasa as fs
    dict_max_acc = {
        # Miller max acc: Miller et al. 1987 https://doi.org/10.1016/0022-2836(87)90038-6
        # Wilke: Tien et al. 2013 https://doi.org/10.1371/journal.pone.0080635
        # Sander: Sander & Rost 1994 https://doi.org/10.1002/prot.340200303
        "Miller": {
            "ALA": 113.0,
            "ARG": 241.0,
            "ASN": 158.0,
            "ASP": 151.0,
            "CYS": 140.0,
            "GLN": 189.0,
            "GLU": 183.0,
            "GLY": 85.0,
            "HIS": 194.0,
            "ILE": 182.0,
            "LEU": 180.0,
            "LYS": 211.0,
            "MET": 204.0,
            "PHE": 218.0,
            "PRO": 143.0,
            "SER": 122.0,
            "THR": 146.0,
            "TRP": 259.0,
            "TYR": 229.0,
            "VAL": 160.0,
        },
        "Wilke": {
            "ALA": 129.0,
            "ARG": 274.0,
            "ASN": 195.0,
            "ASP": 193.0,
            "CYS": 167.0,
            "GLN": 225.0,
            "GLU": 223.0,
            "GLY": 104.0,
            "HIS": 224.0,
            "ILE": 197.0,
            "LEU": 201.0,
            "LYS": 236.0,
            "MET": 224.0,
            "PHE": 240.0,
            "PRO": 159.0,
            "SER": 155.0,
            "THR": 172.0,
            "TRP": 285.0,
            "TYR": 263.0,
            "VAL": 174.0,
            "MSE": 224.0,
            "SEC": 167.0,
        },
        "Sander": {
            "ALA": 106.0,
            "ARG": 248.0,
            "ASN": 157.0,
            "ASP": 163.0,
            "CYS": 135.0,
            "GLN": 198.0,
            "GLU": 194.0,
            "GLY": 84.0,
            "HIS": 184.0,
            "ILE": 169.0,
            "LEU": 164.0,
            "LYS": 205.0,
            "MET": 188.0,
            "PHE": 197.0,
            "PRO": 136.0,
            "SER": 130.0,
            "THR": 142.0,
            "TRP": 227.0,
            "TYR": 222.0,
            "VAL": 142.0,
        },
    }
    theoreticalMaxASA = dict_max_acc["Wilke"]

    # Calculates SASA for unseparated chains.
    if not multichain:
        structure = fs.Structure(pdbfile)
    else:
        # Separate chains if multichain structure. This allows SASA calculation for a single unattached monomer.
        structures = fs.structureArray(pdbfile, options={"separate-chains": True})
        chains = []
        for c in range(len(structures)):
            chains.append(structures[c].chainLabel(1))
        structure = structures[chains.index(chain)]
        print("using {} separating chains {}".format(chains.index(chain), chains))

    print("Number of atoms of {}: {}".format(pdbfile, structure.nAtoms()))
    result = fs.calc(structure, fs.Parameters({'algorithm': fs.ShrakeRupley, 'n-points': 10000}))
    res = result.residueAreas()
    residue = []
    resnum = []
    total = []
    apolar = []
    mainchain = []
    sidechain = []
    ratio = []

    for idx, v in res[chain].items():
        residue.append(v.residueType)
        resnum.append(v.residueNumber)
        total.append(v.total)
        apolar.append(v.apolar)
        mainchain.append(v.mainChain)
        sidechain.append(v.sideChain)
        if v.residueType == 'GLY':
            ratio.append(100 * v.mainChain / theoreticalMaxASA[v.residueType])
        elif v.residueType not in theoreticalMaxASA.keys():
            possibleSASA = []
            for i, maxSASA in enumerate(theoreticalMaxASA.values()):
                # If the residue is unknown but has a SASA,
                # calculate the rSASA dividing by theoretical maxSASA and then use the average of that value
                possibleSASA.append(100 * v.sideChain / maxSASA)
            ratio.append(np.average(possibleSASA))
        else:
            if relative_type == 'sidechain':
                ratio.append(100 * v.sideChain / theoreticalMaxASA[v.residueType])
            else:
                ratio.append(100 * v.total / theoreticalMaxASA[v.residueType])

        # if v.hasRelativeAreas:
        #     ratio.append(v.relativeSideChain)
        # else:
        #     ratio.append(np.nan)

    df_sasa = pd.DataFrame({'Residue': residue, 'Residue_num': resnum, 'Chain': chain, 'Total': total, 'Apolar': apolar,
                            'Backbone': mainchain, 'Sidechain': sidechain, 'Ratio': ratio})
    area_class = fs.classifyResults(result, structure)
    print("Total : %.2f A2" % result.totalArea())
    for key in area_class:
        print(key, ": %.2f A2" % area_class[key])

    return df_sasa
import freesasa

savedData = open('SASA.txt', 'w+')
structure = freesasa.Structure("3lau.pdb")

result = freesasa.calc(
    structure,
    freesasa.Parameters({
        'algorithm': freesasa.LeeRichards,
        'n-slices': 100
    }))
print(result.nAtoms())

for i in range(1, result.nAtoms() + 1):
    details = '(' + structure.atomName(i) + ',' + str(
        result.atomArea(i)) + ' )'
    print(details)
    savedData.writelines(details + '\n')

area_classes = freesasa.classifyResults(result, structure)
print(area_classes)
print("Total : %.2f A2" % result.totalArea())
for key in area_classes:
    print(key, ": %.2f A2" % area_classes[key])
char_at_neutral = []
char_at_base = []

parser = argparse.ArgumentParser()
parser.add_argument("--infile", type=str, default="data/test.zip")
parser.add_argument("--model", type=str, default="model.pkl")
args = parser.parse_args()

#protein_parser = PDBParser()

with temppathlib.TemporaryDirectory() as tmpdir:
    # unzip the file with all the test PDBs
    with zipfile.ZipFile(args.infile, "r") as zip_:
        zip_.extractall(tmpdir.path)
        for test_pdb in tmpdir.path.glob("*.pdb"):
            struct = freesasa.Structure(str(test_pdb))
            result = freesasa.calc(struct)
            areas_classes = freesasa.classifyResults(result, struct)
            list_areas = [(list(areas_classes.values())[0]),
                          (list(areas_classes.values())[1]),
                          result.totalArea()]

            polar_area.append(list_areas[0])
            apolar_area.append(list_areas[1])
            total_area.append(list_areas[2])

print('done')
with temppathlib.TemporaryDirectory() as tmpdir:
    # unzip the file with all the test PDBs
    with zipfile.ZipFile(args.infile, "r") as zip_:
        zip_.extractall(tmpdir.path)
Exemplo n.º 29
0
    def handle(self, *args, **options):
        def recurse(entity, slist):
            """
            filter a pdb structure in a recursive way
            
            entity: the pdb entity, a structure should be given on the top level
            
            slist: the list of filter criterias, for each level.            
            """
            for subenty in entity.get_list():
                if not subenty.id in slist[0]: entity.detach_child(subenty.id)
                elif slist[1:]: recurse(subenty, slist[1:])

        def cal_pseudo_CB(r):
            """
            Calculate pseudo CB for Glycin
            from Bio pdb faq
            """
            a = r['CA'].get_vector()
            n = r['N'].get_vector() - a
            c = r['C'].get_vector() - a
            rot = pdb.rotaxis(-np.pi * 120.0 / 180.0, c)
            b = n.left_multiply(rot) + a
            return b.get_array()

        def pca_line(pca, h, r=0):
            """
            Calculate the pca for h and return the first pc transformed back to
            the original coordinate system
            """
            if ((not r) if pca.fit_transform(h)[0][0] < 0 else r):
                return pca.inverse_transform(
                    np.asarray([[-20, 0, 0], [20, 0, 0]]))
            else:
                return pca.inverse_transform(
                    np.asarray([[20, 0, 0], [-20, 0, 0]]))

        def calc_angle(b, c):
            """
            Calculate the angle between c, b and the orthogonal projection of b
            to the x axis.
            """
            ba = -b
            bc = c + ba
            ba[:, 0] = 0
            return np.degrees(
                np.arccos(
                    inner1d(ba, bc) /
                    (np.linalg.norm(ba, axis=1) * np.linalg.norm(bc, axis=1))))

        def ca_cb_calc(ca, cb, pca):
            """
            Calcuate the angles between ca, cb and center axis
            """
            return calc_angle(pca.transform(ca), pca.transform(cb))

        def axes_calc(h, p, pca):
            """
            Calculate the orthogonal projection of the CA to the helix axis
            which is moved to the mean of three consecutive amino acids
            """
            a = (np.roll(np.vstack((h, h[0])), 1, axis=0)[:-1] + h +
                 np.roll(np.vstack((h, h[-1])), -1, axis=0)[:-1]) / 3
            b = p.transform(h)
            b[:, 1:] = p.transform(a)[:, 1:]
            b = p.inverse_transform(b)
            return calc_angle(pca.transform(b), pca.transform(h))

        def set_bfactor(chain, angles):
            """
            simple helper to set the bfactor of all residues by some value of a
            list
            """
            for r, an in zip(chain.get_list(), angles):
                for a in r:
                    a.set_bfactor(an)

        def qgen(x):
            """
            Helper function to slice a list of all residues of a protein of the
            list of the residues of all proteins
            """
            start = False
            for i in range(len(qset) - 1, 0, -1):
                if not start and qset[i].protein_conformation.protein == x:
                    start = i
                if start and qset[i].protein_conformation.protein != x:
                    if start != len(qset) - 1:
                        del qset[start + 1:]
                        return qset[i + 1:]
                    return qset[i + 1:]
            del qset[start + 1:]
            return qset

        failed = []

        # get preferred chain for PDB-code
        references = Structure.objects.filter(
            protein_conformation__protein__family__slug__startswith="001"
        ).exclude(refined=True).prefetch_related(
            'pdb_code', 'pdb_data',
            'protein_conformation').order_by('protein_conformation__protein')
        references = list(references)

        pids = [ref.protein_conformation.protein.id for ref in references]

        qset = Residue.objects.filter(
            protein_conformation__protein__id__in=pids)
        qset = qset.filter(
            generic_number__label__regex=r'^[1-7]x[0-9]+').order_by(
                '-protein_conformation__protein', '-generic_number__label')
        qset = list(
            qset.prefetch_related('generic_number', 'protein_conformation'))

        res_dict = {
            ref.pdb_code.index: qgen(ref.protein_conformation.protein)
            for ref in references
        }

        #######################################################################
        ######################### Start of main loop ##########################
        #######################################################################

        for reference in references:

            preferred_chain = reference.preferred_chain.split(',')[0]
            pdb_code = reference.pdb_code.index
            state_id = reference.protein_conformation.state.id

            try:

                print(pdb_code)

                structure = self.load_pdb_var(pdb_code, reference.pdb_data.pdb)
                pchain = structure[0][preferred_chain]

                #######################################################################
                ###################### prepare and evaluate query #####################

                db_reslist = res_dict[pdb_code]

                #######################################################################
                ######################### filter data from db #########################

                def reslist_gen(x):
                    try:
                        while db_reslist[-1].generic_number.label[0] == x:
                            yield db_reslist.pop()
                    except IndexError:
                        pass

                # when gdict is not needed the helper can be removed
                #db_tmlist = [[(' ',r.sequence_number,' ') for r in reslist_gen(x) if r.sequence_number in pchain and r.sequence_number < 1000] for x in ["1","2","3","4","5","6","7"]]
                db_helper = [[
                    (r.generic_number.label, r.sequence_number)
                    for r in reslist_gen(x)
                    if r.sequence_number in pchain and r.sequence_number < 1000
                ] for x in ["1", "2", "3", "4", "5", "6", "7"]]
                gdict = {r[1]: r[0] for hlist in db_helper for r in hlist}
                db_tmlist = [[(' ', r[1], ' ') for r in sl]
                             for sl in db_helper]
                db_set = set(db_tmlist[0] + db_tmlist[1] + db_tmlist[2] +
                             db_tmlist[3] + db_tmlist[4] + db_tmlist[5] +
                             db_tmlist[6])

                #######################################################################
                ############################# filter  pdb #############################

                recurse(structure, [[0], preferred_chain, db_set])

                #######################################################################
                ############### Calculate the axes through the helices ################
                #######################################################################
                N = 3

                hres_list = [
                    np.asarray([pchain[r]["CA"].get_coord() for r in sl],
                               dtype=float) for sl in db_tmlist
                ]
                h_cb_list = [
                    np.asarray([
                        pchain[r]["CB"].get_coord()
                        if "CB" in pchain[r] else cal_pseudo_CB(pchain[r])
                        for r in sl
                    ],
                               dtype=float) for sl in db_tmlist
                ]

                # fast and fancy way to take the average of N consecutive elements
                hres_three = np.asarray([
                    sum([h[i:-(len(h) % N) or None:N] for i in range(N)]) / N
                    for h in hres_list
                ])

                #######################################################################
                ################################# PCA #################################
                #######################################################################

                helix_pcas = [PCA() for i in range(7)]
                [
                    pca_line(helix_pcas[i], h, i % 2)
                    for i, h in enumerate(hres_three)
                ]

                # extracellular part
                if extra_pca:
                    helices_mn = np.asarray(
                        [np.mean(h, axis=0) for h in hres_three])
                    pos_list = np.asarray([
                        pca_line(PCA(), h[:len(h) // 2:(-(i % 2) or 1)])
                        for i, h in enumerate(hres_three)
                    ])
                    pos_list = pos_list - (np.mean(pos_list, axis=1) -
                                           helices_mn).reshape(-1, 1, 3)

                    pca = PCA()
                    pca_line(pca, np.vstack(pos_list))
                else:
                    pca = PCA()
                    pca_line(pca, np.vstack(hres_three))

                #######################################################################
                ################################ Angles ###############################
                #######################################################################

                ########################### Axis to CA to CB ##########################

                angle = np.concatenate([
                    ca_cb_calc(ca, cb, pca)
                    for ca, cb in zip(hres_list, h_cb_list)
                ])

                set_bfactor(pchain, angle)

                if print_pdb:
                    self.save_pdb(structure,
                                  pdb_code + 'angle_colored_ca_cb.pdb')

                ######################### Axis to Axis to CA ##########################

                angle2 = np.concatenate([
                    axes_calc(h, p, pca)
                    for h, p in zip(hres_list, helix_pcas)
                ])

                set_bfactor(pchain, angle2)
                if print_pdb:
                    self.save_pdb(structure,
                                  pdb_code + 'angle_colored_axes.pdb')

                ################################ SASA #################################
                if SASA:
                    pdbstruct = freesasa.Structure("pymol_output/" + pdb_code +
                                                   'angle_colored_axes.pdb')
                    res = freesasa.calc(pdbstruct)

                    asa_list = []
                    oldnum = -1
                    for i in range(res.nAtoms()):
                        resnum = pdbstruct.residueNumber(i)
                        if resnum == oldnum:
                            asa_list[-1] += res.atomArea(i)
                        else:
                            asa_list.append(res.atomArea(i))
                            oldnum = resnum

                    set_bfactor(pchain, asa_list)
                    if print_pdb:
                        self.save_pdb(structure, pdb_code + 'asa_colored.pdb')

                ################################# HSE #################################
                if HSE:
                    hse = pdb.HSExposure.HSExposureCB(structure[0])
                    [[a.set_bfactor(x[1][1]) for a in x[0]] for x in hse]

                    if print_pdb:
                        self.save_pdb(structure, pdb_code + 'hsea_colored.pdb')

                ############################### pickle ################################
                if HSE and SASA:
                    reslist = []
                    grslist = []
                    hse = []
                    for r in pchain:
                        reslist.append(r.id[1])
                        grslist.append(gdict[r.id[1]])
                        hse.append(r["CA"].get_bfactor())
                    with open('pymol_output/' + pdb_code + '_measures.pickle',
                              'wb') as handle:
                        pickle.dump(
                            (np.array(reslist), grslist, np.array(asa_list),
                             np.array(hse), angle, angle2, state_id), handle)

                #Angle.objects.bulk_create([Angle(residue=gdict[res.id[1]], angle=res["CA"].get_bfactor(), structure=reference) for res in pchain])

            except Exception as e:
                print("ERROR!!", pdb_code, e)
                failed.append(pdb_code)
                continue

        print(len(failed), "of", len(references), "failed:", failed)
Exemplo n.º 30
0
    def _get_item_src(self, decoy):
        """
        decoy: str, path to the decoy
        """
        atom_to_num = {
            "C": 1,
            "N": 2,
            "O": 3,
            "S": 4
        }
        residues = []
        atom_positions = self.create_atom_positions()
        residue = self.build_residue()
        structure = fs.Structure(decoy)
        solvent_access = fs.calc(structure)
        with open(decoy, "r") as f:
            line = f.readline().rstrip()
            while not line.startswith("ATOM"):
                line = f.readline().rstrip()
            cur_resi = int(line[22:26])

            # PDB file stardard format
            # COLUMNS   DATA  TYPE    FIELD
            # -------------------------------------------
            #  1 -  6   Record name   "ATOM  "
            #  7 - 11   Integer       Atom serial #
            # 13 - 16   Atom          Atom name
            # 17        Character     Alternate location
            # 18 - 20   Residue name  resName
            # 22        Character     chainID
            # 23 - 26   Integer       resSeq
            # 27        AChar         Code for insertion of residues
            # 31 - 38   Real(8.3)     x
            # 39 - 46   Real(8.3)     y
            # 47 - 54   Real(8.3)     z
            # 55 - 60   Real(6.2)     occupancy
            # 61 - 66   Real(6.2)     tempFactor
            # 77 - 78   LString(2)    element
            # 79 - 80   LString(2)    Charge  on the atom

            while line:
                if line.startswith("TER"):
                    break
                if not line.startswith("ATOM"):
                    line = f.readline().rstrip()
                    continue

                # ignore hydrogens
                atom_type = line[-1]
                if atom_type == "H":
                    line = f.readline().rstrip()
                    continue

                resi_num = int(line[22:26])
                if resi_num > cur_resi:
                    residues.append(residue)
                    if len(residues) == 400:
                        break
                    residue = self.build_residue()
                    cur_resi = resi_num
                residue = self._put_atom_src(
                    line.rstrip(), residue, solvent_access, atom_positions, atom_to_num)
                line = f.readline().rstrip()

        # normalize residues
        pc = np.ones((self.npoints, self.num_channel())) * float("-inf")
        residues = np.array(residues)
        logging.debug("decoy shape: {}".format(residues.shape))
        x_mean = np.mean(residues[:, 1])
        y_mean = np.mean(residues[:, 2])
        z_mean = np.mean(residues[:, 3])
        for i in range(self.num_channel() // self.ATTRIBUTES_EACH_ATOM):
            residues[:, self.ATTRIBUTES_EACH_ATOM*i+1] -= x_mean
            residues[:, self.ATTRIBUTES_EACH_ATOM*i+2] -= y_mean
            residues[:, self.ATTRIBUTES_EACH_ATOM*i+3] -= z_mean
        pc[0:residues.shape[0], :] = residues

        target_path = os.path.dirname(decoy)
        gdt_ts = 0.0
        with open(os.path.join(target_path, "list.dat"), "r") as lst:
            info = lst.readline()
            while info:
                if info.startswith(os.path.basename(decoy)):
                    gdt_ts = float(
                        info.split()[CASPDataset.list_dat["gdt_ts"]])
                    break
                info = lst.readline()

        return pc, gdt_ts