示例#1
0
    def _calc_residue_dist(self, residue_one, residue_two, dist_atoms='CA'):
        """Returns the C-alpha distance between two residues"""

        if not Polypeptide.is_aa(residue_one) or not Polypeptide.is_aa(residue_two):
            return np.nan
        dist_atom_1 = dist_atoms if dist_atoms in residue_one else 'CA'
        dist_atom_2 = dist_atoms if dist_atoms in residue_two else 'CA'

        try:
            diff_vector = residue_one[dist_atom_1].coord - residue_two[dist_atom_2].coord
        except KeyError:
            return np.nan
        return np.sqrt(np.sum(diff_vector * diff_vector))
def MutationsDict(file, positions=None):
    """Get dictionary with lists of mutations per position in protein, ignore
    positions without residue in pdb file.

    Parameters:
        file (string): pdb file to get mutations from
        positions: list of tuples of the form (chain, first, last) for positions
                   to mutate for all other aminoacids. If None, mutates all
                   positions in all chains

    Returns:
        dict with keys :aa:chain:position, each containing lists with
        :aa:chain:position:mutated_aa for all mutations

    """

    # Sorted list of one letter amino acids
    AA = list(Bio.PDB.Polypeptide.aa1)
    # Generate model of original pdb file
    model = it.Pmolecule(file).model
    # Dict to store mutations
    mutations = dict()
    if positions:
        for chain_id, first, last in positions:
            # Get chain corresponding to chain_id given
            chain = next(chain for chain in model.get_chains()
                         if chain.id == chain_id)
            for residue in chain:
                if pp.is_aa(residue):
                    code = pp.three_to_one(residue.get_resname())
                    position = residue.id[1]
                    prefix = code + chain_id + str(position)
                    # Only save positions between first and last
                    if position in range(first, last + 1):
                        mutations[prefix] = [
                            prefix + aa for aa in AA if aa != code
                        ]
    else:
        for chain in model.get_chains():
            for residue in chain:
                if pp.is_aa(residue):
                    code = pp.three_to_one(residue.get_resname())
                    position = residue.id[1]
                    chain_id = chain.id
                    prefix = code + chain_id + str(position)
                    mutations[prefix] = [
                        prefix + aa for aa in AA if aa != code
                    ]
    return mutations
示例#3
0
def get_chain_to_valid_residues(structure, pdb_name=None):
    """Get tuples of chains and their valid residues."""
    if pdb_name is None:
        pdb_name = ()
    else:
        pdb_name = (pdb_name, )
    chain_res = []
    if type(structure) is Bio.PDB.Structure.Structure:
        for model in structure:
            for chain in model:
                residues = [
                    res for res in chain
                    if poly.is_aa(res.get_resname(), standard=True)
                    and 'CA' in res
                ]
                if len(residues) != 0:
                    chain_res.append(
                        (pdb_name + (str(model.serial_num), chain.get_id()),
                         residues))
    else:
        if 'atom_name' in structure.columns:
            calphas = structure[structure['atom_name'] == 'CA']
        else:
            calphas = structure[structure['maestro_atom_name'] == ' CA ']
        calphas = calphas[calphas['resname'] != 'UNK']

        for (chain, chain_ca) in calphas.groupby(['model', 'chain']):
            residues = [ca for idx, ca in chain_ca.iterrows()]
            if len(residues) != 0:
                chain_res.append((pdb_name + chain, residues))
    return chain_res
示例#4
0
	def definePeptideChain(self):		# find peptide chain if not stated in self.__table and fill self.__peptide
		l = 'INFINITY'			# with list of peptide residues if length is less than 30
		if not self.__table['chain_antigen'][0]:
			for i in self.__chains:
				buf = len(PPBuilder().build_peptides(self.__struct[0][i])[0])
				if (buf <= l):
					l = buf
					chid = i
			self.__table.loc['chain_antigen', :] = chid
		else:
			chid = self.__table['chain_antigen'][0]
					
		pp = list(self.__struct[0][chid])
		
		if (len(pp) > 30):
			line = self.__name + '\t;TOO MANY AMINO ACIDS (' + str(len(pp)) + ') TO BE A PEPTIDE :(\n'
			self.printerr('definePeptideChain(): ' + line)
			return 0
				
		pep_res = []
		for r in pp:
			if (Polypeptide.is_aa(r.get_resname(), standard=True)):
				pep_res.append(r)
		self.__peptide = pep_res
		self.__regions_res.update({'peptide':pep_res})
		return 1
def aa_to_index(aa):
    """
    :param aa: Three character amino acid name.
    :returns: Integer index as per BioPython, unknown/non-standard amino acids return 20.
    """
    if Polypeptide.is_aa(aa, standard=True):
        return Polypeptide.three_to_index(aa)
    else:
        return 20
示例#6
0
def get_chain_sequences(df):
    """Return list of tuples of (id, sequence) for different chains of monomers in a given dataframe."""
    # Keep only CA of standard residues
    df = df[df['name'] == 'CA'].drop_duplicates()
    df = df[df['resname'].apply(lambda x: Poly.is_aa(x, standard=True))]
    df['resname'] = df['resname'].apply(Poly.three_to_one)
    chain_sequences = []
    for c, chain in df.groupby(['ensemble', 'subunit', 'structure', 'model', 'chain']):
        seq = ''.join(chain['resname'])
        chain_sequences.append((tuple([str(x) for x in c]), seq))
    return chain_sequences
示例#7
0
def standard_residue_filter(df):
    """Filter out non-standard residues."""
    residues = df[['structure', 'model', 'chain', 'residue', 'resname']] \
        .drop_duplicates()
    sel = residues['resname'].apply(lambda x: Poly.is_aa(x, standard=True))

    residues['to_keep'] = sel
    residues_to_keep = residues.set_index(
        ['structure', 'model', 'chain', 'residue', 'resname'])['to_keep']
    to_keep = residues_to_keep.loc[df.set_index(
        ['structure', 'model', 'chain', 'residue', 'resname']).index]
    return df[to_keep.values]
示例#8
0
def get_all_chain_sequences_df(df):
    """Return list of tuples of (struct_name, chain_sequences) for sharded."""
    all_chain_sequences = []
    # Keep only CA of standard residues
    df = df[df['name'] == 'CA'].drop_duplicates()
    df = df[df['resname'].apply(lambda x: Poly.is_aa(x, standard=True))]
    df['resname'] = df['resname'].apply(Poly.three_to_one)
    for s, structure in df.groupby(['ensemble', 'subunit', 'structure']):
        chain_sequences = []
        for c, chain in structure.groupby(['model', 'chain']):
            seq = ''.join(chain['resname'])
            chain_sequences.append((c, seq))
        all_chain_sequences.append((s, chain_sequences))
    return all_chain_sequences
示例#9
0
def write_FASTAs(PDB_ID, chains):
    polypeptide_IDs = []
    for chain_ID, residues in chains.items():
        if residues and Polypeptide.is_aa(residues[0][0]):
            polypeptide_ID = '{}_{}'.format(PDB_ID, chain_ID)
            polypeptide_IDs.append(polypeptide_ID)
            sequence = []
            for resname, resseq, icode in residues:
                try:
                    sequence.append(Polypeptide.three_to_one(resname))
                except KeyError:
                    sequence.append('X')
            with open('{}.fasta'.format(polypeptide_ID), mode='w') as f:
                f.write('>{}\n'.format(polypeptide_ID))
                f.write('{}\n'.format(''.join(sequence)))
    return polypeptide_IDs
    def get_bfactors( self, chain_id ):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the B-Factors for all residues in a chain of a Biopython.PDB structure.
                The B-Factors describe the mobility of an atom or a residue.
                In a Biopython.PDB structure B-Factors are given for each atom in a residue.
                Calculate the mean B-Factor for a residue by averaging over the B-Factor 
                of all atoms in a residue.
                Sometimes B-Factors are not available for a certain residue; 
                (e.g. the residue was not resolved); insert np.nan for those cases.
            
                Finally normalize your B-Factors using Standard scores (zero mean, unit variance).
                You have to use np.nanmean, np.nanvar etc. if you have nan values in your array.
                The returned data structure has to be a numpy array rounded again to integer.
        '''
        chain = self.structure.child_list[0].child_dict[chain_id]
        residues1 = chain.get_list()
        residues = []
        #remove residues that are not AAs
        for res_nr in range(len(residues1)):
            if Polypeptide.is_aa(residues1[res_nr]):
                residues.append(residues1[res_nr])
        length = len(residues)
        b_factors = np.zeros(length, dtype=np.float32)
        #calculate bfactor average per residue
        tmp_factors = np.zeros(length, dtype=np.float32)
        for res_nr in range(length):
            atoms = residues[res_nr].get_list()
            bfactor = 0
            atom_count = 0
            for ato in atoms:
                bfactor += ato.bfactor
                atom_count += 1

            bfactor /= atom_count
            tmp_factors[res_nr] = bfactor

        #normalize
        mean = np.nanmean(tmp_factors)
        vari = np.nanstd(tmp_factors)
        for foo in range(length):
            b_factors[foo] = (tmp_factors[foo]-mean)/vari
        return b_factors.astype( np.int ) # return rounded (integer) values
    def get_sequence( self, chain_id ):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
                in a Biopython.PDB structure as a string.
        '''
        chain = self.structure.child_list[0].child_dict[chain_id]
        sequence = ""
        for residue in chain:
            if Polypeptide.is_aa(residue):
                long_name = residue.get_resname()
                sequence += Polypeptide.three_to_one(long_name)

        return sequence
示例#12
0
def clean_pdb(structure, pdb_name, clean_dir):
    '''
    Function to select and write pdb with only aminoacids
    Invokes SelectAA class constructed with Bio.PDB.select
    Called by: clean_pdb_files()
               clean_and_sort()
    '''
    reslist = []
    clean_name = clean_dir + pdb_name + '.clean.pdb'
    for res in structure.get_residues():
        if bpp_poly.is_aa(res.get_resname(), standard=True):
            reslist.append(res.resname)
    if len(reslist) > 30:
        io.set_structure(structure)
        io.save(clean_name, SelectAA())
        return True
    else:
        return False
示例#13
0
def standard_residue_filter(df):
    """
    Filter out non-standard residues.

    :param df: dataframe to filter against.
    :type df: atoms dataframe.

    :return: same dataframe, but with only with atoms corresponding to standard residues left.
    :rtype: atoms dataframe.
    """
    residues = df[['structure', 'model', 'chain', 'residue', 'resname']] \
        .drop_duplicates()
    sel = residues['resname'].apply(
        lambda x: Poly.is_aa(x, standard=True))

    residues['to_keep'] = sel
    residues_to_keep = residues.set_index(
        ['structure', 'model', 'chain', 'residue', 'resname'])['to_keep']
    to_keep = residues_to_keep.loc[df.set_index(
        ['structure', 'model', 'chain', 'residue', 'resname']).index]
    return df[to_keep.values]
示例#14
0
def extract_seqs(structure, defmodel):
    '''
    Uses Biopython to count the numer of chains and to extract the
    each chain's sequence as a list of sequences.
    Called by: clean_and_sort()
    '''
    nchains = 0
    for model in structure:
        if model.id == defmodel:
            seqs = []
            chain_ids = []
            for chain in model:
                nchains += 1
                seqlist = []
                for residue in chain:
                    if bpp_poly.is_aa(residue.get_resname(), standard=True):
                        seqlist.append(
                            bpp_poly.three_to_one(residue.get_resname()))
                    else:
                        seqlist.append('X')
                seq = str("".join(seqlist))
                seqs.append(seq)
                chain_ids.append(chain.id)
    return nchains, seqs, chain_ids
 def get_contact_map( self, chain_id ):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return a complete contact map (see description in exercise sheet) 
             for a given chain in a Biopython.PDB structure as numpy array. 
             The values in the matrix describe the c-alpha distance between all residues 
             in a chain of a Biopython.PDB structure.
             Only integer values of the distance have to be given (see below).
     '''
     
     length = len(self.get_sequence(chain_id))
     contact_map = np.zeros( (length,length), dtype=np.float32 )
     aa = []
     for residue in self.structure.child_list[0].child_dict[chain_id]:
         if Polypeptide.is_aa(residue):
             aa.append(residue)
     for foo in range(0,length):
         for bar in range(0,length):
             contact_map[foo][bar] = self.get_residue_distance(aa[foo], aa[bar])
     return contact_map.astype( np.int ) # return rounded (integer) values
示例#16
0
    def _aa_mask(self):

        poly = Polypeptide.Polypeptide(self._bio_chain)
        aa_mask = [Polypeptide.is_aa(r) for r in poly]
        return aa_mask
示例#17
0
def get_structure_seqrecords(model):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        model: Biopython Model object of a Structure

    Returns:
        list: List of SeqRecords

    """

    structure_seq_records = []

    # Loop over each chain of the PDB
    for chain in model:
        tracker = 0
        chain_seq = ''
        chain_resnums = []

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            res_id = res.id
            res_num = res_id[1]
            res_icode = res_id[2]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                end_tracker = res_num
                res_aa_one = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if res_icode != ' ':
                        chain_seq += res_aa_one
                        chain_resnums.append(res_num)
                        tracker = end_tracker + 1
                        continue
                    else:
                        multiplier = (end_tracker - tracker - 1)
                        chain_seq += 'X' * multiplier
                        # Residue numbers for unresolved or nonstandard residues are Infinite
                        chain_resnums.extend([float("Inf")] * multiplier)

                chain_seq += res_aa_one
                chain_resnums.append(res_num)
                tracker = end_tracker

            else:
                continue

        chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein),
                                     id=chain.get_id())
        chain_seq_record.letter_annotations[
            'structure_resnums'] = chain_resnums
        structure_seq_records.append(chain_seq_record)

    return structure_seq_records
示例#18
0
def build_matrix(
        path: str,
        filename: str,
        truncate_log: Union[tqdm.tqdm, None] = None) -> BuildMatrixDict:
    """Build the input matrix for one protein.

    Args:
        path: path of the pdb file.
        filename: name of the file (without extension).
        truncate_log: tqdm logger

    Returns:
        Build matrix dictionary
    """
    PROTEIN_SEQ_MAX_LEN = 4000
    protein_matrix = [[0 for x in range(PROTEIN_SEQ_MAX_LEN)]
                      for y in range(10)]
    protein_structure = PDBParser().get_structure(filename, path)
    protein_model = list(protein_structure.get_models())
    protein_chains = list(protein_model[0].get_chains())

    col = 0

    try:
        for chain in protein_chains:
            protein_residues = list(chain.get_residues())

            for residue in protein_residues:
                if Polypeptide.is_aa(residue.get_resname(), standard=True):
                    atoms = list(residue.get_atoms())
                    x = []
                    y = []
                    z = []

                    for atom in atoms:
                        vec = atom.get_vector()
                        x.append(vec.__getitem__(0))
                        y.append(vec.__getitem__(1))
                        z.append(vec.__getitem__(2))

                    # calculate position of residue
                    x = round(mean(x))
                    y = round(mean(y))
                    z = round(mean(z))

                    # one letter code
                    code = Polypeptide.three_to_one(residue.get_resname())

                    aa = amino_acid[code]
                    protein_matrix[0][col] = aa["code"]
                    protein_matrix[1][col] = x
                    protein_matrix[2][col] = y
                    protein_matrix[3][col] = z
                    protein_matrix[4][col] = aa["hydropathy"]
                    protein_matrix[5][col] = aa["hydropathy_index"]
                    protein_matrix[6][col] = aa["acidity_basicity"]
                    protein_matrix[7][col] = aa["mass"]
                    protein_matrix[8][col] = aa["isoelectric_point"]
                    protein_matrix[9][col] = aa["charge"]

                # Even if the current residue is not amino acid we increase the col.
                # 0 is save at this position if it is not an amino acid.
                col = col + 1

    except IndexError:
        if truncate_log is not None:
            truncate_log.set_description_str(
                f"Protein {filename} is truncated.")

    # Prepare dict so it can be load to vaex dataframe
    dic: BuildMatrixDict = {
        "seq": [[]],
        "x_pos": [[]],
        "y_pos": [[]],
        "z_pos": [[]],
        "hydropathy": [[]],
        "hydropathy_index": [[]],
        "acidity_basicity": [[]],
        "mass": [[]],
        "isoelectric_point": [[]],
        "charge": [[]],
    }

    for i in range(10):
        dic[col_name[i]] = pyarrow.array(
            [[protein_matrix[i][x] for x in range(PROTEIN_SEQ_MAX_LEN)]])

    return dic
示例#19
0
def get_structure_seqs(pdb_file, file_type):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        pdb_file: Path to PDB file

    Returns:
        dict: Dictionary of:
        {chain_id: sequence}

    """

    # TODO: Please check out capitalization of chain IDs in mmcif files. example: 5afi - chain "l" is present but
    # it seems like biopython capitalizes it to chain L

    # Get the first model
    my_structure = StructureIO(pdb_file)
    model = my_structure.first_model

    structure_seqs = {}

    # Loop over each chain of the PDB
    for chain in model:
        chain_seq = ''
        tracker = 0

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            # res_num = res.id[1]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                full_id = res.get_full_id()
                end_tracker = full_id[3][1]
                i_code = full_id[3][2]
                aa = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if i_code != ' ':
                        chain_seq += aa
                        tracker = end_tracker + 1
                        continue
                    else:
                        chain_seq += 'X' * (end_tracker - tracker - 1)

                chain_seq += aa
                tracker = end_tracker

            else:
                continue

        structure_seqs[chain.get_id()] = chain_seq

    return structure_seqs
示例#20
0
def get_knots(pdb, cutoff, cluster_cutoff, genpdb, verbosity):
    '''
    Main routine, uses biopython and pandas to detect knots and cluster them
    through the implementation of the average linkage algorithm
    Called by: main()
    '''
    if (pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz")
            or pdb.endswith(".pdb1") or pdb.endswith(".pdb1.gz")
            or pdb.endswith(".pdb.gz")) and not pdb.startswith('CONTACTS-'):
        pdb_name, structure, nchains = strtools.parse_pdb_structure(pdb)
        print(str('\n' + clrs['p'] + pdb + clrs['n']))
        with open('KnotScope.log', 'a') as log:
            log.write(str('\n[STRUCTURE],' + pdb + '\n'))
        mainchain = [
            atom for atom in bpp.Selection.unfold_entities(structure[0], 'A')
            if bpp_poly.is_aa(atom.get_parent(), standard=True) and (
                atom.id == 'CA')
        ]  # or atom.id == 'N' or atom.id == 'O')]
        contacts = []
        core = []
        for atom in mainchain:
            distances = []
            ns = bpp.NeighborSearch(mainchain)
            center = atom.get_coord()
            neighbors = [
                neighbor for neighbor in ns.search(center, cutoff)
                if (neighbor.get_parent().id[1] -
                    atom.get_parent().id[1]) > abs(3)
            ]
            if neighbors:
                for neighbor in neighbors:
                    d = neighbor - atom
                    distances.append(d)
                    if d <= cutoff:
                        printv(
                            clrs['y'] + 'Unlikely proximity' + clrs['n'] +
                            ' between residues ' + clrs['y'] +
                            str(atom.get_parent().id[1]) + clrs['n'] +
                            ' and ' + clrs['y'] +
                            str(neighbor.get_parent().id[1]) + clrs['n'] + '!',
                            verbosity)
                        printv(str(d), verbosity)
                        with open('KnotScope.log', 'a') as log:
                            log.write('[CLASH],' +
                                      str(atom.get_parent().id[1]) + ',' +
                                      str(neighbor.get_parent().id[1]) + ',' +
                                      str(d) + '\n')
                        contacts.append(neighbor.get_parent())
                        contacts.append(atom.get_parent())
                        if atom not in core:
                            core.append(atom)
                        if neighbor not in core:
                            core.append(neighbor)
        # Save contacts to pdb file if they exist
        if contacts and genpdb:
            io.set_structure(structure)
            io.save('CONTACTS-' + pdb, strtools.SelectResidues(contacts))
        # Start cluster analysis to separate knots
        pairwisedist = []
        # Measure pairwise distances of every CA involved in knots and record in vertical list
        if len(core) > 1:
            for a, b in it.combinations(core, 2):
                d = a - b
                pairwisedist.append(
                    [a.get_parent().id[1],
                     b.get_parent().id[1], d])
            # Add values for diagonal
            for entry in range(len(core)):
                line = make_diagonal(core, entry)
                pairwisedist.append([line[0], line[1], line[2]])
            # Create pandas dataframe, make it a square and symmetric matrix
            df = pd.DataFrame(pairwisedist, index=None, columns=None)
            df = pd.crosstab(index=df[0],
                             columns=df[1],
                             values=df[2],
                             aggfunc='sum',
                             dropna=True).fillna(0)
            df = df + df.T
            # Start average linkage algorithm
            reslist = list(df.columns)
            clusters = []
            row_index = -1
            col_index = -1
            array = []
            for n in range(df.shape[0]):
                array.append(n)
            clusters.append(array.copy())
            for k in range(1, df.shape[0]):
                min_val = sys.maxsize
                for i in range(0, df.shape[0]):
                    for j in range(0, df.shape[1]):
                        #print(str(df.iloc[i,j]))
                        if type(df.iloc[i, j]) != str:
                            if (df.iloc[i, j] <= min_val):
                                min_val = df.iloc[i, j]
                                row_index = i
                                col_index = j

                for i in range(0, df.shape[0]):
                    if (i != col_index and i != row_index):
                        temp = (df.iloc[col_index, i] +
                                df.iloc[row_index, i]) / 2
                        df.iloc[col_index, i] = temp
                        df.iloc[i, col_index] = temp
                for i in range(0, df.shape[0]):
                    df.iloc[row_index, i] = sys.maxsize
                    df.iloc[i, row_index] = sys.maxsize
                minimum = min(row_index, col_index)
                maximum = max(row_index, col_index)
                for n in range(len(array)):
                    if (array[n] == maximum):
                        array[n] = minimum
                clusters.append(array.copy())
                # Stop iterations when minimum pairwise distance in the matrix is greater than 22
                if min_val > cluster_cutoff:
                    break
            # Get the clusters from last iteration and 'count' elements
            clustered_res = clusters[-1]
            counter = collections.Counter(clustered_res)
            # Combine residue and cluster information and print them user-friendly
            clusterdict = dict(zip(reslist, clustered_res))
            print(clrs['y'] + '\nLikely ' + str(len(set(clusters[-1]))) +
                  ' knot(s) found in structure under chosen criteria...' +
                  clrs['n'])
            n = 0
            k_lengths = []
            for cl in set(clusterdict.values()):
                n += 1
                cluster_residues = []
                print('\nKnot ' + clrs['y'] + str(n) + clrs['n'] +
                      ' (Cluster id: ' + str(cl) + ') involves ' + clrs['y'] +
                      str(list(counter.values())[n - 1]) + clrs['n'] +
                      ' residues:')
                for res in clusterdict:
                    if clusterdict[res] == cl:
                        cluster_residues.append(res)
                k_lengths.append(len(cluster_residues))
                print(clrs['y'] + ', '.join([str(a)
                                             for a in cluster_residues]) +
                      clrs['n'])
                with open('KnotScope.log', 'a') as log:
                    log.write('[K' + str(n) + '-RES],' +
                              ','.join([str(a)
                                        for a in cluster_residues]) + '\n')
                    log.write('[K' + str(n) + '-LEN],' +
                              str(len(cluster_residues)) + '\n')
            if clusterdict:
                nknots = len(set(list(clusterdict.values())))
            else:
                nknots = 0
            maxklength = max(k_lengths)
            with open('KnotScope.log', 'a') as log:
                log.write('[SUM],str,' + pdb + ',ca_clash,' + str(len(core)) +
                          ',nknots,' + str(nknots) + ',maxklength,' +
                          str(maxklength) + '\n')
            return clusterdict, nknots, maxklength
        else:
            print(clrs['g'] + 'No CA distances under ' + str(cutoff) +
                  ' angstrons found' + clrs['n'] + '!\n')
            with open('KnotScope.log', 'a') as log:
                log.write('[SUM],str,' + pdb +
                          ',ca_clash,0,nknots,0,maxklength,0\n')

        del pdb_name, structure, nchains, contacts
    elif pdb.startswith('CONTACTS-'):
        pass
    else:
        print(clrs['y'] + pdb + clrs['n'] +
              ' not a pdb-related structure format.' + clrs['r'] +
              ' SKIPPING!' + clrs['n'])
def aa_to_index(aa):
    if Polypeptide.is_aa(aa, standard=True):
        return Polypeptide.three_to_index(aa)
    else:
        return 20
示例#22
0
 def accept_residue(self, residue):
     if bpp_poly.is_aa(residue.get_resname(), standard=True):
         return 1
     else:
         return 0