示例#1
0
def slice_pdb( arguments ):
    pdb, resnum_to_slice_dict, save_hetatm = arguments
    pdbline_dict = pdb_util.create_xyzDict_bychain( pdb, save_hetatm )[1]

    outfn_lines = ""
    for chain in pdb_util.chnids:
        if chain in pdbline_dict.keys():
            resnums = pdb_util.get_resnums_from_resnum_dict( resnum_to_slice_dict, pdbline_dict, chain )
            if not resnums: continue
            for resnum in resnums:
                try:
                    outfn_lines += pdbline_dict[ chain ][ resnum ]
                except KeyError:
                    if resnum <= max(pdbline_dict[chain]):
                        print "WARNING:", resnum, "is not present in the pdb"
                        continue
                    else:
                        print "NOT"
                        return False
            outfn_lines += "TER\n" # end of a chain
    outfn_lines += "END\n"

    assert outfn_lines, "nothing is going to be sliced, check your chain definition while selecting residues"

    outfn = pdb_util.get_uniq_outpdbname( pdb, "_sliced")
    buf = open( outfn, "w" )
    buf.write( outfn_lines )
    buf.close()
示例#2
0
def trim_pdb( arguments ):
    pdb, resnum_to_trim_dict, save_hetatm = arguments
    pdbline_dict = pdb_util.create_xyzDict_bychain( pdb, save_hetatm )[1]

    trim_res_list = []
    for chain in sorted( pdbline_dict.keys() ):
        resnums = pdb_util.get_resnums_from_resnum_dict( resnum_to_trim_dict, pdbline_dict, chain )
        if not resnums: continue
        for resnum in resnums:
            try:
                del pdbline_dict[ chain ][ resnum ]
                trim_res_list.append( str(resnum)+chain )
            except KeyError:
                if resnum <= max(pdbline_dict[chain]):
                    print "WARNING:", resnum, "not in pdb"
                    continue
                else:
                    stderr.write("ERROR: residue %s not present in the file\n" %(str(resnum)+chain))
                    return False


    outfn_lines = "REMARK trim residues: %s\n" % (" ".join( trim_res_list ))
    #for chain in sorted(pdbline_dict.keys()):
    for chain in pdb_util.chnids:
        if chain in (pdbline_dict.keys()):
            for rsn in sorted(pdbline_dict[ chain ].keys()):
                outfn_lines += pdbline_dict[ chain ][ rsn ]
            outfn_lines += "TER\n"
    outfn_lines += "END\n"


    outfn = pdb_util.get_uniq_outpdbname(pdb, "_trim")
    '''
    pdb = basename(pdb)
    tag = "_trim"
    outfn = pdb[:-4] + tag

    while exists( outfn+".pdb" ):
        outfn += tag
        print outfn

    outfn = outfn+".pdb"
    '''

    buf = open( outfn, "w" )
    buf.write( outfn_lines )
    buf.close()
示例#3
0
def whats_in_pdb( pdb ):
    pdbline_dict = pdb_util.create_xyzDict_bychain( pdb )[1]
    total_rsds = 0
    chains = sorted( pdbline_dict.keys() )
    check_chainbreak=False
    for chain in chains:
        res_list = pdbline_dict[ chain ]
        first_rsn = min( res_list )
        last_rsn  = max( res_list )
        n_rsd     = len( res_list )
        print "%s: chain:%s, from %4s to %4s, %4s rsds" %( pdb, chain, min( res_list ), max( res_list ), len( res_list ))
        if first_rsn + n_rsd - 1 != last_rsn: # there is a chainbreak
            check_chainbreak=True
        total_rsds += len( res_list )

    print
    print "found chainbreak(s):"
    print pdb_util.detect_chainbreaks_in_pdb( pdb )
    print "[Summary] %s: %s chains (%s) and %s rsds" %( pdb, len(chains), " ".join(chains), total_rsds )
def chain_extractor( pdb ):
    assert exists( pdb )
    pdbline_dict = pdb_util.create_xyzDict_bychain( pdb, opts.save_hetatm, opts.stripH )[1]
    pdbtag = basename(pdb).split(".pdb")[0]

    if not opts.chains:
        # extract all chains
        stderr.write("extracting all chains from %s: " %( pdb ))
        for chain in pdbline_dict.keys():
            print chain,
            outlines = ""
            for rsn in sorted( pdbline_dict[ chain ].keys() ):
                outlines += pdbline_dict[ chain ][ rsn ]

            outpdb = open( pdbtag + opts.delimiter + chain + ".pdb", "w" )
            outpdb.write( outlines )
            outpdb.write( "TER\n" )
            outpdb.close()
        print
    else:
        for chain in opts.chains:
            stderr.write("extracting chain %s from %s\n" %( chain, pdb ))
            outlines = ""
            if len(chain) > 1: # a batch selection, say ABC
                for c in chain:
                    try:
                        for rsn in sorted( pdbline_dict[ c ].keys() ):
                            outlines += pdbline_dict[ c ][ rsn ]
                    except:
                        stderr.write("ERROR: %s does not have chain %s\n" %(pdb, c))
                        return False
            else:
                try:
                    for rsn in sorted( pdbline_dict[ chain ].keys() ):
                        outlines += pdbline_dict[ chain ][ rsn ]
                except:
                    stderr.write("ERROR: %s does not have chain %s\n" %(pdb, chain))
                    return False

            outpdb = open( pdbtag + opts.delimiter + chain + ".pdb", "w" )
            outpdb.write( outlines )
            outpdb.write( "TER\n" )
            outpdb.close()
示例#5
0
def correct_alignment_using_pdb( alignment, sbj_pdbfile, remove_temppdb=True ):
    """ This function is trying to fix an alignment issue caused by the penalty of opening a gap in dynamic programming
    eg:
            123456789
            NKTTTTTKG <- ref_seq_aligned
            NK------G <- mistaken sbj_seq_aligned (doesn't like a gap)
            N------KG <- correct sbj_seq_aligned (from the pdb, KG are connected)
            1      23
            seq_map = { 1:1,
                        2:2,  <- should be 2:8
                        3:9 }
    """
    ref_seq_aligned, sbj_seq_aligned = ( alignment[0], alignment[1] )

    # to have a standard, number residues from 1 to the end, continuously
    pdb_util.pdb_idx1( sbj_pdbfile, "temp.pdb" )
    xyz_dict, pdbline_dict, resname_dict = pdb_util.create_xyzDict_bychain( "temp.pdb" )
    if remove_temppdb:
        os.remove("temp.pdb")

    assert len( pdbline_dict.keys() ) == 1, "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys()
    chain = ( pdbline_dict.keys()[0] )

    xyz_dict = xyz_dict[ chain ]
    sbj_pdb_idx1_res_nums = sorted( pdbline_dict[chain].keys() )

    seq_map = seq_mapping( alignment )
    corrected_seq_map = seq_map

    for idx, rsn in enumerate( sbj_pdb_idx1_res_nums ):
        try:
            next_rsn = sbj_pdb_idx1_res_nums[idx+1]
            assert next_rsn == rsn+1 # shouldn't it be rsn+1 since residues in the pdb has been reindex from 1
        except:
            pass # do nothing since next_rsn will be equal to rsn

        # newrsn means the rsn you would like to number from the reference (ref_seq_aligned)
        newrsn = seq_map[rsn]

        try:
            next_newrsn = seq_map[next_rsn]
        except KeyError:
            sys.stderr.write("ERROR: couldn't find the next rsn in the old\n")
            raise

        # detect chain break from alignment
        if ( next_newrsn - newrsn ) > 1: # there's a gap
            #sys.stderr.write("%s, %s\n" %(newrsn, next_newrsn ))
            dist = pdb_util.cal_dist( xyz_dict[rsn]["CA"], xyz_dict[next_rsn]["CA"] ) # dist from old numbering
            sys.stderr.write("Chainbreak (from alignment) at %s(%s)-%s(%s) with dist %.3f\n" %(rsn, newrsn, next_rsn, next_newrsn, dist))

            # if no physically chainbreak detected, possibly could be error in dynamic programming, trying fixing it by looking for same residue name in next_newrsn-1, if so, overwrite the corrent numbering to write to the next_newrsn-1
            if ( dist <= 4.5 ):
                sys.stderr.write("WARNING: no gap physically detected from %s-%s; caught an error in dynamic programming...\n" %(newrsn, next_newrsn))
                # index in seq, thus -1
                # just to make sure this error is caused by the dynamic programming (must be same residue name)
                assert ( ref_seq_aligned[newrsn-1] == ref_seq_aligned[next_newrsn-2] ), "ERROR: failing to looking for same residue name as pos:%s in pos:%s of fasta file could not fix the error in dynamic programming\n%s\n%s\n%s\nIs this a partial thread?" %( newrsn, next_newrsn-1, alignment[0], alignment[1], sbj_pdbfile )
                sys.stderr.write("Correcting an alignment problem caused by DP: %s(%s) -> %s(%s)\n"%( ref_seq_aligned[newrsn-1], newrsn, ref_seq_aligned[next_newrsn-2], next_newrsn-1 ) )

                corrected_newrsn = next_newrsn-1
                corrected_seq_map[rsn] = corrected_newrsn

    # make new sbj_seq_aligned
    corrected_sbj_seq_aligned = ""

    """
    make dict[ ref_numbering ] = old_numbering; was dict[ old_numbering ] = ref_numbering
        seq_map = { 1:1,
                    2:8,
                    3:9 }
        thus the keys will be residues that are aligned (in reference numbering)
        revert_seq_map = { 1:1,
                           8:2,
                           9:3 }
    """
    new_dict = python_util.invert_dict( corrected_seq_map )
    res_numbers_aligned = new_dict.keys()
    for idx, rsd in enumerate(ref_seq_aligned):
        rsn = idx+1
        if rsn in res_numbers_aligned:
            corrected_sbj_seq_aligned += ref_seq_aligned[idx]
        else:
            corrected_sbj_seq_aligned += "-"

    return (ref_seq_aligned, corrected_sbj_seq_aligned)
示例#6
0
if __name__=="__main__":
    parser = ArgumentParser()
    parser.add_argument("pdbs", nargs="+", help="")
    parser.add_argument("-o", "--outfn", default="merged", help="")
    parser.add_argument("-p", "--prefix", default="", help="")
    parser.add_argument("--debug", action="store_true", help="")
    parser.add_argument("--save_hetatm", action="store_true", help="")
    args = parser.parse_args()
    # merge different pdbs in a chainA-Z order and resnum

    all_pdb_dict = {}
    input_res_ctr = 0
    output_res_ctr = 0
    for pdb in args.pdbs:
        pdbline_dict = pdb_util.create_xyzDict_bychain( pdb, args.save_hetatm )[1]

        for chain in pdbline_dict.keys():
            if chain not in all_pdb_dict.keys():
                all_pdb_dict[ chain ] = pdbline_dict[ chain]
                input_res_ctr += len( pdbline_dict[ chain ].keys() ) # serve as a sanity check to prevent two same residues present in pdbs to merge
            else:
                for resnum in pdbline_dict[ chain ].keys():
                    if args.debug:
                        # this will be really slow to do a check everytime
                        assert resnum not in all_pdb_dict[ chain ].keys(), "ERROR: residue %s has multiple copies" % resnum
                    all_pdb_dict[ chain ][ resnum ] = pdbline_dict[ chain ][ resnum ]
                input_res_ctr += len( pdbline_dict[ chain ].keys() ) # serve as a sanity check to prevent two same residues present in pdbs to merge

            # update the line number everytime