def main(pdb):

    ## speed up by not reading atom section...
    d_mmCIF = parse_mmCIF.main(pdb)

    a = float(d_mmCIF['_cell.length_a'][0])
    b = float(d_mmCIF['_cell.length_b'][0])
    c = float(d_mmCIF['_cell.length_c'][0])
    alpha = float(d_mmCIF['_cell.angle_alpha'][0])
    beta = float(d_mmCIF['_cell.angle_beta'][0])
    gamma = float(d_mmCIF['_cell.angle_gamma'][0])
    Z = int(d_mmCIF['_cell.Z_PDB'][0])
    mw = 0
    for i in range(len(d_mmCIF['_entity.id'])):
        if d_mmCIF['_entity.type'][i] == 'polymer':
            mw += float(d_mmCIF['_entity.formula_weight'][i])
    VM = calc(
        a,
        b,
        c,
        alpha,
        beta,
        gamma,
        mw,
        Z,
    )
    print pdb, VM

    return VM
def one_polypeptide(pdb,):

    l_data_categories = ['_entity_poly',]
    d = parse_mmCIF.main(
        pdb,
        l_data_categories = l_data_categories,
        )

    bool_append = False

    ## make sure polymer is present (not vacomycin 1aa5)
    if '_entity_poly.type' in d.keys():
        ## one polypeptide?
        if d['_entity_poly.type'].count('polypeptide(L)') == 1:
            bool_append = True
##            if not ',' in ''.join(d['_entity_poly.pdbx_strand_id']):
##                bool_append = True
##            list_entity.pdbx_number_of_molecules__1.txt
    
    return bool_append
def modres_not_MSE(pdb,):

    l_data_categories = ['_pdbx_struct_mod_residue']
    d = parse_mmCIF.main(
        pdb,
        l_data_categories = l_data_categories,
        )

    bool_append = False

    ## has MODRES
    if '_pdbx_struct_mod_residue.id' in d.keys():
        if d['_pdbx_struct_mod_residue.label_comp_id'] != d['_pdbx_struct_mod_residue.auth_comp_id']:
            print pdb
            stop
        ## at least one MODRES is different from MSE
        if d['_pdbx_struct_mod_residue.auth_comp_id'] != len(d['_pdbx_struct_mod_residue.auth_comp_id'])*['MSE']:
            bool_append = True
    
    return bool_append
def main(pdb):

    ## speed up by not reading atom section...
    d_mmCIF = parse_mmCIF.main(pdb)

    a = float(d_mmCIF['_cell.length_a'][0])
    b = float(d_mmCIF['_cell.length_b'][0])
    c = float(d_mmCIF['_cell.length_c'][0])
    alpha = float(d_mmCIF['_cell.angle_alpha'][0])
    beta = float(d_mmCIF['_cell.angle_beta'][0])
    gamma = float(d_mmCIF['_cell.angle_gamma'][0])
    Z = int(d_mmCIF['_cell.Z_PDB'][0])
    mw = 0
    for i in range(len(d_mmCIF['_entity.id'])):
        if d_mmCIF['_entity.type'][i] == 'polymer':
            mw += float(d_mmCIF['_entity.formula_weight'][i])
    VM = calc(a,b,c,alpha,beta,gamma,mw,Z,)
    print pdb, VM

    return VM
示例#5
0
def modres_not_MSE(pdb, ):

    l_data_categories = ['_pdbx_struct_mod_residue']
    d = parse_mmCIF.main(
        pdb,
        l_data_categories=l_data_categories,
    )

    bool_append = False

    ## has MODRES
    if '_pdbx_struct_mod_residue.id' in d.keys():
        if d['_pdbx_struct_mod_residue.label_comp_id'] != d[
                '_pdbx_struct_mod_residue.auth_comp_id']:
            print pdb
            stop
        ## at least one MODRES is different from MSE
        if d['_pdbx_struct_mod_residue.auth_comp_id'] != len(
                d['_pdbx_struct_mod_residue.auth_comp_id']) * ['MSE']:
            bool_append = True

    return bool_append
示例#6
0
def one_polypeptide(pdb, ):

    l_data_categories = [
        '_entity_poly',
    ]
    d = parse_mmCIF.main(
        pdb,
        l_data_categories=l_data_categories,
    )

    bool_append = False

    ## make sure polymer is present (not vacomycin 1aa5)
    if '_entity_poly.type' in d.keys():
        ## one polypeptide?
        if d['_entity_poly.type'].count('polypeptide(L)') == 1:
            bool_append = True


##            if not ',' in ''.join(d['_entity_poly.pdbx_strand_id']):
##                bool_append = True
##            list_entity.pdbx_number_of_molecules__1.txt

    return bool_append
def parse_coords(pdb):

    d_mmCIF = parse_mmCIF.main(pdb, )
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb, d_mmCIF)

    return d_mmCIF, l_coords_alpha
def parse_dihedrals():

    import sys

    path = '/data/mmCIF'

    d_phipsi_res = {
        'ALA':[],'CYS':[],'ASP':[],'GLU':[],'PHE':[],
        'GLY':[],'HIS':[],'ILE':[],'LYS':[],'LEU':[],
        'MET':[],'ASN':[],'PRO':[],'GLN':[],'ARG':[],
        'SER':[],'THR':[],'VAL':[],'TRP':[],'TYR':[],
        'prePRO':[],'prePRO_notGLY':[],'prePRO_GLY':[],
        'cisPro':[],'transPro':[],
        'all_notgly_notpro_notprepro':[],
        }

    d_phipsi_ss = {
        'sheet':[], ## _struct_sheet_order.sense
        ##_struct_conf.pdbx_PDB_helix_class
        'helix_alpha':[], ## i+4 # 1
        'helix_pi':[], ## i+5 # 3
        'helix_310':[], ## i+3 # 5
        'Turn':[], ## i+?
        ##
        'turns_notgly_notpro_notprepro':[],
        }

    d_counts = {
        'cisProALA':0,
        'cisProCYS':0,
        'cisProASP':0,
        'cisProGLU':0,
        'cisProPHE':0,
        'cisProGLY':0,
        'cisProHIS':0,
        'cisProILE':0,
        'cisProLYS':0,
        'cisProLEU':0,
        'cisProMET':0,
        'cisProASN':0,
        'cisProPRO':0,
        'cisProGLN':0,
        'cisProARG':0,
        'cisProSER':0,
        'cisProTHR':0,
        'cisProVAL':0,
        'cisProTRP':0,
        'cisProTYR':0,
        'cisPro_helix':0,
        'cisPro_sheet':0,
        'cisPro_turn':0,
        'cisPro_random':0,
        }

    l_dn = os.listdir(path)
    l_dn.sort()
    l_dn.remove('mmCIF.py')
    for dn in l_dn:
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        print '*',dn
        l_fn = os.listdir('%s/%s' %(path,dn,))
        l_fn.sort()
        for fn in l_fn:
            pdb = fn[:4]
            print pdb
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks = {'_exptl.method':['SOLUTION NMR']},
                l_data_categories = [
                    '_exptl',
                    '_refine',

                    '_struct_conf', ## HELIX
                    '_struct_sheet_range', ## SHEET

                    '_entity',
                    '_entity_poly',
                    '_entity_poly_seq',

                    '_atom_site',
                    ],
                )

            ## skip NMR models
            if ''.join(d_mmCIF['_exptl.method']) in [
                'SOLUTION NMR',
                'POWDER DIFFRACTION',
                'ELECTRON MICROSCOPY',
                ]:
                continue

            if not '_refine.ls_d_res_high' in d_mmCIF.keys():
                print d_mmCIF['_exptl.method']
                continue

            ## skip if multiple resolutions
            if len(d_mmCIF['_refine.ls_d_res_high']) > 1:
                continue

            ## skip if no resolution
            if ''.join(d_mmCIF['_refine.ls_d_res_high']) == '?':
                continue

            ## skip low resolution structures
            if float(''.join(d_mmCIF['_refine.ls_d_res_high'])) > 2:
                continue

            if not 'polymer' in d_mmCIF['_entity.type']:
                continue
            if not '_entity_poly.type' in d_mmCIF.keys(): ## e.g. 1hhu
                continue
            if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide/polyribonucleotide hybrid']:
                continue
            if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide']:
                continue

            d_sequence = {}
            for i_entity_poly_seq in range(len(d_mmCIF['_entity_poly_seq.entity_id'])):
                entity_id = int(d_mmCIF['_entity_poly_seq.entity_id'][i_entity_poly_seq])
                if not entity_id in d_sequence.keys():
                    d_sequence[entity_id] = []
                res_no = int(d_mmCIF['_entity_poly_seq.num'][i_entity_poly_seq])
                res_name = d_mmCIF['_entity_poly_seq.mon_id'][i_entity_poly_seq]
                d_sequence[entity_id] += [{'res_no':res_no,'res_name':res_name,}]

            l_entities_poly = []
            for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])):
                ## skip if not polypeptide
                entity_poly_type = d_mmCIF['_entity_poly.type'][i_entity_poly]
                if entity_poly_type != 'polypeptide(L)':
                    continue
                ## skip if nonstd linkages
                if d_mmCIF['_entity_poly.nstd_linkage'][i_entity_poly] == 'yes':
                    print pdb
                    stop
                    continue
                ## parse entity_id and chains
                entity_id = int(d_mmCIF['_entity_poly.entity_id'][i_entity_poly])
                l_entities_poly += [entity_id]
            ## skip if no polypeptide chains
            if l_entities_poly == []:
                continue

            d_coords = {}
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                entity_id = int(d_mmCIF['_atom_site.label_entity_id'][i_atom_site])
                ## not a polymer
                if not entity_id in l_entities_poly:
                    continue
                ## polymer, append
                elif not entity_id in d_coords.keys():
                    d_coords[entity_id] = {}

                model = int(d_mmCIF['_atom_site.pdbx_PDB_model_num'][i_atom_site])
                if model > 1:
                    continue

                chain = d_mmCIF['_atom_site.label_asym_id'][i_atom_site]
                if not chain in d_coords[entity_id].keys():
                    d_coords[entity_id][chain] = {}
                res_no = int(d_mmCIF['_atom_site.label_seq_id'][i_atom_site])
                if not res_no in d_coords[entity_id][chain].keys():
                    d_coords[entity_id][chain][res_no] = {}
                atom_name = d_mmCIF['_atom_site.label_atom_id'][i_atom_site]

                altloc = d_mmCIF['_atom_site.label_alt_id'][i_atom_site]
                if altloc not in ['.','A','1',]:
                    continue

                ## skip if zero occupancy
                occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site])
                if altloc == '.' and occupancy == 0:
                    continue

                if atom_name in ['CA','C','O','N',] and atom_name in d_coords[entity_id][chain][res_no].keys():
                    print pdb, chain, res_no, atom_name
                    print d_mmCIF['_atom_site.Cartn_x'][i_atom_site], d_mmCIF['_atom_site.Cartn_y'][i_atom_site]
                    print d_coords[entity_id][chain][res_no][atom_name]
                    stop
                x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site])
                y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site])
                z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site])
                coord = numpy.array([x,y,z,])
                d_coords[entity_id][chain][res_no][atom_name] = coord

            d_helices = {}
            ## helices or turns present?
            if '_struct_conf.id' in d_mmCIF.keys():
                for i_struct_conf in range(len(d_mmCIF['_struct_conf.id'])):
                    chain1 = d_mmCIF['_struct_conf.beg_label_asym_id'][i_struct_conf]
                    chain2 = d_mmCIF['_struct_conf.end_label_asym_id'][i_struct_conf]
                    res_no1 = int(d_mmCIF['_struct_conf.beg_label_seq_id'][i_struct_conf])
                    res_no2 = int(d_mmCIF['_struct_conf.end_label_seq_id'][i_struct_conf])
                    conf_type_id = d_mmCIF['_struct_conf.conf_type_id'][i_struct_conf]
                    if chain1 != chain2:
                        print chain1, chain2, pdb
                        stop
                    if conf_type_id == 'HELX_P':
                        helix_class = int(d_mmCIF['_struct_conf.pdbx_PDB_helix_class'][i_struct_conf])
                    elif conf_type_id == 'TURN_P':
                        helix_class = 99
                    else:
                        print conf_type_id
                        print pdb
                        stop
                    l_res_nos = range(res_no1,res_no2+1,)
                    if not chain1 in d_helices.keys():
                        d_helices[chain1] = {}
                    for res_no in l_res_nos:
                        d_helices[chain1][res_no] = helix_class

            d_sheets = {}
            ## sheet present?
            if '_struct_sheet_range.sheet_id' in d_mmCIF.keys():
                for i_struct_sheet_range in range(len(d_mmCIF['_struct_sheet_range.sheet_id'])):
                    chain1 = d_mmCIF['_struct_sheet_range.beg_label_asym_id'][i_struct_sheet_range]
                    chain2 = d_mmCIF['_struct_sheet_range.end_label_asym_id'][i_struct_sheet_range]
                    res_no1 = int(d_mmCIF['_struct_sheet_range.beg_label_seq_id'][i_struct_sheet_range])
                    res_no2 = int(d_mmCIF['_struct_sheet_range.end_label_seq_id'][i_struct_sheet_range])
                    l_res_nos = range(res_no1,res_no2+1,)
                    if chain1 != chain2:
                        print chain1, chain2, pdb
                        stop
                    if not chain1 in d_sheets.keys():
                        d_sheets[chain1] = []
                    for res_no in l_res_nos:
                        d_sheets[chain1] += l_res_nos

            for entity_id in l_entities_poly:
                for chain in d_coords[entity_id].keys():
                    ## skip if short peptide (e.g. 13gs)
                    if len(d_sequence[entity_id]) <= 3:
                        continue
                    for i_res_no in range(1,len(d_sequence[entity_id])-1):
                        res_no_prev = int(d_sequence[entity_id][i_res_no-1]['res_no'])
                        res_no = int(d_sequence[entity_id][i_res_no]['res_no'])
                        res_no_next = int(d_sequence[entity_id][i_res_no+1]['res_no'])
                        res_name = d_sequence[entity_id][i_res_no]['res_name']
                        if res_name == 'MSE':
                            res_name = 'MET'
                        res_name_next = d_sequence[entity_id][i_res_no+1]['res_name']

                        ## not a standard residue
                        if not res_name in d_phipsi_res.keys():
                            continue

                        ## residue not observed
                        if not res_no_prev in d_coords[entity_id][chain].keys():
                            continue
                        if not res_no in d_coords[entity_id][chain].keys():
                            continue
                        if not res_no_next in d_coords[entity_id][chain].keys():
                            continue

                        ## atom not observed
                        if not 'C' in d_coords[entity_id][chain][res_no_prev]:
                            continue
                        if not 'N' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'CA' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'C' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'N' in d_coords[entity_id][chain][res_no_next]:
                            continue
                        
                        C_prev = d_coords[entity_id][chain][res_no_prev]['C']
                        N = d_coords[entity_id][chain][res_no]['N']
                        CA = d_coords[entity_id][chain][res_no]['CA']
                        C = d_coords[entity_id][chain][res_no]['C']
                        N_next = d_coords[entity_id][chain][res_no_next]['N']
                        phi = calc_dihedral(C_prev,N,CA,C,)
                        psi = calc_dihedral(N,CA,C,N_next,)

                        if 'CA' in d_coords[entity_id][chain][res_no_prev].keys():
                            CA_prev = d_coords[entity_id][chain][res_no_prev]['CA']
                            omega = calc_dihedral(CA_prev,C_prev,N,CA,)
                        else:
                            omega = None

                        
                        if omega:
                            if (
                                omega
                                and
                                omega < 150
                                and
                                omega > -150
                                ): ## 12e8, PRO44D
                                if abs(omega) > 30: ## 12e8 PRO196D, 1a44 GLU82A
                                    omega = None
                                ## cis
                                else:
                                    omega = 'cis'
                                    pass
                            ## trans
                            else:
                                omega = 'trans'
                                pass
                        else:
                            omega = None
                        
                        bool_helix = False
                        if chain in d_helices.keys():
                            if res_no in d_helices[chain].keys():
                                bool_helix = True
                                helix_class = d_helices[chain][res_no]

                        bool_sheet = False
                        if chain in d_sheets.keys():
                            if res_no in d_sheets[chain]:
                                bool_sheet = True

##                        if bool_helix == True and bool_sheet == True and helix_class != 99:
##                            print pdb, chain, res_no, 'sheet and helix'
####                            stop
                        
                        if res_name_next == 'PRO':
                            d_phipsi_res['prePRO'] += [[phi,psi,]]
                            if res_name != 'GLY':
                                d_phipsi_res['prePRO_notGLY'] += [[phi,psi,]]
                            else:
                                d_phipsi_res['prePRO_GLY'] += [[phi,psi,]]
                        else:
                            d_phipsi_res[res_name] += [[phi,psi,]]
                            if res_name not in ['GLY','PRO',]:
                                d_phipsi_res['all_notgly_notpro_notprepro'] += [[phi,psi,]]
                            elif res_name == 'PRO' and omega:
                                d_phipsi_res['%sPro' %(omega)] += [[phi,psi,]]
                                if omega == 'cis':
                                    d_counts['cisPro%s' %(res_name)] += 1
                                    if bool_helix == True:
                                        if helix_class == 1:
                                            d_counts['cisPro_helix'] += 1
                                        elif helix_class == 99:
                                            d_counts['cisPro_turn'] += 99
                                    elif bool_sheet == True:
                                        d_counts['cisPro_sheet'] += 1
                                    else:
                                        d_counts['cisPro_random'] += 1
                                        

                        if bool_helix == True:
##                            if helix_class not in [1,3,5,99,]:
##                                print pdb, chain, res_no, helix_class
##                                print 'unexpected helix class'
####                                stop_helix_class
                            if helix_class == 1:
                                d_phipsi_ss['helix_alpha'] += [[phi,psi,]]
                            elif helix_class == 3:
                                d_phipsi_ss['helix_pi'] += [[phi,psi,]]
                            elif helix_class == 5:
                                d_phipsi_ss['helix_310'] += [[phi,psi,]]
                            elif helix_class == 99:
                                d_phipsi_ss['Turn'] += [[phi,psi,]]
                                if (
                                    res_name_next != 'PRO'
                                    and
                                    res_name not in ['GLY','PRO',]
                                    ):
                                    d_phipsi_ss['turns_notgly_notpro_notprepro'] += [[phi,psi,]]
                        if bool_sheet == True:
                            d_phipsi_ss['sheet'] += [[phi,psi,]]

    l = []
    for k in d_counts.keys():
        count = d_counts[k]
        l += ['%s %s\n' %(k,count,)]
    fd = open('count.txt','w')
    fd.writelines(l)
    fd.close()

    return d_phipsi_res, d_phipsi_ss
def unobs_nonterminal_residues():

    ##
    ## unobs or zero occup not at terminals!!! (combination...)
    ## eg dont exlude 200l w 163,164 missing
    ## dont exclude 201l w 163,164 missing, but internally in _pdbx_poly_seq_scheme because 2 chains
    ##
    category = fn = '_pdbx_unobs_or_zero_occ_residues'
    fd = open('%s/list%s.txt' %(path,fn))
    s = fd.read()
    fd.close()
    l_pdbs_in = s.split()
    l_data_categories = [
        '_pdbx_poly_seq_scheme',
        '_pdbx_unobs_or_zero_occ_residues',
        '_entity_poly',
        ]

    fn_out = 'list_pdbx_unobs_residues__NONTERMINAL'

    loop_residues(category,fn_out,)

    l_pdbs_out = []
    for pdb in l_pdbs_in:

##        if pdb[1:3] < 'oa':
##            continue
##        if pdb != '2hub':
##            continue

        ## no residues are present! (e.g. 1oax, 1oay)
        if pdb in ['1oax','1oay',]:
            continue

        d = parse_mmCIF.main(pdb,l_data_categories=l_data_categories,)

##        print pdb

        if not category in d.keys():
            continue

        bool_append = False
        s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id'])
        for chains in d['_entity_poly.pdbx_strand_id']:
            for chain in chains.split(','):
                index1 = s.index(chain)
                index2 = s.rindex(chain)
##                print chain
                l_auth_seq_num = d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:index2+1]
                while l_auth_seq_num[0] == '?':
                    l_auth_seq_num = l_auth_seq_num[1:]
                while l_auth_seq_num[-1] == '?':
                    l_auth_seq_num = l_auth_seq_num[:-1]
                ## non-terminal residues missing?
                if '?' in l_auth_seq_num:
                    print '****', pdb
                    bool_append = True
                    break
            if bool_append == True:
                break
        if bool_append == True:
            print pdb
            l_pdbs_out += [pdb]
            ## continue

    fd = open('%s/%s' %(path,fn_out,),'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    return
示例#10
0
def main():

    d = {}

    if os.path.isfile('db_resolution.txt'):

        fd = open('db_resolution.txt', 'r')
        lines = fd.readlines()
        fd.close()

        for line in lines:
            l = line.strip().split()
            pdb = l[0]
            v = l[1]
            d[pdb] = v

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()

    lines_out = []

    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                l_data_categories=[
                    '_refine',
                    '_refine_hist',
                ],  ## parse selected data categories
                l_data_categories_break=[
                    '_refine',
                    ##                    '_refine_hist',
                ],
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                })

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            resolution = d_mmCIF['_refine.ls_d_res_high']

            line = '%s %s\n' % (
                pdb,
                resolution,
            )
            lines_out += [line]

            fd = open('db_resolution.txt', 'a')
            fd.write(line)
            fd.close()

            d[pdb] = resolution

    ##
    ## write to file
    ##
    lines_out = []
    for pdb, resolution in d.items():
        line = '%s %s\n' % (
            pdb,
            resolution,
        )
        lines_out += [line]
    fd = open('db_resolution.txt', 'w')
    fd.writelines(lines_out)
    fd.close()

    d = {}
    fd = open('db_resolution.txt', 'r')
    lines = fd.readlines()
    fd.close()

    lines_out = []
    for line in lines:
        resolution = line.strip().split()[1][2:-2]
        if resolution == '.':
            continue
        resolution = float(resolution)
        resolution = round(resolution, 2)
        if not resolution in d.keys():
            d[resolution] = 0
        d[resolution] += 1
        lines_out += ['%s\n' % (resolution, )]
    fd = open('histogram_resolution.txt', 'w')
    fd.writelines(lines_out)
    fd.close()
    stop

    lines_out = []
    l_resolutions = d.keys()
    l_resolutions.sort()
    ##    for resolution,count in d.items():
    for resolution in l_resolutions:
        count = d[resolution]
        lines_out += ['%s %s\n' % (
            resolution,
            count,
        )]
    fd = open('histogram_resolution.txt', 'w')
    fd.writelines(lines_out)
    fd.close()

    return
def main():

    fd = open('radius_of_gyration.txt','r')
    lines = fd.readlines()
    fd.close()
    d_radii = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        r = l[1]
        d_radii[pdb] = r

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d_radii.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                d_breaks = {
                    ## break if multiple polymer types (not monomeric)
                    '_entity_poly.entity_id':'2',
##                    '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR
                    ## break if multiple chains
                    '_entity_poly.pdbx_strand_id':',',
                    }, 
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    ## break if not monomeric
                    '_pdbx_struct_assembly.oligomeric_details':'monomeric',
                    },
                l_data_categories = [
                    '_atom_site',
                    '_entity_poly',
                    '_pdbx_struct_assembly',
                    ], ## parse selected data categories
                )

            ## some unknown temporary error... or break before reaching this part when parsing...
            if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys():
                continue

            ## NMR structure?
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                stop2
                continue

            ## no polymers in structure?
            if not '_entity_poly.entity_id' in d_mmCIF.keys():
                continue

            ## polymer(s) is/are not polypeptide(s)
            if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']:
                continue

            ## biounit not monomeric
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_details'])*['monomeric']:
                continue

            ## one polymer in assymetric unit
            if len(d_mmCIF['_entity_poly.entity_id']) > 1:
                continue

            print pdb

            ##
            ## calculate center of mass
            ##
            center_of_mass = numpy.array([0.,0.,0.,])
            l_coords = []
            l_masses = []
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                if d_mmCIF['_atom_site.label_entity_id'][i_atom_site] not in d_mmCIF['_entity_poly.entity_id']:
                    continue

                element = d_mmCIF['_atom_site.type_symbol'][i_atom_site]

                ## only do heavy atoms
                if element == 'H':
                    continue
                if element not in d_mass.keys():
                    print pdb, d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                    continue

                mass = d_mass[element]
                l_masses += [mass]

                x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site])
                y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site])
                z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site])
                coord = numpy.array([x,y,z,])
                l_coords += [coord]

                center_of_mass += mass*coord

            center_of_mass /= sum(l_masses)

            ##
            ## calculate radius of gyration
            ##
            sum_r = 0
            for i_coord in range(len(l_coords)):
                coord = l_coords[i_coord]
                mass = l_masses[i_coord]
                sq_dist_from_center_of_mass = sum((coord-center_of_mass)**2)
                sum_r += mass*sq_dist_from_center_of_mass
            radius_of_gyration = math.sqrt(sum_r/sum(l_masses))

            print pdb, center_of_mass, radius_of_gyration

            line = '%s %s\n' %(pdb,radius_of_gyration,)
            lines_out += [line]

            fd = open('radius_of_gyration.txt','a')
            fd.write(line)
            fd.close()

            d_radii[pdb] = radius_of_gyration

    ##
    ## write calculated radii of gyration to file
    ##
    lines_out = []
    for pdb,radius_of_gyration in d_radii.items():
        line = '%s %s\n' %(pdb,radius_of_gyration,)
        lines_out += [line]
    fd = open('radius_of_gyration.txt','w')
    fd.writelines(lines_out)
    fd.close()

    return
def one_polysaccharide(pdb,):

    l_data_categories = [
        '_entity',
        '_chem_comp',
        '_entity_poly',
        ]
    d = parse_mmCIF.main(
        pdb,
        l_data_categories = l_data_categories,
        )

    bool_append = False

    bool_polysaccharide = False
    if '_chem_comp.type' in d.keys():
        for chem_comp_type in d['_chem_comp.type']:
            if chem_comp_type.lower() in [
                'd-saccharide 1,4 and 1,4 linking', # 3amm
                'l-saccharide','d-saccharide','saccharide'
                ]:
                bool_polysaccharide = True
                break
##            elif 'acchar' in chem_comp_type.lower():
##                print d
##                print chem_comp_type
##                print pdb
##                print set(['D-saccharide','saccharide'])&set(d['_chem_comp.type'])
##                stop
##    else:
##        print pdb
##        stop

    count_polymer_sugar = 0
    bool_monosaccharide = False ## included to exclude 1a14 which contains polymers and monomers
    for i in range(len(d['_entity.type'])):
        entity_type = d['_entity.type'][i]
        if entity_type in [
            'polymer',
            ]:
            if d['_entity.pdbx_description'][i][:7] == 'SUGAR (':
                count_polymer_sugar += int(d['_entity.pdbx_number_of_molecules'][i])
                continue
##            ## polypeptide or polynucleotide (just a check)
##            elif d['_entity.pdbx_description'][i][:5] == 'SUGAR': ## eg 2c49
##                if d['_entity.id'][i] not in d['_entity_poly.entity_id']:
##                    print pdb
##                    stop
        elif entity_type == 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR':
            bool_monosaccharide = True
##            ## just a check
##            if d['_entity.pdbx_description'][i][:7] != 'SUGAR (' and pdb not in ['1iuc',]:
##                print pdb
##                print d['_entity.pdbx_description'][i]
##                stop
##        ## anything else named SUGAR? just a check
##        elif entity_type != 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR':
##            print d
##            print pdb
##            print entity_type
##            print d['_entity.pdbx_description'][i]
##            stop

    if bool_monosaccharide == False and bool_polysaccharide == True and count_polymer_sugar == 1:
        bool_append = True
##    elif pdb in ['3gvj','3gvk','3gvl','3hmy','3msg','1v0f',]:
##        bool_append = False
##    ## error check
##    elif bool_polysaccharide == False and count_polymer_sugar > 0:
##        print d
##        print bool_polysaccharide
##        print d['_entity.pdbx_description']
##        print count_polymer_sugar
##        print pdb
##        stop_no_poly_but_poly

    if pdb == '1dl2':
        print count_polymer_sugar
        print bool_append
        stop

    return bool_append
def main():

    fd = open('remediation_negativeBiso.txt', 'r')
    lines = fd.readlines()
    fd.close()
    l_pdbs = []
    for line in lines:
        if line.strip() == '':
            continue
        if line[0] == '#':
            continue
        l = line.strip().split()
        pdb = l[0]
        l_pdbs += [pdb]

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if not pdb in l_pdbs:
                continue

            print pdb

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                },
                l_data_categories=[
                    ## parse selected data categories
                    '_database_PDB_rev',
                    '_computing',
                    '_atom_site',
                    '_refine'
                ],
            )

            ##            ## no polymers in structure?
            ##            if not '_entity_poly.entity_id' in d_mmCIF.keys():
            ##                continue

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            print pdb

            ##
            ## parse bfactors
            ##
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                bfactor = float(
                    d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site])

                ##                if bfactor == '?':
                ##                    continue

                element = d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site]

                if float(bfactor) < -0.01:
                    if (element != 'H' and comp_id in [
                            'ALA',
                            'CYS',
                            'ASP',
                            'GLU',
                            'PHE',
                            'GLY',
                            'HIS',
                            'ILE',
                            'LYS',
                            'MET',
                            'ASN',
                            'PRO',
                            'GLN',
                            'ARG',
                            'SER',
                            'THR',
                            'VAL',
                            'TRP',
                            'TYR',
                    ]):

                        print
                        print 'negative'
                        print

                        year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
                        atom_id = int(d_mmCIF['_atom_site.id'][i_atom_site])
                        refinement = ''.join(
                            d_mmCIF['_computing.structure_refinement'])
                        solution = ''.join(
                            d_mmCIF['_computing.structure_solution'])
                        resolution = float(''.join(
                            d_mmCIF['_refine.ls_d_res_high']))

                        fd = open('remediation_negativeBiso.txt', 'a')
                        fd.write(
                            ##                            '%4s %4i %4i %3s %2s %6.2f %30s %20s\n' %(
                            '%4s\t%4i\t%4i\t%3s\t%2s\t%6.2f\t%6.2f\t%30s\t%20s\n'
                            % (
                                pdb,
                                year,
                                atom_id,
                                comp_id,
                                element,
                                bfactor,
                                resolution,
                                solution.ljust(30),
                                refinement.ljust(20),
                            ))
                        fd.close()
                        break

    return
def main():

    d_MV = {}

    path = '/data/mmCIF'
    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn == 'mmCIF.py':
            continue
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        l_fn = os.listdir('%s/%s' % (path, dn))
        for fn in l_fn:
            pdb = fn[:4]
            ##            if pdb.upper() not in s_pdbs:
            ##                continue
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks={'_exptl.method': 'SOLUTION NMR'},
                l_data_categories=[
                    '_cell',
                    '_entity',
                    '_exptl',
                    '_exptl_crystal',
                    '_entity_poly',
                    '_symmetry',
                    ## virus
                    '_pdbx_struct_assembly',
                    ## split structure
                    '_pdbx_database_related',
                ],
            )

            ## x-ray structure
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            ## polymer present
            if not '_entity_poly.type' in d_mmCIF.keys():
                continue

            ## only polymer present is protein
            if d_mmCIF['_entity_poly.type'] != len(
                    d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']:
                continue

            if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys():
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['?']:
                continue

            ## virus
            if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count']
                   [0]) % 60 == 0:
                continue

            ## not monomer
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['1']:
                continue

            ## split structure
            if '_pdbx_database_related' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related']:
                    continue
                if 'SPLIT' in d_mmCIF['_pdbx_database_related']:
                    print pdb
                    stop

            if not '_cell.Z_PDB' in d_mmCIF.keys():
                continue

            if pdb in [
                    ## treshold
                    '1e54',
                    '1e9i',
                    ## difference between calculated MV and MV in mmCIF
                    '3eiq',
                    ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with
                    ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1
                    ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01.
                    ## Toscana has published with Hellinga...
                    '2cjf',
                    '2bt4',
            ]:
                continue

##            if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [
##                'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3',
##                ]:
##                continue ## tmp!!!

            a = float(d_mmCIF['_cell.length_a'][0])
            b = float(d_mmCIF['_cell.length_b'][0])
            c = float(d_mmCIF['_cell.length_c'][0])
            alpha = float(d_mmCIF['_cell.angle_alpha'][0])
            beta = float(d_mmCIF['_cell.angle_beta'][0])
            gamma = float(d_mmCIF['_cell.angle_gamma'][0])
            Z = int(d_mmCIF['_cell.Z_PDB'][0])
            mw = 0
            for i in range(len(d_mmCIF['_entity.id'])):
                ##                if d_mmCIF['_entity.type'][i] == 'polymer':
                s = d_mmCIF['_entity.formula_weight'][i]
                ## unknown ligand
                if s == '?':
                    continue
                mw += float(s)

            MV = matthews_coefficient.main(a, b, c, alpha, beta, gamma, mw, Z)

            spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M'])

            if spacegroup not in [
                    'F 4 3 2',
                    'F 41 3 2',
                    'I 41 3 2',
            ]:
                continue  ## tmp!!!

            if MV > 10:
                print pdb
                print 'mw', mw
                print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews']
                print 'Z', Z
                import math
                alpha *= math.pi / 180.
                beta *= math.pi / 180.
                gamma *= math.pi / 180.
                V = a * b * c * math.sqrt(
                    1 - math.cos(alpha)**2 - math.cos(beta)**2 -
                    math.cos(gamma)**2 + 2 *
                    (math.cos(alpha) * math.cos(beta) * math.cos(gamma)))
                print 'V', V
                continue
                stop_treshold
                stop
            if '_exptl_crystal.density_Matthews' in d_mmCIF.keys():
                if d_mmCIF['_exptl_crystal.density_Matthews'] not in [
                    ['?'],
                        len(d_mmCIF['_exptl_crystal.density_Matthews']) *
                    ['?'],
                ]:
                    if abs(MV -
                           float(d_mmCIF['_exptl_crystal.density_Matthews'][0])
                           ) > 1:
                        print 'MV', MV
                        print 'MV', d_mmCIF['_exptl_crystal.density_Matthews']
                        print 'mw', mw
                        print 'Z', Z
                        continue
                        stop_difference

            if not spacegroup in d_MV.keys():
                d_MV[spacegroup] = []
            d_MV[spacegroup] += [MV]

            print pdb, round(MV, 2), spacegroup


##    fd = open('MV_v_spacegroup.txt','w')
##    fd.write(str(d_MV))
##    fd.close()

    l = ['# MV_average MV_stddev n spacegroup\n']
    for spacegroup in d_MV.keys():
        l_MV = d_MV[spacegroup]
        if len(l_MV) <= 1:
            continue
        average, stddev = statistics.do_stddev(l_MV)
        average, stderr = statistics.do_stderr(l_MV)
        ##        l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)]
        l += ['%s %s %s %s\n' % (
            average,
            stderr,
            len(l_MV),
            spacegroup,
        )]

    fd = open('MV_v_spacegroup.txt', 'w')
    fd.writelines(l)
    fd.close()

    return
示例#15
0
def get_position_ligand(pdb,pdb_apo,d_apo2holo,):

    pdb_holo = d_apo2holo[pdb_apo]['holo']
    d_mmCIF_holo = parse_mmCIF.main(pdb_holo,)
    d_coords, l_coords_alpha_holo = mmCIF2coords.main(pdb_holo,d_mmCIF_holo)

    ##
    ##
    ##
    ligand = d_apo2holo[pdb_apo]['ligand']

    l_residues = []
    for i in range(len(d_mmCIF_holo['_struct_site.id'])):
        if not 'BINDING SITE FOR RESIDUE %s' %(ligand) in d_mmCIF_holo['_struct_site.details'][i]:
            continue
        if len(l_residues) > 0:
            print pdb, pdb_apo, pdb_holo
            print l_residues
            print d_mmCIF_holo['_struct_site.details'][i]
            stop
        struct_site_ID = d_mmCIF_holo['_struct_site.id'][i]
        for j in range(len(d_mmCIF_holo['_struct_site_gen.site_id'])):
            struct_site_gen_ID = d_mmCIF_holo['_struct_site_gen.site_id'][j]
            if struct_site_ID == struct_site_gen_ID:
                residue = int(d_mmCIF_holo['_struct_site_gen.auth_seq_id'][j])
##                l_residues += [residue]
                ## include neighboring residues
                l_residues += [residue-1,residue,residue+1]
    l_residues = list(set(l_residues))
    if len(l_residues) == 0:
        print pdb
        stop

    ## 
    l_coords_ligand = []
    for i in range(len(d_mmCIF_holo['_atom_site.id'])):
        if (
            d_mmCIF_holo['_atom_site.group_PDB'][i] == 'HETATM'
            and
            d_mmCIF_holo['_atom_site.label_comp_id'][i] == ligand
            ):
            x = float(d_mmCIF_holo['_atom_site.Cartn_x'][i])
            y = float(d_mmCIF_holo['_atom_site.Cartn_y'][i])
            z = float(d_mmCIF_holo['_atom_site.Cartn_z'][i])
            coord = numpy.array([x,y,z,])
            l_coords_ligand += [coord]


    d_mmCIF_apo = parse_mmCIF.main(pdb_apo,)
    d_coords, l_coords_alpha_apo = mmCIF2coords.main(pdb_apo,d_mmCIF_apo)   

    ## structural alignment
    ## solution that works in all cases
    ## also for 2d59 and 2d5a, which have residues missing at the Nterm and Cterm, respectively
    ## first non-?
    index1_seq_apo = next((i for i,v in enumerate(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?'))
    index1_seq_holo = next((i for i,v in enumerate(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?'))
    ## last non-?
    index2_seq_apo = len(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?'))
    index2_seq_holo = len(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?'))
    ## first common non-?
    index1_coord_apo = max(0,index1_seq_holo-index1_seq_apo)
    index1_coord_holo = max(0,index1_seq_apo-index1_seq_holo)
    ## last common non-?
    index2_coord_apo = len(l_coords_alpha_apo)+min(0,index2_seq_holo-index2_seq_apo)
    index2_coord_holo = len(l_coords_alpha_holo)+min(0,index2_seq_apo-index2_seq_holo)
    l_coords_alpha_apo = l_coords_alpha_apo[index1_coord_apo:index2_coord_apo]
    l_coords_alpha_holo = l_coords_alpha_holo[index1_coord_holo:index2_coord_holo]


    if pdb == pdb_apo:
        l_seq_num = d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_seq_num'][index1_coord_apo:index2_coord_apo]
        chain = ''.join(d_mmCIF_apo['_entity_poly.pdbx_strand_id'])
        n_residues = len(l_coords_alpha_apo)
        l_coords_alpha = l_coords_alpha_apo
    else:
        l_seq_num = d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_seq_num'][index1_coord_holo:index2_coord_holo]
        chain = ''.join(d_mmCIF_holo['_entity_poly.pdbx_strand_id'])
        n_residues = len(l_coords_alpha_holo)
        l_coords_alpha = l_coords_alpha_holo

    overlap_site = 1.
##    ##
##    ## eigenvector
##    ##
##    cutoff = 10
##    matrix_hessian = NMA.hessian_calculation(l_coords_alpha,cutoff,)
##    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian)
##
##    ## apply transformation matrix
##    if pdb == pdb_apo:
##        instance_geometry = geometry.geometry()
##        rmsd = instance_geometry.superpose(l_coords_alpha_apo,l_coords_alpha_holo,)
##        tv1 = instance_geometry.fitcenter
##        rm = instance_geometry.rotation
##        tv2 = instance_geometry.refcenter
##        for i_coord in range(len(l_coords_ligand)):
##            l_coords_ligand[i_coord] = numpy.dot(l_coords_ligand[i_coord]-tv1,rm)+tv2
##
##    ##
##    ## apo/holo eigenvector
##    ##
##    vector_apo2holo = []
##    for i in range(len(l_coords_alpha_holo)):
##        vector_apo2holo += [
##            l_coords_alpha_holo[i][0]-l_coords_alpha_apo[i][0],
##            l_coords_alpha_holo[i][1]-l_coords_alpha_apo[i][1],
##            l_coords_alpha_holo[i][2]-l_coords_alpha_apo[i][2],
##            ]
##    vector_apo2holo = numpy.array(vector_apo2holo)
##
##    ##
##    ## calculate overlap between normal modes and difference vector
##    ## in the ligand binding site!!!
##    ##
##    vector_apo2holo_site = []
##    eigenvector_site = []
##    ## exclude coordinate not at the ligand binding site
##    for i_seq_num in range(len(l_seq_num)):
##        seq_num = int(l_seq_num[i_seq_num])
##        if seq_num in l_residues:
##            eigenvector_site += list(eigenvectors[6][3*i_seq_num:3*i_seq_num+3])
##            vector_apo2holo_site += list(vector_apo2holo[3*i_seq_num:3*i_seq_num+3])
##    ## calculate overlap
##    vector_apo2holo_site = numpy.array(vector_apo2holo_site)
##    eigenvector_site = numpy.array(eigenvector_site)
##    overlap_site = abs(
##        numpy.dot(eigenvector_site,vector_apo2holo_site)
##        /
##        math.sqrt(
##            numpy.dot(eigenvector_site,eigenvector_site)
##            *
##            numpy.dot(vector_apo2holo_site,vector_apo2holo_site)
##            )
##        )
##    if overlap_site > 0.8:
##        print vector_apo2holo_site
##        print eigenvector_site
##        print pdb
##        print l_residues

    position_ligand = sum(l_coords_ligand)/len(l_coords_ligand)

    n_atoms = len(l_coords_ligand)

    return position_ligand, chain, n_residues, n_atoms, ligand, overlap_site
def main():

    set_pdbs = exclude_include()
    l_pdbs_remove = [
        '4a3h','2wf5','1arl','1ee3', ## incorrect _struct_ref_seq.pdbx_db_accession
        '1uyd','1uye','1uyf','2byh','2byi', ## remediation _struct_ref_seq_dif
        '2xdu','3dn8','3dna','1ps3','1ouf','1l35','2eun','1rtc','1zon', ## _struct_ref_seq_dif missing
        '1pwl','1pwm','2fz8','2fz9', ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
        ]
    set_pdbs.remove('1f92') ## remediation _struct_ref_seq_dif incorrect residue number
    set_pdbs.remove('2f6f') ## remediation _pdbx_poly_seq_scheme.auth_mon_id wrong
    set_pdbs.remove('3a5j') ## remediation _struct_ref_seq_dif.db_mon_id is ? but should be MET
    set_pdbs.remove('2rhx') ## remediation _struct_ref_seq_dif.db_mon_id is ? but should be SER
    set_pdbs.remove('2fzb') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('2fzd') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('3dn5') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('1x96') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('1x97') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('1x98') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('1z3n') ## GenBank DBref - not an error...
    set_pdbs.remove('1z8a') ## GenBank DBref - not an error...
    set_pdbs.remove('1z89') ## GenBank DBref - not an error...
    set_pdbs.remove('2pf8') ## stupid use of alt_ids (C for highest occupancy and only altloc)
    set_pdbs.remove('2pyr') ## stupid use of alt_ids (G and R)
    set_pdbs.remove('3pdn') ## stupid use of alt_ids (B and C)
    set_pdbs.remove('2v4c') ## alt_id B used for 100% occupancy atoms
    set_pdbs.remove('1jxt') ## weird alt_id microheterogeneity...
    set_pdbs.remove('1jxu') ## weird alt_id microheterogeneity...
    set_pdbs.remove('1jxw') ## weird alt_id microheterogeneity...
    set_pdbs.remove('1jxx') ## weird alt_id microheterogeneity...
    set_pdbs.remove('1jxy') ## weird alt_id microheterogeneity...
##    set_pdbs.remove('1ac4') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)...
##    set_pdbs.remove('1ac8') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)...
##    set_pdbs.remove('1aeb') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)...
##    set_pdbs.remove('2rbt') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789
##    set_pdbs.remove('2rbu') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789
##    set_pdbs.remove('2rbv') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789
    for pdb in l_pdbs_remove:
        set_pdbs.remove(pdb)

    fd = open('%s/bc-100.out' %(path_mmCIF),'r')
    lines = fd.readlines()
    fd.close()

    for i_line in range(len(lines)):
        cluster = i_line
        if cluster < 4816:
            continue
##        if cluster not in [5,]:
##            continue
        line = lines[i_line]
        l_pdbs = line.lower().split()
        l_pdbs.sort()
        for i_pdb in range(len(l_pdbs)):
            l_pdbs[i_pdb] = l_pdbs[i_pdb][:4]

        for i_pdb1 in range(0,len(l_pdbs)-1):

            pdb1 = l_pdbs[i_pdb1]

##            if pdb1 != '1t49': ## tmp!!!
##                continue

            if not pdb1 in set_pdbs:
                continue

            print pdb1
            stop

            d_mmCIF1 = parse_mmCIF.main(pdb1,)

            bool_monomeric = check_monomeric(d_mmCIF1)
            if bool_monomeric == False:
                if i_pdb1 == 0:
                    break
                else:
                    continue

            bool_remediation_modres = check_modres(d_mmCIF1,pdb1,)
            if bool_remediation_modres == True:
                continue

            if '_struct_ref_seq_dif.details' in d_mmCIF1.keys():
                if 'DELETION' in d_mmCIF1['_struct_ref_seq_dif.details']:
                    continue

            for i_entity in range(len(d_mmCIF1['_entity.id'])):
                if d_mmCIF1['_entity.type'][i_entity] == 'polymer':
                    if int(d_mmCIF1['_entity.pdbx_number_of_molecules'][i_entity]) != 1:
                        print d_mmCIF1['_entity.pdbx_number_of_molecules']
                        print pdb1, cluster
                        stop

            SG1 = d_mmCIF1['_symmetry.space_group_name_H-M']

            for i_pdb2 in range(i_pdb1+1,len(l_pdbs)):

                pdb2 = l_pdbs[i_pdb2]

##                if pdb2 != '2pf8': ## tmp!!!
##                    continue

##                if pdb1 != '3fui' or pdb2 != '3fuj':
##                    continue

                if not pdb2 in set_pdbs:
                    continue

                d_mmCIF2 = parse_mmCIF.main(pdb2,)

                bool_monomeric = check_monomeric(d_mmCIF2)
                if bool_monomeric == False:
                    continue

                bool_remediation_modres = check_modres(d_mmCIF2,pdb2,)
                if bool_remediation_modres == True:
                    continue

                if '_struct_ref_seq_dif.seq_num' in d_mmCIF2.keys():
                    if 'DELETION' in d_mmCIF2['_struct_ref_seq_dif.details']:
                        continue

                ## biounit monomeric?
                for i_entity in range(len(d_mmCIF2['_entity.id'])):
                    if d_mmCIF2['_entity.type'][i_entity] == 'polymer':
                        if int(d_mmCIF2['_entity.pdbx_number_of_molecules'][i_entity]) != 1:
                            continue

                SG2 = d_mmCIF2['_symmetry.space_group_name_H-M']

                if SG1 != SG2:
                    continue

                ## parse coordinates again after being shortened in previous loop
                try:
                    d_coords1, l_coords_alpha1 = mmCIF2coords.main(pdb1, d_mmCIF1)
                except:
                    fd = open('remediation_atom_site.label_alt_id.txt','a')
                    fd.write('%s\n' %(pdb1,))
                    fd.close()
                try:
                    d_coords2, l_coords_alpha2 = mmCIF2coords.main(pdb2, d_mmCIF2)
                except:
                    fd = open('remediation_atom_site.label_alt_id.txt','a')
                    fd.write('%s\n' %(pdb2,))
                    fd.close()

                ## align sequences/coordinates
                try:
                    l_coords_alpha1, l_coords_alpha2 = create_apo_holo_dataset.sequential_alignment_of_coordinates(
                        l_coords_alpha1, l_coords_alpha2,
                        d_mmCIF1, d_mmCIF2,
                        pdb1, pdb2,
                        )
                except:
                    fd = open('remediation_struct_ref_seq_dif.txt','a')
                    fd.write(
                        '%s %s %s %s\n' %(
                            pdb1,pdb2,
                            d_mmCIF1['_struct_ref_seq.pdbx_db_accession'],
                            d_mmCIF2['_struct_ref_seq.pdbx_db_accession'],
                            )
                        )
                    fd.close()
                    continue
                if len(l_coords_alpha1) != len(l_coords_alpha2):
                    print d_mmCIF1['_pdbx_poly_seq_scheme.pdb_mon_id']
                    print d_mmCIF2['_pdbx_poly_seq_scheme.pdb_mon_id']
                    print 'coords', len(l_coords_alpha1), len(l_coords_alpha2)
                    print 'seq', len(d_mmCIF1['_pdbx_poly_seq_scheme.pdb_mon_id'])
                    print 'seq', len(d_mmCIF2['_pdbx_poly_seq_scheme.pdb_mon_id'])
                    print pdb1, pdb2
                    d_coords1, l_coords_alpha1 = mmCIF2coords.main(pdb1, d_mmCIF1)
                    d_coords1, l_coords_alpha2 = mmCIF2coords.main(pdb1, d_mmCIF2)
                    print len(l_coords_alpha1), len(l_coords_alpha2)
                    stop
                    continue

                ##
                ## align structure 1 and 2
                ##
                instance_geometry = geometry.geometry()
                rmsd = instance_geometry.superpose(l_coords_alpha1,l_coords_alpha2)
                tv1 = instance_geometry.fitcenter
                rm = instance_geometry.rotation
                tv2 = instance_geometry.refcenter

                ## structural alignment
                for i_coord in range(len(l_coords_alpha2)):
                    l_coords_alpha2[i_coord] = numpy.dot(l_coords_alpha2[i_coord]-tv1,rm)+tv2

                ##
                ## vector from structure 1 to 2
                ##
                vector = []
                for i in range(len(l_coords_alpha1)):
                    vector += [
                        l_coords_alpha1[i][0]-l_coords_alpha2[i][0],
                        l_coords_alpha1[i][1]-l_coords_alpha2[i][1],
                        l_coords_alpha1[i][2]-l_coords_alpha2[i][2],
                        ]
                vector = numpy.array(vector)

                ##
                ## calculate normal modes of structure 1
                ##
                cutoff = 10
                try:
                    matrix_hessian1 = NMA.hessian_calculation(l_coords_alpha1, cutoff, verbose = False)
                    eigenvectors1, eigenvalues1 = NMA.diagonalize_hessian(matrix_hessian1, verbose = False)
                    matrix_hessian2 = NMA.hessian_calculation(l_coords_alpha2, cutoff, verbose = False)
                    eigenvectors2, eigenvalues2 = NMA.diagonalize_hessian(matrix_hessian2, verbose = False)
                except:
                    continue

                ##
                ## calculate overlap between normal modes and difference vector
                ##
                eigenvector1 = eigenvectors1[6]
                eigenvector2 = eigenvectors2[6]

                overlap1 = calc_overlap(eigenvector1,vector)
                overlap2 = calc_overlap(eigenvector2,vector)
                overlap3a = calc_overlap(eigenvector1,eigenvector2)
                overlap3b = calc_overlap(eigenvectors1[6],eigenvectors2[7])
                overlap3c = calc_overlap(eigenvectors1[7],eigenvectors2[6])
                overlap3 = max(overlap3a,overlap3b,overlap3c)

                fd = open('rmsd_v_overlap2/cluster%i.txt' %(i_line),'a')
                fd.write('%s %s\n' %(rmsd,overlap1))
                fd.close()
                fd = open('rmsd_v_overlap2/cluster%i.txt' %(i_line),'a')
                fd.write('%s %s\n' %(rmsd,overlap2))
                fd.close()
                fd = open('rmsd_v_overlap2/cluster%i_ev_v_ev.txt' %(i_line),'a')
                fd.write('%s %s\n' %(rmsd,overlap3a))
                fd.close()
                fd = open('rmsd_v_overlap2/cluster%i_ev_v_ev_max.txt' %(i_line),'a')
                fd.write('%s %s\n' %(rmsd,overlap3))
                fd.close()
                print pdb1, pdb2, 'cluster', i_line, 'size', len(l_pdbs),
                print 'overlap', '%4.2f' %(round(overlap1,2)), '%4.2f' %(round(overlap2,2)), '%4.2f' %(round(overlap3,2)), 'rmsd', '%4.2f' %(round(rmsd,2))

    return
def main():

    fn_out = 'db_MatthewsCoefficient.txt'

    fd = open(fn_out,'r')
    lines = fd.readlines()
    fd.close()

    d = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        v = l[1]
        if pdb == '2p51':
            v = '1.72610466393'
        d[pdb] = v

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d.keys():
                continue

            ## Matthews Coefficient not calculated...
            if pdb in [
                '1vh7','1vho','1vhu','1vi3','1vi4','1vis',
                ]:
                continue

            ## Matthews Coefficient *wrong*
            if pdb in [
                '2p51',
                ## too high
                '1c5v','1q9i','1ut6','1x6x','1x6y','1xdn','1y63','1zix',
                ## too low
                '1t95','1jih','1t95','1d5t','1c7k',
                '1dbo','1d9x','1qt9','1ia5','1dcq',
                ]:
                continue

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                d_breaks = {
                    ## break if multiple polymer types (not monomeric)
                    '_entity_poly.entity_id':'2',
##                    '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR
                    ## break if multiple chains
                    '_entity_poly.pdbx_strand_id':',',
                    }, 
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    ## break if not monomeric
                    '_pdbx_struct_assembly.oligomeric_details':'monomeric',
                    },
                l_data_categories = [
                    '_exptl_crystal',
                    ], ## parse selected data categories
                l_data_categories_break = ['_exptl_crystal']
                )

            ## some unknown temporary error... or break before reaching this part when parsing...
            if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys():
                continue

            ## NMR structure?
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                stop2
                continue

            ## no polymers in structure?
            if not '_entity_poly.entity_id' in d_mmCIF.keys():
                continue

            ## polymer(s) is/are not polypeptide(s)
            if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']:
                continue

            ## biounit not monomeric
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_details'])*['monomeric']:
                continue

            ## one polymer in assymetric unit
            if len(d_mmCIF['_entity_poly.entity_id']) > 1:
                continue

            if d_mmCIF['_exptl_crystal.density_Matthews'] == ['?']:
                v = VM = calc_matthews_coefficient.main(pdb)
##                continue
            else:
                v = float(''.join(d_mmCIF['_exptl_crystal.density_Matthews']))

            line = '%s %s\n' %(pdb,v,)

            fd = open(fn_out,'a')
            fd.write(line)
            fd.close()

            d[pdb] = v

    ##
    ## write calculated radii of gyration to file
    ##
    lines_out = []
    for pdb,v in d.items():
        line = '%s %s\n' %(pdb,v,)
        lines_out += [line]
    fd = open(fn_out,'w')
    fd.writelines(lines_out)
    fd.close()

    return
def main():

    d_MV = {}

    path = '/data/mmCIF'
    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn == 'mmCIF.py':
            continue
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        l_fn = os.listdir('%s/%s' %(path,dn))
        for fn in l_fn:
            pdb = fn[:4]
##            if pdb.upper() not in s_pdbs:
##                continue
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks = {'_exptl.method':'SOLUTION NMR'},
                l_data_categories = [
                    '_cell','_entity','_exptl','_exptl_crystal',
                    '_entity_poly',
                    '_symmetry',
                    ## virus
                    '_pdbx_struct_assembly',
                    ## split structure
                    '_pdbx_database_related',
                    ],
                )

            ## x-ray structure
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            ## polymer present
            if not '_entity_poly.type' in d_mmCIF.keys():
                continue

            ## only polymer present is protein
            if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']:
                continue

            if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys():
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['?']:
                continue
            
            ## virus
            if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'][0]) % 60 == 0:
                continue

            ## not monomer
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['1']:
                continue

            ## split structure
            if '_pdbx_database_related' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related']:
                    continue
                if 'SPLIT' in d_mmCIF['_pdbx_database_related']:
                    print pdb
                    stop

            if not '_cell.Z_PDB' in d_mmCIF.keys():
                continue

            if pdb in [
                ## treshold
                '1e54','1e9i',
                ## difference between calculated MV and MV in mmCIF
                '3eiq',
                ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with
                ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1
                ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01.
                ## Toscana has published with Hellinga...
                '2cjf','2bt4',
                ]:
                continue

##            if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [
##                'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3',
##                ]:
##                continue ## tmp!!!

            a = float(d_mmCIF['_cell.length_a'][0])
            b = float(d_mmCIF['_cell.length_b'][0])
            c = float(d_mmCIF['_cell.length_c'][0])
            alpha = float(d_mmCIF['_cell.angle_alpha'][0])
            beta = float(d_mmCIF['_cell.angle_beta'][0])
            gamma = float(d_mmCIF['_cell.angle_gamma'][0])
            Z = int(d_mmCIF['_cell.Z_PDB'][0])
            mw = 0
            for i in range(len(d_mmCIF['_entity.id'])):
##                if d_mmCIF['_entity.type'][i] == 'polymer':
                    s = d_mmCIF['_entity.formula_weight'][i]
                    ## unknown ligand
                    if s == '?':
                        continue
                    mw += float(s)

            MV = matthews_coefficient.main(a,b,c,alpha,beta,gamma,mw,Z)

            spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M'])

            if spacegroup not in [
                'F 4 3 2',
                'F 41 3 2',
                'I 41 3 2',
                ]:
                continue ## tmp!!!

            if MV > 10:
                print pdb
                print 'mw', mw
                print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews']
                print 'Z', Z
                import math
                alpha *= math.pi/180.
                beta *= math.pi/180.
                gamma *= math.pi/180.
                V = a*b*c*math.sqrt(1-math.cos(alpha)**2-math.cos(beta)**2-math.cos(gamma)**2+2*(math.cos(alpha)*math.cos(beta)*math.cos(gamma)))
                print 'V', V
                continue
                stop_treshold
                stop
            if '_exptl_crystal.density_Matthews' in d_mmCIF.keys():
                if d_mmCIF['_exptl_crystal.density_Matthews'] not in [['?'],len(d_mmCIF['_exptl_crystal.density_Matthews'])*['?'],]:
                    if abs(MV-float(d_mmCIF['_exptl_crystal.density_Matthews'][0])) > 1:
                        print 'MV', MV
                        print 'MV', d_mmCIF['_exptl_crystal.density_Matthews']
                        print 'mw', mw
                        print 'Z', Z
                        continue
                        stop_difference


            if not spacegroup in d_MV.keys():
                d_MV[spacegroup] = []
            d_MV[spacegroup] += [MV]

            print pdb, round(MV,2), spacegroup

##    fd = open('MV_v_spacegroup.txt','w')
##    fd.write(str(d_MV))
##    fd.close()

    l = ['# MV_average MV_stddev n spacegroup\n']
    for spacegroup in d_MV.keys():
        l_MV = d_MV[spacegroup]
        if len(l_MV) <= 1:
            continue
        average, stddev = statistics.do_stddev(l_MV)
        average, stderr = statistics.do_stderr(l_MV)
##        l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)]
        l += ['%s %s %s %s\n' %(average,stderr,len(l_MV),spacegroup,)]

    fd = open('MV_v_spacegroup.txt','w')
    fd.writelines(l)
    fd.close()

    return
示例#19
0
def main():

    fd = open('db_authors.txt','r')
    lines = fd.readlines()
    fd.close()

    d_authors = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        s_authors = l[1:]
        d_authors[pdb] = s_authors

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d_authors.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                l_data_categories = [
                    '_audit_author',
                    '_citation_author',
                    ], ## parse selected data categories
                l_data_categories_break = [
                    '_citation_author',
                    ],
                )

            l_authors = d_mmCIF['_audit_author.name']
            s_authors = ';'.join(l_authors)

            if d_mmCIF['_audit_author.name'] == []:
                print d_mmCIF['_citation_author.name']
                print d_mmCIF['_audit_author.name']
                stop

            line = '%s %s\n' %(pdb,s_authors,)
            lines_out += [line]

            fd = open('db_authors.txt','a')
            fd.write(line)
            fd.close()

            d_authors[pdb] = s_authors

    ##
    ## write to file
    ##
    lines_out = []
    for pdb,s_authors in d_authors.items():
        line = '%s %s\n' %(pdb,s_authors,)
        lines_out += [line]
    fd = open('db_authors.txt','w')
    fd.writelines(lines_out)
    fd.close()

    return
def main():

    fd = open('remediation_negativeBiso.txt','r')
    lines = fd.readlines()
    fd.close()
    l_pdbs = []
    for line in lines:
        if line.strip() == '':
            continue
        if line[0] == '#':
            continue
        l = line.strip().split()
        pdb = l[0]
        l_pdbs += [pdb]

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if not pdb in l_pdbs:
                continue

            print pdb

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    },
                l_data_categories = [
                    ## parse selected data categories
                    '_database_PDB_rev',
                    '_computing',
                    '_atom_site',
                    '_refine'
                    ],
                )

##            ## no polymers in structure?
##            if not '_entity_poly.entity_id' in d_mmCIF.keys():
##                continue

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            print pdb

            ##
            ## parse bfactors
            ##
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                bfactor = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site])

##                if bfactor == '?':
##                    continue

                element = d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site]

                if float(bfactor) < -0.01:
                    if (
                        element != 'H'
                        and
                        comp_id in ['ALA','CYS','ASP','GLU','PHE','GLY','HIS','ILE','LYS','MET','ASN','PRO','GLN','ARG','SER','THR','VAL','TRP','TYR',]
                        ):

                        print
                        print 'negative'
                        print

                        year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
                        atom_id = int(d_mmCIF['_atom_site.id'][i_atom_site])
                        refinement = ''.join(d_mmCIF['_computing.structure_refinement'])
                        solution = ''.join(d_mmCIF['_computing.structure_solution'])
                        resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high']))
                        
                        fd = open('remediation_negativeBiso.txt','a')
                        fd.write(
##                            '%4s %4i %4i %3s %2s %6.2f %30s %20s\n' %(
                            '%4s\t%4i\t%4i\t%3s\t%2s\t%6.2f\t%6.2f\t%30s\t%20s\n' %(
                                pdb,year,atom_id,
                                comp_id,element,bfactor,resolution,
                                solution.ljust(30),refinement.ljust(20),
                                )
                            )
                        fd.close()
                        break

    return
示例#21
0
def parse_dihedrals():

    import sys

    path = '/data/mmCIF'

    d_phipsi_res = {
        'ALA': [],
        'CYS': [],
        'ASP': [],
        'GLU': [],
        'PHE': [],
        'GLY': [],
        'HIS': [],
        'ILE': [],
        'LYS': [],
        'LEU': [],
        'MET': [],
        'ASN': [],
        'PRO': [],
        'GLN': [],
        'ARG': [],
        'SER': [],
        'THR': [],
        'VAL': [],
        'TRP': [],
        'TYR': [],
        'prePRO': [],
        'prePRO_notGLY': [],
        'prePRO_GLY': [],
        'cisPro': [],
        'transPro': [],
        'all_notgly_notpro_notprepro': [],
    }

    d_phipsi_ss = {
        'sheet': [],  ## _struct_sheet_order.sense
        ##_struct_conf.pdbx_PDB_helix_class
        'helix_alpha': [],  ## i+4 # 1
        'helix_pi': [],  ## i+5 # 3
        'helix_310': [],  ## i+3 # 5
        'Turn': [],  ## i+?
        ##
        'turns_notgly_notpro_notprepro': [],
    }

    d_counts = {
        'cisProALA': 0,
        'cisProCYS': 0,
        'cisProASP': 0,
        'cisProGLU': 0,
        'cisProPHE': 0,
        'cisProGLY': 0,
        'cisProHIS': 0,
        'cisProILE': 0,
        'cisProLYS': 0,
        'cisProLEU': 0,
        'cisProMET': 0,
        'cisProASN': 0,
        'cisProPRO': 0,
        'cisProGLN': 0,
        'cisProARG': 0,
        'cisProSER': 0,
        'cisProTHR': 0,
        'cisProVAL': 0,
        'cisProTRP': 0,
        'cisProTYR': 0,
        'cisPro_helix': 0,
        'cisPro_sheet': 0,
        'cisPro_turn': 0,
        'cisPro_random': 0,
    }

    l_dn = os.listdir(path)
    l_dn.sort()
    l_dn.remove('mmCIF.py')
    for dn in l_dn:
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        print '*', dn
        l_fn = os.listdir('%s/%s' % (
            path,
            dn,
        ))
        l_fn.sort()
        for fn in l_fn:
            pdb = fn[:4]
            print pdb
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks={'_exptl.method': ['SOLUTION NMR']},
                l_data_categories=[
                    '_exptl',
                    '_refine',
                    '_struct_conf',  ## HELIX
                    '_struct_sheet_range',  ## SHEET
                    '_entity',
                    '_entity_poly',
                    '_entity_poly_seq',
                    '_atom_site',
                ],
            )

            ## skip NMR models
            if ''.join(d_mmCIF['_exptl.method']) in [
                    'SOLUTION NMR',
                    'POWDER DIFFRACTION',
                    'ELECTRON MICROSCOPY',
            ]:
                continue

            if not '_refine.ls_d_res_high' in d_mmCIF.keys():
                print d_mmCIF['_exptl.method']
                continue

            ## skip if multiple resolutions
            if len(d_mmCIF['_refine.ls_d_res_high']) > 1:
                continue

            ## skip if no resolution
            if ''.join(d_mmCIF['_refine.ls_d_res_high']) == '?':
                continue

            ## skip low resolution structures
            if float(''.join(d_mmCIF['_refine.ls_d_res_high'])) > 2:
                continue

            if not 'polymer' in d_mmCIF['_entity.type']:
                continue
            if not '_entity_poly.type' in d_mmCIF.keys():  ## e.g. 1hhu
                continue
            if d_mmCIF['_entity_poly.type'] == [
                    'polydeoxyribonucleotide/polyribonucleotide hybrid'
            ]:
                continue
            if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide']:
                continue

            d_sequence = {}
            for i_entity_poly_seq in range(
                    len(d_mmCIF['_entity_poly_seq.entity_id'])):
                entity_id = int(
                    d_mmCIF['_entity_poly_seq.entity_id'][i_entity_poly_seq])
                if not entity_id in d_sequence.keys():
                    d_sequence[entity_id] = []
                res_no = int(
                    d_mmCIF['_entity_poly_seq.num'][i_entity_poly_seq])
                res_name = d_mmCIF['_entity_poly_seq.mon_id'][
                    i_entity_poly_seq]
                d_sequence[entity_id] += [{
                    'res_no': res_no,
                    'res_name': res_name,
                }]

            l_entities_poly = []
            for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])):
                ## skip if not polypeptide
                entity_poly_type = d_mmCIF['_entity_poly.type'][i_entity_poly]
                if entity_poly_type != 'polypeptide(L)':
                    continue
                ## skip if nonstd linkages
                if d_mmCIF['_entity_poly.nstd_linkage'][
                        i_entity_poly] == 'yes':
                    print pdb
                    stop
                    continue
                ## parse entity_id and chains
                entity_id = int(
                    d_mmCIF['_entity_poly.entity_id'][i_entity_poly])
                l_entities_poly += [entity_id]
            ## skip if no polypeptide chains
            if l_entities_poly == []:
                continue

            d_coords = {}
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                entity_id = int(
                    d_mmCIF['_atom_site.label_entity_id'][i_atom_site])
                ## not a polymer
                if not entity_id in l_entities_poly:
                    continue
                ## polymer, append
                elif not entity_id in d_coords.keys():
                    d_coords[entity_id] = {}

                model = int(
                    d_mmCIF['_atom_site.pdbx_PDB_model_num'][i_atom_site])
                if model > 1:
                    continue

                chain = d_mmCIF['_atom_site.label_asym_id'][i_atom_site]
                if not chain in d_coords[entity_id].keys():
                    d_coords[entity_id][chain] = {}
                res_no = int(d_mmCIF['_atom_site.label_seq_id'][i_atom_site])
                if not res_no in d_coords[entity_id][chain].keys():
                    d_coords[entity_id][chain][res_no] = {}
                atom_name = d_mmCIF['_atom_site.label_atom_id'][i_atom_site]

                altloc = d_mmCIF['_atom_site.label_alt_id'][i_atom_site]
                if altloc not in [
                        '.',
                        'A',
                        '1',
                ]:
                    continue

                ## skip if zero occupancy
                occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site])
                if altloc == '.' and occupancy == 0:
                    continue

                if atom_name in [
                        'CA',
                        'C',
                        'O',
                        'N',
                ] and atom_name in d_coords[entity_id][chain][res_no].keys():
                    print pdb, chain, res_no, atom_name
                    print d_mmCIF['_atom_site.Cartn_x'][i_atom_site], d_mmCIF[
                        '_atom_site.Cartn_y'][i_atom_site]
                    print d_coords[entity_id][chain][res_no][atom_name]
                    stop
                x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site])
                y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site])
                z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site])
                coord = numpy.array([
                    x,
                    y,
                    z,
                ])
                d_coords[entity_id][chain][res_no][atom_name] = coord

            d_helices = {}
            ## helices or turns present?
            if '_struct_conf.id' in d_mmCIF.keys():
                for i_struct_conf in range(len(d_mmCIF['_struct_conf.id'])):
                    chain1 = d_mmCIF['_struct_conf.beg_label_asym_id'][
                        i_struct_conf]
                    chain2 = d_mmCIF['_struct_conf.end_label_asym_id'][
                        i_struct_conf]
                    res_no1 = int(d_mmCIF['_struct_conf.beg_label_seq_id']
                                  [i_struct_conf])
                    res_no2 = int(d_mmCIF['_struct_conf.end_label_seq_id']
                                  [i_struct_conf])
                    conf_type_id = d_mmCIF['_struct_conf.conf_type_id'][
                        i_struct_conf]
                    if chain1 != chain2:
                        print chain1, chain2, pdb
                        stop
                    if conf_type_id == 'HELX_P':
                        helix_class = int(
                            d_mmCIF['_struct_conf.pdbx_PDB_helix_class']
                            [i_struct_conf])
                    elif conf_type_id == 'TURN_P':
                        helix_class = 99
                    else:
                        print conf_type_id
                        print pdb
                        stop
                    l_res_nos = range(
                        res_no1,
                        res_no2 + 1,
                    )
                    if not chain1 in d_helices.keys():
                        d_helices[chain1] = {}
                    for res_no in l_res_nos:
                        d_helices[chain1][res_no] = helix_class

            d_sheets = {}
            ## sheet present?
            if '_struct_sheet_range.sheet_id' in d_mmCIF.keys():
                for i_struct_sheet_range in range(
                        len(d_mmCIF['_struct_sheet_range.sheet_id'])):
                    chain1 = d_mmCIF['_struct_sheet_range.beg_label_asym_id'][
                        i_struct_sheet_range]
                    chain2 = d_mmCIF['_struct_sheet_range.end_label_asym_id'][
                        i_struct_sheet_range]
                    res_no1 = int(
                        d_mmCIF['_struct_sheet_range.beg_label_seq_id']
                        [i_struct_sheet_range])
                    res_no2 = int(
                        d_mmCIF['_struct_sheet_range.end_label_seq_id']
                        [i_struct_sheet_range])
                    l_res_nos = range(
                        res_no1,
                        res_no2 + 1,
                    )
                    if chain1 != chain2:
                        print chain1, chain2, pdb
                        stop
                    if not chain1 in d_sheets.keys():
                        d_sheets[chain1] = []
                    for res_no in l_res_nos:
                        d_sheets[chain1] += l_res_nos

            for entity_id in l_entities_poly:
                for chain in d_coords[entity_id].keys():
                    ## skip if short peptide (e.g. 13gs)
                    if len(d_sequence[entity_id]) <= 3:
                        continue
                    for i_res_no in range(1, len(d_sequence[entity_id]) - 1):
                        res_no_prev = int(d_sequence[entity_id][i_res_no -
                                                                1]['res_no'])
                        res_no = int(d_sequence[entity_id][i_res_no]['res_no'])
                        res_no_next = int(d_sequence[entity_id][i_res_no +
                                                                1]['res_no'])
                        res_name = d_sequence[entity_id][i_res_no]['res_name']
                        if res_name == 'MSE':
                            res_name = 'MET'
                        res_name_next = d_sequence[entity_id][i_res_no +
                                                              1]['res_name']

                        ## not a standard residue
                        if not res_name in d_phipsi_res.keys():
                            continue

                        ## residue not observed
                        if not res_no_prev in d_coords[entity_id][chain].keys(
                        ):
                            continue
                        if not res_no in d_coords[entity_id][chain].keys():
                            continue
                        if not res_no_next in d_coords[entity_id][chain].keys(
                        ):
                            continue

                        ## atom not observed
                        if not 'C' in d_coords[entity_id][chain][res_no_prev]:
                            continue
                        if not 'N' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'CA' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'C' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'N' in d_coords[entity_id][chain][res_no_next]:
                            continue

                        C_prev = d_coords[entity_id][chain][res_no_prev]['C']
                        N = d_coords[entity_id][chain][res_no]['N']
                        CA = d_coords[entity_id][chain][res_no]['CA']
                        C = d_coords[entity_id][chain][res_no]['C']
                        N_next = d_coords[entity_id][chain][res_no_next]['N']
                        phi = calc_dihedral(
                            C_prev,
                            N,
                            CA,
                            C,
                        )
                        psi = calc_dihedral(
                            N,
                            CA,
                            C,
                            N_next,
                        )

                        if 'CA' in d_coords[entity_id][chain][
                                res_no_prev].keys():
                            CA_prev = d_coords[entity_id][chain][res_no_prev][
                                'CA']
                            omega = calc_dihedral(
                                CA_prev,
                                C_prev,
                                N,
                                CA,
                            )
                        else:
                            omega = None

                        if omega:
                            if (omega and omega < 150
                                    and omega > -150):  ## 12e8, PRO44D
                                if abs(omega
                                       ) > 30:  ## 12e8 PRO196D, 1a44 GLU82A
                                    omega = None
                                ## cis
                                else:
                                    omega = 'cis'
                                    pass
                            ## trans
                            else:
                                omega = 'trans'
                                pass
                        else:
                            omega = None

                        bool_helix = False
                        if chain in d_helices.keys():
                            if res_no in d_helices[chain].keys():
                                bool_helix = True
                                helix_class = d_helices[chain][res_no]

                        bool_sheet = False
                        if chain in d_sheets.keys():
                            if res_no in d_sheets[chain]:
                                bool_sheet = True


##                        if bool_helix == True and bool_sheet == True and helix_class != 99:
##                            print pdb, chain, res_no, 'sheet and helix'
####                            stop

                        if res_name_next == 'PRO':
                            d_phipsi_res['prePRO'] += [[
                                phi,
                                psi,
                            ]]
                            if res_name != 'GLY':
                                d_phipsi_res['prePRO_notGLY'] += [[
                                    phi,
                                    psi,
                                ]]
                            else:
                                d_phipsi_res['prePRO_GLY'] += [[
                                    phi,
                                    psi,
                                ]]
                        else:
                            d_phipsi_res[res_name] += [[
                                phi,
                                psi,
                            ]]
                            if res_name not in [
                                    'GLY',
                                    'PRO',
                            ]:
                                d_phipsi_res[
                                    'all_notgly_notpro_notprepro'] += [[
                                        phi,
                                        psi,
                                    ]]
                            elif res_name == 'PRO' and omega:
                                d_phipsi_res['%sPro' % (omega)] += [[
                                    phi,
                                    psi,
                                ]]
                                if omega == 'cis':
                                    d_counts['cisPro%s' % (res_name)] += 1
                                    if bool_helix == True:
                                        if helix_class == 1:
                                            d_counts['cisPro_helix'] += 1
                                        elif helix_class == 99:
                                            d_counts['cisPro_turn'] += 99
                                    elif bool_sheet == True:
                                        d_counts['cisPro_sheet'] += 1
                                    else:
                                        d_counts['cisPro_random'] += 1

                        if bool_helix == True:
                            ##                            if helix_class not in [1,3,5,99,]:
                            ##                                print pdb, chain, res_no, helix_class
                            ##                                print 'unexpected helix class'
                            ####                                stop_helix_class
                            if helix_class == 1:
                                d_phipsi_ss['helix_alpha'] += [[
                                    phi,
                                    psi,
                                ]]
                            elif helix_class == 3:
                                d_phipsi_ss['helix_pi'] += [[
                                    phi,
                                    psi,
                                ]]
                            elif helix_class == 5:
                                d_phipsi_ss['helix_310'] += [[
                                    phi,
                                    psi,
                                ]]
                            elif helix_class == 99:
                                d_phipsi_ss['Turn'] += [[
                                    phi,
                                    psi,
                                ]]
                                if (res_name_next != 'PRO'
                                        and res_name not in [
                                            'GLY',
                                            'PRO',
                                        ]):
                                    d_phipsi_ss[
                                        'turns_notgly_notpro_notprepro'] += [[
                                            phi,
                                            psi,
                                        ]]
                        if bool_sheet == True:
                            d_phipsi_ss['sheet'] += [[
                                phi,
                                psi,
                            ]]

    l = []
    for k in d_counts.keys():
        count = d_counts[k]
        l += ['%s %s\n' % (
            k,
            count,
        )]
    fd = open('count.txt', 'w')
    fd.writelines(l)
    fd.close()

    return d_phipsi_res, d_phipsi_ss
def main():

    l_pdbs = []
    fd = open('Biso_v_resolution.gnuplotdata','r')
    lines = fd.readlines()
    fd.close()
    for line in lines:
        l = line.split()
        resolution = float(l[1])
        Biso = float(l[0])
        if resolution > 3.5 and Biso < 10:
            print line
        if resolution > 2.5 and Biso < 10:
            print line
        if resolution > 2.0 and Biso < 5:
            print line
##        if resolution > 1.5 and Biso < 5:
##            print line
        pdb = l[2]
        l_pdbs += [pdb]

    Biso_average_prev = 0

    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        if not os.path.isdir('%s/%s' %(path,dn)):
            continue
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue
            pdb = fn[0:4]

            if pdb in l_pdbs:
                continue

            if pdb in [
                '3bfn', ## PISA left out chains from biological unit
                '2jjg','1qjb', ## _pdbx_struct_assembly missing
                ]:
                continue

            ##
            ## parse header
            ##
            d_mmCIF = parse_mmCIF.main(
                pdb,
                l_data_categories = [
                    '_pdbx_struct_assembly',
                    '_entity_poly',
                    '_citation',
                    '_pdbx_database_related',
                    ], ## parse selected data categories
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    }
                )
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']:
                continue

            if '_pdbx_database_related.content_type' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related.content_type']:
                    continue

            try:
                if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != ['monomeric']:
                    continue
            except:
                print pdb
                stop

            if not '_citation.id' in d_mmCIF.keys():
                continue

            ##
            ## parse coordinate section
            ##
            d_mmCIF = parse_mmCIF.main(
                pdb,
                l_data_categories = [
                    '_database_PDB_rev',
                    '_refine',
                    '_refine_hist',
                    '_atom_site',
                    '_software',
                    '_entity','_entity_poly',
                    '_pdbx_struct_assembly',
                    '_pdbx_database_status',
                    ], ## parse selected data categories
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    }
                )

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']:
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != ['monomeric']:
                continue

            resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high']))

            if (
                int(d_mmCIF['_entity.pdbx_number_of_molecules'][0]) != 1
                or
                len(d_mmCIF['_entity_poly.pdbx_strand_id']) > 1
                or
                len(''.join(d_mmCIF['_entity_poly.pdbx_strand_id']).split(',')) > 1
                or
                len(d_mmCIF['_entity_poly.entity_id']) > 1
                ):
                print pdb
                print d_mmCIF['_entity.pdbx_number_of_molecules']
                print d_mmCIF['_entity_poly.pdbx_strand_id']
                stop

            entity_poly_id = int(''.join(d_mmCIF['_entity_poly.entity_id']))
            for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])):
                entity_poly_id = d_mmCIF['_entity_poly.entity_id'][i_entity_poly]
                entity_poly_type = d_mmCIF['_entity_poly.entity_id'][i_entity_poly]

            l_Biso = []
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):
                occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site])
                if occupancy != 1:
                    continue
                alt_id = d_mmCIF['_atom_site.label_alt_id'][i_atom_site]
                if alt_id != '.':
                    continue
                entity_id = d_mmCIF['_atom_site.label_entity_id'][i_atom_site]
                if entity_id != entity_poly_id:
                    continue
                comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site]
                if not comp_id in ['MSE','ALA','CYS','ASP','GLU','PHE','GLY','HIS','ILE','LYS','LEU','MET','ASN','PRO','GLN','ARG','SER','THR','VAL','TRP','TYR',]:
                    continue
                type_symbol = d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                if type_symbol == 'H':
                    continue
                atom_id = d_mmCIF['_atom_site.label_atom_id'][i_atom_site]
                if not atom_id in ['N','CA','C',]:
                    continue

                Biso = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site])
                l_Biso += [Biso]

            year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
            site = ''.join(d_mmCIF['_pdbx_database_status.process_site'])

            if len(l_Biso) == 0:
                continue

##            if l_Biso == len(l_Biso)*[l_Biso[0]]:
##                print pdb, year, l_Biso[0:3]
##                if year >= 2010:
##                    stop
##                continue

            Biso_average = sum(l_Biso)/len(l_Biso)

            bool_continue = False
            for Biso in set(l_Biso):
                count = l_Biso.count(Biso)
                if count > 20:
                    if '_software.name' in d_mmCIF.keys():
                        print pdb, Biso_average, Biso, count, d_mmCIF['_software.name']
                        s = '%s %6.2f %4i %6.2f %4i %s %s\n' %(
                            pdb,Biso,count,Biso_average,year,site,
                            str(d_mmCIF['_software.name']),
                            )
                    else:
                        print pdb, Biso_average, Biso, count
                        s = '%s %6.2f %4i %6.2f %4i %s\n' %(
                            pdb,Biso,count,Biso_average, year, site,
                            )
                    bool_continue = True
                    fd = open('remediation_Biso_duplicates.txt','a')
                    fd.write(s)
                    fd.close()
                    break
            if bool_continue == True:
                continue

##            if Biso_average in [2,3,4,5,6,7,8,9,99,90,50,20,25,1,100,10,0]:
            if Biso_average in range(0,100+1):
                print l_Biso
                print Biso_average
                print pdb
                print year
                stop

            if '_refine.pdbx_TLS_residual_ADP_flag' in d_mmCIF.keys():
                if ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in ['UNVERIFIED','LIKELY RESIDUAL',]:
                    continue
                elif ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in ['?',]:
                    pass
                else:
                    print d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']
                    print pdb, Biso_average
                    stop

            if round(Biso_average,4) == round(Biso_average_prev,4):
                print pdb, Biso_average, Biso_average_prev
                stop

            print pdb, round(Biso_average,2), resolution
            fd = open('Biso_v_resolution.gnuplotdata','a')
            fd.write('%s %s %s %s\n' %(Biso_average,resolution,pdb,year,))
            fd.close()

    plot()
示例#23
0
import matthews_coefficient, parse_mmCIF

for pdb in [
        '2hhb',
        '1hho',
        '1hv4',
        '3hl9',
        '3hlb',
        '3hlc',
        '3hld',
        '3hle',
        '3hlf',
        '3hlg',
]:

    d_mmCIF = parse_mmCIF.main(pdb)

    a = float(d_mmCIF['_cell.length_a'][0])
    b = float(d_mmCIF['_cell.length_b'][0])
    c = float(d_mmCIF['_cell.length_c'][0])
    alpha = float(d_mmCIF['_cell.angle_alpha'][0])
    beta = float(d_mmCIF['_cell.angle_beta'][0])
    gamma = float(d_mmCIF['_cell.angle_gamma'][0])
    Z = int(d_mmCIF['_cell.Z_PDB'][0])  ## number of polymers in unit cell
    mw = 0
    for i in range(len(d_mmCIF['_entity.id'])):
        if d_mmCIF['_entity.type'][i] == 'polymer':
            mw += float(d_mmCIF['_entity.formula_weight'][i])
    MV = matthews_coefficient.main(
        a,
        b,
示例#24
0
        dn,
    ))
    l_fn.sort()
    for fn in l_fn:
        pdb = fn[:4]
        if fn[-3:] == '.gz':
            continue
########        if pdb in ['2fl9','3gau','3gav','3gaw',]: ## tmp!!!
########            continue
##        print pdb
        fd = open('%s/%s/%s' % (path, dn, fn), 'r')
        lines = fd.readlines()
        fd.close()
        d = parse_mmCIF.main(
            pdb,
            lines,
            l_data_categories=l_data_categories,
            d_breaks=d_breaks,
        )

        if d_exclude_subset:
            bool_continue = False
            for item_exclude, l_values_exclude in d_exclude_subset.items():
                if not item_exclude in d.keys():
                    bool_continue = True
                    fd = open('%s/remediation_%s.txt' % (
                        path,
                        item_exclude,
                    ), 'a')
                    fd.write('%s\n' % (pdb))
                    fd.close()
                    continue
示例#25
0
def parse_cifs(
    l_pdbs,
    ref_seq, l_db_codes,
    n_mutations_max,
    resolution_min,
    bool_multiple_entities = False,
    ):

    print 'parse cifs'

    n_mutants = 0
    l_wts = []
    l_wts_cysfree = []
    d_mutants = {}

    d_mmCIF_main = {}
    for pdb in l_pdbs:

        if pdb[:4].lower() in d_mmCIF_main.keys():
            continue

        d_mmCIF = parse_mmCIF.main(pdb[:4].lower(),)

        ## not an x-ray structure
        if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
            print pdb, d_mmCIF['_exptl.method']
            continue

        ## more than one type of polymer present
        n_entities = len(d_mmCIF['_entity_poly.entity_id'])
        if bool_multiple_entities == False:
            if n_entities > 1:
                print pdb, 'entities', n_entities #, d_mmCIF['_struct.title']
                continue

        ## low resolution
        if d_mmCIF['_refine.ls_d_res_high'] != d_mmCIF['_refine_hist.d_res_high']:
            print d_mmCIF['_refine.ls_d_res_high']
            print d_mmCIF['_refine_hist.d_res_high']
            stop
        if resolution_min:
##            if float(d_mmCIF['_refine.ls_d_res_high'][0]) >= resolution_min:
            if float(d_mmCIF['_refine.ls_d_res_high'][0]) > resolution_min:
                print pdb, 'resolution', d_mmCIF['_refine.ls_d_res_high']
                continue

        ## get entity ID from chain ID
        for i_entity in range(len(d_mmCIF['_entity_poly.entity_id'])):
            entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity]
            s_chain_ids = d_mmCIF['_entity_poly.pdbx_strand_id'][i_entity]
            if pdb[-1] in s_chain_ids:
                break
        if pdb[-1] not in s_chain_ids:
            print pdb
            print s_chain_ids
            stop
        ## get sequence from entity ID
        seq = []
        for i in range(len(d_mmCIF['_entity_poly_seq.entity_id'])):
            if d_mmCIF['_entity_poly_seq.entity_id'][i] == entity_id:
                mon_id = d_mmCIF['_entity_poly_seq.mon_id'][i]
                if pdb[:4] == '1RCM' and i == 126:
                    if mon_id != 'CYS':
                        stop
                    mon_id = 'CCS'
                seq += [mon_id]

        ## wrong chain length
        if ref_seq:
            if len(seq) != len(ref_seq):
                if ''.join(ref_seq) in ''.join(seq):
                    print ref_seq
                    print seq
                    stop
                ## unobserved atoms not in seqres
                elif ''.join(seq) in ''.join(ref_seq):
                    pass
                ## last two residues unobserved
                elif len(seq) == 162 and pdb in [
                    '1KS3_A','1KW5_A','1KW7_A','1KY0_A','1KY1_A','1L0J_A','1LOK_A','1LPY_A','1LW9_A','1LWG_A','1LWK_A',
                    ]:
                    pass
                ## last two residues unobserved
                elif len(seq) == 162 and seq[-1] == 'LYS':
                    pass
                else:
                    print pdb, 'seqlen', len(seq)
                    continue

        ## not from Gallus gallus
        ## check not necessary, because sequence checked against ref seq
        entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity]
        db_code = d_mmCIF['_struct_ref.db_code'][d_mmCIF['_struct_ref.entity_id'].index(entity_id)]
        if db_code not in l_db_codes:
            print pdb, 'uniprot', db_code
            continue

        ## more than 1 mutation?
        if n_mutations_max != None:
            l_mutations = []
            for i_seq in range(len(seq)):
                res_id_mmCIF = seq[i_seq]
                res_id_uniprot = ref_seq[i_seq]
                if res_id_mmCIF != res_id_uniprot:
                    l_mutations += ['%3s%i%3s' %(res_id_uniprot,i_seq+1,res_id_mmCIF,)]
##            if len(l_mutations) == 1:
            if len(l_mutations) > n_mutations_max:
                print pdb, 'muts', len(l_mutations)
                continue
            elif len(l_mutations) > 0:
                n_mutants += 1
                startmodel = parse_mmCIF_item(d_mmCIF,'_refine.pdbx_starting_model',pdb,)
                    

        ## append to lists and dictionaries
        d_mmCIF_main[pdb[:4]] = d_mmCIF
        if len(l_mutations) > 0:
            if l_mutations == ['CYS54THR', 'CYS97ALA']:
                l_wts_cysfree += [pdb]
            d_mutants[pdb] = {'mutations':l_mutations,'startmodel':startmodel}
        else:
            l_wts += [pdb]

##    print 'd_mutants', d_mutants

    return d_mmCIF_main, l_wts, d_mutants, l_wts_cysfree
示例#26
0
    '1u3fA',
    '1agyA',
    '1zioA',
    '1pa9A',
    '2tpsA',
    '2plcA',
    '1qk2A',
    '1j53A',
    '1m21A',
]

cutoff = 10

for pdb in l_pdbs:

    pdb = pdb[:4]

    d = parse_mmCIF.main(pdb, )

    d_coords, l_coords = mmCIF2coords.main(pdb, d, query_chain=pdb[4:])

    matrix_hessian = NMA.hessian_calculation(l_coords, cutoff, verbose=False)

    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian,
                                                        verbose=False)

    visualization.vmd_arrows(pdb, l_coords, eigenvectors)

    print pdb
    stop
    '1czfA', '1thgA', '1booA', '1iu4A', '1bqcA', '206lA', '1cdeA', '1snzA',
    '1gq8A', '1aqlA', '1ps1A', '1s95A', '1pylA', '1ra2A', '1b6bA', '1pntA',
    '1e1aA', '2f9rA', '1v04A', '2nlrA', '1n29A', '1pbgA', '5cpaA', '1agmA',
    '1byaA', '1r76A', '1u5uA', '1vidA', '1h4gA', '1akdA', '1fy2A', '1xqdA',
    '1d6oA', '1qv0A', '1qjeA', '1fvaA', '1bp2A', '1ah7A', '2pthA', '2engA',
    '2acyA', '1qazA', '2a0nA', '1dl2A', '1gp5A', '1onrA', '1cwyA', '1pudA',
    '1bs9A', '1dinA', '1xyzA', '1bwlA', '1eugA', '1idjA', '1g24A', '1oygA',
    '1hzfA', '9papA', '1eb6A', '1ghsA', '1rbnA', '1bixA', '1bs4A', '1celA',
    '1hkaA', '1b02A', '1qibA', '1u3fA', '1agyA', '1zioA', '1pa9A', '2tpsA',
    '2plcA', '1qk2A', '1j53A', '1m21A',
    ]

cutoff = 10

for pdb in l_pdbs:

    pdb = pdb[:4]

    d = parse_mmCIF.main(pdb,)

    d_coords, l_coords = mmCIF2coords.main(pdb, d, query_chain = pdb[4:])

    matrix_hessian = NMA.hessian_calculation(l_coords, cutoff, verbose = False)

    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, verbose = False)

    visualization.vmd_arrows(pdb, l_coords, eigenvectors)

    print pdb
    stop
def unobs_nonterminal_atoms_alpha():

    ## this method is not entirely correct... e.g. 1kwr...

    category = fn = '_pdbx_unobs_or_zero_occ_atoms'

    fd = open('%s/list%s.txt' %(path,fn))
    s = fd.read()
    fd.close()
    l_pdbs_include = s.split()

    ## if a whole residue is missing, then all of it's atoms are also missing
    fd = open('%s/list_pdbx_unobs_residues__NONTERMINAL.txt' %(path))
    s = fd.read()
    fd.close()
    l_pdbs_exclude = s.split()

    l_data_categories = [
        '_pdbx_poly_seq_scheme',
        '_pdbx_unobs_or_zero_occ_atoms',
        '_entity_poly',
        '_struct', ## .pdbx_model_type_details
        '_exptl',
        ]
    d_breaks = {'_exptl.method':['SOLUTION NMR','SOLID-STATE NMR']}

    fn_out = 'list_pdbx_unobs_atoms__CA.txt'

    l_pdbs_out = []
    for pdb in l_pdbs_include:

##        if pdb[1:3] < 'fe':
##            continue
##        if pdb == '2kzt': ## takes too long...
##            continue
        if pdb != '3e3d':
            continue

        if pdb in l_pdbs_exclude:
            continue

        print pdb

        d = parse_mmCIF.main(pdb,l_data_categories=l_data_categories,d_breaks=d_breaks,)

        ## something has to be missing in the first place for it to be terminal/nonterminal
        if not category in d.keys():
            continue
        ## it has to be a polymer in the first place for anything to be terminal/nonterminal
        if not '_pdbx_poly_seq_scheme' in d.keys():
            continue
        ## don't deal with NMR models for now... (too many unobs records when hydrogen...)
        if d['_exptl.method'] != ['X-RAY DIFFRACTION']:
            continue
        if '_struct.pdbx_model_type_details' in d.keys():
            if d['_struct.pdbx_model_type_details'] in [
                ['?'],
                ['minimized average'],
                ['MINIMIZED AVERAGE'],
                ]:
                pass
            ## if residues are not missing, and model is CA only, then no CA are missing!!!
            elif 'CA ATOMS ONLY' in d['_struct.pdbx_model_type_details'][0]:
                continue
            else:
                print d['_struct.pdbx_model_type_details']
                stop
##        if not 'CA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']:
##            continue

        bool_append = False
        for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])):
            if (
                d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] == 'CA'
                and
                d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'][i_unobs] == 'Y'
                and
                ## unobs (1), zero_occ (0)
                d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs] == '1'
                ):
                l_pdbs_out += [pdb]
                print '***', pdb
                break

        continue

    print l_pdbs_out
    stop
    fd = open('%s/%s' %(path,fn_out,),'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    for x in []:

        l_indexes_unobs = []
        bool_append = False
        s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id'])
        for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])):

            ## skip if not alpha carbon
            if d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] != 'CA':
                continue
            ## skip if zero occupancy
            if d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs] == '0':
                continue

            if 'HA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']:
                print pdb
                print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'])
                stop2
            if d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'].count('Y') > 800:
                print pdb
                print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'])
                stop1

            asymID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_asym_id'][i_unobs]
            seqID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'][i_unobs]

            index1 = s.index(asymID_unobs)
            index2 = s.rindex(asymID_unobs)+1
            for i_poly in range(index1,index2,):

                asymID_poly = d['_pdbx_poly_seq_scheme.pdb_strand_id'][i_poly]
                seqID_poly = d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly]

                if seqID_poly == seqID_unobs:

                    if d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == '.' and d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] == '?':
                        pass
                    elif d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs]:
                        pass
                    elif d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] == '?' and d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] != '.':
                        continue
                    elif not d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] in d['_pdbx_poly_seq_scheme.pdb_ins_code']:
                        print d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly]
                        print insCode_unobs
                        print pdb
                        print seqID_unobs, asymID_unobs
                        stop
                    else:
                        continue

                    if asymID_unobs != asymID_poly:
                        stop_add_with_check_of_identiiical_seqID

                    ## tmp!!! check!!!
                    if d['_pdbx_unobs_or_zero_occ_atoms.auth_comp_id'][i_unobs] != d['_pdbx_poly_seq_scheme.pdb_mon_id'][i_poly]:
                        print pdb
                        stop

##                    ## last residue
##                    if index2-i_poly == 0:
##                        pass ## should append...
##                    ## first residue
##                    elif i_poly-index1 == 0:
##                        pass ## should append...
####                    elif i_poly-index1 > 1 and bool_unobs_prev == False:
####                        bool_append = True
                    ## previous residues are missing
                    elif d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:i_poly] == (i_poly-index1)*['?']:
                        bool_append = True
                    ## next residues are missing
                    elif d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly+1:index2] == (index2-i_poly-1)*['?']:
                        bool_append = True
                    ## zero occupancy residue prior to residue with unobserved atom(s)
                    elif pdb in ['7adh']:
                        bool_append = False
                        pass
                    else:
                        if len( set(range(index1,i_poly)) - set(l_indexes_unobs) ) == 0:
                            l_indexes_unobs += [i_poly]
                            stop1
                            pass
                        elif len( set(range(i_poly+1,index2)) - set(l_indexes_unobs) ) == 0:
                            l_indexes_unobs += [i_poly]
                            print pdb
                            print l_indexes_unobs
                            print i_poly, index1, index2
                            stop2
                            pass
                        else:
                            ## this method is not entirely correct... e.g. 1kwr...
                            if i_poly-index1 < 10 or index2-i_poly < 10:
                                print pdb
                                print i_poly-index1
##                        print index2-i_poly
                                print seqID_unobs
                                print pdb
                                print d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:i_poly]
                                print d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly:index2]
                                print pdb
##                                stop
                            bool_append = True
                            break

            if bool_append == True:
                break

        if bool_append == True:
            print pdb
            l_pdbs_out += [pdb]
            continue

        if l_indexes_unobs != []:
            print l_indexes_unobs
            stop

    fd = open('%s/%s' %(path,fn_out,),'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    return
def main():

    l_fn_out = [
        '_exptl_crystal_grow',
        '_exptl_crystal_grow_comp',
        ]

    d = {}
    for fn_out in l_fn_out:
        fd = open('db%s.txt' %(fn_out),'r')
        lines = fd.readlines()
        fd.close()
        d[fn_out] = {}
        for line in lines:
            if line == '\n':
                continue
            pdb = line[:4]
            s = line[5:]
            d[fn_out][pdb] = s

    fd = open('remediation_exptl_crystal_grow.pH.txt','r')
    lines = fd.readlines()
    fd.close()
    l_pdbs = [line[:4] for line in lines]

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            ## continue if already in txt file from previous attempt to run loop
##            if pdb in d['_exptl_crystal_grow_comp'].keys():
##                continue
##            if pdb in d['_exptl_crystal_grow'].keys():
##                continue

##            print pdb

            if not pdb in l_pdbs:
                continue

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    },
                l_data_categories_break = [
##                    '_atom',
                    '_diffrn',
                    ],
                l_data_categories = [
                    ## parse selected data categories
                    '_database_PDB_rev',
                    '_pdbx_database_status',
                    '_exptl',
                    '_exptl_crystal_grow',
                    '_exptl_crystal_grow_comp',
                    ],
                )

##            ## no polymers in structure?
##            if not '_entity_poly.entity_id' in d_mmCIF.keys():
##                continue

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

##            print pdb

            ##
            ##
            ##
            year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
            process_site = ''.join(d_mmCIF['_pdbx_database_status.process_site'])
            if (
                not '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys()
                and
                not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys()
##                ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip() == '?'
                ):
                if process_site != '?':
                    print pdb, year, process_site
                continue

            ##
            if '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys():

                s_grow = ' '.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip()

                if (
                    ## pH not given
                    d_mmCIF['_exptl_crystal_grow.pH'] in [['?'],[''],['.'],]
                    and
                    d_mmCIF['_exptl_crystal_grow.pdbx_pH_range'] in [['?'],[''],['.'],]
                    and
                    ## but pH in growth details
                    (
                        ' PH ' in s_grow.upper()
                        or
                        'PH=' in s_grow.upper()
                        or
                        ',PH ' in s_grow.upper()
                        )
                    ):
                    fd = open('remediation_exptl_crystal_grow.pH.txt','a')
                    fd.write('%s\t%s\t%s\t%4i\t%s\t%s\n' %(
                        pdb,
                        ''.join(d_mmCIF['_exptl_crystal_grow.pH']),
                        ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_pH_range']),
                        year,
                        process_site,
                        s_grow,
                        )
                             )
                    fd.close()

                if (
                    not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys()
                    or
                    ''.join(d_mmCIF['_exptl_crystal_grow_comp.name']) in ['.','','?',]
                    ):

                    if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys():
                        name = ''.join(d_mmCIF['_exptl_crystal_grow_comp.name'])
                    else:
                        name = 'N/A'

##                    ## remove end punctuation
##                    s = s_grow[:-1]+s_grow[-1].replace('.','')

                    ## split
##                    l_grow_punctuation = s_grow.upper().split('. ')
##                    l_grow = l_grow_comma = [s_grow.upper().split(',') for s in l_grow_punctuation]
                    l_grow = s_grow.upper().split(',')

                    ## strip space
                    l_grow = [x.strip() for x in l_grow]
                    ## remove empty
                    if '' in l_grow:
                        l_grow.remove('')
                    ## remove end punctuation
                    l_grow = [x[:-1]+x[-1].replace('.','') for x in l_grow]

                    ## remove selected words from elements of list
                    for x in [
                        'CRYSTALS OBTAINED BY CO-CRYSTALLIZATION AT ',
                        'PROTEIN SOLUTION (',
                        ]:
                        for i_grow in range(len(l_grow)):
                            l_grow[i_grow] = l_grow[i_grow].replace(x,'')

                    ## replace abbreviations
                    for i_grow in range(len(l_grow)):
                        l_grow[i_grow] = l_grow[i_grow].replace('MILLIMOLAR','MM')
                        
                    
                    ## remove selected words from list
                    l_remove = []
                    for x in [
                        'VAPOR DIFFUSION',
                        'VAPOUR DIFFUSION',
                        'HANGING DROP',
                        'SITTING DROP',
                        ]:
                        if x in l_grow:
                            l_remove += [x]
                            
                    ## removed other selected words from list
                    for i_grow in range(len(list(l_grow))):

                        ## remove physical conditions
                        bool_continue = False
                        for x in [
                            'TEMPERATURE',
                            'PH=',
                            'PH ',
                            'AT PH ',
                            ]:
                            if l_grow[i_grow][:len(x)] == x:
                                l_remove += [l_grow[i_grow]]
                                bool_continue = True
                                break
                        if bool_continue == True:
                            continue

                        ## remove long words (sentences)
                        if len(l_grow[i_grow]) > 50:
                            l_remove += [l_grow[i_grow]]
                            break
                    for remove in l_remove:
                        l_grow.remove(remove)
                    if len(l_grow) > 0:
                        ## write to file
                        line = '%s\t%s\t%s\t%4i\t%s\t%s\n' %(
                            pdb,
                            name,
                            l_grow,
                            year,
                            process_site,
                            s_grow,
                            )
                        fd = open('remediation_exptl_crystal_grow_comp.name.txt','a')
                        fd.write(line)
                        fd.close()

            else:
                s_grow = ''

            ##
            if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys():
                l_grow_comp = d_mmCIF['_exptl_crystal_grow_comp.name']
            else:
                l_grow_comp = []

##            lines_out += [line]

            ## append to txt file in case loop doesn't finish
            d_lines = {}
            line = '%s %s\n' %(pdb,s_grow,)
            d_lines['_exptl_crystal_grow'] = line
            line = '%s %s\n' %(pdb,l_grow_comp,)
            d_lines['_exptl_crystal_grow_comp'] = line
            for fn_out in l_fn_out:
                fd = open('db%s.txt' %(fn_out),'a')
                fd.write(d_lines[fn_out])
                fd.close()

            ## append to dic for when loop finishes
            d['_exptl_crystal_grow'][pdb] = s_grow
            d['_exptl_crystal_grow_comp'][pdb] = l_grow_comp

    lines_out = []
    for pdb,s in d.items():
        line = '%s %s\n' %(pdb,s,)
        lines_out += [line]
    fd = open(fn_out,'w')
    fd.writelines(lines_out)
    fd.close()

    return
示例#30
0
import sys
sys.path.append('/home/tc/svn/tc_sandbox/pdb')
import parse_mmCIF, mmCIF2coords
sys.path.append('/home/tc/svn/GoodVibes')
import NMA, visualization

d_mmCIF = parse_mmCIF.main('2lzm', )
d_coords, l_coords_alpha = mmCIF2coords.main('2lzm', d_mmCIF)

cutoff = 10
matrix_hessian = NMA.hessian_calculation(l_coords_alpha, cutoff)
eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, )
visualization.vmd_trajectory('2lzm', l_coords_alpha, eigenvectors)
示例#31
0
def main():

    fd = open('db_authors.txt', 'r')
    lines = fd.readlines()
    fd.close()

    d_authors = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        s_authors = l[1:]
        d_authors[pdb] = s_authors

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d_authors.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                l_data_categories=[
                    '_audit_author',
                    '_citation_author',
                ],  ## parse selected data categories
                l_data_categories_break=[
                    '_citation_author',
                ],
            )

            l_authors = d_mmCIF['_audit_author.name']
            s_authors = ';'.join(l_authors)

            if d_mmCIF['_audit_author.name'] == []:
                print d_mmCIF['_citation_author.name']
                print d_mmCIF['_audit_author.name']
                stop

            line = '%s %s\n' % (
                pdb,
                s_authors,
            )
            lines_out += [line]

            fd = open('db_authors.txt', 'a')
            fd.write(line)
            fd.close()

            d_authors[pdb] = s_authors

    ##
    ## write to file
    ##
    lines_out = []
    for pdb, s_authors in d_authors.items():
        line = '%s %s\n' % (
            pdb,
            s_authors,
        )
        lines_out += [line]
    fd = open('db_authors.txt', 'w')
    fd.writelines(lines_out)
    fd.close()

    return
示例#32
0
for i_line in range(len(lines)):
    if i_line % 100 == 0:
        d_coordinates = {}
    line = lines[i_line]
    l = line.split()
    pdb1 = l[0]
    pdb2 = l[1]
    chain1 = l[4]
    chain2 = l[5]

    for pdb,chain in [[pdb1,chain1,],[pdb2,chain2,],]:

        if pdb in d_coordinates.keys():
            continue

        d_mmCIF = parse_mmCIF.main(pdb)


        if d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num'] != d_mmCIF['_pdbx_poly_seq_scheme.author_seq_num']:
            print d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num']
            print d_mmCIF['_pdbx_poly_seq_scheme.author_seq_num']
            stop

        d_coords = {}
        d_ndb_seq_num = {}
        for i_seq in range(len(d_mmCIF['_pdbx_poly_seq_scheme.ndb_seq_num'])):
            if d_mmCIF['_pdbx_poly_seq_scheme.pdb_strand_id'][i_seq] != chain:
                continue
            ndb_seq_num = d_mmCIF['_pdbx_poly_seq_scheme.ndb_seq_num'][i_seq]
            pdb_seq_num = d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num'][i_seq]
            d_ndb_seq_num[pdb_seq_num] = ndb_seq_num
示例#33
0
def unobs_nonterminal_atoms_alpha():

    ## this method is not entirely correct... e.g. 1kwr...

    category = fn = '_pdbx_unobs_or_zero_occ_atoms'

    fd = open('%s/list%s.txt' % (path, fn))
    s = fd.read()
    fd.close()
    l_pdbs_include = s.split()

    ## if a whole residue is missing, then all of it's atoms are also missing
    fd = open('%s/list_pdbx_unobs_residues__NONTERMINAL.txt' % (path))
    s = fd.read()
    fd.close()
    l_pdbs_exclude = s.split()

    l_data_categories = [
        '_pdbx_poly_seq_scheme',
        '_pdbx_unobs_or_zero_occ_atoms',
        '_entity_poly',
        '_struct',  ## .pdbx_model_type_details
        '_exptl',
    ]
    d_breaks = {'_exptl.method': ['SOLUTION NMR', 'SOLID-STATE NMR']}

    fn_out = 'list_pdbx_unobs_atoms__CA.txt'

    l_pdbs_out = []
    for pdb in l_pdbs_include:

        ##        if pdb[1:3] < 'fe':
        ##            continue
        ##        if pdb == '2kzt': ## takes too long...
        ##            continue
        if pdb != '3e3d':
            continue

        if pdb in l_pdbs_exclude:
            continue

        print pdb

        d = parse_mmCIF.main(
            pdb,
            l_data_categories=l_data_categories,
            d_breaks=d_breaks,
        )

        ## something has to be missing in the first place for it to be terminal/nonterminal
        if not category in d.keys():
            continue
        ## it has to be a polymer in the first place for anything to be terminal/nonterminal
        if not '_pdbx_poly_seq_scheme' in d.keys():
            continue
        ## don't deal with NMR models for now... (too many unobs records when hydrogen...)
        if d['_exptl.method'] != ['X-RAY DIFFRACTION']:
            continue
        if '_struct.pdbx_model_type_details' in d.keys():
            if d['_struct.pdbx_model_type_details'] in [
                ['?'],
                ['minimized average'],
                ['MINIMIZED AVERAGE'],
            ]:
                pass
            ## if residues are not missing, and model is CA only, then no CA are missing!!!
            elif 'CA ATOMS ONLY' in d['_struct.pdbx_model_type_details'][0]:
                continue
            else:
                print d['_struct.pdbx_model_type_details']
                stop
##        if not 'CA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']:
##            continue

        bool_append = False
        for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])):
            if (d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs]
                    == 'CA' and
                    d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'][i_unobs]
                    == 'Y' and
                    ## unobs (1), zero_occ (0)
                    d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs]
                    == '1'):
                l_pdbs_out += [pdb]
                print '***', pdb
                break

        continue

    print l_pdbs_out
    stop
    fd = open('%s/%s' % (
        path,
        fn_out,
    ), 'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    for x in []:

        l_indexes_unobs = []
        bool_append = False
        s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id'])
        for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])):

            ## skip if not alpha carbon
            if d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] != 'CA':
                continue
            ## skip if zero occupancy
            if d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][
                    i_unobs] == '0':
                continue

            if 'HA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']:
                print pdb
                print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'])
                stop2
            if d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'].count(
                    'Y') > 800:
                print pdb
                print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'])
                stop1

            asymID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_asym_id'][
                i_unobs]
            seqID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'][
                i_unobs]

            index1 = s.index(asymID_unobs)
            index2 = s.rindex(asymID_unobs) + 1
            for i_poly in range(
                    index1,
                    index2,
            ):

                asymID_poly = d['_pdbx_poly_seq_scheme.pdb_strand_id'][i_poly]
                seqID_poly = d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly]

                if seqID_poly == seqID_unobs:

                    if d['_pdbx_poly_seq_scheme.pdb_ins_code'][
                            i_poly] == '.' and d[
                                '_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][
                                    i_unobs] == '?':
                        pass
                    elif d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == d[
                            '_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][
                                i_unobs]:
                        pass
                    elif d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][
                            i_unobs] == '?' and d[
                                '_pdbx_poly_seq_scheme.pdb_ins_code'][
                                    i_poly] != '.':
                        continue
                    elif not d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][
                            i_unobs] in d['_pdbx_poly_seq_scheme.pdb_ins_code']:
                        print d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly]
                        print insCode_unobs
                        print pdb
                        print seqID_unobs, asymID_unobs
                        stop
                    else:
                        continue

                    if asymID_unobs != asymID_poly:
                        stop_add_with_check_of_identiiical_seqID

                    ## tmp!!! check!!!
                    if d['_pdbx_unobs_or_zero_occ_atoms.auth_comp_id'][
                            i_unobs] != d['_pdbx_poly_seq_scheme.pdb_mon_id'][
                                i_poly]:
                        print pdb
                        stop

##                    ## last residue
##                    if index2-i_poly == 0:
##                        pass ## should append...
##                    ## first residue
##                    elif i_poly-index1 == 0:
##                        pass ## should append...
####                    elif i_poly-index1 > 1 and bool_unobs_prev == False:
####                        bool_append = True
## previous residues are missing
                    elif d['_pdbx_poly_seq_scheme.auth_seq_num'][
                            index1:i_poly] == (i_poly - index1) * ['?']:
                        bool_append = True
                    ## next residues are missing
                    elif d['_pdbx_poly_seq_scheme.auth_seq_num'][
                            i_poly +
                            1:index2] == (index2 - i_poly - 1) * ['?']:
                        bool_append = True
                    ## zero occupancy residue prior to residue with unobserved atom(s)
                    elif pdb in ['7adh']:
                        bool_append = False
                        pass
                    else:
                        if len(
                                set(range(index1, i_poly)) -
                                set(l_indexes_unobs)) == 0:
                            l_indexes_unobs += [i_poly]
                            stop1
                            pass
                        elif len(
                                set(range(i_poly + 1, index2)) -
                                set(l_indexes_unobs)) == 0:
                            l_indexes_unobs += [i_poly]
                            print pdb
                            print l_indexes_unobs
                            print i_poly, index1, index2
                            stop2
                            pass
                        else:
                            ## this method is not entirely correct... e.g. 1kwr...
                            if i_poly - index1 < 10 or index2 - i_poly < 10:
                                print pdb
                                print i_poly - index1
                                ##                        print index2-i_poly
                                print seqID_unobs
                                print pdb
                                print d['_pdbx_poly_seq_scheme.auth_seq_num'][
                                    index1:i_poly]
                                print d['_pdbx_poly_seq_scheme.auth_seq_num'][
                                    i_poly:index2]
                                print pdb


##                                stop
                            bool_append = True
                            break

            if bool_append == True:
                break

        if bool_append == True:
            print pdb
            l_pdbs_out += [pdb]
            continue

        if l_indexes_unobs != []:
            print l_indexes_unobs
            stop

    fd = open('%s/%s' % (
        path,
        fn_out,
    ), 'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    return
示例#34
0
import sys
sys.path.append('/home/tc/svn/tc_sandbox/pdb')
import parse_mmCIF, mmCIF2coords
sys.path.append('/home/tc/svn/GoodVibes')
import NMA, visualization

d_mmCIF = parse_mmCIF.main('2lzm',)
d_coords, l_coords_alpha = mmCIF2coords.main('2lzm',d_mmCIF)

cutoff = 10
matrix_hessian = NMA.hessian_calculation(l_coords_alpha, cutoff)
eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian,)
visualization.vmd_trajectory('2lzm',l_coords_alpha,eigenvectors)
def parse_coords(pdb):

    d_mmCIF = parse_mmCIF.main(pdb,)
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb,d_mmCIF)

    return d_mmCIF, l_coords_alpha
示例#36
0
def main():

    fd = open('radius_of_gyration.txt', 'r')
    lines = fd.readlines()
    fd.close()
    d_radii = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        r = l[1]
        d_radii[pdb] = r

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d_radii.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                d_breaks={
                    ## break if multiple polymer types (not monomeric)
                    '_entity_poly.entity_id': '2',
                    ##                    '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR
                    ## break if multiple chains
                    '_entity_poly.pdbx_strand_id': ',',
                },
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                    ## break if not monomeric
                    '_pdbx_struct_assembly.oligomeric_details': 'monomeric',
                },
                l_data_categories=[
                    '_atom_site',
                    '_entity_poly',
                    '_pdbx_struct_assembly',
                ],  ## parse selected data categories
            )

            ## some unknown temporary error... or break before reaching this part when parsing...
            if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys(
            ):
                continue

            ## NMR structure?
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                stop2
                continue

            ## no polymers in structure?
            if not '_entity_poly.entity_id' in d_mmCIF.keys():
                continue

            ## polymer(s) is/are not polypeptide(s)
            if d_mmCIF['_entity_poly.type'] != len(
                    d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']:
                continue

            ## biounit not monomeric
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_details']) * [
                        'monomeric'
                    ]:
                continue

            ## one polymer in assymetric unit
            if len(d_mmCIF['_entity_poly.entity_id']) > 1:
                continue

            print pdb

            ##
            ## calculate center of mass
            ##
            center_of_mass = numpy.array([
                0.,
                0.,
                0.,
            ])
            l_coords = []
            l_masses = []
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                if d_mmCIF['_atom_site.label_entity_id'][
                        i_atom_site] not in d_mmCIF['_entity_poly.entity_id']:
                    continue

                element = d_mmCIF['_atom_site.type_symbol'][i_atom_site]

                ## only do heavy atoms
                if element == 'H':
                    continue
                if element not in d_mass.keys():
                    print pdb, d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                    continue

                mass = d_mass[element]
                l_masses += [mass]

                x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site])
                y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site])
                z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site])
                coord = numpy.array([
                    x,
                    y,
                    z,
                ])
                l_coords += [coord]

                center_of_mass += mass * coord

            center_of_mass /= sum(l_masses)

            ##
            ## calculate radius of gyration
            ##
            sum_r = 0
            for i_coord in range(len(l_coords)):
                coord = l_coords[i_coord]
                mass = l_masses[i_coord]
                sq_dist_from_center_of_mass = sum((coord - center_of_mass)**2)
                sum_r += mass * sq_dist_from_center_of_mass
            radius_of_gyration = math.sqrt(sum_r / sum(l_masses))

            print pdb, center_of_mass, radius_of_gyration

            line = '%s %s\n' % (
                pdb,
                radius_of_gyration,
            )
            lines_out += [line]

            fd = open('radius_of_gyration.txt', 'a')
            fd.write(line)
            fd.close()

            d_radii[pdb] = radius_of_gyration

    ##
    ## write calculated radii of gyration to file
    ##
    lines_out = []
    for pdb, radius_of_gyration in d_radii.items():
        line = '%s %s\n' % (
            pdb,
            radius_of_gyration,
        )
        lines_out += [line]
    fd = open('radius_of_gyration.txt', 'w')
    fd.writelines(lines_out)
    fd.close()

    return
def main():

    l_fn_out = [
        '_exptl_crystal_grow',
        '_exptl_crystal_grow_comp',
    ]

    d = {}
    for fn_out in l_fn_out:
        fd = open('db%s.txt' % (fn_out), 'r')
        lines = fd.readlines()
        fd.close()
        d[fn_out] = {}
        for line in lines:
            if line == '\n':
                continue
            pdb = line[:4]
            s = line[5:]
            d[fn_out][pdb] = s

    fd = open('remediation_exptl_crystal_grow.pH.txt', 'r')
    lines = fd.readlines()
    fd.close()
    l_pdbs = [line[:4] for line in lines]

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            ## continue if already in txt file from previous attempt to run loop
            ##            if pdb in d['_exptl_crystal_grow_comp'].keys():
            ##                continue
            ##            if pdb in d['_exptl_crystal_grow'].keys():
            ##                continue

            ##            print pdb

            if not pdb in l_pdbs:
                continue

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                },
                l_data_categories_break=[
                    ##                    '_atom',
                    '_diffrn',
                ],
                l_data_categories=[
                    ## parse selected data categories
                    '_database_PDB_rev',
                    '_pdbx_database_status',
                    '_exptl',
                    '_exptl_crystal_grow',
                    '_exptl_crystal_grow_comp',
                ],
            )

            ##            ## no polymers in structure?
            ##            if not '_entity_poly.entity_id' in d_mmCIF.keys():
            ##                continue

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

##            print pdb

##
##
##
            year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
            process_site = ''.join(
                d_mmCIF['_pdbx_database_status.process_site'])
            if (not '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys()
                    and not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys()
                    ##                ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip() == '?'
                ):
                if process_site != '?':
                    print pdb, year, process_site
                continue

            ##
            if '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys():

                s_grow = ' '.join(
                    d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip()

                if (
                        ## pH not given
                        d_mmCIF['_exptl_crystal_grow.pH'] in [
                            ['?'],
                            [''],
                            ['.'],
                        ] and d_mmCIF['_exptl_crystal_grow.pdbx_pH_range'] in [
                            ['?'],
                            [''],
                            ['.'],
                        ] and
                        ## but pH in growth details
                    (' PH ' in s_grow.upper() or 'PH=' in s_grow.upper()
                     or ',PH ' in s_grow.upper())):
                    fd = open('remediation_exptl_crystal_grow.pH.txt', 'a')
                    fd.write('%s\t%s\t%s\t%4i\t%s\t%s\n' % (
                        pdb,
                        ''.join(d_mmCIF['_exptl_crystal_grow.pH']),
                        ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_pH_range']),
                        year,
                        process_site,
                        s_grow,
                    ))
                    fd.close()

                if (not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys() or
                        ''.join(d_mmCIF['_exptl_crystal_grow_comp.name']) in [
                            '.',
                            '',
                            '?',
                        ]):

                    if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys():
                        name = ''.join(
                            d_mmCIF['_exptl_crystal_grow_comp.name'])
                    else:
                        name = 'N/A'

##                    ## remove end punctuation
##                    s = s_grow[:-1]+s_grow[-1].replace('.','')

## split
##                    l_grow_punctuation = s_grow.upper().split('. ')
##                    l_grow = l_grow_comma = [s_grow.upper().split(',') for s in l_grow_punctuation]
                    l_grow = s_grow.upper().split(',')

                    ## strip space
                    l_grow = [x.strip() for x in l_grow]
                    ## remove empty
                    if '' in l_grow:
                        l_grow.remove('')
                    ## remove end punctuation
                    l_grow = [x[:-1] + x[-1].replace('.', '') for x in l_grow]

                    ## remove selected words from elements of list
                    for x in [
                            'CRYSTALS OBTAINED BY CO-CRYSTALLIZATION AT ',
                            'PROTEIN SOLUTION (',
                    ]:
                        for i_grow in range(len(l_grow)):
                            l_grow[i_grow] = l_grow[i_grow].replace(x, '')

                    ## replace abbreviations
                    for i_grow in range(len(l_grow)):
                        l_grow[i_grow] = l_grow[i_grow].replace(
                            'MILLIMOLAR', 'MM')

                    ## remove selected words from list
                    l_remove = []
                    for x in [
                            'VAPOR DIFFUSION',
                            'VAPOUR DIFFUSION',
                            'HANGING DROP',
                            'SITTING DROP',
                    ]:
                        if x in l_grow:
                            l_remove += [x]

                    ## removed other selected words from list
                    for i_grow in range(len(list(l_grow))):

                        ## remove physical conditions
                        bool_continue = False
                        for x in [
                                'TEMPERATURE',
                                'PH=',
                                'PH ',
                                'AT PH ',
                        ]:
                            if l_grow[i_grow][:len(x)] == x:
                                l_remove += [l_grow[i_grow]]
                                bool_continue = True
                                break
                        if bool_continue == True:
                            continue

                        ## remove long words (sentences)
                        if len(l_grow[i_grow]) > 50:
                            l_remove += [l_grow[i_grow]]
                            break
                    for remove in l_remove:
                        l_grow.remove(remove)
                    if len(l_grow) > 0:
                        ## write to file
                        line = '%s\t%s\t%s\t%4i\t%s\t%s\n' % (
                            pdb,
                            name,
                            l_grow,
                            year,
                            process_site,
                            s_grow,
                        )
                        fd = open(
                            'remediation_exptl_crystal_grow_comp.name.txt',
                            'a')
                        fd.write(line)
                        fd.close()

            else:
                s_grow = ''

            ##
            if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys():
                l_grow_comp = d_mmCIF['_exptl_crystal_grow_comp.name']
            else:
                l_grow_comp = []


##            lines_out += [line]

## append to txt file in case loop doesn't finish
            d_lines = {}
            line = '%s %s\n' % (
                pdb,
                s_grow,
            )
            d_lines['_exptl_crystal_grow'] = line
            line = '%s %s\n' % (
                pdb,
                l_grow_comp,
            )
            d_lines['_exptl_crystal_grow_comp'] = line
            for fn_out in l_fn_out:
                fd = open('db%s.txt' % (fn_out), 'a')
                fd.write(d_lines[fn_out])
                fd.close()

            ## append to dic for when loop finishes
            d['_exptl_crystal_grow'][pdb] = s_grow
            d['_exptl_crystal_grow_comp'][pdb] = l_grow_comp

    lines_out = []
    for pdb, s in d.items():
        line = '%s %s\n' % (
            pdb,
            s,
        )
        lines_out += [line]
    fd = open(fn_out, 'w')
    fd.writelines(lines_out)
    fd.close()

    return
def exclude(l_chainIDs):

    ##
    ## exclude obsolete structures and theoretical structures
    ##
    print 'obsolete/theoretical'
    print len(l_chainIDs)
    l_exclude = []
    for chainID in l_chainIDs:
        if not os.path.isfile('/data/mmCIF/%s/%s.cif' % (
                chainID[1:3],
                chainID[0:4],
        )):
            l_exclude += [chainID]
    for chainID in l_exclude:
        l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    ##
    ## exclude multidomain structures
    ##
    print 'multidomain'
    print len(l_chainIDs)
    fd = open('../CathDomall', 'r')
    lines = fd.readlines()
    fd.close()
    l_single_domain_chains = []
    for line in lines:
        chainID = line[:5]
        if not chainID in l_chainIDs:
            continue
        n_domains = int(line[7:9])
        if n_domains == 1:
            l_single_domain_chains += [chainID]
    l_chainIDs = list(set(l_chainIDs) & set(l_single_domain_chains))
    print len(l_chainIDs)
    print

    ##
    ## exclude multichain biological units
    ## exclude non-x-ray structures
    ##
    print 'multichain'
    print len(l_chainIDs)
    l_exclude = []
    l_pdbs_parsed = []
    d_resolutions = {}
    for i_chainID in range(len(l_chainIDs)):
        chainID = l_chainIDs[i_chainID]
        print i_chainID, len(l_chainIDs), chainID
        pdbID = chainID[:4]
        if pdbID in l_pdbs_parsed:
            continue
        d_mmCIF = parse_mmCIF.main(pdbID)

        l_pdbs_parsed += [pdbID]

        if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
            l_exclude += [pdbID]
            continue

        try:
            l_oligomeric_counts = d_mmCIF[
                '_pdbx_struct_assembly.oligomeric_count']
        except:
            print chainID
            continue
        if l_oligomeric_counts != len(l_oligomeric_counts) * ['1']:
            l_exclude += [pdbID]

        try:
            d_resolutions[pdbID] = float(''.join(
                d_mmCIF['_refine_hist.d_res_high']))
        except:
            print chainID
            stop

    for chainID in list(l_chainIDs):
        if chainID[:4] in l_exclude:
            l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    ##
    ## exclude redundancies
    ##
    print 'redunant'
    print len(l_chainIDs)
    fd = open('../bc-50.out', 'r')
    lines = fd.readlines()
    fd.close()
    d = {}
    for i_line in range(len(lines)):
        line = lines[i_line]
        l_cluster = line.split()
        for i in range(len(l_cluster)):
            l_cluster[i] = l_cluster[i][:4].lower() + l_cluster[i][-1]
        l = list(set(l_cluster) & set(l_chainIDs))
        if len(l) > 1:
            max_resolution = [
                '',
                None,
            ]
            l.sort()
            for chainID in l:
                pdbID = chainID[:4]
                resolution = d_resolutions[pdbID]
                if resolution < max_resolution[0]:
                    max_resolution = [
                        resolution,
                        chainID,
                    ]
            for chainID in l:
                if chainID != max_resolution[1]:
                    l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    return l_chainIDs
def parse_GoodVibes_exclude_flexible(pdb,path,):

    ##
    ## calculate amplitudes
    ##
    d_mmCIF = parse_mmCIF.main(pdb[:4],)
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4],d_mmCIF,query_chain=pdb[-1])
    print len(l_coords_alpha)
    ##
    ## eigenvector
    ##
    cutoff = 10
    matrix_hessian = NMA.hessian_calculation(l_coords_alpha,cutoff,)
    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian)
    l_amplitudes = [
        math.sqrt(
            eigenvectors[6][i]**2+eigenvectors[6][i+1]**2+eigenvectors[6][i+2]**2
            )
        for i in range(0,len(eigenvectors[6]),3)
        ]

##    ## write pdb (color by bfactor)
##    l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))]
##    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
##    lines = fd.readlines()
##    fd.close()
##    index = [-1,None,]
##    lines_out = []
##    for line in lines:
##        record = line[:6].strip()
##        if record != 'ATOM':
##            lines_out += [line]
##        else:
##            res_no = int(line[22:26])
##            if res_no != index[1]:
##                index = [index[0]+1,res_no,]
##                bfactor = l_bfactors[index[0]]
##            line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],)
##            lines_out += [line_out]
##    fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w')
##    fd.writelines(lines_out)
##    fd.close()

    ## average amplitude
    average = sum(l_amplitudes)/len(l_amplitudes)
    average,stddev = statistics.do_stddev(l_amplitudes)
    ##
    l_coords_rigid = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] < average:
            l_coords_rigid += [l_coords_alpha[i]]
    l_coords_flexible = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] > average+0.5*stddev:
            l_coords_flexible += [l_coords_alpha[i]]

    ## parse output
    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
    lines = fd.readlines()
    fd.close()

    max_bfactor = None
    coord = None
    for line in lines:
        record = line[:6].strip()
        if record not in ['ATOM','HETATM',]:
            continue
        res_name = line[17:20]
        if res_name != 'EXT':
            continue

        bfactor = float(line[60:66])

        if bfactor > max_bfactor:
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

##            coord_tmp = numpy.array([x,y,z,])

##            bool_vicinal_to_rigid = False
##            for coord_rigid in l_coords_rigid:
##                dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2))
##                if dist_from_rigid < 6:
##                    bool_vicinal_to_rigid = True
##                    break
##            if bool_vicinal_to_rigid == False:
##                continue

##            bool_vicinal_to_flexible = False
##            for coord_flexible in l_coords_flexible:
##                dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2))
##                if dist_from_flexible < 6:
##                    bool_vicinal_to_flexible = True
##                    break
##            if bool_vicinal_to_flexible == True:
##                continue

##            min_dist = [1000.,None,]
##            for i_coord_alpha in range(len(l_coords_alpha)):
##                coord_alpha = l_coords_alpha[i_coord_alpha]
##                dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2))
##                if dist_from_alpha < min_dist[0]:
##                    min_dist = [dist_from_alpha,i_coord_alpha,]
##            if l_amplitudes[min_dist[1]] > average+stddev:
##                continue

            coord = numpy.array([x,y,z,])
            max_bfactor = bfactor

    return coord
示例#40
0
def one_polysaccharide(pdb, ):

    l_data_categories = [
        '_entity',
        '_chem_comp',
        '_entity_poly',
    ]
    d = parse_mmCIF.main(
        pdb,
        l_data_categories=l_data_categories,
    )

    bool_append = False

    bool_polysaccharide = False
    if '_chem_comp.type' in d.keys():
        for chem_comp_type in d['_chem_comp.type']:
            if chem_comp_type.lower() in [
                    'd-saccharide 1,4 and 1,4 linking',  # 3amm
                    'l-saccharide',
                    'd-saccharide',
                    'saccharide'
            ]:
                bool_polysaccharide = True
                break
##            elif 'acchar' in chem_comp_type.lower():
##                print d
##                print chem_comp_type
##                print pdb
##                print set(['D-saccharide','saccharide'])&set(d['_chem_comp.type'])
##                stop
##    else:
##        print pdb
##        stop

    count_polymer_sugar = 0
    bool_monosaccharide = False  ## included to exclude 1a14 which contains polymers and monomers
    for i in range(len(d['_entity.type'])):
        entity_type = d['_entity.type'][i]
        if entity_type in [
                'polymer',
        ]:
            if d['_entity.pdbx_description'][i][:7] == 'SUGAR (':
                count_polymer_sugar += int(
                    d['_entity.pdbx_number_of_molecules'][i])
                continue
##            ## polypeptide or polynucleotide (just a check)
##            elif d['_entity.pdbx_description'][i][:5] == 'SUGAR': ## eg 2c49
##                if d['_entity.id'][i] not in d['_entity_poly.entity_id']:
##                    print pdb
##                    stop
        elif entity_type == 'non-polymer' and d['_entity.pdbx_description'][
                i][:5] == 'SUGAR':
            bool_monosaccharide = True
##            ## just a check
##            if d['_entity.pdbx_description'][i][:7] != 'SUGAR (' and pdb not in ['1iuc',]:
##                print pdb
##                print d['_entity.pdbx_description'][i]
##                stop
##        ## anything else named SUGAR? just a check
##        elif entity_type != 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR':
##            print d
##            print pdb
##            print entity_type
##            print d['_entity.pdbx_description'][i]
##            stop

    if bool_monosaccharide == False and bool_polysaccharide == True and count_polymer_sugar == 1:
        bool_append = True


##    elif pdb in ['3gvj','3gvk','3gvl','3hmy','3msg','1v0f',]:
##        bool_append = False
##    ## error check
##    elif bool_polysaccharide == False and count_polymer_sugar > 0:
##        print d
##        print bool_polysaccharide
##        print d['_entity.pdbx_description']
##        print count_polymer_sugar
##        print pdb
##        stop_no_poly_but_poly

    if pdb == '1dl2':
        print count_polymer_sugar
        print bool_append
        stop

    return bool_append
def main():

    l_pdbs = []
    fd = open('Biso_v_resolution.gnuplotdata', 'r')
    lines = fd.readlines()
    fd.close()
    for line in lines:
        l = line.split()
        resolution = float(l[1])
        Biso = float(l[0])
        if resolution > 3.5 and Biso < 10:
            print line
        if resolution > 2.5 and Biso < 10:
            print line
        if resolution > 2.0 and Biso < 5:
            print line
##        if resolution > 1.5 and Biso < 5:
##            print line
        pdb = l[2]
        l_pdbs += [pdb]

    Biso_average_prev = 0

    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        if not os.path.isdir('%s/%s' % (path, dn)):
            continue
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue
            pdb = fn[0:4]

            if pdb in l_pdbs:
                continue

            if pdb in [
                    '3bfn',  ## PISA left out chains from biological unit
                    '2jjg',
                    '1qjb',  ## _pdbx_struct_assembly missing
            ]:
                continue

            ##
            ## parse header
            ##
            d_mmCIF = parse_mmCIF.main(
                pdb,
                l_data_categories=[
                    '_pdbx_struct_assembly',
                    '_entity_poly',
                    '_citation',
                    '_pdbx_database_related',
                ],  ## parse selected data categories
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                })
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']:
                continue

            if '_pdbx_database_related.content_type' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related.content_type']:
                    continue

            try:
                if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != [
                        'monomeric'
                ]:
                    continue
            except:
                print pdb
                stop

            if not '_citation.id' in d_mmCIF.keys():
                continue

            ##
            ## parse coordinate section
            ##
            d_mmCIF = parse_mmCIF.main(
                pdb,
                l_data_categories=[
                    '_database_PDB_rev',
                    '_refine',
                    '_refine_hist',
                    '_atom_site',
                    '_software',
                    '_entity',
                    '_entity_poly',
                    '_pdbx_struct_assembly',
                    '_pdbx_database_status',
                ],  ## parse selected data categories
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                })

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']:
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != [
                    'monomeric'
            ]:
                continue

            resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high']))

            if (int(d_mmCIF['_entity.pdbx_number_of_molecules'][0]) != 1
                    or len(d_mmCIF['_entity_poly.pdbx_strand_id']) > 1
                    or len(''.join(
                        d_mmCIF['_entity_poly.pdbx_strand_id']).split(',')) > 1
                    or len(d_mmCIF['_entity_poly.entity_id']) > 1):
                print pdb
                print d_mmCIF['_entity.pdbx_number_of_molecules']
                print d_mmCIF['_entity_poly.pdbx_strand_id']
                stop

            entity_poly_id = int(''.join(d_mmCIF['_entity_poly.entity_id']))
            for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])):
                entity_poly_id = d_mmCIF['_entity_poly.entity_id'][
                    i_entity_poly]
                entity_poly_type = d_mmCIF['_entity_poly.entity_id'][
                    i_entity_poly]

            l_Biso = []
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):
                occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site])
                if occupancy != 1:
                    continue
                alt_id = d_mmCIF['_atom_site.label_alt_id'][i_atom_site]
                if alt_id != '.':
                    continue
                entity_id = d_mmCIF['_atom_site.label_entity_id'][i_atom_site]
                if entity_id != entity_poly_id:
                    continue
                comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site]
                if not comp_id in [
                        'MSE',
                        'ALA',
                        'CYS',
                        'ASP',
                        'GLU',
                        'PHE',
                        'GLY',
                        'HIS',
                        'ILE',
                        'LYS',
                        'LEU',
                        'MET',
                        'ASN',
                        'PRO',
                        'GLN',
                        'ARG',
                        'SER',
                        'THR',
                        'VAL',
                        'TRP',
                        'TYR',
                ]:
                    continue
                type_symbol = d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                if type_symbol == 'H':
                    continue
                atom_id = d_mmCIF['_atom_site.label_atom_id'][i_atom_site]
                if not atom_id in [
                        'N',
                        'CA',
                        'C',
                ]:
                    continue

                Biso = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site])
                l_Biso += [Biso]

            year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
            site = ''.join(d_mmCIF['_pdbx_database_status.process_site'])

            if len(l_Biso) == 0:
                continue

##            if l_Biso == len(l_Biso)*[l_Biso[0]]:
##                print pdb, year, l_Biso[0:3]
##                if year >= 2010:
##                    stop
##                continue

            Biso_average = sum(l_Biso) / len(l_Biso)

            bool_continue = False
            for Biso in set(l_Biso):
                count = l_Biso.count(Biso)
                if count > 20:
                    if '_software.name' in d_mmCIF.keys():
                        print pdb, Biso_average, Biso, count, d_mmCIF[
                            '_software.name']
                        s = '%s %6.2f %4i %6.2f %4i %s %s\n' % (
                            pdb,
                            Biso,
                            count,
                            Biso_average,
                            year,
                            site,
                            str(d_mmCIF['_software.name']),
                        )
                    else:
                        print pdb, Biso_average, Biso, count
                        s = '%s %6.2f %4i %6.2f %4i %s\n' % (
                            pdb,
                            Biso,
                            count,
                            Biso_average,
                            year,
                            site,
                        )
                    bool_continue = True
                    fd = open('remediation_Biso_duplicates.txt', 'a')
                    fd.write(s)
                    fd.close()
                    break
            if bool_continue == True:
                continue


##            if Biso_average in [2,3,4,5,6,7,8,9,99,90,50,20,25,1,100,10,0]:
            if Biso_average in range(0, 100 + 1):
                print l_Biso
                print Biso_average
                print pdb
                print year
                stop

            if '_refine.pdbx_TLS_residual_ADP_flag' in d_mmCIF.keys():
                if ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in [
                        'UNVERIFIED',
                        'LIKELY RESIDUAL',
                ]:
                    continue
                elif ''.join(
                        d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in [
                            '?',
                        ]:
                    pass
                else:
                    print d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']
                    print pdb, Biso_average
                    stop

            if round(Biso_average, 4) == round(Biso_average_prev, 4):
                print pdb, Biso_average, Biso_average_prev
                stop

            print pdb, round(Biso_average, 2), resolution
            fd = open('Biso_v_resolution.gnuplotdata', 'a')
            fd.write('%s %s %s %s\n' % (
                Biso_average,
                resolution,
                pdb,
                year,
            ))
            fd.close()

    plot()
示例#42
0
def unobs_nonterminal_residues():

    ##
    ## unobs or zero occup not at terminals!!! (combination...)
    ## eg dont exlude 200l w 163,164 missing
    ## dont exclude 201l w 163,164 missing, but internally in _pdbx_poly_seq_scheme because 2 chains
    ##
    category = fn = '_pdbx_unobs_or_zero_occ_residues'
    fd = open('%s/list%s.txt' % (path, fn))
    s = fd.read()
    fd.close()
    l_pdbs_in = s.split()
    l_data_categories = [
        '_pdbx_poly_seq_scheme',
        '_pdbx_unobs_or_zero_occ_residues',
        '_entity_poly',
    ]

    fn_out = 'list_pdbx_unobs_residues__NONTERMINAL'

    loop_residues(
        category,
        fn_out,
    )

    l_pdbs_out = []
    for pdb in l_pdbs_in:

        ##        if pdb[1:3] < 'oa':
        ##            continue
        ##        if pdb != '2hub':
        ##            continue

        ## no residues are present! (e.g. 1oax, 1oay)
        if pdb in [
                '1oax',
                '1oay',
        ]:
            continue

        d = parse_mmCIF.main(
            pdb,
            l_data_categories=l_data_categories,
        )

        ##        print pdb

        if not category in d.keys():
            continue

        bool_append = False
        s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id'])
        for chains in d['_entity_poly.pdbx_strand_id']:
            for chain in chains.split(','):
                index1 = s.index(chain)
                index2 = s.rindex(chain)
                ##                print chain
                l_auth_seq_num = d['_pdbx_poly_seq_scheme.auth_seq_num'][
                    index1:index2 + 1]
                while l_auth_seq_num[0] == '?':
                    l_auth_seq_num = l_auth_seq_num[1:]
                while l_auth_seq_num[-1] == '?':
                    l_auth_seq_num = l_auth_seq_num[:-1]
                ## non-terminal residues missing?
                if '?' in l_auth_seq_num:
                    print '****', pdb
                    bool_append = True
                    break
            if bool_append == True:
                break
        if bool_append == True:
            print pdb
            l_pdbs_out += [pdb]
            ## continue

    fd = open('%s/%s' % (
        path,
        fn_out,
    ), 'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    return
def main():

    fn_out = 'db_MatthewsCoefficient.txt'

    fd = open(fn_out, 'r')
    lines = fd.readlines()
    fd.close()

    d = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        v = l[1]
        if pdb == '2p51':
            v = '1.72610466393'
        d[pdb] = v

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d.keys():
                continue

            ## Matthews Coefficient not calculated...
            if pdb in [
                    '1vh7',
                    '1vho',
                    '1vhu',
                    '1vi3',
                    '1vi4',
                    '1vis',
            ]:
                continue

            ## Matthews Coefficient *wrong*
            if pdb in [
                    '2p51',
                    ## too high
                    '1c5v',
                    '1q9i',
                    '1ut6',
                    '1x6x',
                    '1x6y',
                    '1xdn',
                    '1y63',
                    '1zix',
                    ## too low
                    '1t95',
                    '1jih',
                    '1t95',
                    '1d5t',
                    '1c7k',
                    '1dbo',
                    '1d9x',
                    '1qt9',
                    '1ia5',
                    '1dcq',
            ]:
                continue

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                d_breaks={
                    ## break if multiple polymer types (not monomeric)
                    '_entity_poly.entity_id': '2',
                    ##                    '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR
                    ## break if multiple chains
                    '_entity_poly.pdbx_strand_id': ',',
                },
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                    ## break if not monomeric
                    '_pdbx_struct_assembly.oligomeric_details': 'monomeric',
                },
                l_data_categories=[
                    '_exptl_crystal',
                ],  ## parse selected data categories
                l_data_categories_break=['_exptl_crystal'])

            ## some unknown temporary error... or break before reaching this part when parsing...
            if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys(
            ):
                continue

            ## NMR structure?
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                stop2
                continue

            ## no polymers in structure?
            if not '_entity_poly.entity_id' in d_mmCIF.keys():
                continue

            ## polymer(s) is/are not polypeptide(s)
            if d_mmCIF['_entity_poly.type'] != len(
                    d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']:
                continue

            ## biounit not monomeric
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_details']) * [
                        'monomeric'
                    ]:
                continue

            ## one polymer in assymetric unit
            if len(d_mmCIF['_entity_poly.entity_id']) > 1:
                continue

            if d_mmCIF['_exptl_crystal.density_Matthews'] == ['?']:
                v = VM = calc_matthews_coefficient.main(pdb)


##                continue
            else:
                v = float(''.join(d_mmCIF['_exptl_crystal.density_Matthews']))

            line = '%s %s\n' % (
                pdb,
                v,
            )

            fd = open(fn_out, 'a')
            fd.write(line)
            fd.close()

            d[pdb] = v

    ##
    ## write calculated radii of gyration to file
    ##
    lines_out = []
    for pdb, v in d.items():
        line = '%s %s\n' % (
            pdb,
            v,
        )
        lines_out += [line]
    fd = open(fn_out, 'w')
    fd.writelines(lines_out)
    fd.close()

    return
示例#44
0
def main():

    d = {}

    if os.path.isfile('db_resolution.txt'):
        
        fd = open('db_resolution.txt','r')
        lines = fd.readlines()
        fd.close()

        for line in lines:
            l = line.strip().split()
            pdb = l[0]
            v = l[1]
            d[pdb] = v

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()

    lines_out = []

    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                l_data_categories = [
                    '_refine',
                    '_refine_hist',
                    ], ## parse selected data categories
                l_data_categories_break = [
                    '_refine',
##                    '_refine_hist',
                    ],
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    }
                )

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            resolution = d_mmCIF['_refine.ls_d_res_high']

            line = '%s %s\n' %(pdb,resolution,)
            lines_out += [line]

            fd = open('db_resolution.txt','a')
            fd.write(line)
            fd.close()

            d[pdb] = resolution

    ##
    ## write to file
    ##
    lines_out = []
    for pdb,resolution in d.items():
        line = '%s %s\n' %(pdb,resolution,)
        lines_out += [line]
    fd = open('db_resolution.txt','w')
    fd.writelines(lines_out)
    fd.close()

    d = {}
    fd = open('db_resolution.txt','r')
    lines = fd.readlines()
    fd.close()

    lines_out = []
    for line in lines:
        resolution = line.strip().split()[1][2:-2]
        if resolution == '.':
            continue
        resolution = float(resolution)
        resolution = round(resolution,2)
        if not resolution in d.keys():
            d[resolution] = 0
        d[resolution] += 1
        lines_out += ['%s\n' %(resolution,)]
    fd = open('histogram_resolution.txt','w')
    fd.writelines(lines_out)
    fd.close()
    stop

    lines_out = []
    l_resolutions = d.keys()
    l_resolutions.sort()
##    for resolution,count in d.items():
    for resolution in l_resolutions:
        count = d[resolution]
        lines_out += ['%s %s\n' %(resolution,count,)]
    fd = open('histogram_resolution.txt','w')
    fd.writelines(lines_out)
    fd.close()

    return
def exclude(l_chainIDs):

    ##
    ## exclude obsolete structures and theoretical structures
    ##
    print 'obsolete/theoretical'
    print len(l_chainIDs)
    l_exclude = []
    for chainID in l_chainIDs:
        if not os.path.isfile('/data/mmCIF/%s/%s.cif' %(chainID[1:3],chainID[0:4],)):
            l_exclude += [chainID]
    for chainID in l_exclude:
        l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    ##
    ## exclude multidomain structures
    ##
    print 'multidomain'
    print len(l_chainIDs)
    fd = open('../CathDomall','r')
    lines = fd.readlines()
    fd.close()
    l_single_domain_chains = []
    for line in lines:
        chainID = line[:5]
        if not chainID in l_chainIDs:
            continue
        n_domains = int(line[7:9])
        if n_domains == 1:
            l_single_domain_chains += [chainID]
    l_chainIDs = list( set(l_chainIDs) & set(l_single_domain_chains) )
    print len(l_chainIDs)
    print

    ##
    ## exclude multichain biological units
    ## exclude non-x-ray structures
    ##
    print 'multichain'
    print len(l_chainIDs)
    l_exclude = []
    l_pdbs_parsed = []
    d_resolutions = {}
    for i_chainID in range(len(l_chainIDs)):
        chainID = l_chainIDs[i_chainID]
        print i_chainID, len(l_chainIDs), chainID
        pdbID = chainID[:4]
        if pdbID in l_pdbs_parsed:
            continue
        d_mmCIF = parse_mmCIF.main(pdbID)

        l_pdbs_parsed += [pdbID]
          
        if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
            l_exclude += [pdbID]
            continue

        try:
            l_oligomeric_counts = d_mmCIF['_pdbx_struct_assembly.oligomeric_count']
        except:
            print chainID
            continue
        if l_oligomeric_counts != len(l_oligomeric_counts)*['1']:
            l_exclude += [pdbID]

        try:
            d_resolutions[pdbID] = float(''.join(d_mmCIF['_refine_hist.d_res_high']))
        except:
            print chainID
            stop

    for chainID in list(l_chainIDs):
        if chainID[:4] in l_exclude:
            l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    ##
    ## exclude redundancies
    ##
    print 'redunant'
    print len(l_chainIDs)
    fd = open('../bc-50.out','r')
    lines = fd.readlines()
    fd.close()
    d = {}
    for i_line in range(len(lines)):
        line = lines[i_line]
        l_cluster = line.split()
        for i in range(len(l_cluster)):
            l_cluster[i] = l_cluster[i][:4].lower()+l_cluster[i][-1]
        l = list( set(l_cluster) & set(l_chainIDs) )
        if len(l) > 1:
            max_resolution = ['',None,]
            l.sort()
            for chainID in l:
                pdbID = chainID[:4]
                resolution = d_resolutions[pdbID]
                if resolution < max_resolution[0]:
                    max_resolution = [resolution,chainID,]
            for chainID in l:
                if chainID != max_resolution[1]:
                    l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    return l_chainIDs
示例#46
0
    print dn
    l_fn = os.listdir('%s/%s' %(path,dn,))
    l_fn.sort()
    for fn in l_fn:
        pdb = fn[:4]
        if fn[-3:] == '.gz':
            continue
########        if pdb in ['2fl9','3gau','3gav','3gaw',]: ## tmp!!!
########            continue
##        print pdb
        fd = open('%s/%s/%s' %(path,dn,fn), 'r')
        lines = fd.readlines()
        fd.close()
        d = parse_mmCIF.main(
            pdb,lines,
            l_data_categories = l_data_categories,
            d_breaks = d_breaks,
            )

        if d_exclude_subset:
            bool_continue = False
            for item_exclude,l_values_exclude in d_exclude_subset.items():
                if not item_exclude in d.keys():
                    bool_continue = True
                    fd = open('%s/remediation_%s.txt' %(path,item_exclude,),'a')
                    fd.write('%s\n' %(pdb))
                    fd.close()
                    continue
                if len( set(d[item_exclude]) & set(l_values_exclude) ) > 0:
                    bool_continue = True
                    break
示例#47
0
def parse_cifs(
    l_pdbs,
    ref_seq,
    l_db_codes,
    n_mutations_max,
    resolution_min,
    bool_multiple_entities=False,
):

    print 'parse cifs'

    n_mutants = 0
    l_wts = []
    l_wts_cysfree = []
    d_mutants = {}

    d_mmCIF_main = {}
    for pdb in l_pdbs:

        if pdb[:4].lower() in d_mmCIF_main.keys():
            continue

        d_mmCIF = parse_mmCIF.main(pdb[:4].lower(), )

        ## not an x-ray structure
        if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
            print pdb, d_mmCIF['_exptl.method']
            continue

        ## more than one type of polymer present
        n_entities = len(d_mmCIF['_entity_poly.entity_id'])
        if bool_multiple_entities == False:
            if n_entities > 1:
                print pdb, 'entities', n_entities  #, d_mmCIF['_struct.title']
                continue

        ## low resolution
        if d_mmCIF['_refine.ls_d_res_high'] != d_mmCIF[
                '_refine_hist.d_res_high']:
            print d_mmCIF['_refine.ls_d_res_high']
            print d_mmCIF['_refine_hist.d_res_high']
            stop
        if resolution_min:
            ##            if float(d_mmCIF['_refine.ls_d_res_high'][0]) >= resolution_min:
            if float(d_mmCIF['_refine.ls_d_res_high'][0]) > resolution_min:
                print pdb, 'resolution', d_mmCIF['_refine.ls_d_res_high']
                continue

        ## get entity ID from chain ID
        for i_entity in range(len(d_mmCIF['_entity_poly.entity_id'])):
            entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity]
            s_chain_ids = d_mmCIF['_entity_poly.pdbx_strand_id'][i_entity]
            if pdb[-1] in s_chain_ids:
                break
        if pdb[-1] not in s_chain_ids:
            print pdb
            print s_chain_ids
            stop
        ## get sequence from entity ID
        seq = []
        for i in range(len(d_mmCIF['_entity_poly_seq.entity_id'])):
            if d_mmCIF['_entity_poly_seq.entity_id'][i] == entity_id:
                mon_id = d_mmCIF['_entity_poly_seq.mon_id'][i]
                if pdb[:4] == '1RCM' and i == 126:
                    if mon_id != 'CYS':
                        stop
                    mon_id = 'CCS'
                seq += [mon_id]

        ## wrong chain length
        if ref_seq:
            if len(seq) != len(ref_seq):
                if ''.join(ref_seq) in ''.join(seq):
                    print ref_seq
                    print seq
                    stop
                ## unobserved atoms not in seqres
                elif ''.join(seq) in ''.join(ref_seq):
                    pass
                ## last two residues unobserved
                elif len(seq) == 162 and pdb in [
                        '1KS3_A',
                        '1KW5_A',
                        '1KW7_A',
                        '1KY0_A',
                        '1KY1_A',
                        '1L0J_A',
                        '1LOK_A',
                        '1LPY_A',
                        '1LW9_A',
                        '1LWG_A',
                        '1LWK_A',
                ]:
                    pass
                ## last two residues unobserved
                elif len(seq) == 162 and seq[-1] == 'LYS':
                    pass
                else:
                    print pdb, 'seqlen', len(seq)
                    continue

        ## not from Gallus gallus
        ## check not necessary, because sequence checked against ref seq
        entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity]
        db_code = d_mmCIF['_struct_ref.db_code'][
            d_mmCIF['_struct_ref.entity_id'].index(entity_id)]
        if db_code not in l_db_codes:
            print pdb, 'uniprot', db_code
            continue

        ## more than 1 mutation?
        if n_mutations_max != None:
            l_mutations = []
            for i_seq in range(len(seq)):
                res_id_mmCIF = seq[i_seq]
                res_id_uniprot = ref_seq[i_seq]
                if res_id_mmCIF != res_id_uniprot:
                    l_mutations += [
                        '%3s%i%3s' % (
                            res_id_uniprot,
                            i_seq + 1,
                            res_id_mmCIF,
                        )
                    ]
##            if len(l_mutations) == 1:
            if len(l_mutations) > n_mutations_max:
                print pdb, 'muts', len(l_mutations)
                continue
            elif len(l_mutations) > 0:
                n_mutants += 1
                startmodel = parse_mmCIF_item(
                    d_mmCIF,
                    '_refine.pdbx_starting_model',
                    pdb,
                )

        ## append to lists and dictionaries
        d_mmCIF_main[pdb[:4]] = d_mmCIF
        if len(l_mutations) > 0:
            if l_mutations == ['CYS54THR', 'CYS97ALA']:
                l_wts_cysfree += [pdb]
            d_mutants[pdb] = {
                'mutations': l_mutations,
                'startmodel': startmodel
            }
        else:
            l_wts += [pdb]


##    print 'd_mutants', d_mutants

    return d_mmCIF_main, l_wts, d_mutants, l_wts_cysfree
示例#48
0
def parse_GoodVibes_exclude_flexible(
    pdb,
    path,
):

    ##
    ## calculate amplitudes
    ##
    d_mmCIF = parse_mmCIF.main(pdb[:4], )
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4],
                                                 d_mmCIF,
                                                 query_chain=pdb[-1])
    print len(l_coords_alpha)
    ##
    ## eigenvector
    ##
    cutoff = 10
    matrix_hessian = NMA.hessian_calculation(
        l_coords_alpha,
        cutoff,
    )
    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian)
    l_amplitudes = [
        math.sqrt(eigenvectors[6][i]**2 + eigenvectors[6][i + 1]**2 +
                  eigenvectors[6][i + 2]**2)
        for i in range(0, len(eigenvectors[6]), 3)
    ]

    ##    ## write pdb (color by bfactor)
    ##    l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))]
    ##    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
    ##    lines = fd.readlines()
    ##    fd.close()
    ##    index = [-1,None,]
    ##    lines_out = []
    ##    for line in lines:
    ##        record = line[:6].strip()
    ##        if record != 'ATOM':
    ##            lines_out += [line]
    ##        else:
    ##            res_no = int(line[22:26])
    ##            if res_no != index[1]:
    ##                index = [index[0]+1,res_no,]
    ##                bfactor = l_bfactors[index[0]]
    ##            line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],)
    ##            lines_out += [line_out]
    ##    fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w')
    ##    fd.writelines(lines_out)
    ##    fd.close()

    ## average amplitude
    average = sum(l_amplitudes) / len(l_amplitudes)
    average, stddev = statistics.do_stddev(l_amplitudes)
    ##
    l_coords_rigid = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] < average:
            l_coords_rigid += [l_coords_alpha[i]]
    l_coords_flexible = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] > average + 0.5 * stddev:
            l_coords_flexible += [l_coords_alpha[i]]

    ## parse output
    fd = open('output/%s/%s_%s_probe.pdb' % (
        path,
        pdb[:4],
        pdb[-1],
    ), 'r')
    lines = fd.readlines()
    fd.close()

    max_bfactor = None
    coord = None
    for line in lines:
        record = line[:6].strip()
        if record not in [
                'ATOM',
                'HETATM',
        ]:
            continue
        res_name = line[17:20]
        if res_name != 'EXT':
            continue

        bfactor = float(line[60:66])

        if bfactor > max_bfactor:
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

            ##            coord_tmp = numpy.array([x,y,z,])

            ##            bool_vicinal_to_rigid = False
            ##            for coord_rigid in l_coords_rigid:
            ##                dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2))
            ##                if dist_from_rigid < 6:
            ##                    bool_vicinal_to_rigid = True
            ##                    break
            ##            if bool_vicinal_to_rigid == False:
            ##                continue

            ##            bool_vicinal_to_flexible = False
            ##            for coord_flexible in l_coords_flexible:
            ##                dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2))
            ##                if dist_from_flexible < 6:
            ##                    bool_vicinal_to_flexible = True
            ##                    break
            ##            if bool_vicinal_to_flexible == True:
            ##                continue

            ##            min_dist = [1000.,None,]
            ##            for i_coord_alpha in range(len(l_coords_alpha)):
            ##                coord_alpha = l_coords_alpha[i_coord_alpha]
            ##                dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2))
            ##                if dist_from_alpha < min_dist[0]:
            ##                    min_dist = [dist_from_alpha,i_coord_alpha,]
            ##            if l_amplitudes[min_dist[1]] > average+stddev:
            ##                continue

            coord = numpy.array([
                x,
                y,
                z,
            ])
            max_bfactor = bfactor

    return coord
示例#49
0
def identify_CH_bonds():
    
    ##
    ## identify all C-H single bonds in the standard residues
    ##
    d_atoms = {}
    for residue in [
        'ALA',
##        'ALA','CYS','ASP','GLU','PHE',
##        'GLY','HIS','ILE','LYS','LEU',
##        'MET','ASN','PRO','GLN','ARG',
##        'SER','THR','VAL','TRP','TYR',
        ]:
        lines = urllib2.urlopen('http://www.pdb.org/pdb/files/ligand/%s.cif' %(residue)).readlines()
        d = parse_mmCIF.main(residue,lines)
        d_atoms[residue] = []
        for i in range(len(d['_chem_comp_bond.comp_id'])):
            if d['_chem_comp_bond.value_order'][i] != 'SING':
                continue
            atom1 = d['_chem_comp_bond.atom_id_1'][i]
            atom2 = d['_chem_comp_bond.atom_id_2'][i]
            ## heavy element is always listed before hydrogen
            if atom1[0] != 'C' or atom2[0] != 'H':
                continue
            print residue, d['_chem_comp_bond.atom_id_1'][i], d['_chem_comp_bond.atom_id_2'][i]
            d_atoms[residue] += [atom1]

    return d_atoms