def main(pdb):

    ## speed up by not reading atom section...
    d_mmCIF = parse_mmCIF.main(pdb)

    a = float(d_mmCIF['_cell.length_a'][0])
    b = float(d_mmCIF['_cell.length_b'][0])
    c = float(d_mmCIF['_cell.length_c'][0])
    alpha = float(d_mmCIF['_cell.angle_alpha'][0])
    beta = float(d_mmCIF['_cell.angle_beta'][0])
    gamma = float(d_mmCIF['_cell.angle_gamma'][0])
    Z = int(d_mmCIF['_cell.Z_PDB'][0])
    mw = 0
    for i in range(len(d_mmCIF['_entity.id'])):
        if d_mmCIF['_entity.type'][i] == 'polymer':
            mw += float(d_mmCIF['_entity.formula_weight'][i])
    VM = calc(
        a,
        b,
        c,
        alpha,
        beta,
        gamma,
        mw,
        Z,
    )
    print pdb, VM

    return VM
def one_polypeptide(pdb,):

    l_data_categories = ['_entity_poly',]
    d = parse_mmCIF.main(
        pdb,
        l_data_categories = l_data_categories,
        )

    bool_append = False

    ## make sure polymer is present (not vacomycin 1aa5)
    if '_entity_poly.type' in d.keys():
        ## one polypeptide?
        if d['_entity_poly.type'].count('polypeptide(L)') == 1:
            bool_append = True
##            if not ',' in ''.join(d['_entity_poly.pdbx_strand_id']):
##                bool_append = True
##            list_entity.pdbx_number_of_molecules__1.txt
    
    return bool_append
def modres_not_MSE(pdb,):

    l_data_categories = ['_pdbx_struct_mod_residue']
    d = parse_mmCIF.main(
        pdb,
        l_data_categories = l_data_categories,
        )

    bool_append = False

    ## has MODRES
    if '_pdbx_struct_mod_residue.id' in d.keys():
        if d['_pdbx_struct_mod_residue.label_comp_id'] != d['_pdbx_struct_mod_residue.auth_comp_id']:
            print pdb
            stop
        ## at least one MODRES is different from MSE
        if d['_pdbx_struct_mod_residue.auth_comp_id'] != len(d['_pdbx_struct_mod_residue.auth_comp_id'])*['MSE']:
            bool_append = True
    
    return bool_append
def main(pdb):

    ## speed up by not reading atom section...
    d_mmCIF = parse_mmCIF.main(pdb)

    a = float(d_mmCIF['_cell.length_a'][0])
    b = float(d_mmCIF['_cell.length_b'][0])
    c = float(d_mmCIF['_cell.length_c'][0])
    alpha = float(d_mmCIF['_cell.angle_alpha'][0])
    beta = float(d_mmCIF['_cell.angle_beta'][0])
    gamma = float(d_mmCIF['_cell.angle_gamma'][0])
    Z = int(d_mmCIF['_cell.Z_PDB'][0])
    mw = 0
    for i in range(len(d_mmCIF['_entity.id'])):
        if d_mmCIF['_entity.type'][i] == 'polymer':
            mw += float(d_mmCIF['_entity.formula_weight'][i])
    VM = calc(a,b,c,alpha,beta,gamma,mw,Z,)
    print pdb, VM

    return VM
예제 #5
0
def modres_not_MSE(pdb, ):

    l_data_categories = ['_pdbx_struct_mod_residue']
    d = parse_mmCIF.main(
        pdb,
        l_data_categories=l_data_categories,
    )

    bool_append = False

    ## has MODRES
    if '_pdbx_struct_mod_residue.id' in d.keys():
        if d['_pdbx_struct_mod_residue.label_comp_id'] != d[
                '_pdbx_struct_mod_residue.auth_comp_id']:
            print pdb
            stop
        ## at least one MODRES is different from MSE
        if d['_pdbx_struct_mod_residue.auth_comp_id'] != len(
                d['_pdbx_struct_mod_residue.auth_comp_id']) * ['MSE']:
            bool_append = True

    return bool_append
예제 #6
0
def one_polypeptide(pdb, ):

    l_data_categories = [
        '_entity_poly',
    ]
    d = parse_mmCIF.main(
        pdb,
        l_data_categories=l_data_categories,
    )

    bool_append = False

    ## make sure polymer is present (not vacomycin 1aa5)
    if '_entity_poly.type' in d.keys():
        ## one polypeptide?
        if d['_entity_poly.type'].count('polypeptide(L)') == 1:
            bool_append = True


##            if not ',' in ''.join(d['_entity_poly.pdbx_strand_id']):
##                bool_append = True
##            list_entity.pdbx_number_of_molecules__1.txt

    return bool_append
def parse_coords(pdb):

    d_mmCIF = parse_mmCIF.main(pdb, )
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb, d_mmCIF)

    return d_mmCIF, l_coords_alpha
예제 #8
0
def parse_dihedrals():

    import sys

    path = '/data/mmCIF'

    d_phipsi_res = {
        'ALA':[],'CYS':[],'ASP':[],'GLU':[],'PHE':[],
        'GLY':[],'HIS':[],'ILE':[],'LYS':[],'LEU':[],
        'MET':[],'ASN':[],'PRO':[],'GLN':[],'ARG':[],
        'SER':[],'THR':[],'VAL':[],'TRP':[],'TYR':[],
        'prePRO':[],'prePRO_notGLY':[],'prePRO_GLY':[],
        'cisPro':[],'transPro':[],
        'all_notgly_notpro_notprepro':[],
        }

    d_phipsi_ss = {
        'sheet':[], ## _struct_sheet_order.sense
        ##_struct_conf.pdbx_PDB_helix_class
        'helix_alpha':[], ## i+4 # 1
        'helix_pi':[], ## i+5 # 3
        'helix_310':[], ## i+3 # 5
        'Turn':[], ## i+?
        ##
        'turns_notgly_notpro_notprepro':[],
        }

    d_counts = {
        'cisProALA':0,
        'cisProCYS':0,
        'cisProASP':0,
        'cisProGLU':0,
        'cisProPHE':0,
        'cisProGLY':0,
        'cisProHIS':0,
        'cisProILE':0,
        'cisProLYS':0,
        'cisProLEU':0,
        'cisProMET':0,
        'cisProASN':0,
        'cisProPRO':0,
        'cisProGLN':0,
        'cisProARG':0,
        'cisProSER':0,
        'cisProTHR':0,
        'cisProVAL':0,
        'cisProTRP':0,
        'cisProTYR':0,
        'cisPro_helix':0,
        'cisPro_sheet':0,
        'cisPro_turn':0,
        'cisPro_random':0,
        }

    l_dn = os.listdir(path)
    l_dn.sort()
    l_dn.remove('mmCIF.py')
    for dn in l_dn:
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        print '*',dn
        l_fn = os.listdir('%s/%s' %(path,dn,))
        l_fn.sort()
        for fn in l_fn:
            pdb = fn[:4]
            print pdb
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks = {'_exptl.method':['SOLUTION NMR']},
                l_data_categories = [
                    '_exptl',
                    '_refine',

                    '_struct_conf', ## HELIX
                    '_struct_sheet_range', ## SHEET

                    '_entity',
                    '_entity_poly',
                    '_entity_poly_seq',

                    '_atom_site',
                    ],
                )

            ## skip NMR models
            if ''.join(d_mmCIF['_exptl.method']) in [
                'SOLUTION NMR',
                'POWDER DIFFRACTION',
                'ELECTRON MICROSCOPY',
                ]:
                continue

            if not '_refine.ls_d_res_high' in d_mmCIF.keys():
                print d_mmCIF['_exptl.method']
                continue

            ## skip if multiple resolutions
            if len(d_mmCIF['_refine.ls_d_res_high']) > 1:
                continue

            ## skip if no resolution
            if ''.join(d_mmCIF['_refine.ls_d_res_high']) == '?':
                continue

            ## skip low resolution structures
            if float(''.join(d_mmCIF['_refine.ls_d_res_high'])) > 2:
                continue

            if not 'polymer' in d_mmCIF['_entity.type']:
                continue
            if not '_entity_poly.type' in d_mmCIF.keys(): ## e.g. 1hhu
                continue
            if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide/polyribonucleotide hybrid']:
                continue
            if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide']:
                continue

            d_sequence = {}
            for i_entity_poly_seq in range(len(d_mmCIF['_entity_poly_seq.entity_id'])):
                entity_id = int(d_mmCIF['_entity_poly_seq.entity_id'][i_entity_poly_seq])
                if not entity_id in d_sequence.keys():
                    d_sequence[entity_id] = []
                res_no = int(d_mmCIF['_entity_poly_seq.num'][i_entity_poly_seq])
                res_name = d_mmCIF['_entity_poly_seq.mon_id'][i_entity_poly_seq]
                d_sequence[entity_id] += [{'res_no':res_no,'res_name':res_name,}]

            l_entities_poly = []
            for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])):
                ## skip if not polypeptide
                entity_poly_type = d_mmCIF['_entity_poly.type'][i_entity_poly]
                if entity_poly_type != 'polypeptide(L)':
                    continue
                ## skip if nonstd linkages
                if d_mmCIF['_entity_poly.nstd_linkage'][i_entity_poly] == 'yes':
                    print pdb
                    stop
                    continue
                ## parse entity_id and chains
                entity_id = int(d_mmCIF['_entity_poly.entity_id'][i_entity_poly])
                l_entities_poly += [entity_id]
            ## skip if no polypeptide chains
            if l_entities_poly == []:
                continue

            d_coords = {}
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                entity_id = int(d_mmCIF['_atom_site.label_entity_id'][i_atom_site])
                ## not a polymer
                if not entity_id in l_entities_poly:
                    continue
                ## polymer, append
                elif not entity_id in d_coords.keys():
                    d_coords[entity_id] = {}

                model = int(d_mmCIF['_atom_site.pdbx_PDB_model_num'][i_atom_site])
                if model > 1:
                    continue

                chain = d_mmCIF['_atom_site.label_asym_id'][i_atom_site]
                if not chain in d_coords[entity_id].keys():
                    d_coords[entity_id][chain] = {}
                res_no = int(d_mmCIF['_atom_site.label_seq_id'][i_atom_site])
                if not res_no in d_coords[entity_id][chain].keys():
                    d_coords[entity_id][chain][res_no] = {}
                atom_name = d_mmCIF['_atom_site.label_atom_id'][i_atom_site]

                altloc = d_mmCIF['_atom_site.label_alt_id'][i_atom_site]
                if altloc not in ['.','A','1',]:
                    continue

                ## skip if zero occupancy
                occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site])
                if altloc == '.' and occupancy == 0:
                    continue

                if atom_name in ['CA','C','O','N',] and atom_name in d_coords[entity_id][chain][res_no].keys():
                    print pdb, chain, res_no, atom_name
                    print d_mmCIF['_atom_site.Cartn_x'][i_atom_site], d_mmCIF['_atom_site.Cartn_y'][i_atom_site]
                    print d_coords[entity_id][chain][res_no][atom_name]
                    stop
                x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site])
                y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site])
                z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site])
                coord = numpy.array([x,y,z,])
                d_coords[entity_id][chain][res_no][atom_name] = coord

            d_helices = {}
            ## helices or turns present?
            if '_struct_conf.id' in d_mmCIF.keys():
                for i_struct_conf in range(len(d_mmCIF['_struct_conf.id'])):
                    chain1 = d_mmCIF['_struct_conf.beg_label_asym_id'][i_struct_conf]
                    chain2 = d_mmCIF['_struct_conf.end_label_asym_id'][i_struct_conf]
                    res_no1 = int(d_mmCIF['_struct_conf.beg_label_seq_id'][i_struct_conf])
                    res_no2 = int(d_mmCIF['_struct_conf.end_label_seq_id'][i_struct_conf])
                    conf_type_id = d_mmCIF['_struct_conf.conf_type_id'][i_struct_conf]
                    if chain1 != chain2:
                        print chain1, chain2, pdb
                        stop
                    if conf_type_id == 'HELX_P':
                        helix_class = int(d_mmCIF['_struct_conf.pdbx_PDB_helix_class'][i_struct_conf])
                    elif conf_type_id == 'TURN_P':
                        helix_class = 99
                    else:
                        print conf_type_id
                        print pdb
                        stop
                    l_res_nos = range(res_no1,res_no2+1,)
                    if not chain1 in d_helices.keys():
                        d_helices[chain1] = {}
                    for res_no in l_res_nos:
                        d_helices[chain1][res_no] = helix_class

            d_sheets = {}
            ## sheet present?
            if '_struct_sheet_range.sheet_id' in d_mmCIF.keys():
                for i_struct_sheet_range in range(len(d_mmCIF['_struct_sheet_range.sheet_id'])):
                    chain1 = d_mmCIF['_struct_sheet_range.beg_label_asym_id'][i_struct_sheet_range]
                    chain2 = d_mmCIF['_struct_sheet_range.end_label_asym_id'][i_struct_sheet_range]
                    res_no1 = int(d_mmCIF['_struct_sheet_range.beg_label_seq_id'][i_struct_sheet_range])
                    res_no2 = int(d_mmCIF['_struct_sheet_range.end_label_seq_id'][i_struct_sheet_range])
                    l_res_nos = range(res_no1,res_no2+1,)
                    if chain1 != chain2:
                        print chain1, chain2, pdb
                        stop
                    if not chain1 in d_sheets.keys():
                        d_sheets[chain1] = []
                    for res_no in l_res_nos:
                        d_sheets[chain1] += l_res_nos

            for entity_id in l_entities_poly:
                for chain in d_coords[entity_id].keys():
                    ## skip if short peptide (e.g. 13gs)
                    if len(d_sequence[entity_id]) <= 3:
                        continue
                    for i_res_no in range(1,len(d_sequence[entity_id])-1):
                        res_no_prev = int(d_sequence[entity_id][i_res_no-1]['res_no'])
                        res_no = int(d_sequence[entity_id][i_res_no]['res_no'])
                        res_no_next = int(d_sequence[entity_id][i_res_no+1]['res_no'])
                        res_name = d_sequence[entity_id][i_res_no]['res_name']
                        if res_name == 'MSE':
                            res_name = 'MET'
                        res_name_next = d_sequence[entity_id][i_res_no+1]['res_name']

                        ## not a standard residue
                        if not res_name in d_phipsi_res.keys():
                            continue

                        ## residue not observed
                        if not res_no_prev in d_coords[entity_id][chain].keys():
                            continue
                        if not res_no in d_coords[entity_id][chain].keys():
                            continue
                        if not res_no_next in d_coords[entity_id][chain].keys():
                            continue

                        ## atom not observed
                        if not 'C' in d_coords[entity_id][chain][res_no_prev]:
                            continue
                        if not 'N' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'CA' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'C' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'N' in d_coords[entity_id][chain][res_no_next]:
                            continue
                        
                        C_prev = d_coords[entity_id][chain][res_no_prev]['C']
                        N = d_coords[entity_id][chain][res_no]['N']
                        CA = d_coords[entity_id][chain][res_no]['CA']
                        C = d_coords[entity_id][chain][res_no]['C']
                        N_next = d_coords[entity_id][chain][res_no_next]['N']
                        phi = calc_dihedral(C_prev,N,CA,C,)
                        psi = calc_dihedral(N,CA,C,N_next,)

                        if 'CA' in d_coords[entity_id][chain][res_no_prev].keys():
                            CA_prev = d_coords[entity_id][chain][res_no_prev]['CA']
                            omega = calc_dihedral(CA_prev,C_prev,N,CA,)
                        else:
                            omega = None

                        
                        if omega:
                            if (
                                omega
                                and
                                omega < 150
                                and
                                omega > -150
                                ): ## 12e8, PRO44D
                                if abs(omega) > 30: ## 12e8 PRO196D, 1a44 GLU82A
                                    omega = None
                                ## cis
                                else:
                                    omega = 'cis'
                                    pass
                            ## trans
                            else:
                                omega = 'trans'
                                pass
                        else:
                            omega = None
                        
                        bool_helix = False
                        if chain in d_helices.keys():
                            if res_no in d_helices[chain].keys():
                                bool_helix = True
                                helix_class = d_helices[chain][res_no]

                        bool_sheet = False
                        if chain in d_sheets.keys():
                            if res_no in d_sheets[chain]:
                                bool_sheet = True

##                        if bool_helix == True and bool_sheet == True and helix_class != 99:
##                            print pdb, chain, res_no, 'sheet and helix'
####                            stop
                        
                        if res_name_next == 'PRO':
                            d_phipsi_res['prePRO'] += [[phi,psi,]]
                            if res_name != 'GLY':
                                d_phipsi_res['prePRO_notGLY'] += [[phi,psi,]]
                            else:
                                d_phipsi_res['prePRO_GLY'] += [[phi,psi,]]
                        else:
                            d_phipsi_res[res_name] += [[phi,psi,]]
                            if res_name not in ['GLY','PRO',]:
                                d_phipsi_res['all_notgly_notpro_notprepro'] += [[phi,psi,]]
                            elif res_name == 'PRO' and omega:
                                d_phipsi_res['%sPro' %(omega)] += [[phi,psi,]]
                                if omega == 'cis':
                                    d_counts['cisPro%s' %(res_name)] += 1
                                    if bool_helix == True:
                                        if helix_class == 1:
                                            d_counts['cisPro_helix'] += 1
                                        elif helix_class == 99:
                                            d_counts['cisPro_turn'] += 99
                                    elif bool_sheet == True:
                                        d_counts['cisPro_sheet'] += 1
                                    else:
                                        d_counts['cisPro_random'] += 1
                                        

                        if bool_helix == True:
##                            if helix_class not in [1,3,5,99,]:
##                                print pdb, chain, res_no, helix_class
##                                print 'unexpected helix class'
####                                stop_helix_class
                            if helix_class == 1:
                                d_phipsi_ss['helix_alpha'] += [[phi,psi,]]
                            elif helix_class == 3:
                                d_phipsi_ss['helix_pi'] += [[phi,psi,]]
                            elif helix_class == 5:
                                d_phipsi_ss['helix_310'] += [[phi,psi,]]
                            elif helix_class == 99:
                                d_phipsi_ss['Turn'] += [[phi,psi,]]
                                if (
                                    res_name_next != 'PRO'
                                    and
                                    res_name not in ['GLY','PRO',]
                                    ):
                                    d_phipsi_ss['turns_notgly_notpro_notprepro'] += [[phi,psi,]]
                        if bool_sheet == True:
                            d_phipsi_ss['sheet'] += [[phi,psi,]]

    l = []
    for k in d_counts.keys():
        count = d_counts[k]
        l += ['%s %s\n' %(k,count,)]
    fd = open('count.txt','w')
    fd.writelines(l)
    fd.close()

    return d_phipsi_res, d_phipsi_ss
def unobs_nonterminal_residues():

    ##
    ## unobs or zero occup not at terminals!!! (combination...)
    ## eg dont exlude 200l w 163,164 missing
    ## dont exclude 201l w 163,164 missing, but internally in _pdbx_poly_seq_scheme because 2 chains
    ##
    category = fn = '_pdbx_unobs_or_zero_occ_residues'
    fd = open('%s/list%s.txt' %(path,fn))
    s = fd.read()
    fd.close()
    l_pdbs_in = s.split()
    l_data_categories = [
        '_pdbx_poly_seq_scheme',
        '_pdbx_unobs_or_zero_occ_residues',
        '_entity_poly',
        ]

    fn_out = 'list_pdbx_unobs_residues__NONTERMINAL'

    loop_residues(category,fn_out,)

    l_pdbs_out = []
    for pdb in l_pdbs_in:

##        if pdb[1:3] < 'oa':
##            continue
##        if pdb != '2hub':
##            continue

        ## no residues are present! (e.g. 1oax, 1oay)
        if pdb in ['1oax','1oay',]:
            continue

        d = parse_mmCIF.main(pdb,l_data_categories=l_data_categories,)

##        print pdb

        if not category in d.keys():
            continue

        bool_append = False
        s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id'])
        for chains in d['_entity_poly.pdbx_strand_id']:
            for chain in chains.split(','):
                index1 = s.index(chain)
                index2 = s.rindex(chain)
##                print chain
                l_auth_seq_num = d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:index2+1]
                while l_auth_seq_num[0] == '?':
                    l_auth_seq_num = l_auth_seq_num[1:]
                while l_auth_seq_num[-1] == '?':
                    l_auth_seq_num = l_auth_seq_num[:-1]
                ## non-terminal residues missing?
                if '?' in l_auth_seq_num:
                    print '****', pdb
                    bool_append = True
                    break
            if bool_append == True:
                break
        if bool_append == True:
            print pdb
            l_pdbs_out += [pdb]
            ## continue

    fd = open('%s/%s' %(path,fn_out,),'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    return
예제 #10
0
def main():

    d = {}

    if os.path.isfile('db_resolution.txt'):

        fd = open('db_resolution.txt', 'r')
        lines = fd.readlines()
        fd.close()

        for line in lines:
            l = line.strip().split()
            pdb = l[0]
            v = l[1]
            d[pdb] = v

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()

    lines_out = []

    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                l_data_categories=[
                    '_refine',
                    '_refine_hist',
                ],  ## parse selected data categories
                l_data_categories_break=[
                    '_refine',
                    ##                    '_refine_hist',
                ],
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                })

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            resolution = d_mmCIF['_refine.ls_d_res_high']

            line = '%s %s\n' % (
                pdb,
                resolution,
            )
            lines_out += [line]

            fd = open('db_resolution.txt', 'a')
            fd.write(line)
            fd.close()

            d[pdb] = resolution

    ##
    ## write to file
    ##
    lines_out = []
    for pdb, resolution in d.items():
        line = '%s %s\n' % (
            pdb,
            resolution,
        )
        lines_out += [line]
    fd = open('db_resolution.txt', 'w')
    fd.writelines(lines_out)
    fd.close()

    d = {}
    fd = open('db_resolution.txt', 'r')
    lines = fd.readlines()
    fd.close()

    lines_out = []
    for line in lines:
        resolution = line.strip().split()[1][2:-2]
        if resolution == '.':
            continue
        resolution = float(resolution)
        resolution = round(resolution, 2)
        if not resolution in d.keys():
            d[resolution] = 0
        d[resolution] += 1
        lines_out += ['%s\n' % (resolution, )]
    fd = open('histogram_resolution.txt', 'w')
    fd.writelines(lines_out)
    fd.close()
    stop

    lines_out = []
    l_resolutions = d.keys()
    l_resolutions.sort()
    ##    for resolution,count in d.items():
    for resolution in l_resolutions:
        count = d[resolution]
        lines_out += ['%s %s\n' % (
            resolution,
            count,
        )]
    fd = open('histogram_resolution.txt', 'w')
    fd.writelines(lines_out)
    fd.close()

    return
def main():

    fd = open('radius_of_gyration.txt','r')
    lines = fd.readlines()
    fd.close()
    d_radii = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        r = l[1]
        d_radii[pdb] = r

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d_radii.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                d_breaks = {
                    ## break if multiple polymer types (not monomeric)
                    '_entity_poly.entity_id':'2',
##                    '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR
                    ## break if multiple chains
                    '_entity_poly.pdbx_strand_id':',',
                    }, 
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    ## break if not monomeric
                    '_pdbx_struct_assembly.oligomeric_details':'monomeric',
                    },
                l_data_categories = [
                    '_atom_site',
                    '_entity_poly',
                    '_pdbx_struct_assembly',
                    ], ## parse selected data categories
                )

            ## some unknown temporary error... or break before reaching this part when parsing...
            if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys():
                continue

            ## NMR structure?
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                stop2
                continue

            ## no polymers in structure?
            if not '_entity_poly.entity_id' in d_mmCIF.keys():
                continue

            ## polymer(s) is/are not polypeptide(s)
            if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']:
                continue

            ## biounit not monomeric
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_details'])*['monomeric']:
                continue

            ## one polymer in assymetric unit
            if len(d_mmCIF['_entity_poly.entity_id']) > 1:
                continue

            print pdb

            ##
            ## calculate center of mass
            ##
            center_of_mass = numpy.array([0.,0.,0.,])
            l_coords = []
            l_masses = []
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                if d_mmCIF['_atom_site.label_entity_id'][i_atom_site] not in d_mmCIF['_entity_poly.entity_id']:
                    continue

                element = d_mmCIF['_atom_site.type_symbol'][i_atom_site]

                ## only do heavy atoms
                if element == 'H':
                    continue
                if element not in d_mass.keys():
                    print pdb, d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                    continue

                mass = d_mass[element]
                l_masses += [mass]

                x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site])
                y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site])
                z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site])
                coord = numpy.array([x,y,z,])
                l_coords += [coord]

                center_of_mass += mass*coord

            center_of_mass /= sum(l_masses)

            ##
            ## calculate radius of gyration
            ##
            sum_r = 0
            for i_coord in range(len(l_coords)):
                coord = l_coords[i_coord]
                mass = l_masses[i_coord]
                sq_dist_from_center_of_mass = sum((coord-center_of_mass)**2)
                sum_r += mass*sq_dist_from_center_of_mass
            radius_of_gyration = math.sqrt(sum_r/sum(l_masses))

            print pdb, center_of_mass, radius_of_gyration

            line = '%s %s\n' %(pdb,radius_of_gyration,)
            lines_out += [line]

            fd = open('radius_of_gyration.txt','a')
            fd.write(line)
            fd.close()

            d_radii[pdb] = radius_of_gyration

    ##
    ## write calculated radii of gyration to file
    ##
    lines_out = []
    for pdb,radius_of_gyration in d_radii.items():
        line = '%s %s\n' %(pdb,radius_of_gyration,)
        lines_out += [line]
    fd = open('radius_of_gyration.txt','w')
    fd.writelines(lines_out)
    fd.close()

    return
예제 #12
0
def one_polysaccharide(pdb,):

    l_data_categories = [
        '_entity',
        '_chem_comp',
        '_entity_poly',
        ]
    d = parse_mmCIF.main(
        pdb,
        l_data_categories = l_data_categories,
        )

    bool_append = False

    bool_polysaccharide = False
    if '_chem_comp.type' in d.keys():
        for chem_comp_type in d['_chem_comp.type']:
            if chem_comp_type.lower() in [
                'd-saccharide 1,4 and 1,4 linking', # 3amm
                'l-saccharide','d-saccharide','saccharide'
                ]:
                bool_polysaccharide = True
                break
##            elif 'acchar' in chem_comp_type.lower():
##                print d
##                print chem_comp_type
##                print pdb
##                print set(['D-saccharide','saccharide'])&set(d['_chem_comp.type'])
##                stop
##    else:
##        print pdb
##        stop

    count_polymer_sugar = 0
    bool_monosaccharide = False ## included to exclude 1a14 which contains polymers and monomers
    for i in range(len(d['_entity.type'])):
        entity_type = d['_entity.type'][i]
        if entity_type in [
            'polymer',
            ]:
            if d['_entity.pdbx_description'][i][:7] == 'SUGAR (':
                count_polymer_sugar += int(d['_entity.pdbx_number_of_molecules'][i])
                continue
##            ## polypeptide or polynucleotide (just a check)
##            elif d['_entity.pdbx_description'][i][:5] == 'SUGAR': ## eg 2c49
##                if d['_entity.id'][i] not in d['_entity_poly.entity_id']:
##                    print pdb
##                    stop
        elif entity_type == 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR':
            bool_monosaccharide = True
##            ## just a check
##            if d['_entity.pdbx_description'][i][:7] != 'SUGAR (' and pdb not in ['1iuc',]:
##                print pdb
##                print d['_entity.pdbx_description'][i]
##                stop
##        ## anything else named SUGAR? just a check
##        elif entity_type != 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR':
##            print d
##            print pdb
##            print entity_type
##            print d['_entity.pdbx_description'][i]
##            stop

    if bool_monosaccharide == False and bool_polysaccharide == True and count_polymer_sugar == 1:
        bool_append = True
##    elif pdb in ['3gvj','3gvk','3gvl','3hmy','3msg','1v0f',]:
##        bool_append = False
##    ## error check
##    elif bool_polysaccharide == False and count_polymer_sugar > 0:
##        print d
##        print bool_polysaccharide
##        print d['_entity.pdbx_description']
##        print count_polymer_sugar
##        print pdb
##        stop_no_poly_but_poly

    if pdb == '1dl2':
        print count_polymer_sugar
        print bool_append
        stop

    return bool_append
def main():

    fd = open('remediation_negativeBiso.txt', 'r')
    lines = fd.readlines()
    fd.close()
    l_pdbs = []
    for line in lines:
        if line.strip() == '':
            continue
        if line[0] == '#':
            continue
        l = line.strip().split()
        pdb = l[0]
        l_pdbs += [pdb]

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if not pdb in l_pdbs:
                continue

            print pdb

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                },
                l_data_categories=[
                    ## parse selected data categories
                    '_database_PDB_rev',
                    '_computing',
                    '_atom_site',
                    '_refine'
                ],
            )

            ##            ## no polymers in structure?
            ##            if not '_entity_poly.entity_id' in d_mmCIF.keys():
            ##                continue

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            print pdb

            ##
            ## parse bfactors
            ##
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                bfactor = float(
                    d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site])

                ##                if bfactor == '?':
                ##                    continue

                element = d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site]

                if float(bfactor) < -0.01:
                    if (element != 'H' and comp_id in [
                            'ALA',
                            'CYS',
                            'ASP',
                            'GLU',
                            'PHE',
                            'GLY',
                            'HIS',
                            'ILE',
                            'LYS',
                            'MET',
                            'ASN',
                            'PRO',
                            'GLN',
                            'ARG',
                            'SER',
                            'THR',
                            'VAL',
                            'TRP',
                            'TYR',
                    ]):

                        print
                        print 'negative'
                        print

                        year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
                        atom_id = int(d_mmCIF['_atom_site.id'][i_atom_site])
                        refinement = ''.join(
                            d_mmCIF['_computing.structure_refinement'])
                        solution = ''.join(
                            d_mmCIF['_computing.structure_solution'])
                        resolution = float(''.join(
                            d_mmCIF['_refine.ls_d_res_high']))

                        fd = open('remediation_negativeBiso.txt', 'a')
                        fd.write(
                            ##                            '%4s %4i %4i %3s %2s %6.2f %30s %20s\n' %(
                            '%4s\t%4i\t%4i\t%3s\t%2s\t%6.2f\t%6.2f\t%30s\t%20s\n'
                            % (
                                pdb,
                                year,
                                atom_id,
                                comp_id,
                                element,
                                bfactor,
                                resolution,
                                solution.ljust(30),
                                refinement.ljust(20),
                            ))
                        fd.close()
                        break

    return
def main():

    d_MV = {}

    path = '/data/mmCIF'
    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn == 'mmCIF.py':
            continue
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        l_fn = os.listdir('%s/%s' % (path, dn))
        for fn in l_fn:
            pdb = fn[:4]
            ##            if pdb.upper() not in s_pdbs:
            ##                continue
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks={'_exptl.method': 'SOLUTION NMR'},
                l_data_categories=[
                    '_cell',
                    '_entity',
                    '_exptl',
                    '_exptl_crystal',
                    '_entity_poly',
                    '_symmetry',
                    ## virus
                    '_pdbx_struct_assembly',
                    ## split structure
                    '_pdbx_database_related',
                ],
            )

            ## x-ray structure
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            ## polymer present
            if not '_entity_poly.type' in d_mmCIF.keys():
                continue

            ## only polymer present is protein
            if d_mmCIF['_entity_poly.type'] != len(
                    d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']:
                continue

            if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys():
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['?']:
                continue

            ## virus
            if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count']
                   [0]) % 60 == 0:
                continue

            ## not monomer
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['1']:
                continue

            ## split structure
            if '_pdbx_database_related' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related']:
                    continue
                if 'SPLIT' in d_mmCIF['_pdbx_database_related']:
                    print pdb
                    stop

            if not '_cell.Z_PDB' in d_mmCIF.keys():
                continue

            if pdb in [
                    ## treshold
                    '1e54',
                    '1e9i',
                    ## difference between calculated MV and MV in mmCIF
                    '3eiq',
                    ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with
                    ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1
                    ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01.
                    ## Toscana has published with Hellinga...
                    '2cjf',
                    '2bt4',
            ]:
                continue

##            if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [
##                'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3',
##                ]:
##                continue ## tmp!!!

            a = float(d_mmCIF['_cell.length_a'][0])
            b = float(d_mmCIF['_cell.length_b'][0])
            c = float(d_mmCIF['_cell.length_c'][0])
            alpha = float(d_mmCIF['_cell.angle_alpha'][0])
            beta = float(d_mmCIF['_cell.angle_beta'][0])
            gamma = float(d_mmCIF['_cell.angle_gamma'][0])
            Z = int(d_mmCIF['_cell.Z_PDB'][0])
            mw = 0
            for i in range(len(d_mmCIF['_entity.id'])):
                ##                if d_mmCIF['_entity.type'][i] == 'polymer':
                s = d_mmCIF['_entity.formula_weight'][i]
                ## unknown ligand
                if s == '?':
                    continue
                mw += float(s)

            MV = matthews_coefficient.main(a, b, c, alpha, beta, gamma, mw, Z)

            spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M'])

            if spacegroup not in [
                    'F 4 3 2',
                    'F 41 3 2',
                    'I 41 3 2',
            ]:
                continue  ## tmp!!!

            if MV > 10:
                print pdb
                print 'mw', mw
                print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews']
                print 'Z', Z
                import math
                alpha *= math.pi / 180.
                beta *= math.pi / 180.
                gamma *= math.pi / 180.
                V = a * b * c * math.sqrt(
                    1 - math.cos(alpha)**2 - math.cos(beta)**2 -
                    math.cos(gamma)**2 + 2 *
                    (math.cos(alpha) * math.cos(beta) * math.cos(gamma)))
                print 'V', V
                continue
                stop_treshold
                stop
            if '_exptl_crystal.density_Matthews' in d_mmCIF.keys():
                if d_mmCIF['_exptl_crystal.density_Matthews'] not in [
                    ['?'],
                        len(d_mmCIF['_exptl_crystal.density_Matthews']) *
                    ['?'],
                ]:
                    if abs(MV -
                           float(d_mmCIF['_exptl_crystal.density_Matthews'][0])
                           ) > 1:
                        print 'MV', MV
                        print 'MV', d_mmCIF['_exptl_crystal.density_Matthews']
                        print 'mw', mw
                        print 'Z', Z
                        continue
                        stop_difference

            if not spacegroup in d_MV.keys():
                d_MV[spacegroup] = []
            d_MV[spacegroup] += [MV]

            print pdb, round(MV, 2), spacegroup


##    fd = open('MV_v_spacegroup.txt','w')
##    fd.write(str(d_MV))
##    fd.close()

    l = ['# MV_average MV_stddev n spacegroup\n']
    for spacegroup in d_MV.keys():
        l_MV = d_MV[spacegroup]
        if len(l_MV) <= 1:
            continue
        average, stddev = statistics.do_stddev(l_MV)
        average, stderr = statistics.do_stderr(l_MV)
        ##        l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)]
        l += ['%s %s %s %s\n' % (
            average,
            stderr,
            len(l_MV),
            spacegroup,
        )]

    fd = open('MV_v_spacegroup.txt', 'w')
    fd.writelines(l)
    fd.close()

    return
예제 #15
0
def get_position_ligand(pdb,pdb_apo,d_apo2holo,):

    pdb_holo = d_apo2holo[pdb_apo]['holo']
    d_mmCIF_holo = parse_mmCIF.main(pdb_holo,)
    d_coords, l_coords_alpha_holo = mmCIF2coords.main(pdb_holo,d_mmCIF_holo)

    ##
    ##
    ##
    ligand = d_apo2holo[pdb_apo]['ligand']

    l_residues = []
    for i in range(len(d_mmCIF_holo['_struct_site.id'])):
        if not 'BINDING SITE FOR RESIDUE %s' %(ligand) in d_mmCIF_holo['_struct_site.details'][i]:
            continue
        if len(l_residues) > 0:
            print pdb, pdb_apo, pdb_holo
            print l_residues
            print d_mmCIF_holo['_struct_site.details'][i]
            stop
        struct_site_ID = d_mmCIF_holo['_struct_site.id'][i]
        for j in range(len(d_mmCIF_holo['_struct_site_gen.site_id'])):
            struct_site_gen_ID = d_mmCIF_holo['_struct_site_gen.site_id'][j]
            if struct_site_ID == struct_site_gen_ID:
                residue = int(d_mmCIF_holo['_struct_site_gen.auth_seq_id'][j])
##                l_residues += [residue]
                ## include neighboring residues
                l_residues += [residue-1,residue,residue+1]
    l_residues = list(set(l_residues))
    if len(l_residues) == 0:
        print pdb
        stop

    ## 
    l_coords_ligand = []
    for i in range(len(d_mmCIF_holo['_atom_site.id'])):
        if (
            d_mmCIF_holo['_atom_site.group_PDB'][i] == 'HETATM'
            and
            d_mmCIF_holo['_atom_site.label_comp_id'][i] == ligand
            ):
            x = float(d_mmCIF_holo['_atom_site.Cartn_x'][i])
            y = float(d_mmCIF_holo['_atom_site.Cartn_y'][i])
            z = float(d_mmCIF_holo['_atom_site.Cartn_z'][i])
            coord = numpy.array([x,y,z,])
            l_coords_ligand += [coord]


    d_mmCIF_apo = parse_mmCIF.main(pdb_apo,)
    d_coords, l_coords_alpha_apo = mmCIF2coords.main(pdb_apo,d_mmCIF_apo)   

    ## structural alignment
    ## solution that works in all cases
    ## also for 2d59 and 2d5a, which have residues missing at the Nterm and Cterm, respectively
    ## first non-?
    index1_seq_apo = next((i for i,v in enumerate(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?'))
    index1_seq_holo = next((i for i,v in enumerate(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?'))
    ## last non-?
    index2_seq_apo = len(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?'))
    index2_seq_holo = len(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?'))
    ## first common non-?
    index1_coord_apo = max(0,index1_seq_holo-index1_seq_apo)
    index1_coord_holo = max(0,index1_seq_apo-index1_seq_holo)
    ## last common non-?
    index2_coord_apo = len(l_coords_alpha_apo)+min(0,index2_seq_holo-index2_seq_apo)
    index2_coord_holo = len(l_coords_alpha_holo)+min(0,index2_seq_apo-index2_seq_holo)
    l_coords_alpha_apo = l_coords_alpha_apo[index1_coord_apo:index2_coord_apo]
    l_coords_alpha_holo = l_coords_alpha_holo[index1_coord_holo:index2_coord_holo]


    if pdb == pdb_apo:
        l_seq_num = d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_seq_num'][index1_coord_apo:index2_coord_apo]
        chain = ''.join(d_mmCIF_apo['_entity_poly.pdbx_strand_id'])
        n_residues = len(l_coords_alpha_apo)
        l_coords_alpha = l_coords_alpha_apo
    else:
        l_seq_num = d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_seq_num'][index1_coord_holo:index2_coord_holo]
        chain = ''.join(d_mmCIF_holo['_entity_poly.pdbx_strand_id'])
        n_residues = len(l_coords_alpha_holo)
        l_coords_alpha = l_coords_alpha_holo

    overlap_site = 1.
##    ##
##    ## eigenvector
##    ##
##    cutoff = 10
##    matrix_hessian = NMA.hessian_calculation(l_coords_alpha,cutoff,)
##    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian)
##
##    ## apply transformation matrix
##    if pdb == pdb_apo:
##        instance_geometry = geometry.geometry()
##        rmsd = instance_geometry.superpose(l_coords_alpha_apo,l_coords_alpha_holo,)
##        tv1 = instance_geometry.fitcenter
##        rm = instance_geometry.rotation
##        tv2 = instance_geometry.refcenter
##        for i_coord in range(len(l_coords_ligand)):
##            l_coords_ligand[i_coord] = numpy.dot(l_coords_ligand[i_coord]-tv1,rm)+tv2
##
##    ##
##    ## apo/holo eigenvector
##    ##
##    vector_apo2holo = []
##    for i in range(len(l_coords_alpha_holo)):
##        vector_apo2holo += [
##            l_coords_alpha_holo[i][0]-l_coords_alpha_apo[i][0],
##            l_coords_alpha_holo[i][1]-l_coords_alpha_apo[i][1],
##            l_coords_alpha_holo[i][2]-l_coords_alpha_apo[i][2],
##            ]
##    vector_apo2holo = numpy.array(vector_apo2holo)
##
##    ##
##    ## calculate overlap between normal modes and difference vector
##    ## in the ligand binding site!!!
##    ##
##    vector_apo2holo_site = []
##    eigenvector_site = []
##    ## exclude coordinate not at the ligand binding site
##    for i_seq_num in range(len(l_seq_num)):
##        seq_num = int(l_seq_num[i_seq_num])
##        if seq_num in l_residues:
##            eigenvector_site += list(eigenvectors[6][3*i_seq_num:3*i_seq_num+3])
##            vector_apo2holo_site += list(vector_apo2holo[3*i_seq_num:3*i_seq_num+3])
##    ## calculate overlap
##    vector_apo2holo_site = numpy.array(vector_apo2holo_site)
##    eigenvector_site = numpy.array(eigenvector_site)
##    overlap_site = abs(
##        numpy.dot(eigenvector_site,vector_apo2holo_site)
##        /
##        math.sqrt(
##            numpy.dot(eigenvector_site,eigenvector_site)
##            *
##            numpy.dot(vector_apo2holo_site,vector_apo2holo_site)
##            )
##        )
##    if overlap_site > 0.8:
##        print vector_apo2holo_site
##        print eigenvector_site
##        print pdb
##        print l_residues

    position_ligand = sum(l_coords_ligand)/len(l_coords_ligand)

    n_atoms = len(l_coords_ligand)

    return position_ligand, chain, n_residues, n_atoms, ligand, overlap_site
def main():

    set_pdbs = exclude_include()
    l_pdbs_remove = [
        '4a3h','2wf5','1arl','1ee3', ## incorrect _struct_ref_seq.pdbx_db_accession
        '1uyd','1uye','1uyf','2byh','2byi', ## remediation _struct_ref_seq_dif
        '2xdu','3dn8','3dna','1ps3','1ouf','1l35','2eun','1rtc','1zon', ## _struct_ref_seq_dif missing
        '1pwl','1pwm','2fz8','2fz9', ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
        ]
    set_pdbs.remove('1f92') ## remediation _struct_ref_seq_dif incorrect residue number
    set_pdbs.remove('2f6f') ## remediation _pdbx_poly_seq_scheme.auth_mon_id wrong
    set_pdbs.remove('3a5j') ## remediation _struct_ref_seq_dif.db_mon_id is ? but should be MET
    set_pdbs.remove('2rhx') ## remediation _struct_ref_seq_dif.db_mon_id is ? but should be SER
    set_pdbs.remove('2fzb') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('2fzd') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('3dn5') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('1x96') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('1x97') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('1x98') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code
    set_pdbs.remove('1z3n') ## GenBank DBref - not an error...
    set_pdbs.remove('1z8a') ## GenBank DBref - not an error...
    set_pdbs.remove('1z89') ## GenBank DBref - not an error...
    set_pdbs.remove('2pf8') ## stupid use of alt_ids (C for highest occupancy and only altloc)
    set_pdbs.remove('2pyr') ## stupid use of alt_ids (G and R)
    set_pdbs.remove('3pdn') ## stupid use of alt_ids (B and C)
    set_pdbs.remove('2v4c') ## alt_id B used for 100% occupancy atoms
    set_pdbs.remove('1jxt') ## weird alt_id microheterogeneity...
    set_pdbs.remove('1jxu') ## weird alt_id microheterogeneity...
    set_pdbs.remove('1jxw') ## weird alt_id microheterogeneity...
    set_pdbs.remove('1jxx') ## weird alt_id microheterogeneity...
    set_pdbs.remove('1jxy') ## weird alt_id microheterogeneity...
##    set_pdbs.remove('1ac4') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)...
##    set_pdbs.remove('1ac8') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)...
##    set_pdbs.remove('1aeb') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)...
##    set_pdbs.remove('2rbt') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789
##    set_pdbs.remove('2rbu') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789
##    set_pdbs.remove('2rbv') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789
    for pdb in l_pdbs_remove:
        set_pdbs.remove(pdb)

    fd = open('%s/bc-100.out' %(path_mmCIF),'r')
    lines = fd.readlines()
    fd.close()

    for i_line in range(len(lines)):
        cluster = i_line
        if cluster < 4816:
            continue
##        if cluster not in [5,]:
##            continue
        line = lines[i_line]
        l_pdbs = line.lower().split()
        l_pdbs.sort()
        for i_pdb in range(len(l_pdbs)):
            l_pdbs[i_pdb] = l_pdbs[i_pdb][:4]

        for i_pdb1 in range(0,len(l_pdbs)-1):

            pdb1 = l_pdbs[i_pdb1]

##            if pdb1 != '1t49': ## tmp!!!
##                continue

            if not pdb1 in set_pdbs:
                continue

            print pdb1
            stop

            d_mmCIF1 = parse_mmCIF.main(pdb1,)

            bool_monomeric = check_monomeric(d_mmCIF1)
            if bool_monomeric == False:
                if i_pdb1 == 0:
                    break
                else:
                    continue

            bool_remediation_modres = check_modres(d_mmCIF1,pdb1,)
            if bool_remediation_modres == True:
                continue

            if '_struct_ref_seq_dif.details' in d_mmCIF1.keys():
                if 'DELETION' in d_mmCIF1['_struct_ref_seq_dif.details']:
                    continue

            for i_entity in range(len(d_mmCIF1['_entity.id'])):
                if d_mmCIF1['_entity.type'][i_entity] == 'polymer':
                    if int(d_mmCIF1['_entity.pdbx_number_of_molecules'][i_entity]) != 1:
                        print d_mmCIF1['_entity.pdbx_number_of_molecules']
                        print pdb1, cluster
                        stop

            SG1 = d_mmCIF1['_symmetry.space_group_name_H-M']

            for i_pdb2 in range(i_pdb1+1,len(l_pdbs)):

                pdb2 = l_pdbs[i_pdb2]

##                if pdb2 != '2pf8': ## tmp!!!
##                    continue

##                if pdb1 != '3fui' or pdb2 != '3fuj':
##                    continue

                if not pdb2 in set_pdbs:
                    continue

                d_mmCIF2 = parse_mmCIF.main(pdb2,)

                bool_monomeric = check_monomeric(d_mmCIF2)
                if bool_monomeric == False:
                    continue

                bool_remediation_modres = check_modres(d_mmCIF2,pdb2,)
                if bool_remediation_modres == True:
                    continue

                if '_struct_ref_seq_dif.seq_num' in d_mmCIF2.keys():
                    if 'DELETION' in d_mmCIF2['_struct_ref_seq_dif.details']:
                        continue

                ## biounit monomeric?
                for i_entity in range(len(d_mmCIF2['_entity.id'])):
                    if d_mmCIF2['_entity.type'][i_entity] == 'polymer':
                        if int(d_mmCIF2['_entity.pdbx_number_of_molecules'][i_entity]) != 1:
                            continue

                SG2 = d_mmCIF2['_symmetry.space_group_name_H-M']

                if SG1 != SG2:
                    continue

                ## parse coordinates again after being shortened in previous loop
                try:
                    d_coords1, l_coords_alpha1 = mmCIF2coords.main(pdb1, d_mmCIF1)
                except:
                    fd = open('remediation_atom_site.label_alt_id.txt','a')
                    fd.write('%s\n' %(pdb1,))
                    fd.close()
                try:
                    d_coords2, l_coords_alpha2 = mmCIF2coords.main(pdb2, d_mmCIF2)
                except:
                    fd = open('remediation_atom_site.label_alt_id.txt','a')
                    fd.write('%s\n' %(pdb2,))
                    fd.close()

                ## align sequences/coordinates
                try:
                    l_coords_alpha1, l_coords_alpha2 = create_apo_holo_dataset.sequential_alignment_of_coordinates(
                        l_coords_alpha1, l_coords_alpha2,
                        d_mmCIF1, d_mmCIF2,
                        pdb1, pdb2,
                        )
                except:
                    fd = open('remediation_struct_ref_seq_dif.txt','a')
                    fd.write(
                        '%s %s %s %s\n' %(
                            pdb1,pdb2,
                            d_mmCIF1['_struct_ref_seq.pdbx_db_accession'],
                            d_mmCIF2['_struct_ref_seq.pdbx_db_accession'],
                            )
                        )
                    fd.close()
                    continue
                if len(l_coords_alpha1) != len(l_coords_alpha2):
                    print d_mmCIF1['_pdbx_poly_seq_scheme.pdb_mon_id']
                    print d_mmCIF2['_pdbx_poly_seq_scheme.pdb_mon_id']
                    print 'coords', len(l_coords_alpha1), len(l_coords_alpha2)
                    print 'seq', len(d_mmCIF1['_pdbx_poly_seq_scheme.pdb_mon_id'])
                    print 'seq', len(d_mmCIF2['_pdbx_poly_seq_scheme.pdb_mon_id'])
                    print pdb1, pdb2
                    d_coords1, l_coords_alpha1 = mmCIF2coords.main(pdb1, d_mmCIF1)
                    d_coords1, l_coords_alpha2 = mmCIF2coords.main(pdb1, d_mmCIF2)
                    print len(l_coords_alpha1), len(l_coords_alpha2)
                    stop
                    continue

                ##
                ## align structure 1 and 2
                ##
                instance_geometry = geometry.geometry()
                rmsd = instance_geometry.superpose(l_coords_alpha1,l_coords_alpha2)
                tv1 = instance_geometry.fitcenter
                rm = instance_geometry.rotation
                tv2 = instance_geometry.refcenter

                ## structural alignment
                for i_coord in range(len(l_coords_alpha2)):
                    l_coords_alpha2[i_coord] = numpy.dot(l_coords_alpha2[i_coord]-tv1,rm)+tv2

                ##
                ## vector from structure 1 to 2
                ##
                vector = []
                for i in range(len(l_coords_alpha1)):
                    vector += [
                        l_coords_alpha1[i][0]-l_coords_alpha2[i][0],
                        l_coords_alpha1[i][1]-l_coords_alpha2[i][1],
                        l_coords_alpha1[i][2]-l_coords_alpha2[i][2],
                        ]
                vector = numpy.array(vector)

                ##
                ## calculate normal modes of structure 1
                ##
                cutoff = 10
                try:
                    matrix_hessian1 = NMA.hessian_calculation(l_coords_alpha1, cutoff, verbose = False)
                    eigenvectors1, eigenvalues1 = NMA.diagonalize_hessian(matrix_hessian1, verbose = False)
                    matrix_hessian2 = NMA.hessian_calculation(l_coords_alpha2, cutoff, verbose = False)
                    eigenvectors2, eigenvalues2 = NMA.diagonalize_hessian(matrix_hessian2, verbose = False)
                except:
                    continue

                ##
                ## calculate overlap between normal modes and difference vector
                ##
                eigenvector1 = eigenvectors1[6]
                eigenvector2 = eigenvectors2[6]

                overlap1 = calc_overlap(eigenvector1,vector)
                overlap2 = calc_overlap(eigenvector2,vector)
                overlap3a = calc_overlap(eigenvector1,eigenvector2)
                overlap3b = calc_overlap(eigenvectors1[6],eigenvectors2[7])
                overlap3c = calc_overlap(eigenvectors1[7],eigenvectors2[6])
                overlap3 = max(overlap3a,overlap3b,overlap3c)

                fd = open('rmsd_v_overlap2/cluster%i.txt' %(i_line),'a')
                fd.write('%s %s\n' %(rmsd,overlap1))
                fd.close()
                fd = open('rmsd_v_overlap2/cluster%i.txt' %(i_line),'a')
                fd.write('%s %s\n' %(rmsd,overlap2))
                fd.close()
                fd = open('rmsd_v_overlap2/cluster%i_ev_v_ev.txt' %(i_line),'a')
                fd.write('%s %s\n' %(rmsd,overlap3a))
                fd.close()
                fd = open('rmsd_v_overlap2/cluster%i_ev_v_ev_max.txt' %(i_line),'a')
                fd.write('%s %s\n' %(rmsd,overlap3))
                fd.close()
                print pdb1, pdb2, 'cluster', i_line, 'size', len(l_pdbs),
                print 'overlap', '%4.2f' %(round(overlap1,2)), '%4.2f' %(round(overlap2,2)), '%4.2f' %(round(overlap3,2)), 'rmsd', '%4.2f' %(round(rmsd,2))

    return
def main():

    fn_out = 'db_MatthewsCoefficient.txt'

    fd = open(fn_out,'r')
    lines = fd.readlines()
    fd.close()

    d = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        v = l[1]
        if pdb == '2p51':
            v = '1.72610466393'
        d[pdb] = v

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d.keys():
                continue

            ## Matthews Coefficient not calculated...
            if pdb in [
                '1vh7','1vho','1vhu','1vi3','1vi4','1vis',
                ]:
                continue

            ## Matthews Coefficient *wrong*
            if pdb in [
                '2p51',
                ## too high
                '1c5v','1q9i','1ut6','1x6x','1x6y','1xdn','1y63','1zix',
                ## too low
                '1t95','1jih','1t95','1d5t','1c7k',
                '1dbo','1d9x','1qt9','1ia5','1dcq',
                ]:
                continue

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                d_breaks = {
                    ## break if multiple polymer types (not monomeric)
                    '_entity_poly.entity_id':'2',
##                    '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR
                    ## break if multiple chains
                    '_entity_poly.pdbx_strand_id':',',
                    }, 
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    ## break if not monomeric
                    '_pdbx_struct_assembly.oligomeric_details':'monomeric',
                    },
                l_data_categories = [
                    '_exptl_crystal',
                    ], ## parse selected data categories
                l_data_categories_break = ['_exptl_crystal']
                )

            ## some unknown temporary error... or break before reaching this part when parsing...
            if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys():
                continue

            ## NMR structure?
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                stop2
                continue

            ## no polymers in structure?
            if not '_entity_poly.entity_id' in d_mmCIF.keys():
                continue

            ## polymer(s) is/are not polypeptide(s)
            if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']:
                continue

            ## biounit not monomeric
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_details'])*['monomeric']:
                continue

            ## one polymer in assymetric unit
            if len(d_mmCIF['_entity_poly.entity_id']) > 1:
                continue

            if d_mmCIF['_exptl_crystal.density_Matthews'] == ['?']:
                v = VM = calc_matthews_coefficient.main(pdb)
##                continue
            else:
                v = float(''.join(d_mmCIF['_exptl_crystal.density_Matthews']))

            line = '%s %s\n' %(pdb,v,)

            fd = open(fn_out,'a')
            fd.write(line)
            fd.close()

            d[pdb] = v

    ##
    ## write calculated radii of gyration to file
    ##
    lines_out = []
    for pdb,v in d.items():
        line = '%s %s\n' %(pdb,v,)
        lines_out += [line]
    fd = open(fn_out,'w')
    fd.writelines(lines_out)
    fd.close()

    return
def main():

    d_MV = {}

    path = '/data/mmCIF'
    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn == 'mmCIF.py':
            continue
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        l_fn = os.listdir('%s/%s' %(path,dn))
        for fn in l_fn:
            pdb = fn[:4]
##            if pdb.upper() not in s_pdbs:
##                continue
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks = {'_exptl.method':'SOLUTION NMR'},
                l_data_categories = [
                    '_cell','_entity','_exptl','_exptl_crystal',
                    '_entity_poly',
                    '_symmetry',
                    ## virus
                    '_pdbx_struct_assembly',
                    ## split structure
                    '_pdbx_database_related',
                    ],
                )

            ## x-ray structure
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            ## polymer present
            if not '_entity_poly.type' in d_mmCIF.keys():
                continue

            ## only polymer present is protein
            if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']:
                continue

            if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys():
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['?']:
                continue
            
            ## virus
            if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'][0]) % 60 == 0:
                continue

            ## not monomer
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['1']:
                continue

            ## split structure
            if '_pdbx_database_related' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related']:
                    continue
                if 'SPLIT' in d_mmCIF['_pdbx_database_related']:
                    print pdb
                    stop

            if not '_cell.Z_PDB' in d_mmCIF.keys():
                continue

            if pdb in [
                ## treshold
                '1e54','1e9i',
                ## difference between calculated MV and MV in mmCIF
                '3eiq',
                ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with
                ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1
                ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01.
                ## Toscana has published with Hellinga...
                '2cjf','2bt4',
                ]:
                continue

##            if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [
##                'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3',
##                ]:
##                continue ## tmp!!!

            a = float(d_mmCIF['_cell.length_a'][0])
            b = float(d_mmCIF['_cell.length_b'][0])
            c = float(d_mmCIF['_cell.length_c'][0])
            alpha = float(d_mmCIF['_cell.angle_alpha'][0])
            beta = float(d_mmCIF['_cell.angle_beta'][0])
            gamma = float(d_mmCIF['_cell.angle_gamma'][0])
            Z = int(d_mmCIF['_cell.Z_PDB'][0])
            mw = 0
            for i in range(len(d_mmCIF['_entity.id'])):
##                if d_mmCIF['_entity.type'][i] == 'polymer':
                    s = d_mmCIF['_entity.formula_weight'][i]
                    ## unknown ligand
                    if s == '?':
                        continue
                    mw += float(s)

            MV = matthews_coefficient.main(a,b,c,alpha,beta,gamma,mw,Z)

            spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M'])

            if spacegroup not in [
                'F 4 3 2',
                'F 41 3 2',
                'I 41 3 2',
                ]:
                continue ## tmp!!!

            if MV > 10:
                print pdb
                print 'mw', mw
                print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews']
                print 'Z', Z
                import math
                alpha *= math.pi/180.
                beta *= math.pi/180.
                gamma *= math.pi/180.
                V = a*b*c*math.sqrt(1-math.cos(alpha)**2-math.cos(beta)**2-math.cos(gamma)**2+2*(math.cos(alpha)*math.cos(beta)*math.cos(gamma)))
                print 'V', V
                continue
                stop_treshold
                stop
            if '_exptl_crystal.density_Matthews' in d_mmCIF.keys():
                if d_mmCIF['_exptl_crystal.density_Matthews'] not in [['?'],len(d_mmCIF['_exptl_crystal.density_Matthews'])*['?'],]:
                    if abs(MV-float(d_mmCIF['_exptl_crystal.density_Matthews'][0])) > 1:
                        print 'MV', MV
                        print 'MV', d_mmCIF['_exptl_crystal.density_Matthews']
                        print 'mw', mw
                        print 'Z', Z
                        continue
                        stop_difference


            if not spacegroup in d_MV.keys():
                d_MV[spacegroup] = []
            d_MV[spacegroup] += [MV]

            print pdb, round(MV,2), spacegroup

##    fd = open('MV_v_spacegroup.txt','w')
##    fd.write(str(d_MV))
##    fd.close()

    l = ['# MV_average MV_stddev n spacegroup\n']
    for spacegroup in d_MV.keys():
        l_MV = d_MV[spacegroup]
        if len(l_MV) <= 1:
            continue
        average, stddev = statistics.do_stddev(l_MV)
        average, stderr = statistics.do_stderr(l_MV)
##        l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)]
        l += ['%s %s %s %s\n' %(average,stderr,len(l_MV),spacegroup,)]

    fd = open('MV_v_spacegroup.txt','w')
    fd.writelines(l)
    fd.close()

    return
예제 #19
0
def main():

    fd = open('db_authors.txt','r')
    lines = fd.readlines()
    fd.close()

    d_authors = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        s_authors = l[1:]
        d_authors[pdb] = s_authors

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d_authors.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                l_data_categories = [
                    '_audit_author',
                    '_citation_author',
                    ], ## parse selected data categories
                l_data_categories_break = [
                    '_citation_author',
                    ],
                )

            l_authors = d_mmCIF['_audit_author.name']
            s_authors = ';'.join(l_authors)

            if d_mmCIF['_audit_author.name'] == []:
                print d_mmCIF['_citation_author.name']
                print d_mmCIF['_audit_author.name']
                stop

            line = '%s %s\n' %(pdb,s_authors,)
            lines_out += [line]

            fd = open('db_authors.txt','a')
            fd.write(line)
            fd.close()

            d_authors[pdb] = s_authors

    ##
    ## write to file
    ##
    lines_out = []
    for pdb,s_authors in d_authors.items():
        line = '%s %s\n' %(pdb,s_authors,)
        lines_out += [line]
    fd = open('db_authors.txt','w')
    fd.writelines(lines_out)
    fd.close()

    return
def main():

    fd = open('remediation_negativeBiso.txt','r')
    lines = fd.readlines()
    fd.close()
    l_pdbs = []
    for line in lines:
        if line.strip() == '':
            continue
        if line[0] == '#':
            continue
        l = line.strip().split()
        pdb = l[0]
        l_pdbs += [pdb]

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if not pdb in l_pdbs:
                continue

            print pdb

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    },
                l_data_categories = [
                    ## parse selected data categories
                    '_database_PDB_rev',
                    '_computing',
                    '_atom_site',
                    '_refine'
                    ],
                )

##            ## no polymers in structure?
##            if not '_entity_poly.entity_id' in d_mmCIF.keys():
##                continue

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            print pdb

            ##
            ## parse bfactors
            ##
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                bfactor = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site])

##                if bfactor == '?':
##                    continue

                element = d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site]

                if float(bfactor) < -0.01:
                    if (
                        element != 'H'
                        and
                        comp_id in ['ALA','CYS','ASP','GLU','PHE','GLY','HIS','ILE','LYS','MET','ASN','PRO','GLN','ARG','SER','THR','VAL','TRP','TYR',]
                        ):

                        print
                        print 'negative'
                        print

                        year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
                        atom_id = int(d_mmCIF['_atom_site.id'][i_atom_site])
                        refinement = ''.join(d_mmCIF['_computing.structure_refinement'])
                        solution = ''.join(d_mmCIF['_computing.structure_solution'])
                        resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high']))
                        
                        fd = open('remediation_negativeBiso.txt','a')
                        fd.write(
##                            '%4s %4i %4i %3s %2s %6.2f %30s %20s\n' %(
                            '%4s\t%4i\t%4i\t%3s\t%2s\t%6.2f\t%6.2f\t%30s\t%20s\n' %(
                                pdb,year,atom_id,
                                comp_id,element,bfactor,resolution,
                                solution.ljust(30),refinement.ljust(20),
                                )
                            )
                        fd.close()
                        break

    return
예제 #21
0
def parse_dihedrals():

    import sys

    path = '/data/mmCIF'

    d_phipsi_res = {
        'ALA': [],
        'CYS': [],
        'ASP': [],
        'GLU': [],
        'PHE': [],
        'GLY': [],
        'HIS': [],
        'ILE': [],
        'LYS': [],
        'LEU': [],
        'MET': [],
        'ASN': [],
        'PRO': [],
        'GLN': [],
        'ARG': [],
        'SER': [],
        'THR': [],
        'VAL': [],
        'TRP': [],
        'TYR': [],
        'prePRO': [],
        'prePRO_notGLY': [],
        'prePRO_GLY': [],
        'cisPro': [],
        'transPro': [],
        'all_notgly_notpro_notprepro': [],
    }

    d_phipsi_ss = {
        'sheet': [],  ## _struct_sheet_order.sense
        ##_struct_conf.pdbx_PDB_helix_class
        'helix_alpha': [],  ## i+4 # 1
        'helix_pi': [],  ## i+5 # 3
        'helix_310': [],  ## i+3 # 5
        'Turn': [],  ## i+?
        ##
        'turns_notgly_notpro_notprepro': [],
    }

    d_counts = {
        'cisProALA': 0,
        'cisProCYS': 0,
        'cisProASP': 0,
        'cisProGLU': 0,
        'cisProPHE': 0,
        'cisProGLY': 0,
        'cisProHIS': 0,
        'cisProILE': 0,
        'cisProLYS': 0,
        'cisProLEU': 0,
        'cisProMET': 0,
        'cisProASN': 0,
        'cisProPRO': 0,
        'cisProGLN': 0,
        'cisProARG': 0,
        'cisProSER': 0,
        'cisProTHR': 0,
        'cisProVAL': 0,
        'cisProTRP': 0,
        'cisProTYR': 0,
        'cisPro_helix': 0,
        'cisPro_sheet': 0,
        'cisPro_turn': 0,
        'cisPro_random': 0,
    }

    l_dn = os.listdir(path)
    l_dn.sort()
    l_dn.remove('mmCIF.py')
    for dn in l_dn:
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        print '*', dn
        l_fn = os.listdir('%s/%s' % (
            path,
            dn,
        ))
        l_fn.sort()
        for fn in l_fn:
            pdb = fn[:4]
            print pdb
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks={'_exptl.method': ['SOLUTION NMR']},
                l_data_categories=[
                    '_exptl',
                    '_refine',
                    '_struct_conf',  ## HELIX
                    '_struct_sheet_range',  ## SHEET
                    '_entity',
                    '_entity_poly',
                    '_entity_poly_seq',
                    '_atom_site',
                ],
            )

            ## skip NMR models
            if ''.join(d_mmCIF['_exptl.method']) in [
                    'SOLUTION NMR',
                    'POWDER DIFFRACTION',
                    'ELECTRON MICROSCOPY',
            ]:
                continue

            if not '_refine.ls_d_res_high' in d_mmCIF.keys():
                print d_mmCIF['_exptl.method']
                continue

            ## skip if multiple resolutions
            if len(d_mmCIF['_refine.ls_d_res_high']) > 1:
                continue

            ## skip if no resolution
            if ''.join(d_mmCIF['_refine.ls_d_res_high']) == '?':
                continue

            ## skip low resolution structures
            if float(''.join(d_mmCIF['_refine.ls_d_res_high'])) > 2:
                continue

            if not 'polymer' in d_mmCIF['_entity.type']:
                continue
            if not '_entity_poly.type' in d_mmCIF.keys():  ## e.g. 1hhu
                continue
            if d_mmCIF['_entity_poly.type'] == [
                    'polydeoxyribonucleotide/polyribonucleotide hybrid'
            ]:
                continue
            if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide']:
                continue

            d_sequence = {}
            for i_entity_poly_seq in range(
                    len(d_mmCIF['_entity_poly_seq.entity_id'])):
                entity_id = int(
                    d_mmCIF['_entity_poly_seq.entity_id'][i_entity_poly_seq])
                if not entity_id in d_sequence.keys():
                    d_sequence[entity_id] = []
                res_no = int(
                    d_mmCIF['_entity_poly_seq.num'][i_entity_poly_seq])
                res_name = d_mmCIF['_entity_poly_seq.mon_id'][
                    i_entity_poly_seq]
                d_sequence[entity_id] += [{
                    'res_no': res_no,
                    'res_name': res_name,
                }]

            l_entities_poly = []
            for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])):
                ## skip if not polypeptide
                entity_poly_type = d_mmCIF['_entity_poly.type'][i_entity_poly]
                if entity_poly_type != 'polypeptide(L)':
                    continue
                ## skip if nonstd linkages
                if d_mmCIF['_entity_poly.nstd_linkage'][
                        i_entity_poly] == 'yes':
                    print pdb
                    stop
                    continue
                ## parse entity_id and chains
                entity_id = int(
                    d_mmCIF['_entity_poly.entity_id'][i_entity_poly])
                l_entities_poly += [entity_id]
            ## skip if no polypeptide chains
            if l_entities_poly == []:
                continue

            d_coords = {}
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                entity_id = int(
                    d_mmCIF['_atom_site.label_entity_id'][i_atom_site])
                ## not a polymer
                if not entity_id in l_entities_poly:
                    continue
                ## polymer, append
                elif not entity_id in d_coords.keys():
                    d_coords[entity_id] = {}

                model = int(
                    d_mmCIF['_atom_site.pdbx_PDB_model_num'][i_atom_site])
                if model > 1:
                    continue

                chain = d_mmCIF['_atom_site.label_asym_id'][i_atom_site]
                if not chain in d_coords[entity_id].keys():
                    d_coords[entity_id][chain] = {}
                res_no = int(d_mmCIF['_atom_site.label_seq_id'][i_atom_site])
                if not res_no in d_coords[entity_id][chain].keys():
                    d_coords[entity_id][chain][res_no] = {}
                atom_name = d_mmCIF['_atom_site.label_atom_id'][i_atom_site]

                altloc = d_mmCIF['_atom_site.label_alt_id'][i_atom_site]
                if altloc not in [
                        '.',
                        'A',
                        '1',
                ]:
                    continue

                ## skip if zero occupancy
                occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site])
                if altloc == '.' and occupancy == 0:
                    continue

                if atom_name in [
                        'CA',
                        'C',
                        'O',
                        'N',
                ] and atom_name in d_coords[entity_id][chain][res_no].keys():
                    print pdb, chain, res_no, atom_name
                    print d_mmCIF['_atom_site.Cartn_x'][i_atom_site], d_mmCIF[
                        '_atom_site.Cartn_y'][i_atom_site]
                    print d_coords[entity_id][chain][res_no][atom_name]
                    stop
                x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site])
                y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site])
                z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site])
                coord = numpy.array([
                    x,
                    y,
                    z,
                ])
                d_coords[entity_id][chain][res_no][atom_name] = coord

            d_helices = {}
            ## helices or turns present?
            if '_struct_conf.id' in d_mmCIF.keys():
                for i_struct_conf in range(len(d_mmCIF['_struct_conf.id'])):
                    chain1 = d_mmCIF['_struct_conf.beg_label_asym_id'][
                        i_struct_conf]
                    chain2 = d_mmCIF['_struct_conf.end_label_asym_id'][
                        i_struct_conf]
                    res_no1 = int(d_mmCIF['_struct_conf.beg_label_seq_id']
                                  [i_struct_conf])
                    res_no2 = int(d_mmCIF['_struct_conf.end_label_seq_id']
                                  [i_struct_conf])
                    conf_type_id = d_mmCIF['_struct_conf.conf_type_id'][
                        i_struct_conf]
                    if chain1 != chain2:
                        print chain1, chain2, pdb
                        stop
                    if conf_type_id == 'HELX_P':
                        helix_class = int(
                            d_mmCIF['_struct_conf.pdbx_PDB_helix_class']
                            [i_struct_conf])
                    elif conf_type_id == 'TURN_P':
                        helix_class = 99
                    else:
                        print conf_type_id
                        print pdb
                        stop
                    l_res_nos = range(
                        res_no1,
                        res_no2 + 1,
                    )
                    if not chain1 in d_helices.keys():
                        d_helices[chain1] = {}
                    for res_no in l_res_nos:
                        d_helices[chain1][res_no] = helix_class

            d_sheets = {}
            ## sheet present?
            if '_struct_sheet_range.sheet_id' in d_mmCIF.keys():
                for i_struct_sheet_range in range(
                        len(d_mmCIF['_struct_sheet_range.sheet_id'])):
                    chain1 = d_mmCIF['_struct_sheet_range.beg_label_asym_id'][
                        i_struct_sheet_range]
                    chain2 = d_mmCIF['_struct_sheet_range.end_label_asym_id'][
                        i_struct_sheet_range]
                    res_no1 = int(
                        d_mmCIF['_struct_sheet_range.beg_label_seq_id']
                        [i_struct_sheet_range])
                    res_no2 = int(
                        d_mmCIF['_struct_sheet_range.end_label_seq_id']
                        [i_struct_sheet_range])
                    l_res_nos = range(
                        res_no1,
                        res_no2 + 1,
                    )
                    if chain1 != chain2:
                        print chain1, chain2, pdb
                        stop
                    if not chain1 in d_sheets.keys():
                        d_sheets[chain1] = []
                    for res_no in l_res_nos:
                        d_sheets[chain1] += l_res_nos

            for entity_id in l_entities_poly:
                for chain in d_coords[entity_id].keys():
                    ## skip if short peptide (e.g. 13gs)
                    if len(d_sequence[entity_id]) <= 3:
                        continue
                    for i_res_no in range(1, len(d_sequence[entity_id]) - 1):
                        res_no_prev = int(d_sequence[entity_id][i_res_no -
                                                                1]['res_no'])
                        res_no = int(d_sequence[entity_id][i_res_no]['res_no'])
                        res_no_next = int(d_sequence[entity_id][i_res_no +
                                                                1]['res_no'])
                        res_name = d_sequence[entity_id][i_res_no]['res_name']
                        if res_name == 'MSE':
                            res_name = 'MET'
                        res_name_next = d_sequence[entity_id][i_res_no +
                                                              1]['res_name']

                        ## not a standard residue
                        if not res_name in d_phipsi_res.keys():
                            continue

                        ## residue not observed
                        if not res_no_prev in d_coords[entity_id][chain].keys(
                        ):
                            continue
                        if not res_no in d_coords[entity_id][chain].keys():
                            continue
                        if not res_no_next in d_coords[entity_id][chain].keys(
                        ):
                            continue

                        ## atom not observed
                        if not 'C' in d_coords[entity_id][chain][res_no_prev]:
                            continue
                        if not 'N' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'CA' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'C' in d_coords[entity_id][chain][res_no]:
                            continue
                        if not 'N' in d_coords[entity_id][chain][res_no_next]:
                            continue

                        C_prev = d_coords[entity_id][chain][res_no_prev]['C']
                        N = d_coords[entity_id][chain][res_no]['N']
                        CA = d_coords[entity_id][chain][res_no]['CA']
                        C = d_coords[entity_id][chain][res_no]['C']
                        N_next = d_coords[entity_id][chain][res_no_next]['N']
                        phi = calc_dihedral(
                            C_prev,
                            N,
                            CA,
                            C,
                        )
                        psi = calc_dihedral(
                            N,
                            CA,
                            C,
                            N_next,
                        )

                        if 'CA' in d_coords[entity_id][chain][
                                res_no_prev].keys():
                            CA_prev = d_coords[entity_id][chain][res_no_prev][
                                'CA']
                            omega = calc_dihedral(
                                CA_prev,
                                C_prev,
                                N,
                                CA,
                            )
                        else:
                            omega = None

                        if omega:
                            if (omega and omega < 150
                                    and omega > -150):  ## 12e8, PRO44D
                                if abs(omega
                                       ) > 30:  ## 12e8 PRO196D, 1a44 GLU82A
                                    omega = None
                                ## cis
                                else:
                                    omega = 'cis'
                                    pass
                            ## trans
                            else:
                                omega = 'trans'
                                pass
                        else:
                            omega = None

                        bool_helix = False
                        if chain in d_helices.keys():
                            if res_no in d_helices[chain].keys():
                                bool_helix = True
                                helix_class = d_helices[chain][res_no]

                        bool_sheet = False
                        if chain in d_sheets.keys():
                            if res_no in d_sheets[chain]:
                                bool_sheet = True


##                        if bool_helix == True and bool_sheet == True and helix_class != 99:
##                            print pdb, chain, res_no, 'sheet and helix'
####                            stop

                        if res_name_next == 'PRO':
                            d_phipsi_res['prePRO'] += [[
                                phi,
                                psi,
                            ]]
                            if res_name != 'GLY':
                                d_phipsi_res['prePRO_notGLY'] += [[
                                    phi,
                                    psi,
                                ]]
                            else:
                                d_phipsi_res['prePRO_GLY'] += [[
                                    phi,
                                    psi,
                                ]]
                        else:
                            d_phipsi_res[res_name] += [[
                                phi,
                                psi,
                            ]]
                            if res_name not in [
                                    'GLY',
                                    'PRO',
                            ]:
                                d_phipsi_res[
                                    'all_notgly_notpro_notprepro'] += [[
                                        phi,
                                        psi,
                                    ]]
                            elif res_name == 'PRO' and omega:
                                d_phipsi_res['%sPro' % (omega)] += [[
                                    phi,
                                    psi,
                                ]]
                                if omega == 'cis':
                                    d_counts['cisPro%s' % (res_name)] += 1
                                    if bool_helix == True:
                                        if helix_class == 1:
                                            d_counts['cisPro_helix'] += 1
                                        elif helix_class == 99:
                                            d_counts['cisPro_turn'] += 99
                                    elif bool_sheet == True:
                                        d_counts['cisPro_sheet'] += 1
                                    else:
                                        d_counts['cisPro_random'] += 1

                        if bool_helix == True:
                            ##                            if helix_class not in [1,3,5,99,]:
                            ##                                print pdb, chain, res_no, helix_class
                            ##                                print 'unexpected helix class'
                            ####                                stop_helix_class
                            if helix_class == 1:
                                d_phipsi_ss['helix_alpha'] += [[
                                    phi,
                                    psi,
                                ]]
                            elif helix_class == 3:
                                d_phipsi_ss['helix_pi'] += [[
                                    phi,
                                    psi,
                                ]]
                            elif helix_class == 5:
                                d_phipsi_ss['helix_310'] += [[
                                    phi,
                                    psi,
                                ]]
                            elif helix_class == 99:
                                d_phipsi_ss['Turn'] += [[
                                    phi,
                                    psi,
                                ]]
                                if (res_name_next != 'PRO'
                                        and res_name not in [
                                            'GLY',
                                            'PRO',
                                        ]):
                                    d_phipsi_ss[
                                        'turns_notgly_notpro_notprepro'] += [[
                                            phi,
                                            psi,
                                        ]]
                        if bool_sheet == True:
                            d_phipsi_ss['sheet'] += [[
                                phi,
                                psi,
                            ]]

    l = []
    for k in d_counts.keys():
        count = d_counts[k]
        l += ['%s %s\n' % (
            k,
            count,
        )]
    fd = open('count.txt', 'w')
    fd.writelines(l)
    fd.close()

    return d_phipsi_res, d_phipsi_ss
예제 #22
0
def main():

    l_pdbs = []
    fd = open('Biso_v_resolution.gnuplotdata','r')
    lines = fd.readlines()
    fd.close()
    for line in lines:
        l = line.split()
        resolution = float(l[1])
        Biso = float(l[0])
        if resolution > 3.5 and Biso < 10:
            print line
        if resolution > 2.5 and Biso < 10:
            print line
        if resolution > 2.0 and Biso < 5:
            print line
##        if resolution > 1.5 and Biso < 5:
##            print line
        pdb = l[2]
        l_pdbs += [pdb]

    Biso_average_prev = 0

    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        if not os.path.isdir('%s/%s' %(path,dn)):
            continue
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue
            pdb = fn[0:4]

            if pdb in l_pdbs:
                continue

            if pdb in [
                '3bfn', ## PISA left out chains from biological unit
                '2jjg','1qjb', ## _pdbx_struct_assembly missing
                ]:
                continue

            ##
            ## parse header
            ##
            d_mmCIF = parse_mmCIF.main(
                pdb,
                l_data_categories = [
                    '_pdbx_struct_assembly',
                    '_entity_poly',
                    '_citation',
                    '_pdbx_database_related',
                    ], ## parse selected data categories
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    }
                )
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']:
                continue

            if '_pdbx_database_related.content_type' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related.content_type']:
                    continue

            try:
                if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != ['monomeric']:
                    continue
            except:
                print pdb
                stop

            if not '_citation.id' in d_mmCIF.keys():
                continue

            ##
            ## parse coordinate section
            ##
            d_mmCIF = parse_mmCIF.main(
                pdb,
                l_data_categories = [
                    '_database_PDB_rev',
                    '_refine',
                    '_refine_hist',
                    '_atom_site',
                    '_software',
                    '_entity','_entity_poly',
                    '_pdbx_struct_assembly',
                    '_pdbx_database_status',
                    ], ## parse selected data categories
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    }
                )

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']:
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != ['monomeric']:
                continue

            resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high']))

            if (
                int(d_mmCIF['_entity.pdbx_number_of_molecules'][0]) != 1
                or
                len(d_mmCIF['_entity_poly.pdbx_strand_id']) > 1
                or
                len(''.join(d_mmCIF['_entity_poly.pdbx_strand_id']).split(',')) > 1
                or
                len(d_mmCIF['_entity_poly.entity_id']) > 1
                ):
                print pdb
                print d_mmCIF['_entity.pdbx_number_of_molecules']
                print d_mmCIF['_entity_poly.pdbx_strand_id']
                stop

            entity_poly_id = int(''.join(d_mmCIF['_entity_poly.entity_id']))
            for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])):
                entity_poly_id = d_mmCIF['_entity_poly.entity_id'][i_entity_poly]
                entity_poly_type = d_mmCIF['_entity_poly.entity_id'][i_entity_poly]

            l_Biso = []
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):
                occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site])
                if occupancy != 1:
                    continue
                alt_id = d_mmCIF['_atom_site.label_alt_id'][i_atom_site]
                if alt_id != '.':
                    continue
                entity_id = d_mmCIF['_atom_site.label_entity_id'][i_atom_site]
                if entity_id != entity_poly_id:
                    continue
                comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site]
                if not comp_id in ['MSE','ALA','CYS','ASP','GLU','PHE','GLY','HIS','ILE','LYS','LEU','MET','ASN','PRO','GLN','ARG','SER','THR','VAL','TRP','TYR',]:
                    continue
                type_symbol = d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                if type_symbol == 'H':
                    continue
                atom_id = d_mmCIF['_atom_site.label_atom_id'][i_atom_site]
                if not atom_id in ['N','CA','C',]:
                    continue

                Biso = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site])
                l_Biso += [Biso]

            year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
            site = ''.join(d_mmCIF['_pdbx_database_status.process_site'])

            if len(l_Biso) == 0:
                continue

##            if l_Biso == len(l_Biso)*[l_Biso[0]]:
##                print pdb, year, l_Biso[0:3]
##                if year >= 2010:
##                    stop
##                continue

            Biso_average = sum(l_Biso)/len(l_Biso)

            bool_continue = False
            for Biso in set(l_Biso):
                count = l_Biso.count(Biso)
                if count > 20:
                    if '_software.name' in d_mmCIF.keys():
                        print pdb, Biso_average, Biso, count, d_mmCIF['_software.name']
                        s = '%s %6.2f %4i %6.2f %4i %s %s\n' %(
                            pdb,Biso,count,Biso_average,year,site,
                            str(d_mmCIF['_software.name']),
                            )
                    else:
                        print pdb, Biso_average, Biso, count
                        s = '%s %6.2f %4i %6.2f %4i %s\n' %(
                            pdb,Biso,count,Biso_average, year, site,
                            )
                    bool_continue = True
                    fd = open('remediation_Biso_duplicates.txt','a')
                    fd.write(s)
                    fd.close()
                    break
            if bool_continue == True:
                continue

##            if Biso_average in [2,3,4,5,6,7,8,9,99,90,50,20,25,1,100,10,0]:
            if Biso_average in range(0,100+1):
                print l_Biso
                print Biso_average
                print pdb
                print year
                stop

            if '_refine.pdbx_TLS_residual_ADP_flag' in d_mmCIF.keys():
                if ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in ['UNVERIFIED','LIKELY RESIDUAL',]:
                    continue
                elif ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in ['?',]:
                    pass
                else:
                    print d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']
                    print pdb, Biso_average
                    stop

            if round(Biso_average,4) == round(Biso_average_prev,4):
                print pdb, Biso_average, Biso_average_prev
                stop

            print pdb, round(Biso_average,2), resolution
            fd = open('Biso_v_resolution.gnuplotdata','a')
            fd.write('%s %s %s %s\n' %(Biso_average,resolution,pdb,year,))
            fd.close()

    plot()
예제 #23
0
import matthews_coefficient, parse_mmCIF

for pdb in [
        '2hhb',
        '1hho',
        '1hv4',
        '3hl9',
        '3hlb',
        '3hlc',
        '3hld',
        '3hle',
        '3hlf',
        '3hlg',
]:

    d_mmCIF = parse_mmCIF.main(pdb)

    a = float(d_mmCIF['_cell.length_a'][0])
    b = float(d_mmCIF['_cell.length_b'][0])
    c = float(d_mmCIF['_cell.length_c'][0])
    alpha = float(d_mmCIF['_cell.angle_alpha'][0])
    beta = float(d_mmCIF['_cell.angle_beta'][0])
    gamma = float(d_mmCIF['_cell.angle_gamma'][0])
    Z = int(d_mmCIF['_cell.Z_PDB'][0])  ## number of polymers in unit cell
    mw = 0
    for i in range(len(d_mmCIF['_entity.id'])):
        if d_mmCIF['_entity.type'][i] == 'polymer':
            mw += float(d_mmCIF['_entity.formula_weight'][i])
    MV = matthews_coefficient.main(
        a,
        b,
예제 #24
0
        dn,
    ))
    l_fn.sort()
    for fn in l_fn:
        pdb = fn[:4]
        if fn[-3:] == '.gz':
            continue
########        if pdb in ['2fl9','3gau','3gav','3gaw',]: ## tmp!!!
########            continue
##        print pdb
        fd = open('%s/%s/%s' % (path, dn, fn), 'r')
        lines = fd.readlines()
        fd.close()
        d = parse_mmCIF.main(
            pdb,
            lines,
            l_data_categories=l_data_categories,
            d_breaks=d_breaks,
        )

        if d_exclude_subset:
            bool_continue = False
            for item_exclude, l_values_exclude in d_exclude_subset.items():
                if not item_exclude in d.keys():
                    bool_continue = True
                    fd = open('%s/remediation_%s.txt' % (
                        path,
                        item_exclude,
                    ), 'a')
                    fd.write('%s\n' % (pdb))
                    fd.close()
                    continue
예제 #25
0
def parse_cifs(
    l_pdbs,
    ref_seq, l_db_codes,
    n_mutations_max,
    resolution_min,
    bool_multiple_entities = False,
    ):

    print 'parse cifs'

    n_mutants = 0
    l_wts = []
    l_wts_cysfree = []
    d_mutants = {}

    d_mmCIF_main = {}
    for pdb in l_pdbs:

        if pdb[:4].lower() in d_mmCIF_main.keys():
            continue

        d_mmCIF = parse_mmCIF.main(pdb[:4].lower(),)

        ## not an x-ray structure
        if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
            print pdb, d_mmCIF['_exptl.method']
            continue

        ## more than one type of polymer present
        n_entities = len(d_mmCIF['_entity_poly.entity_id'])
        if bool_multiple_entities == False:
            if n_entities > 1:
                print pdb, 'entities', n_entities #, d_mmCIF['_struct.title']
                continue

        ## low resolution
        if d_mmCIF['_refine.ls_d_res_high'] != d_mmCIF['_refine_hist.d_res_high']:
            print d_mmCIF['_refine.ls_d_res_high']
            print d_mmCIF['_refine_hist.d_res_high']
            stop
        if resolution_min:
##            if float(d_mmCIF['_refine.ls_d_res_high'][0]) >= resolution_min:
            if float(d_mmCIF['_refine.ls_d_res_high'][0]) > resolution_min:
                print pdb, 'resolution', d_mmCIF['_refine.ls_d_res_high']
                continue

        ## get entity ID from chain ID
        for i_entity in range(len(d_mmCIF['_entity_poly.entity_id'])):
            entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity]
            s_chain_ids = d_mmCIF['_entity_poly.pdbx_strand_id'][i_entity]
            if pdb[-1] in s_chain_ids:
                break
        if pdb[-1] not in s_chain_ids:
            print pdb
            print s_chain_ids
            stop
        ## get sequence from entity ID
        seq = []
        for i in range(len(d_mmCIF['_entity_poly_seq.entity_id'])):
            if d_mmCIF['_entity_poly_seq.entity_id'][i] == entity_id:
                mon_id = d_mmCIF['_entity_poly_seq.mon_id'][i]
                if pdb[:4] == '1RCM' and i == 126:
                    if mon_id != 'CYS':
                        stop
                    mon_id = 'CCS'
                seq += [mon_id]

        ## wrong chain length
        if ref_seq:
            if len(seq) != len(ref_seq):
                if ''.join(ref_seq) in ''.join(seq):
                    print ref_seq
                    print seq
                    stop
                ## unobserved atoms not in seqres
                elif ''.join(seq) in ''.join(ref_seq):
                    pass
                ## last two residues unobserved
                elif len(seq) == 162 and pdb in [
                    '1KS3_A','1KW5_A','1KW7_A','1KY0_A','1KY1_A','1L0J_A','1LOK_A','1LPY_A','1LW9_A','1LWG_A','1LWK_A',
                    ]:
                    pass
                ## last two residues unobserved
                elif len(seq) == 162 and seq[-1] == 'LYS':
                    pass
                else:
                    print pdb, 'seqlen', len(seq)
                    continue

        ## not from Gallus gallus
        ## check not necessary, because sequence checked against ref seq
        entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity]
        db_code = d_mmCIF['_struct_ref.db_code'][d_mmCIF['_struct_ref.entity_id'].index(entity_id)]
        if db_code not in l_db_codes:
            print pdb, 'uniprot', db_code
            continue

        ## more than 1 mutation?
        if n_mutations_max != None:
            l_mutations = []
            for i_seq in range(len(seq)):
                res_id_mmCIF = seq[i_seq]
                res_id_uniprot = ref_seq[i_seq]
                if res_id_mmCIF != res_id_uniprot:
                    l_mutations += ['%3s%i%3s' %(res_id_uniprot,i_seq+1,res_id_mmCIF,)]
##            if len(l_mutations) == 1:
            if len(l_mutations) > n_mutations_max:
                print pdb, 'muts', len(l_mutations)
                continue
            elif len(l_mutations) > 0:
                n_mutants += 1
                startmodel = parse_mmCIF_item(d_mmCIF,'_refine.pdbx_starting_model',pdb,)
                    

        ## append to lists and dictionaries
        d_mmCIF_main[pdb[:4]] = d_mmCIF
        if len(l_mutations) > 0:
            if l_mutations == ['CYS54THR', 'CYS97ALA']:
                l_wts_cysfree += [pdb]
            d_mutants[pdb] = {'mutations':l_mutations,'startmodel':startmodel}
        else:
            l_wts += [pdb]

##    print 'd_mutants', d_mutants

    return d_mmCIF_main, l_wts, d_mutants, l_wts_cysfree
예제 #26
0
    '1u3fA',
    '1agyA',
    '1zioA',
    '1pa9A',
    '2tpsA',
    '2plcA',
    '1qk2A',
    '1j53A',
    '1m21A',
]

cutoff = 10

for pdb in l_pdbs:

    pdb = pdb[:4]

    d = parse_mmCIF.main(pdb, )

    d_coords, l_coords = mmCIF2coords.main(pdb, d, query_chain=pdb[4:])

    matrix_hessian = NMA.hessian_calculation(l_coords, cutoff, verbose=False)

    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian,
                                                        verbose=False)

    visualization.vmd_arrows(pdb, l_coords, eigenvectors)

    print pdb
    stop
예제 #27
0
    '1czfA', '1thgA', '1booA', '1iu4A', '1bqcA', '206lA', '1cdeA', '1snzA',
    '1gq8A', '1aqlA', '1ps1A', '1s95A', '1pylA', '1ra2A', '1b6bA', '1pntA',
    '1e1aA', '2f9rA', '1v04A', '2nlrA', '1n29A', '1pbgA', '5cpaA', '1agmA',
    '1byaA', '1r76A', '1u5uA', '1vidA', '1h4gA', '1akdA', '1fy2A', '1xqdA',
    '1d6oA', '1qv0A', '1qjeA', '1fvaA', '1bp2A', '1ah7A', '2pthA', '2engA',
    '2acyA', '1qazA', '2a0nA', '1dl2A', '1gp5A', '1onrA', '1cwyA', '1pudA',
    '1bs9A', '1dinA', '1xyzA', '1bwlA', '1eugA', '1idjA', '1g24A', '1oygA',
    '1hzfA', '9papA', '1eb6A', '1ghsA', '1rbnA', '1bixA', '1bs4A', '1celA',
    '1hkaA', '1b02A', '1qibA', '1u3fA', '1agyA', '1zioA', '1pa9A', '2tpsA',
    '2plcA', '1qk2A', '1j53A', '1m21A',
    ]

cutoff = 10

for pdb in l_pdbs:

    pdb = pdb[:4]

    d = parse_mmCIF.main(pdb,)

    d_coords, l_coords = mmCIF2coords.main(pdb, d, query_chain = pdb[4:])

    matrix_hessian = NMA.hessian_calculation(l_coords, cutoff, verbose = False)

    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, verbose = False)

    visualization.vmd_arrows(pdb, l_coords, eigenvectors)

    print pdb
    stop
예제 #28
0
def unobs_nonterminal_atoms_alpha():

    ## this method is not entirely correct... e.g. 1kwr...

    category = fn = '_pdbx_unobs_or_zero_occ_atoms'

    fd = open('%s/list%s.txt' %(path,fn))
    s = fd.read()
    fd.close()
    l_pdbs_include = s.split()

    ## if a whole residue is missing, then all of it's atoms are also missing
    fd = open('%s/list_pdbx_unobs_residues__NONTERMINAL.txt' %(path))
    s = fd.read()
    fd.close()
    l_pdbs_exclude = s.split()

    l_data_categories = [
        '_pdbx_poly_seq_scheme',
        '_pdbx_unobs_or_zero_occ_atoms',
        '_entity_poly',
        '_struct', ## .pdbx_model_type_details
        '_exptl',
        ]
    d_breaks = {'_exptl.method':['SOLUTION NMR','SOLID-STATE NMR']}

    fn_out = 'list_pdbx_unobs_atoms__CA.txt'

    l_pdbs_out = []
    for pdb in l_pdbs_include:

##        if pdb[1:3] < 'fe':
##            continue
##        if pdb == '2kzt': ## takes too long...
##            continue
        if pdb != '3e3d':
            continue

        if pdb in l_pdbs_exclude:
            continue

        print pdb

        d = parse_mmCIF.main(pdb,l_data_categories=l_data_categories,d_breaks=d_breaks,)

        ## something has to be missing in the first place for it to be terminal/nonterminal
        if not category in d.keys():
            continue
        ## it has to be a polymer in the first place for anything to be terminal/nonterminal
        if not '_pdbx_poly_seq_scheme' in d.keys():
            continue
        ## don't deal with NMR models for now... (too many unobs records when hydrogen...)
        if d['_exptl.method'] != ['X-RAY DIFFRACTION']:
            continue
        if '_struct.pdbx_model_type_details' in d.keys():
            if d['_struct.pdbx_model_type_details'] in [
                ['?'],
                ['minimized average'],
                ['MINIMIZED AVERAGE'],
                ]:
                pass
            ## if residues are not missing, and model is CA only, then no CA are missing!!!
            elif 'CA ATOMS ONLY' in d['_struct.pdbx_model_type_details'][0]:
                continue
            else:
                print d['_struct.pdbx_model_type_details']
                stop
##        if not 'CA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']:
##            continue

        bool_append = False
        for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])):
            if (
                d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] == 'CA'
                and
                d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'][i_unobs] == 'Y'
                and
                ## unobs (1), zero_occ (0)
                d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs] == '1'
                ):
                l_pdbs_out += [pdb]
                print '***', pdb
                break

        continue

    print l_pdbs_out
    stop
    fd = open('%s/%s' %(path,fn_out,),'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    for x in []:

        l_indexes_unobs = []
        bool_append = False
        s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id'])
        for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])):

            ## skip if not alpha carbon
            if d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] != 'CA':
                continue
            ## skip if zero occupancy
            if d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs] == '0':
                continue

            if 'HA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']:
                print pdb
                print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'])
                stop2
            if d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'].count('Y') > 800:
                print pdb
                print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'])
                stop1

            asymID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_asym_id'][i_unobs]
            seqID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'][i_unobs]

            index1 = s.index(asymID_unobs)
            index2 = s.rindex(asymID_unobs)+1
            for i_poly in range(index1,index2,):

                asymID_poly = d['_pdbx_poly_seq_scheme.pdb_strand_id'][i_poly]
                seqID_poly = d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly]

                if seqID_poly == seqID_unobs:

                    if d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == '.' and d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] == '?':
                        pass
                    elif d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs]:
                        pass
                    elif d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] == '?' and d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] != '.':
                        continue
                    elif not d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] in d['_pdbx_poly_seq_scheme.pdb_ins_code']:
                        print d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly]
                        print insCode_unobs
                        print pdb
                        print seqID_unobs, asymID_unobs
                        stop
                    else:
                        continue

                    if asymID_unobs != asymID_poly:
                        stop_add_with_check_of_identiiical_seqID

                    ## tmp!!! check!!!
                    if d['_pdbx_unobs_or_zero_occ_atoms.auth_comp_id'][i_unobs] != d['_pdbx_poly_seq_scheme.pdb_mon_id'][i_poly]:
                        print pdb
                        stop

##                    ## last residue
##                    if index2-i_poly == 0:
##                        pass ## should append...
##                    ## first residue
##                    elif i_poly-index1 == 0:
##                        pass ## should append...
####                    elif i_poly-index1 > 1 and bool_unobs_prev == False:
####                        bool_append = True
                    ## previous residues are missing
                    elif d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:i_poly] == (i_poly-index1)*['?']:
                        bool_append = True
                    ## next residues are missing
                    elif d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly+1:index2] == (index2-i_poly-1)*['?']:
                        bool_append = True
                    ## zero occupancy residue prior to residue with unobserved atom(s)
                    elif pdb in ['7adh']:
                        bool_append = False
                        pass
                    else:
                        if len( set(range(index1,i_poly)) - set(l_indexes_unobs) ) == 0:
                            l_indexes_unobs += [i_poly]
                            stop1
                            pass
                        elif len( set(range(i_poly+1,index2)) - set(l_indexes_unobs) ) == 0:
                            l_indexes_unobs += [i_poly]
                            print pdb
                            print l_indexes_unobs
                            print i_poly, index1, index2
                            stop2
                            pass
                        else:
                            ## this method is not entirely correct... e.g. 1kwr...
                            if i_poly-index1 < 10 or index2-i_poly < 10:
                                print pdb
                                print i_poly-index1
##                        print index2-i_poly
                                print seqID_unobs
                                print pdb
                                print d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:i_poly]
                                print d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly:index2]
                                print pdb
##                                stop
                            bool_append = True
                            break

            if bool_append == True:
                break

        if bool_append == True:
            print pdb
            l_pdbs_out += [pdb]
            continue

        if l_indexes_unobs != []:
            print l_indexes_unobs
            stop

    fd = open('%s/%s' %(path,fn_out,),'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    return
def main():

    l_fn_out = [
        '_exptl_crystal_grow',
        '_exptl_crystal_grow_comp',
        ]

    d = {}
    for fn_out in l_fn_out:
        fd = open('db%s.txt' %(fn_out),'r')
        lines = fd.readlines()
        fd.close()
        d[fn_out] = {}
        for line in lines:
            if line == '\n':
                continue
            pdb = line[:4]
            s = line[5:]
            d[fn_out][pdb] = s

    fd = open('remediation_exptl_crystal_grow.pH.txt','r')
    lines = fd.readlines()
    fd.close()
    l_pdbs = [line[:4] for line in lines]

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            ## continue if already in txt file from previous attempt to run loop
##            if pdb in d['_exptl_crystal_grow_comp'].keys():
##                continue
##            if pdb in d['_exptl_crystal_grow'].keys():
##                continue

##            print pdb

            if not pdb in l_pdbs:
                continue

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    },
                l_data_categories_break = [
##                    '_atom',
                    '_diffrn',
                    ],
                l_data_categories = [
                    ## parse selected data categories
                    '_database_PDB_rev',
                    '_pdbx_database_status',
                    '_exptl',
                    '_exptl_crystal_grow',
                    '_exptl_crystal_grow_comp',
                    ],
                )

##            ## no polymers in structure?
##            if not '_entity_poly.entity_id' in d_mmCIF.keys():
##                continue

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

##            print pdb

            ##
            ##
            ##
            year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
            process_site = ''.join(d_mmCIF['_pdbx_database_status.process_site'])
            if (
                not '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys()
                and
                not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys()
##                ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip() == '?'
                ):
                if process_site != '?':
                    print pdb, year, process_site
                continue

            ##
            if '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys():

                s_grow = ' '.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip()

                if (
                    ## pH not given
                    d_mmCIF['_exptl_crystal_grow.pH'] in [['?'],[''],['.'],]
                    and
                    d_mmCIF['_exptl_crystal_grow.pdbx_pH_range'] in [['?'],[''],['.'],]
                    and
                    ## but pH in growth details
                    (
                        ' PH ' in s_grow.upper()
                        or
                        'PH=' in s_grow.upper()
                        or
                        ',PH ' in s_grow.upper()
                        )
                    ):
                    fd = open('remediation_exptl_crystal_grow.pH.txt','a')
                    fd.write('%s\t%s\t%s\t%4i\t%s\t%s\n' %(
                        pdb,
                        ''.join(d_mmCIF['_exptl_crystal_grow.pH']),
                        ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_pH_range']),
                        year,
                        process_site,
                        s_grow,
                        )
                             )
                    fd.close()

                if (
                    not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys()
                    or
                    ''.join(d_mmCIF['_exptl_crystal_grow_comp.name']) in ['.','','?',]
                    ):

                    if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys():
                        name = ''.join(d_mmCIF['_exptl_crystal_grow_comp.name'])
                    else:
                        name = 'N/A'

##                    ## remove end punctuation
##                    s = s_grow[:-1]+s_grow[-1].replace('.','')

                    ## split
##                    l_grow_punctuation = s_grow.upper().split('. ')
##                    l_grow = l_grow_comma = [s_grow.upper().split(',') for s in l_grow_punctuation]
                    l_grow = s_grow.upper().split(',')

                    ## strip space
                    l_grow = [x.strip() for x in l_grow]
                    ## remove empty
                    if '' in l_grow:
                        l_grow.remove('')
                    ## remove end punctuation
                    l_grow = [x[:-1]+x[-1].replace('.','') for x in l_grow]

                    ## remove selected words from elements of list
                    for x in [
                        'CRYSTALS OBTAINED BY CO-CRYSTALLIZATION AT ',
                        'PROTEIN SOLUTION (',
                        ]:
                        for i_grow in range(len(l_grow)):
                            l_grow[i_grow] = l_grow[i_grow].replace(x,'')

                    ## replace abbreviations
                    for i_grow in range(len(l_grow)):
                        l_grow[i_grow] = l_grow[i_grow].replace('MILLIMOLAR','MM')
                        
                    
                    ## remove selected words from list
                    l_remove = []
                    for x in [
                        'VAPOR DIFFUSION',
                        'VAPOUR DIFFUSION',
                        'HANGING DROP',
                        'SITTING DROP',
                        ]:
                        if x in l_grow:
                            l_remove += [x]
                            
                    ## removed other selected words from list
                    for i_grow in range(len(list(l_grow))):

                        ## remove physical conditions
                        bool_continue = False
                        for x in [
                            'TEMPERATURE',
                            'PH=',
                            'PH ',
                            'AT PH ',
                            ]:
                            if l_grow[i_grow][:len(x)] == x:
                                l_remove += [l_grow[i_grow]]
                                bool_continue = True
                                break
                        if bool_continue == True:
                            continue

                        ## remove long words (sentences)
                        if len(l_grow[i_grow]) > 50:
                            l_remove += [l_grow[i_grow]]
                            break
                    for remove in l_remove:
                        l_grow.remove(remove)
                    if len(l_grow) > 0:
                        ## write to file
                        line = '%s\t%s\t%s\t%4i\t%s\t%s\n' %(
                            pdb,
                            name,
                            l_grow,
                            year,
                            process_site,
                            s_grow,
                            )
                        fd = open('remediation_exptl_crystal_grow_comp.name.txt','a')
                        fd.write(line)
                        fd.close()

            else:
                s_grow = ''

            ##
            if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys():
                l_grow_comp = d_mmCIF['_exptl_crystal_grow_comp.name']
            else:
                l_grow_comp = []

##            lines_out += [line]

            ## append to txt file in case loop doesn't finish
            d_lines = {}
            line = '%s %s\n' %(pdb,s_grow,)
            d_lines['_exptl_crystal_grow'] = line
            line = '%s %s\n' %(pdb,l_grow_comp,)
            d_lines['_exptl_crystal_grow_comp'] = line
            for fn_out in l_fn_out:
                fd = open('db%s.txt' %(fn_out),'a')
                fd.write(d_lines[fn_out])
                fd.close()

            ## append to dic for when loop finishes
            d['_exptl_crystal_grow'][pdb] = s_grow
            d['_exptl_crystal_grow_comp'][pdb] = l_grow_comp

    lines_out = []
    for pdb,s in d.items():
        line = '%s %s\n' %(pdb,s,)
        lines_out += [line]
    fd = open(fn_out,'w')
    fd.writelines(lines_out)
    fd.close()

    return
예제 #30
0
import sys
sys.path.append('/home/tc/svn/tc_sandbox/pdb')
import parse_mmCIF, mmCIF2coords
sys.path.append('/home/tc/svn/GoodVibes')
import NMA, visualization

d_mmCIF = parse_mmCIF.main('2lzm', )
d_coords, l_coords_alpha = mmCIF2coords.main('2lzm', d_mmCIF)

cutoff = 10
matrix_hessian = NMA.hessian_calculation(l_coords_alpha, cutoff)
eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, )
visualization.vmd_trajectory('2lzm', l_coords_alpha, eigenvectors)
예제 #31
0
def main():

    fd = open('db_authors.txt', 'r')
    lines = fd.readlines()
    fd.close()

    d_authors = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        s_authors = l[1:]
        d_authors[pdb] = s_authors

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d_authors.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                l_data_categories=[
                    '_audit_author',
                    '_citation_author',
                ],  ## parse selected data categories
                l_data_categories_break=[
                    '_citation_author',
                ],
            )

            l_authors = d_mmCIF['_audit_author.name']
            s_authors = ';'.join(l_authors)

            if d_mmCIF['_audit_author.name'] == []:
                print d_mmCIF['_citation_author.name']
                print d_mmCIF['_audit_author.name']
                stop

            line = '%s %s\n' % (
                pdb,
                s_authors,
            )
            lines_out += [line]

            fd = open('db_authors.txt', 'a')
            fd.write(line)
            fd.close()

            d_authors[pdb] = s_authors

    ##
    ## write to file
    ##
    lines_out = []
    for pdb, s_authors in d_authors.items():
        line = '%s %s\n' % (
            pdb,
            s_authors,
        )
        lines_out += [line]
    fd = open('db_authors.txt', 'w')
    fd.writelines(lines_out)
    fd.close()

    return
예제 #32
0
for i_line in range(len(lines)):
    if i_line % 100 == 0:
        d_coordinates = {}
    line = lines[i_line]
    l = line.split()
    pdb1 = l[0]
    pdb2 = l[1]
    chain1 = l[4]
    chain2 = l[5]

    for pdb,chain in [[pdb1,chain1,],[pdb2,chain2,],]:

        if pdb in d_coordinates.keys():
            continue

        d_mmCIF = parse_mmCIF.main(pdb)


        if d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num'] != d_mmCIF['_pdbx_poly_seq_scheme.author_seq_num']:
            print d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num']
            print d_mmCIF['_pdbx_poly_seq_scheme.author_seq_num']
            stop

        d_coords = {}
        d_ndb_seq_num = {}
        for i_seq in range(len(d_mmCIF['_pdbx_poly_seq_scheme.ndb_seq_num'])):
            if d_mmCIF['_pdbx_poly_seq_scheme.pdb_strand_id'][i_seq] != chain:
                continue
            ndb_seq_num = d_mmCIF['_pdbx_poly_seq_scheme.ndb_seq_num'][i_seq]
            pdb_seq_num = d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num'][i_seq]
            d_ndb_seq_num[pdb_seq_num] = ndb_seq_num
예제 #33
0
def unobs_nonterminal_atoms_alpha():

    ## this method is not entirely correct... e.g. 1kwr...

    category = fn = '_pdbx_unobs_or_zero_occ_atoms'

    fd = open('%s/list%s.txt' % (path, fn))
    s = fd.read()
    fd.close()
    l_pdbs_include = s.split()

    ## if a whole residue is missing, then all of it's atoms are also missing
    fd = open('%s/list_pdbx_unobs_residues__NONTERMINAL.txt' % (path))
    s = fd.read()
    fd.close()
    l_pdbs_exclude = s.split()

    l_data_categories = [
        '_pdbx_poly_seq_scheme',
        '_pdbx_unobs_or_zero_occ_atoms',
        '_entity_poly',
        '_struct',  ## .pdbx_model_type_details
        '_exptl',
    ]
    d_breaks = {'_exptl.method': ['SOLUTION NMR', 'SOLID-STATE NMR']}

    fn_out = 'list_pdbx_unobs_atoms__CA.txt'

    l_pdbs_out = []
    for pdb in l_pdbs_include:

        ##        if pdb[1:3] < 'fe':
        ##            continue
        ##        if pdb == '2kzt': ## takes too long...
        ##            continue
        if pdb != '3e3d':
            continue

        if pdb in l_pdbs_exclude:
            continue

        print pdb

        d = parse_mmCIF.main(
            pdb,
            l_data_categories=l_data_categories,
            d_breaks=d_breaks,
        )

        ## something has to be missing in the first place for it to be terminal/nonterminal
        if not category in d.keys():
            continue
        ## it has to be a polymer in the first place for anything to be terminal/nonterminal
        if not '_pdbx_poly_seq_scheme' in d.keys():
            continue
        ## don't deal with NMR models for now... (too many unobs records when hydrogen...)
        if d['_exptl.method'] != ['X-RAY DIFFRACTION']:
            continue
        if '_struct.pdbx_model_type_details' in d.keys():
            if d['_struct.pdbx_model_type_details'] in [
                ['?'],
                ['minimized average'],
                ['MINIMIZED AVERAGE'],
            ]:
                pass
            ## if residues are not missing, and model is CA only, then no CA are missing!!!
            elif 'CA ATOMS ONLY' in d['_struct.pdbx_model_type_details'][0]:
                continue
            else:
                print d['_struct.pdbx_model_type_details']
                stop
##        if not 'CA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']:
##            continue

        bool_append = False
        for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])):
            if (d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs]
                    == 'CA' and
                    d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'][i_unobs]
                    == 'Y' and
                    ## unobs (1), zero_occ (0)
                    d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs]
                    == '1'):
                l_pdbs_out += [pdb]
                print '***', pdb
                break

        continue

    print l_pdbs_out
    stop
    fd = open('%s/%s' % (
        path,
        fn_out,
    ), 'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    for x in []:

        l_indexes_unobs = []
        bool_append = False
        s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id'])
        for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])):

            ## skip if not alpha carbon
            if d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] != 'CA':
                continue
            ## skip if zero occupancy
            if d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][
                    i_unobs] == '0':
                continue

            if 'HA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']:
                print pdb
                print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'])
                stop2
            if d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'].count(
                    'Y') > 800:
                print pdb
                print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'])
                stop1

            asymID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_asym_id'][
                i_unobs]
            seqID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'][
                i_unobs]

            index1 = s.index(asymID_unobs)
            index2 = s.rindex(asymID_unobs) + 1
            for i_poly in range(
                    index1,
                    index2,
            ):

                asymID_poly = d['_pdbx_poly_seq_scheme.pdb_strand_id'][i_poly]
                seqID_poly = d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly]

                if seqID_poly == seqID_unobs:

                    if d['_pdbx_poly_seq_scheme.pdb_ins_code'][
                            i_poly] == '.' and d[
                                '_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][
                                    i_unobs] == '?':
                        pass
                    elif d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == d[
                            '_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][
                                i_unobs]:
                        pass
                    elif d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][
                            i_unobs] == '?' and d[
                                '_pdbx_poly_seq_scheme.pdb_ins_code'][
                                    i_poly] != '.':
                        continue
                    elif not d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][
                            i_unobs] in d['_pdbx_poly_seq_scheme.pdb_ins_code']:
                        print d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly]
                        print insCode_unobs
                        print pdb
                        print seqID_unobs, asymID_unobs
                        stop
                    else:
                        continue

                    if asymID_unobs != asymID_poly:
                        stop_add_with_check_of_identiiical_seqID

                    ## tmp!!! check!!!
                    if d['_pdbx_unobs_or_zero_occ_atoms.auth_comp_id'][
                            i_unobs] != d['_pdbx_poly_seq_scheme.pdb_mon_id'][
                                i_poly]:
                        print pdb
                        stop

##                    ## last residue
##                    if index2-i_poly == 0:
##                        pass ## should append...
##                    ## first residue
##                    elif i_poly-index1 == 0:
##                        pass ## should append...
####                    elif i_poly-index1 > 1 and bool_unobs_prev == False:
####                        bool_append = True
## previous residues are missing
                    elif d['_pdbx_poly_seq_scheme.auth_seq_num'][
                            index1:i_poly] == (i_poly - index1) * ['?']:
                        bool_append = True
                    ## next residues are missing
                    elif d['_pdbx_poly_seq_scheme.auth_seq_num'][
                            i_poly +
                            1:index2] == (index2 - i_poly - 1) * ['?']:
                        bool_append = True
                    ## zero occupancy residue prior to residue with unobserved atom(s)
                    elif pdb in ['7adh']:
                        bool_append = False
                        pass
                    else:
                        if len(
                                set(range(index1, i_poly)) -
                                set(l_indexes_unobs)) == 0:
                            l_indexes_unobs += [i_poly]
                            stop1
                            pass
                        elif len(
                                set(range(i_poly + 1, index2)) -
                                set(l_indexes_unobs)) == 0:
                            l_indexes_unobs += [i_poly]
                            print pdb
                            print l_indexes_unobs
                            print i_poly, index1, index2
                            stop2
                            pass
                        else:
                            ## this method is not entirely correct... e.g. 1kwr...
                            if i_poly - index1 < 10 or index2 - i_poly < 10:
                                print pdb
                                print i_poly - index1
                                ##                        print index2-i_poly
                                print seqID_unobs
                                print pdb
                                print d['_pdbx_poly_seq_scheme.auth_seq_num'][
                                    index1:i_poly]
                                print d['_pdbx_poly_seq_scheme.auth_seq_num'][
                                    i_poly:index2]
                                print pdb


##                                stop
                            bool_append = True
                            break

            if bool_append == True:
                break

        if bool_append == True:
            print pdb
            l_pdbs_out += [pdb]
            continue

        if l_indexes_unobs != []:
            print l_indexes_unobs
            stop

    fd = open('%s/%s' % (
        path,
        fn_out,
    ), 'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    return
예제 #34
0
import sys
sys.path.append('/home/tc/svn/tc_sandbox/pdb')
import parse_mmCIF, mmCIF2coords
sys.path.append('/home/tc/svn/GoodVibes')
import NMA, visualization

d_mmCIF = parse_mmCIF.main('2lzm',)
d_coords, l_coords_alpha = mmCIF2coords.main('2lzm',d_mmCIF)

cutoff = 10
matrix_hessian = NMA.hessian_calculation(l_coords_alpha, cutoff)
eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian,)
visualization.vmd_trajectory('2lzm',l_coords_alpha,eigenvectors)
def parse_coords(pdb):

    d_mmCIF = parse_mmCIF.main(pdb,)
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb,d_mmCIF)

    return d_mmCIF, l_coords_alpha
예제 #36
0
def main():

    fd = open('radius_of_gyration.txt', 'r')
    lines = fd.readlines()
    fd.close()
    d_radii = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        r = l[1]
        d_radii[pdb] = r

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d_radii.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                d_breaks={
                    ## break if multiple polymer types (not monomeric)
                    '_entity_poly.entity_id': '2',
                    ##                    '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR
                    ## break if multiple chains
                    '_entity_poly.pdbx_strand_id': ',',
                },
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                    ## break if not monomeric
                    '_pdbx_struct_assembly.oligomeric_details': 'monomeric',
                },
                l_data_categories=[
                    '_atom_site',
                    '_entity_poly',
                    '_pdbx_struct_assembly',
                ],  ## parse selected data categories
            )

            ## some unknown temporary error... or break before reaching this part when parsing...
            if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys(
            ):
                continue

            ## NMR structure?
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                stop2
                continue

            ## no polymers in structure?
            if not '_entity_poly.entity_id' in d_mmCIF.keys():
                continue

            ## polymer(s) is/are not polypeptide(s)
            if d_mmCIF['_entity_poly.type'] != len(
                    d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']:
                continue

            ## biounit not monomeric
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_details']) * [
                        'monomeric'
                    ]:
                continue

            ## one polymer in assymetric unit
            if len(d_mmCIF['_entity_poly.entity_id']) > 1:
                continue

            print pdb

            ##
            ## calculate center of mass
            ##
            center_of_mass = numpy.array([
                0.,
                0.,
                0.,
            ])
            l_coords = []
            l_masses = []
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):

                if d_mmCIF['_atom_site.label_entity_id'][
                        i_atom_site] not in d_mmCIF['_entity_poly.entity_id']:
                    continue

                element = d_mmCIF['_atom_site.type_symbol'][i_atom_site]

                ## only do heavy atoms
                if element == 'H':
                    continue
                if element not in d_mass.keys():
                    print pdb, d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                    continue

                mass = d_mass[element]
                l_masses += [mass]

                x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site])
                y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site])
                z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site])
                coord = numpy.array([
                    x,
                    y,
                    z,
                ])
                l_coords += [coord]

                center_of_mass += mass * coord

            center_of_mass /= sum(l_masses)

            ##
            ## calculate radius of gyration
            ##
            sum_r = 0
            for i_coord in range(len(l_coords)):
                coord = l_coords[i_coord]
                mass = l_masses[i_coord]
                sq_dist_from_center_of_mass = sum((coord - center_of_mass)**2)
                sum_r += mass * sq_dist_from_center_of_mass
            radius_of_gyration = math.sqrt(sum_r / sum(l_masses))

            print pdb, center_of_mass, radius_of_gyration

            line = '%s %s\n' % (
                pdb,
                radius_of_gyration,
            )
            lines_out += [line]

            fd = open('radius_of_gyration.txt', 'a')
            fd.write(line)
            fd.close()

            d_radii[pdb] = radius_of_gyration

    ##
    ## write calculated radii of gyration to file
    ##
    lines_out = []
    for pdb, radius_of_gyration in d_radii.items():
        line = '%s %s\n' % (
            pdb,
            radius_of_gyration,
        )
        lines_out += [line]
    fd = open('radius_of_gyration.txt', 'w')
    fd.writelines(lines_out)
    fd.close()

    return
def main():

    l_fn_out = [
        '_exptl_crystal_grow',
        '_exptl_crystal_grow_comp',
    ]

    d = {}
    for fn_out in l_fn_out:
        fd = open('db%s.txt' % (fn_out), 'r')
        lines = fd.readlines()
        fd.close()
        d[fn_out] = {}
        for line in lines:
            if line == '\n':
                continue
            pdb = line[:4]
            s = line[5:]
            d[fn_out][pdb] = s

    fd = open('remediation_exptl_crystal_grow.pH.txt', 'r')
    lines = fd.readlines()
    fd.close()
    l_pdbs = [line[:4] for line in lines]

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            ## continue if already in txt file from previous attempt to run loop
            ##            if pdb in d['_exptl_crystal_grow_comp'].keys():
            ##                continue
            ##            if pdb in d['_exptl_crystal_grow'].keys():
            ##                continue

            ##            print pdb

            if not pdb in l_pdbs:
                continue

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                },
                l_data_categories_break=[
                    ##                    '_atom',
                    '_diffrn',
                ],
                l_data_categories=[
                    ## parse selected data categories
                    '_database_PDB_rev',
                    '_pdbx_database_status',
                    '_exptl',
                    '_exptl_crystal_grow',
                    '_exptl_crystal_grow_comp',
                ],
            )

            ##            ## no polymers in structure?
            ##            if not '_entity_poly.entity_id' in d_mmCIF.keys():
            ##                continue

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

##            print pdb

##
##
##
            year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
            process_site = ''.join(
                d_mmCIF['_pdbx_database_status.process_site'])
            if (not '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys()
                    and not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys()
                    ##                ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip() == '?'
                ):
                if process_site != '?':
                    print pdb, year, process_site
                continue

            ##
            if '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys():

                s_grow = ' '.join(
                    d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip()

                if (
                        ## pH not given
                        d_mmCIF['_exptl_crystal_grow.pH'] in [
                            ['?'],
                            [''],
                            ['.'],
                        ] and d_mmCIF['_exptl_crystal_grow.pdbx_pH_range'] in [
                            ['?'],
                            [''],
                            ['.'],
                        ] and
                        ## but pH in growth details
                    (' PH ' in s_grow.upper() or 'PH=' in s_grow.upper()
                     or ',PH ' in s_grow.upper())):
                    fd = open('remediation_exptl_crystal_grow.pH.txt', 'a')
                    fd.write('%s\t%s\t%s\t%4i\t%s\t%s\n' % (
                        pdb,
                        ''.join(d_mmCIF['_exptl_crystal_grow.pH']),
                        ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_pH_range']),
                        year,
                        process_site,
                        s_grow,
                    ))
                    fd.close()

                if (not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys() or
                        ''.join(d_mmCIF['_exptl_crystal_grow_comp.name']) in [
                            '.',
                            '',
                            '?',
                        ]):

                    if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys():
                        name = ''.join(
                            d_mmCIF['_exptl_crystal_grow_comp.name'])
                    else:
                        name = 'N/A'

##                    ## remove end punctuation
##                    s = s_grow[:-1]+s_grow[-1].replace('.','')

## split
##                    l_grow_punctuation = s_grow.upper().split('. ')
##                    l_grow = l_grow_comma = [s_grow.upper().split(',') for s in l_grow_punctuation]
                    l_grow = s_grow.upper().split(',')

                    ## strip space
                    l_grow = [x.strip() for x in l_grow]
                    ## remove empty
                    if '' in l_grow:
                        l_grow.remove('')
                    ## remove end punctuation
                    l_grow = [x[:-1] + x[-1].replace('.', '') for x in l_grow]

                    ## remove selected words from elements of list
                    for x in [
                            'CRYSTALS OBTAINED BY CO-CRYSTALLIZATION AT ',
                            'PROTEIN SOLUTION (',
                    ]:
                        for i_grow in range(len(l_grow)):
                            l_grow[i_grow] = l_grow[i_grow].replace(x, '')

                    ## replace abbreviations
                    for i_grow in range(len(l_grow)):
                        l_grow[i_grow] = l_grow[i_grow].replace(
                            'MILLIMOLAR', 'MM')

                    ## remove selected words from list
                    l_remove = []
                    for x in [
                            'VAPOR DIFFUSION',
                            'VAPOUR DIFFUSION',
                            'HANGING DROP',
                            'SITTING DROP',
                    ]:
                        if x in l_grow:
                            l_remove += [x]

                    ## removed other selected words from list
                    for i_grow in range(len(list(l_grow))):

                        ## remove physical conditions
                        bool_continue = False
                        for x in [
                                'TEMPERATURE',
                                'PH=',
                                'PH ',
                                'AT PH ',
                        ]:
                            if l_grow[i_grow][:len(x)] == x:
                                l_remove += [l_grow[i_grow]]
                                bool_continue = True
                                break
                        if bool_continue == True:
                            continue

                        ## remove long words (sentences)
                        if len(l_grow[i_grow]) > 50:
                            l_remove += [l_grow[i_grow]]
                            break
                    for remove in l_remove:
                        l_grow.remove(remove)
                    if len(l_grow) > 0:
                        ## write to file
                        line = '%s\t%s\t%s\t%4i\t%s\t%s\n' % (
                            pdb,
                            name,
                            l_grow,
                            year,
                            process_site,
                            s_grow,
                        )
                        fd = open(
                            'remediation_exptl_crystal_grow_comp.name.txt',
                            'a')
                        fd.write(line)
                        fd.close()

            else:
                s_grow = ''

            ##
            if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys():
                l_grow_comp = d_mmCIF['_exptl_crystal_grow_comp.name']
            else:
                l_grow_comp = []


##            lines_out += [line]

## append to txt file in case loop doesn't finish
            d_lines = {}
            line = '%s %s\n' % (
                pdb,
                s_grow,
            )
            d_lines['_exptl_crystal_grow'] = line
            line = '%s %s\n' % (
                pdb,
                l_grow_comp,
            )
            d_lines['_exptl_crystal_grow_comp'] = line
            for fn_out in l_fn_out:
                fd = open('db%s.txt' % (fn_out), 'a')
                fd.write(d_lines[fn_out])
                fd.close()

            ## append to dic for when loop finishes
            d['_exptl_crystal_grow'][pdb] = s_grow
            d['_exptl_crystal_grow_comp'][pdb] = l_grow_comp

    lines_out = []
    for pdb, s in d.items():
        line = '%s %s\n' % (
            pdb,
            s,
        )
        lines_out += [line]
    fd = open(fn_out, 'w')
    fd.writelines(lines_out)
    fd.close()

    return
예제 #38
0
def exclude(l_chainIDs):

    ##
    ## exclude obsolete structures and theoretical structures
    ##
    print 'obsolete/theoretical'
    print len(l_chainIDs)
    l_exclude = []
    for chainID in l_chainIDs:
        if not os.path.isfile('/data/mmCIF/%s/%s.cif' % (
                chainID[1:3],
                chainID[0:4],
        )):
            l_exclude += [chainID]
    for chainID in l_exclude:
        l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    ##
    ## exclude multidomain structures
    ##
    print 'multidomain'
    print len(l_chainIDs)
    fd = open('../CathDomall', 'r')
    lines = fd.readlines()
    fd.close()
    l_single_domain_chains = []
    for line in lines:
        chainID = line[:5]
        if not chainID in l_chainIDs:
            continue
        n_domains = int(line[7:9])
        if n_domains == 1:
            l_single_domain_chains += [chainID]
    l_chainIDs = list(set(l_chainIDs) & set(l_single_domain_chains))
    print len(l_chainIDs)
    print

    ##
    ## exclude multichain biological units
    ## exclude non-x-ray structures
    ##
    print 'multichain'
    print len(l_chainIDs)
    l_exclude = []
    l_pdbs_parsed = []
    d_resolutions = {}
    for i_chainID in range(len(l_chainIDs)):
        chainID = l_chainIDs[i_chainID]
        print i_chainID, len(l_chainIDs), chainID
        pdbID = chainID[:4]
        if pdbID in l_pdbs_parsed:
            continue
        d_mmCIF = parse_mmCIF.main(pdbID)

        l_pdbs_parsed += [pdbID]

        if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
            l_exclude += [pdbID]
            continue

        try:
            l_oligomeric_counts = d_mmCIF[
                '_pdbx_struct_assembly.oligomeric_count']
        except:
            print chainID
            continue
        if l_oligomeric_counts != len(l_oligomeric_counts) * ['1']:
            l_exclude += [pdbID]

        try:
            d_resolutions[pdbID] = float(''.join(
                d_mmCIF['_refine_hist.d_res_high']))
        except:
            print chainID
            stop

    for chainID in list(l_chainIDs):
        if chainID[:4] in l_exclude:
            l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    ##
    ## exclude redundancies
    ##
    print 'redunant'
    print len(l_chainIDs)
    fd = open('../bc-50.out', 'r')
    lines = fd.readlines()
    fd.close()
    d = {}
    for i_line in range(len(lines)):
        line = lines[i_line]
        l_cluster = line.split()
        for i in range(len(l_cluster)):
            l_cluster[i] = l_cluster[i][:4].lower() + l_cluster[i][-1]
        l = list(set(l_cluster) & set(l_chainIDs))
        if len(l) > 1:
            max_resolution = [
                '',
                None,
            ]
            l.sort()
            for chainID in l:
                pdbID = chainID[:4]
                resolution = d_resolutions[pdbID]
                if resolution < max_resolution[0]:
                    max_resolution = [
                        resolution,
                        chainID,
                    ]
            for chainID in l:
                if chainID != max_resolution[1]:
                    l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    return l_chainIDs
예제 #39
0
def parse_GoodVibes_exclude_flexible(pdb,path,):

    ##
    ## calculate amplitudes
    ##
    d_mmCIF = parse_mmCIF.main(pdb[:4],)
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4],d_mmCIF,query_chain=pdb[-1])
    print len(l_coords_alpha)
    ##
    ## eigenvector
    ##
    cutoff = 10
    matrix_hessian = NMA.hessian_calculation(l_coords_alpha,cutoff,)
    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian)
    l_amplitudes = [
        math.sqrt(
            eigenvectors[6][i]**2+eigenvectors[6][i+1]**2+eigenvectors[6][i+2]**2
            )
        for i in range(0,len(eigenvectors[6]),3)
        ]

##    ## write pdb (color by bfactor)
##    l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))]
##    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
##    lines = fd.readlines()
##    fd.close()
##    index = [-1,None,]
##    lines_out = []
##    for line in lines:
##        record = line[:6].strip()
##        if record != 'ATOM':
##            lines_out += [line]
##        else:
##            res_no = int(line[22:26])
##            if res_no != index[1]:
##                index = [index[0]+1,res_no,]
##                bfactor = l_bfactors[index[0]]
##            line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],)
##            lines_out += [line_out]
##    fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w')
##    fd.writelines(lines_out)
##    fd.close()

    ## average amplitude
    average = sum(l_amplitudes)/len(l_amplitudes)
    average,stddev = statistics.do_stddev(l_amplitudes)
    ##
    l_coords_rigid = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] < average:
            l_coords_rigid += [l_coords_alpha[i]]
    l_coords_flexible = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] > average+0.5*stddev:
            l_coords_flexible += [l_coords_alpha[i]]

    ## parse output
    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
    lines = fd.readlines()
    fd.close()

    max_bfactor = None
    coord = None
    for line in lines:
        record = line[:6].strip()
        if record not in ['ATOM','HETATM',]:
            continue
        res_name = line[17:20]
        if res_name != 'EXT':
            continue

        bfactor = float(line[60:66])

        if bfactor > max_bfactor:
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

##            coord_tmp = numpy.array([x,y,z,])

##            bool_vicinal_to_rigid = False
##            for coord_rigid in l_coords_rigid:
##                dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2))
##                if dist_from_rigid < 6:
##                    bool_vicinal_to_rigid = True
##                    break
##            if bool_vicinal_to_rigid == False:
##                continue

##            bool_vicinal_to_flexible = False
##            for coord_flexible in l_coords_flexible:
##                dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2))
##                if dist_from_flexible < 6:
##                    bool_vicinal_to_flexible = True
##                    break
##            if bool_vicinal_to_flexible == True:
##                continue

##            min_dist = [1000.,None,]
##            for i_coord_alpha in range(len(l_coords_alpha)):
##                coord_alpha = l_coords_alpha[i_coord_alpha]
##                dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2))
##                if dist_from_alpha < min_dist[0]:
##                    min_dist = [dist_from_alpha,i_coord_alpha,]
##            if l_amplitudes[min_dist[1]] > average+stddev:
##                continue

            coord = numpy.array([x,y,z,])
            max_bfactor = bfactor

    return coord
예제 #40
0
def one_polysaccharide(pdb, ):

    l_data_categories = [
        '_entity',
        '_chem_comp',
        '_entity_poly',
    ]
    d = parse_mmCIF.main(
        pdb,
        l_data_categories=l_data_categories,
    )

    bool_append = False

    bool_polysaccharide = False
    if '_chem_comp.type' in d.keys():
        for chem_comp_type in d['_chem_comp.type']:
            if chem_comp_type.lower() in [
                    'd-saccharide 1,4 and 1,4 linking',  # 3amm
                    'l-saccharide',
                    'd-saccharide',
                    'saccharide'
            ]:
                bool_polysaccharide = True
                break
##            elif 'acchar' in chem_comp_type.lower():
##                print d
##                print chem_comp_type
##                print pdb
##                print set(['D-saccharide','saccharide'])&set(d['_chem_comp.type'])
##                stop
##    else:
##        print pdb
##        stop

    count_polymer_sugar = 0
    bool_monosaccharide = False  ## included to exclude 1a14 which contains polymers and monomers
    for i in range(len(d['_entity.type'])):
        entity_type = d['_entity.type'][i]
        if entity_type in [
                'polymer',
        ]:
            if d['_entity.pdbx_description'][i][:7] == 'SUGAR (':
                count_polymer_sugar += int(
                    d['_entity.pdbx_number_of_molecules'][i])
                continue
##            ## polypeptide or polynucleotide (just a check)
##            elif d['_entity.pdbx_description'][i][:5] == 'SUGAR': ## eg 2c49
##                if d['_entity.id'][i] not in d['_entity_poly.entity_id']:
##                    print pdb
##                    stop
        elif entity_type == 'non-polymer' and d['_entity.pdbx_description'][
                i][:5] == 'SUGAR':
            bool_monosaccharide = True
##            ## just a check
##            if d['_entity.pdbx_description'][i][:7] != 'SUGAR (' and pdb not in ['1iuc',]:
##                print pdb
##                print d['_entity.pdbx_description'][i]
##                stop
##        ## anything else named SUGAR? just a check
##        elif entity_type != 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR':
##            print d
##            print pdb
##            print entity_type
##            print d['_entity.pdbx_description'][i]
##            stop

    if bool_monosaccharide == False and bool_polysaccharide == True and count_polymer_sugar == 1:
        bool_append = True


##    elif pdb in ['3gvj','3gvk','3gvl','3hmy','3msg','1v0f',]:
##        bool_append = False
##    ## error check
##    elif bool_polysaccharide == False and count_polymer_sugar > 0:
##        print d
##        print bool_polysaccharide
##        print d['_entity.pdbx_description']
##        print count_polymer_sugar
##        print pdb
##        stop_no_poly_but_poly

    if pdb == '1dl2':
        print count_polymer_sugar
        print bool_append
        stop

    return bool_append
예제 #41
0
def main():

    l_pdbs = []
    fd = open('Biso_v_resolution.gnuplotdata', 'r')
    lines = fd.readlines()
    fd.close()
    for line in lines:
        l = line.split()
        resolution = float(l[1])
        Biso = float(l[0])
        if resolution > 3.5 and Biso < 10:
            print line
        if resolution > 2.5 and Biso < 10:
            print line
        if resolution > 2.0 and Biso < 5:
            print line
##        if resolution > 1.5 and Biso < 5:
##            print line
        pdb = l[2]
        l_pdbs += [pdb]

    Biso_average_prev = 0

    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        if not os.path.isdir('%s/%s' % (path, dn)):
            continue
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue
            pdb = fn[0:4]

            if pdb in l_pdbs:
                continue

            if pdb in [
                    '3bfn',  ## PISA left out chains from biological unit
                    '2jjg',
                    '1qjb',  ## _pdbx_struct_assembly missing
            ]:
                continue

            ##
            ## parse header
            ##
            d_mmCIF = parse_mmCIF.main(
                pdb,
                l_data_categories=[
                    '_pdbx_struct_assembly',
                    '_entity_poly',
                    '_citation',
                    '_pdbx_database_related',
                ],  ## parse selected data categories
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                })
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']:
                continue

            if '_pdbx_database_related.content_type' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related.content_type']:
                    continue

            try:
                if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != [
                        'monomeric'
                ]:
                    continue
            except:
                print pdb
                stop

            if not '_citation.id' in d_mmCIF.keys():
                continue

            ##
            ## parse coordinate section
            ##
            d_mmCIF = parse_mmCIF.main(
                pdb,
                l_data_categories=[
                    '_database_PDB_rev',
                    '_refine',
                    '_refine_hist',
                    '_atom_site',
                    '_software',
                    '_entity',
                    '_entity_poly',
                    '_pdbx_struct_assembly',
                    '_pdbx_database_status',
                ],  ## parse selected data categories
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                })

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']:
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != [
                    'monomeric'
            ]:
                continue

            resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high']))

            if (int(d_mmCIF['_entity.pdbx_number_of_molecules'][0]) != 1
                    or len(d_mmCIF['_entity_poly.pdbx_strand_id']) > 1
                    or len(''.join(
                        d_mmCIF['_entity_poly.pdbx_strand_id']).split(',')) > 1
                    or len(d_mmCIF['_entity_poly.entity_id']) > 1):
                print pdb
                print d_mmCIF['_entity.pdbx_number_of_molecules']
                print d_mmCIF['_entity_poly.pdbx_strand_id']
                stop

            entity_poly_id = int(''.join(d_mmCIF['_entity_poly.entity_id']))
            for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])):
                entity_poly_id = d_mmCIF['_entity_poly.entity_id'][
                    i_entity_poly]
                entity_poly_type = d_mmCIF['_entity_poly.entity_id'][
                    i_entity_poly]

            l_Biso = []
            for i_atom_site in range(len(d_mmCIF['_atom_site.id'])):
                occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site])
                if occupancy != 1:
                    continue
                alt_id = d_mmCIF['_atom_site.label_alt_id'][i_atom_site]
                if alt_id != '.':
                    continue
                entity_id = d_mmCIF['_atom_site.label_entity_id'][i_atom_site]
                if entity_id != entity_poly_id:
                    continue
                comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site]
                if not comp_id in [
                        'MSE',
                        'ALA',
                        'CYS',
                        'ASP',
                        'GLU',
                        'PHE',
                        'GLY',
                        'HIS',
                        'ILE',
                        'LYS',
                        'LEU',
                        'MET',
                        'ASN',
                        'PRO',
                        'GLN',
                        'ARG',
                        'SER',
                        'THR',
                        'VAL',
                        'TRP',
                        'TYR',
                ]:
                    continue
                type_symbol = d_mmCIF['_atom_site.type_symbol'][i_atom_site]
                if type_symbol == 'H':
                    continue
                atom_id = d_mmCIF['_atom_site.label_atom_id'][i_atom_site]
                if not atom_id in [
                        'N',
                        'CA',
                        'C',
                ]:
                    continue

                Biso = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site])
                l_Biso += [Biso]

            year = int(d_mmCIF['_database_PDB_rev.date'][0][:4])
            site = ''.join(d_mmCIF['_pdbx_database_status.process_site'])

            if len(l_Biso) == 0:
                continue

##            if l_Biso == len(l_Biso)*[l_Biso[0]]:
##                print pdb, year, l_Biso[0:3]
##                if year >= 2010:
##                    stop
##                continue

            Biso_average = sum(l_Biso) / len(l_Biso)

            bool_continue = False
            for Biso in set(l_Biso):
                count = l_Biso.count(Biso)
                if count > 20:
                    if '_software.name' in d_mmCIF.keys():
                        print pdb, Biso_average, Biso, count, d_mmCIF[
                            '_software.name']
                        s = '%s %6.2f %4i %6.2f %4i %s %s\n' % (
                            pdb,
                            Biso,
                            count,
                            Biso_average,
                            year,
                            site,
                            str(d_mmCIF['_software.name']),
                        )
                    else:
                        print pdb, Biso_average, Biso, count
                        s = '%s %6.2f %4i %6.2f %4i %s\n' % (
                            pdb,
                            Biso,
                            count,
                            Biso_average,
                            year,
                            site,
                        )
                    bool_continue = True
                    fd = open('remediation_Biso_duplicates.txt', 'a')
                    fd.write(s)
                    fd.close()
                    break
            if bool_continue == True:
                continue


##            if Biso_average in [2,3,4,5,6,7,8,9,99,90,50,20,25,1,100,10,0]:
            if Biso_average in range(0, 100 + 1):
                print l_Biso
                print Biso_average
                print pdb
                print year
                stop

            if '_refine.pdbx_TLS_residual_ADP_flag' in d_mmCIF.keys():
                if ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in [
                        'UNVERIFIED',
                        'LIKELY RESIDUAL',
                ]:
                    continue
                elif ''.join(
                        d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in [
                            '?',
                        ]:
                    pass
                else:
                    print d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']
                    print pdb, Biso_average
                    stop

            if round(Biso_average, 4) == round(Biso_average_prev, 4):
                print pdb, Biso_average, Biso_average_prev
                stop

            print pdb, round(Biso_average, 2), resolution
            fd = open('Biso_v_resolution.gnuplotdata', 'a')
            fd.write('%s %s %s %s\n' % (
                Biso_average,
                resolution,
                pdb,
                year,
            ))
            fd.close()

    plot()
예제 #42
0
def unobs_nonterminal_residues():

    ##
    ## unobs or zero occup not at terminals!!! (combination...)
    ## eg dont exlude 200l w 163,164 missing
    ## dont exclude 201l w 163,164 missing, but internally in _pdbx_poly_seq_scheme because 2 chains
    ##
    category = fn = '_pdbx_unobs_or_zero_occ_residues'
    fd = open('%s/list%s.txt' % (path, fn))
    s = fd.read()
    fd.close()
    l_pdbs_in = s.split()
    l_data_categories = [
        '_pdbx_poly_seq_scheme',
        '_pdbx_unobs_or_zero_occ_residues',
        '_entity_poly',
    ]

    fn_out = 'list_pdbx_unobs_residues__NONTERMINAL'

    loop_residues(
        category,
        fn_out,
    )

    l_pdbs_out = []
    for pdb in l_pdbs_in:

        ##        if pdb[1:3] < 'oa':
        ##            continue
        ##        if pdb != '2hub':
        ##            continue

        ## no residues are present! (e.g. 1oax, 1oay)
        if pdb in [
                '1oax',
                '1oay',
        ]:
            continue

        d = parse_mmCIF.main(
            pdb,
            l_data_categories=l_data_categories,
        )

        ##        print pdb

        if not category in d.keys():
            continue

        bool_append = False
        s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id'])
        for chains in d['_entity_poly.pdbx_strand_id']:
            for chain in chains.split(','):
                index1 = s.index(chain)
                index2 = s.rindex(chain)
                ##                print chain
                l_auth_seq_num = d['_pdbx_poly_seq_scheme.auth_seq_num'][
                    index1:index2 + 1]
                while l_auth_seq_num[0] == '?':
                    l_auth_seq_num = l_auth_seq_num[1:]
                while l_auth_seq_num[-1] == '?':
                    l_auth_seq_num = l_auth_seq_num[:-1]
                ## non-terminal residues missing?
                if '?' in l_auth_seq_num:
                    print '****', pdb
                    bool_append = True
                    break
            if bool_append == True:
                break
        if bool_append == True:
            print pdb
            l_pdbs_out += [pdb]
            ## continue

    fd = open('%s/%s' % (
        path,
        fn_out,
    ), 'w')
    fd.write('\n'.join(l_pdbs_out))
    fd.close()

    return
def main():

    fn_out = 'db_MatthewsCoefficient.txt'

    fd = open(fn_out, 'r')
    lines = fd.readlines()
    fd.close()

    d = {}
    for line in lines:
        l = line.strip().split()
        pdb = l[0]
        v = l[1]
        if pdb == '2p51':
            v = '1.72610466393'
        d[pdb] = v

    lines_out = []

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()
    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' % (path, dn)):
            continue

        print '%s/%s %s' % (i + 1, len(l_dns), dn)
        l_fns = os.listdir('%s/%s' % (path, dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d.keys():
                continue

            ## Matthews Coefficient not calculated...
            if pdb in [
                    '1vh7',
                    '1vho',
                    '1vhu',
                    '1vi3',
                    '1vi4',
                    '1vis',
            ]:
                continue

            ## Matthews Coefficient *wrong*
            if pdb in [
                    '2p51',
                    ## too high
                    '1c5v',
                    '1q9i',
                    '1ut6',
                    '1x6x',
                    '1x6y',
                    '1xdn',
                    '1y63',
                    '1zix',
                    ## too low
                    '1t95',
                    '1jih',
                    '1t95',
                    '1d5t',
                    '1c7k',
                    '1dbo',
                    '1d9x',
                    '1qt9',
                    '1ia5',
                    '1dcq',
            ]:
                continue

            fd = open('%s/%s/%s' % (path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,
                lines,
                d_breaks={
                    ## break if multiple polymer types (not monomeric)
                    '_entity_poly.entity_id': '2',
                    ##                    '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR
                    ## break if multiple chains
                    '_entity_poly.pdbx_strand_id': ',',
                },
                d_breaks_negation={
                    ## break if not x-ray diffraction
                    '_exptl.method': 'X-RAY DIFFRACTION',
                    ## break if not monomeric
                    '_pdbx_struct_assembly.oligomeric_details': 'monomeric',
                },
                l_data_categories=[
                    '_exptl_crystal',
                ],  ## parse selected data categories
                l_data_categories_break=['_exptl_crystal'])

            ## some unknown temporary error... or break before reaching this part when parsing...
            if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys(
            ):
                continue

            ## NMR structure?
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                stop2
                continue

            ## no polymers in structure?
            if not '_entity_poly.entity_id' in d_mmCIF.keys():
                continue

            ## polymer(s) is/are not polypeptide(s)
            if d_mmCIF['_entity_poly.type'] != len(
                    d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']:
                continue

            ## biounit not monomeric
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_details']) * [
                        'monomeric'
                    ]:
                continue

            ## one polymer in assymetric unit
            if len(d_mmCIF['_entity_poly.entity_id']) > 1:
                continue

            if d_mmCIF['_exptl_crystal.density_Matthews'] == ['?']:
                v = VM = calc_matthews_coefficient.main(pdb)


##                continue
            else:
                v = float(''.join(d_mmCIF['_exptl_crystal.density_Matthews']))

            line = '%s %s\n' % (
                pdb,
                v,
            )

            fd = open(fn_out, 'a')
            fd.write(line)
            fd.close()

            d[pdb] = v

    ##
    ## write calculated radii of gyration to file
    ##
    lines_out = []
    for pdb, v in d.items():
        line = '%s %s\n' % (
            pdb,
            v,
        )
        lines_out += [line]
    fd = open(fn_out, 'w')
    fd.writelines(lines_out)
    fd.close()

    return
예제 #44
0
def main():

    d = {}

    if os.path.isfile('db_resolution.txt'):
        
        fd = open('db_resolution.txt','r')
        lines = fd.readlines()
        fd.close()

        for line in lines:
            l = line.strip().split()
            pdb = l[0]
            v = l[1]
            d[pdb] = v

    path = '/media/WDMyBook1TB/2TB/mmCIF'
    l_dns = os.listdir(path)
    l_dns.sort()

    lines_out = []

    for i in range(len(l_dns)):
        dn = l_dns[i]

        if dn < sys.argv[-1]:
            continue

        if not os.path.isdir('%s/%s' %(path,dn)):
            continue

        print '%s/%s %s' %(i+1,len(l_dns), dn)
        l_fns = os.listdir('%s/%s' %(path,dn))
        l_fns.sort()
        for fn in l_fns:
            if fn[-3:] == '.gz':
                continue

            pdb = fn[0:4]

            if pdb in d.keys():
                continue

            print pdb

            fd = open('%s/%s/%s' %(path, dn, fn), 'r')
            lines = fd.readlines()
            fd.close()

            d_mmCIF = parse_mmCIF.main(
                pdb,lines,
                l_data_categories = [
                    '_refine',
                    '_refine_hist',
                    ], ## parse selected data categories
                l_data_categories_break = [
                    '_refine',
##                    '_refine_hist',
                    ],
                d_breaks_negation = {
                    ## break if not x-ray diffraction
                    '_exptl.method':'X-RAY DIFFRACTION',
                    }
                )

            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            resolution = d_mmCIF['_refine.ls_d_res_high']

            line = '%s %s\n' %(pdb,resolution,)
            lines_out += [line]

            fd = open('db_resolution.txt','a')
            fd.write(line)
            fd.close()

            d[pdb] = resolution

    ##
    ## write to file
    ##
    lines_out = []
    for pdb,resolution in d.items():
        line = '%s %s\n' %(pdb,resolution,)
        lines_out += [line]
    fd = open('db_resolution.txt','w')
    fd.writelines(lines_out)
    fd.close()

    d = {}
    fd = open('db_resolution.txt','r')
    lines = fd.readlines()
    fd.close()

    lines_out = []
    for line in lines:
        resolution = line.strip().split()[1][2:-2]
        if resolution == '.':
            continue
        resolution = float(resolution)
        resolution = round(resolution,2)
        if not resolution in d.keys():
            d[resolution] = 0
        d[resolution] += 1
        lines_out += ['%s\n' %(resolution,)]
    fd = open('histogram_resolution.txt','w')
    fd.writelines(lines_out)
    fd.close()
    stop

    lines_out = []
    l_resolutions = d.keys()
    l_resolutions.sort()
##    for resolution,count in d.items():
    for resolution in l_resolutions:
        count = d[resolution]
        lines_out += ['%s %s\n' %(resolution,count,)]
    fd = open('histogram_resolution.txt','w')
    fd.writelines(lines_out)
    fd.close()

    return
예제 #45
0
def exclude(l_chainIDs):

    ##
    ## exclude obsolete structures and theoretical structures
    ##
    print 'obsolete/theoretical'
    print len(l_chainIDs)
    l_exclude = []
    for chainID in l_chainIDs:
        if not os.path.isfile('/data/mmCIF/%s/%s.cif' %(chainID[1:3],chainID[0:4],)):
            l_exclude += [chainID]
    for chainID in l_exclude:
        l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    ##
    ## exclude multidomain structures
    ##
    print 'multidomain'
    print len(l_chainIDs)
    fd = open('../CathDomall','r')
    lines = fd.readlines()
    fd.close()
    l_single_domain_chains = []
    for line in lines:
        chainID = line[:5]
        if not chainID in l_chainIDs:
            continue
        n_domains = int(line[7:9])
        if n_domains == 1:
            l_single_domain_chains += [chainID]
    l_chainIDs = list( set(l_chainIDs) & set(l_single_domain_chains) )
    print len(l_chainIDs)
    print

    ##
    ## exclude multichain biological units
    ## exclude non-x-ray structures
    ##
    print 'multichain'
    print len(l_chainIDs)
    l_exclude = []
    l_pdbs_parsed = []
    d_resolutions = {}
    for i_chainID in range(len(l_chainIDs)):
        chainID = l_chainIDs[i_chainID]
        print i_chainID, len(l_chainIDs), chainID
        pdbID = chainID[:4]
        if pdbID in l_pdbs_parsed:
            continue
        d_mmCIF = parse_mmCIF.main(pdbID)

        l_pdbs_parsed += [pdbID]
          
        if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
            l_exclude += [pdbID]
            continue

        try:
            l_oligomeric_counts = d_mmCIF['_pdbx_struct_assembly.oligomeric_count']
        except:
            print chainID
            continue
        if l_oligomeric_counts != len(l_oligomeric_counts)*['1']:
            l_exclude += [pdbID]

        try:
            d_resolutions[pdbID] = float(''.join(d_mmCIF['_refine_hist.d_res_high']))
        except:
            print chainID
            stop

    for chainID in list(l_chainIDs):
        if chainID[:4] in l_exclude:
            l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    ##
    ## exclude redundancies
    ##
    print 'redunant'
    print len(l_chainIDs)
    fd = open('../bc-50.out','r')
    lines = fd.readlines()
    fd.close()
    d = {}
    for i_line in range(len(lines)):
        line = lines[i_line]
        l_cluster = line.split()
        for i in range(len(l_cluster)):
            l_cluster[i] = l_cluster[i][:4].lower()+l_cluster[i][-1]
        l = list( set(l_cluster) & set(l_chainIDs) )
        if len(l) > 1:
            max_resolution = ['',None,]
            l.sort()
            for chainID in l:
                pdbID = chainID[:4]
                resolution = d_resolutions[pdbID]
                if resolution < max_resolution[0]:
                    max_resolution = [resolution,chainID,]
            for chainID in l:
                if chainID != max_resolution[1]:
                    l_chainIDs.remove(chainID)
    print len(l_chainIDs)
    print

    return l_chainIDs
예제 #46
0
    print dn
    l_fn = os.listdir('%s/%s' %(path,dn,))
    l_fn.sort()
    for fn in l_fn:
        pdb = fn[:4]
        if fn[-3:] == '.gz':
            continue
########        if pdb in ['2fl9','3gau','3gav','3gaw',]: ## tmp!!!
########            continue
##        print pdb
        fd = open('%s/%s/%s' %(path,dn,fn), 'r')
        lines = fd.readlines()
        fd.close()
        d = parse_mmCIF.main(
            pdb,lines,
            l_data_categories = l_data_categories,
            d_breaks = d_breaks,
            )

        if d_exclude_subset:
            bool_continue = False
            for item_exclude,l_values_exclude in d_exclude_subset.items():
                if not item_exclude in d.keys():
                    bool_continue = True
                    fd = open('%s/remediation_%s.txt' %(path,item_exclude,),'a')
                    fd.write('%s\n' %(pdb))
                    fd.close()
                    continue
                if len( set(d[item_exclude]) & set(l_values_exclude) ) > 0:
                    bool_continue = True
                    break
예제 #47
0
def parse_cifs(
    l_pdbs,
    ref_seq,
    l_db_codes,
    n_mutations_max,
    resolution_min,
    bool_multiple_entities=False,
):

    print 'parse cifs'

    n_mutants = 0
    l_wts = []
    l_wts_cysfree = []
    d_mutants = {}

    d_mmCIF_main = {}
    for pdb in l_pdbs:

        if pdb[:4].lower() in d_mmCIF_main.keys():
            continue

        d_mmCIF = parse_mmCIF.main(pdb[:4].lower(), )

        ## not an x-ray structure
        if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
            print pdb, d_mmCIF['_exptl.method']
            continue

        ## more than one type of polymer present
        n_entities = len(d_mmCIF['_entity_poly.entity_id'])
        if bool_multiple_entities == False:
            if n_entities > 1:
                print pdb, 'entities', n_entities  #, d_mmCIF['_struct.title']
                continue

        ## low resolution
        if d_mmCIF['_refine.ls_d_res_high'] != d_mmCIF[
                '_refine_hist.d_res_high']:
            print d_mmCIF['_refine.ls_d_res_high']
            print d_mmCIF['_refine_hist.d_res_high']
            stop
        if resolution_min:
            ##            if float(d_mmCIF['_refine.ls_d_res_high'][0]) >= resolution_min:
            if float(d_mmCIF['_refine.ls_d_res_high'][0]) > resolution_min:
                print pdb, 'resolution', d_mmCIF['_refine.ls_d_res_high']
                continue

        ## get entity ID from chain ID
        for i_entity in range(len(d_mmCIF['_entity_poly.entity_id'])):
            entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity]
            s_chain_ids = d_mmCIF['_entity_poly.pdbx_strand_id'][i_entity]
            if pdb[-1] in s_chain_ids:
                break
        if pdb[-1] not in s_chain_ids:
            print pdb
            print s_chain_ids
            stop
        ## get sequence from entity ID
        seq = []
        for i in range(len(d_mmCIF['_entity_poly_seq.entity_id'])):
            if d_mmCIF['_entity_poly_seq.entity_id'][i] == entity_id:
                mon_id = d_mmCIF['_entity_poly_seq.mon_id'][i]
                if pdb[:4] == '1RCM' and i == 126:
                    if mon_id != 'CYS':
                        stop
                    mon_id = 'CCS'
                seq += [mon_id]

        ## wrong chain length
        if ref_seq:
            if len(seq) != len(ref_seq):
                if ''.join(ref_seq) in ''.join(seq):
                    print ref_seq
                    print seq
                    stop
                ## unobserved atoms not in seqres
                elif ''.join(seq) in ''.join(ref_seq):
                    pass
                ## last two residues unobserved
                elif len(seq) == 162 and pdb in [
                        '1KS3_A',
                        '1KW5_A',
                        '1KW7_A',
                        '1KY0_A',
                        '1KY1_A',
                        '1L0J_A',
                        '1LOK_A',
                        '1LPY_A',
                        '1LW9_A',
                        '1LWG_A',
                        '1LWK_A',
                ]:
                    pass
                ## last two residues unobserved
                elif len(seq) == 162 and seq[-1] == 'LYS':
                    pass
                else:
                    print pdb, 'seqlen', len(seq)
                    continue

        ## not from Gallus gallus
        ## check not necessary, because sequence checked against ref seq
        entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity]
        db_code = d_mmCIF['_struct_ref.db_code'][
            d_mmCIF['_struct_ref.entity_id'].index(entity_id)]
        if db_code not in l_db_codes:
            print pdb, 'uniprot', db_code
            continue

        ## more than 1 mutation?
        if n_mutations_max != None:
            l_mutations = []
            for i_seq in range(len(seq)):
                res_id_mmCIF = seq[i_seq]
                res_id_uniprot = ref_seq[i_seq]
                if res_id_mmCIF != res_id_uniprot:
                    l_mutations += [
                        '%3s%i%3s' % (
                            res_id_uniprot,
                            i_seq + 1,
                            res_id_mmCIF,
                        )
                    ]
##            if len(l_mutations) == 1:
            if len(l_mutations) > n_mutations_max:
                print pdb, 'muts', len(l_mutations)
                continue
            elif len(l_mutations) > 0:
                n_mutants += 1
                startmodel = parse_mmCIF_item(
                    d_mmCIF,
                    '_refine.pdbx_starting_model',
                    pdb,
                )

        ## append to lists and dictionaries
        d_mmCIF_main[pdb[:4]] = d_mmCIF
        if len(l_mutations) > 0:
            if l_mutations == ['CYS54THR', 'CYS97ALA']:
                l_wts_cysfree += [pdb]
            d_mutants[pdb] = {
                'mutations': l_mutations,
                'startmodel': startmodel
            }
        else:
            l_wts += [pdb]


##    print 'd_mutants', d_mutants

    return d_mmCIF_main, l_wts, d_mutants, l_wts_cysfree
예제 #48
0
def parse_GoodVibes_exclude_flexible(
    pdb,
    path,
):

    ##
    ## calculate amplitudes
    ##
    d_mmCIF = parse_mmCIF.main(pdb[:4], )
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4],
                                                 d_mmCIF,
                                                 query_chain=pdb[-1])
    print len(l_coords_alpha)
    ##
    ## eigenvector
    ##
    cutoff = 10
    matrix_hessian = NMA.hessian_calculation(
        l_coords_alpha,
        cutoff,
    )
    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian)
    l_amplitudes = [
        math.sqrt(eigenvectors[6][i]**2 + eigenvectors[6][i + 1]**2 +
                  eigenvectors[6][i + 2]**2)
        for i in range(0, len(eigenvectors[6]), 3)
    ]

    ##    ## write pdb (color by bfactor)
    ##    l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))]
    ##    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
    ##    lines = fd.readlines()
    ##    fd.close()
    ##    index = [-1,None,]
    ##    lines_out = []
    ##    for line in lines:
    ##        record = line[:6].strip()
    ##        if record != 'ATOM':
    ##            lines_out += [line]
    ##        else:
    ##            res_no = int(line[22:26])
    ##            if res_no != index[1]:
    ##                index = [index[0]+1,res_no,]
    ##                bfactor = l_bfactors[index[0]]
    ##            line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],)
    ##            lines_out += [line_out]
    ##    fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w')
    ##    fd.writelines(lines_out)
    ##    fd.close()

    ## average amplitude
    average = sum(l_amplitudes) / len(l_amplitudes)
    average, stddev = statistics.do_stddev(l_amplitudes)
    ##
    l_coords_rigid = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] < average:
            l_coords_rigid += [l_coords_alpha[i]]
    l_coords_flexible = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] > average + 0.5 * stddev:
            l_coords_flexible += [l_coords_alpha[i]]

    ## parse output
    fd = open('output/%s/%s_%s_probe.pdb' % (
        path,
        pdb[:4],
        pdb[-1],
    ), 'r')
    lines = fd.readlines()
    fd.close()

    max_bfactor = None
    coord = None
    for line in lines:
        record = line[:6].strip()
        if record not in [
                'ATOM',
                'HETATM',
        ]:
            continue
        res_name = line[17:20]
        if res_name != 'EXT':
            continue

        bfactor = float(line[60:66])

        if bfactor > max_bfactor:
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

            ##            coord_tmp = numpy.array([x,y,z,])

            ##            bool_vicinal_to_rigid = False
            ##            for coord_rigid in l_coords_rigid:
            ##                dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2))
            ##                if dist_from_rigid < 6:
            ##                    bool_vicinal_to_rigid = True
            ##                    break
            ##            if bool_vicinal_to_rigid == False:
            ##                continue

            ##            bool_vicinal_to_flexible = False
            ##            for coord_flexible in l_coords_flexible:
            ##                dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2))
            ##                if dist_from_flexible < 6:
            ##                    bool_vicinal_to_flexible = True
            ##                    break
            ##            if bool_vicinal_to_flexible == True:
            ##                continue

            ##            min_dist = [1000.,None,]
            ##            for i_coord_alpha in range(len(l_coords_alpha)):
            ##                coord_alpha = l_coords_alpha[i_coord_alpha]
            ##                dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2))
            ##                if dist_from_alpha < min_dist[0]:
            ##                    min_dist = [dist_from_alpha,i_coord_alpha,]
            ##            if l_amplitudes[min_dist[1]] > average+stddev:
            ##                continue

            coord = numpy.array([
                x,
                y,
                z,
            ])
            max_bfactor = bfactor

    return coord
예제 #49
0
def identify_CH_bonds():
    
    ##
    ## identify all C-H single bonds in the standard residues
    ##
    d_atoms = {}
    for residue in [
        'ALA',
##        'ALA','CYS','ASP','GLU','PHE',
##        'GLY','HIS','ILE','LYS','LEU',
##        'MET','ASN','PRO','GLN','ARG',
##        'SER','THR','VAL','TRP','TYR',
        ]:
        lines = urllib2.urlopen('http://www.pdb.org/pdb/files/ligand/%s.cif' %(residue)).readlines()
        d = parse_mmCIF.main(residue,lines)
        d_atoms[residue] = []
        for i in range(len(d['_chem_comp_bond.comp_id'])):
            if d['_chem_comp_bond.value_order'][i] != 'SING':
                continue
            atom1 = d['_chem_comp_bond.atom_id_1'][i]
            atom2 = d['_chem_comp_bond.atom_id_2'][i]
            ## heavy element is always listed before hydrogen
            if atom1[0] != 'C' or atom2[0] != 'H':
                continue
            print residue, d['_chem_comp_bond.atom_id_1'][i], d['_chem_comp_bond.atom_id_2'][i]
            d_atoms[residue] += [atom1]

    return d_atoms