Пример #1
0
def parse_GoodVibes_exclude_flexible(
    pdb,
    path,
):

    ##
    ## calculate amplitudes
    ##
    d_mmCIF = parse_mmCIF.main(pdb[:4], )
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4],
                                                 d_mmCIF,
                                                 query_chain=pdb[-1])
    print len(l_coords_alpha)
    ##
    ## eigenvector
    ##
    cutoff = 10
    matrix_hessian = NMA.hessian_calculation(
        l_coords_alpha,
        cutoff,
    )
    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian)
    l_amplitudes = [
        math.sqrt(eigenvectors[6][i]**2 + eigenvectors[6][i + 1]**2 +
                  eigenvectors[6][i + 2]**2)
        for i in range(0, len(eigenvectors[6]), 3)
    ]

    ##    ## write pdb (color by bfactor)
    ##    l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))]
    ##    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
    ##    lines = fd.readlines()
    ##    fd.close()
    ##    index = [-1,None,]
    ##    lines_out = []
    ##    for line in lines:
    ##        record = line[:6].strip()
    ##        if record != 'ATOM':
    ##            lines_out += [line]
    ##        else:
    ##            res_no = int(line[22:26])
    ##            if res_no != index[1]:
    ##                index = [index[0]+1,res_no,]
    ##                bfactor = l_bfactors[index[0]]
    ##            line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],)
    ##            lines_out += [line_out]
    ##    fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w')
    ##    fd.writelines(lines_out)
    ##    fd.close()

    ## average amplitude
    average = sum(l_amplitudes) / len(l_amplitudes)
    average, stddev = statistics.do_stddev(l_amplitudes)
    ##
    l_coords_rigid = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] < average:
            l_coords_rigid += [l_coords_alpha[i]]
    l_coords_flexible = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] > average + 0.5 * stddev:
            l_coords_flexible += [l_coords_alpha[i]]

    ## parse output
    fd = open('output/%s/%s_%s_probe.pdb' % (
        path,
        pdb[:4],
        pdb[-1],
    ), 'r')
    lines = fd.readlines()
    fd.close()

    max_bfactor = None
    coord = None
    for line in lines:
        record = line[:6].strip()
        if record not in [
                'ATOM',
                'HETATM',
        ]:
            continue
        res_name = line[17:20]
        if res_name != 'EXT':
            continue

        bfactor = float(line[60:66])

        if bfactor > max_bfactor:
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

            ##            coord_tmp = numpy.array([x,y,z,])

            ##            bool_vicinal_to_rigid = False
            ##            for coord_rigid in l_coords_rigid:
            ##                dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2))
            ##                if dist_from_rigid < 6:
            ##                    bool_vicinal_to_rigid = True
            ##                    break
            ##            if bool_vicinal_to_rigid == False:
            ##                continue

            ##            bool_vicinal_to_flexible = False
            ##            for coord_flexible in l_coords_flexible:
            ##                dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2))
            ##                if dist_from_flexible < 6:
            ##                    bool_vicinal_to_flexible = True
            ##                    break
            ##            if bool_vicinal_to_flexible == True:
            ##                continue

            ##            min_dist = [1000.,None,]
            ##            for i_coord_alpha in range(len(l_coords_alpha)):
            ##                coord_alpha = l_coords_alpha[i_coord_alpha]
            ##                dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2))
            ##                if dist_from_alpha < min_dist[0]:
            ##                    min_dist = [dist_from_alpha,i_coord_alpha,]
            ##            if l_amplitudes[min_dist[1]] > average+stddev:
            ##                continue

            coord = numpy.array([
                x,
                y,
                z,
            ])
            max_bfactor = bfactor

    return coord
def main():

    d_MV = {}

    path = '/data/mmCIF'
    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn == 'mmCIF.py':
            continue
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        l_fn = os.listdir('%s/%s' % (path, dn))
        for fn in l_fn:
            pdb = fn[:4]
            ##            if pdb.upper() not in s_pdbs:
            ##                continue
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks={'_exptl.method': 'SOLUTION NMR'},
                l_data_categories=[
                    '_cell',
                    '_entity',
                    '_exptl',
                    '_exptl_crystal',
                    '_entity_poly',
                    '_symmetry',
                    ## virus
                    '_pdbx_struct_assembly',
                    ## split structure
                    '_pdbx_database_related',
                ],
            )

            ## x-ray structure
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            ## polymer present
            if not '_entity_poly.type' in d_mmCIF.keys():
                continue

            ## only polymer present is protein
            if d_mmCIF['_entity_poly.type'] != len(
                    d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']:
                continue

            if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys():
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['?']:
                continue

            ## virus
            if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count']
                   [0]) % 60 == 0:
                continue

            ## not monomer
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(
                    d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['1']:
                continue

            ## split structure
            if '_pdbx_database_related' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related']:
                    continue
                if 'SPLIT' in d_mmCIF['_pdbx_database_related']:
                    print pdb
                    stop

            if not '_cell.Z_PDB' in d_mmCIF.keys():
                continue

            if pdb in [
                    ## treshold
                    '1e54',
                    '1e9i',
                    ## difference between calculated MV and MV in mmCIF
                    '3eiq',
                    ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with
                    ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1
                    ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01.
                    ## Toscana has published with Hellinga...
                    '2cjf',
                    '2bt4',
            ]:
                continue

##            if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [
##                'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3',
##                ]:
##                continue ## tmp!!!

            a = float(d_mmCIF['_cell.length_a'][0])
            b = float(d_mmCIF['_cell.length_b'][0])
            c = float(d_mmCIF['_cell.length_c'][0])
            alpha = float(d_mmCIF['_cell.angle_alpha'][0])
            beta = float(d_mmCIF['_cell.angle_beta'][0])
            gamma = float(d_mmCIF['_cell.angle_gamma'][0])
            Z = int(d_mmCIF['_cell.Z_PDB'][0])
            mw = 0
            for i in range(len(d_mmCIF['_entity.id'])):
                ##                if d_mmCIF['_entity.type'][i] == 'polymer':
                s = d_mmCIF['_entity.formula_weight'][i]
                ## unknown ligand
                if s == '?':
                    continue
                mw += float(s)

            MV = matthews_coefficient.main(a, b, c, alpha, beta, gamma, mw, Z)

            spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M'])

            if spacegroup not in [
                    'F 4 3 2',
                    'F 41 3 2',
                    'I 41 3 2',
            ]:
                continue  ## tmp!!!

            if MV > 10:
                print pdb
                print 'mw', mw
                print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews']
                print 'Z', Z
                import math
                alpha *= math.pi / 180.
                beta *= math.pi / 180.
                gamma *= math.pi / 180.
                V = a * b * c * math.sqrt(
                    1 - math.cos(alpha)**2 - math.cos(beta)**2 -
                    math.cos(gamma)**2 + 2 *
                    (math.cos(alpha) * math.cos(beta) * math.cos(gamma)))
                print 'V', V
                continue
                stop_treshold
                stop
            if '_exptl_crystal.density_Matthews' in d_mmCIF.keys():
                if d_mmCIF['_exptl_crystal.density_Matthews'] not in [
                    ['?'],
                        len(d_mmCIF['_exptl_crystal.density_Matthews']) *
                    ['?'],
                ]:
                    if abs(MV -
                           float(d_mmCIF['_exptl_crystal.density_Matthews'][0])
                           ) > 1:
                        print 'MV', MV
                        print 'MV', d_mmCIF['_exptl_crystal.density_Matthews']
                        print 'mw', mw
                        print 'Z', Z
                        continue
                        stop_difference

            if not spacegroup in d_MV.keys():
                d_MV[spacegroup] = []
            d_MV[spacegroup] += [MV]

            print pdb, round(MV, 2), spacegroup


##    fd = open('MV_v_spacegroup.txt','w')
##    fd.write(str(d_MV))
##    fd.close()

    l = ['# MV_average MV_stddev n spacegroup\n']
    for spacegroup in d_MV.keys():
        l_MV = d_MV[spacegroup]
        if len(l_MV) <= 1:
            continue
        average, stddev = statistics.do_stddev(l_MV)
        average, stderr = statistics.do_stderr(l_MV)
        ##        l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)]
        l += ['%s %s %s %s\n' % (
            average,
            stderr,
            len(l_MV),
            spacegroup,
        )]

    fd = open('MV_v_spacegroup.txt', 'w')
    fd.writelines(l)
    fd.close()

    return
Пример #3
0
def parse_GoodVibes_exclude_flexible(pdb,path,):

    ##
    ## calculate amplitudes
    ##
    d_mmCIF = parse_mmCIF.main(pdb[:4],)
    d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4],d_mmCIF,query_chain=pdb[-1])
    print len(l_coords_alpha)
    ##
    ## eigenvector
    ##
    cutoff = 10
    matrix_hessian = NMA.hessian_calculation(l_coords_alpha,cutoff,)
    eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian)
    l_amplitudes = [
        math.sqrt(
            eigenvectors[6][i]**2+eigenvectors[6][i+1]**2+eigenvectors[6][i+2]**2
            )
        for i in range(0,len(eigenvectors[6]),3)
        ]

##    ## write pdb (color by bfactor)
##    l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))]
##    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
##    lines = fd.readlines()
##    fd.close()
##    index = [-1,None,]
##    lines_out = []
##    for line in lines:
##        record = line[:6].strip()
##        if record != 'ATOM':
##            lines_out += [line]
##        else:
##            res_no = int(line[22:26])
##            if res_no != index[1]:
##                index = [index[0]+1,res_no,]
##                bfactor = l_bfactors[index[0]]
##            line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],)
##            lines_out += [line_out]
##    fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w')
##    fd.writelines(lines_out)
##    fd.close()

    ## average amplitude
    average = sum(l_amplitudes)/len(l_amplitudes)
    average,stddev = statistics.do_stddev(l_amplitudes)
    ##
    l_coords_rigid = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] < average:
            l_coords_rigid += [l_coords_alpha[i]]
    l_coords_flexible = []
    for i in range(len(l_coords_alpha)):
        if l_amplitudes[i] > average+0.5*stddev:
            l_coords_flexible += [l_coords_alpha[i]]

    ## parse output
    fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r')
    lines = fd.readlines()
    fd.close()

    max_bfactor = None
    coord = None
    for line in lines:
        record = line[:6].strip()
        if record not in ['ATOM','HETATM',]:
            continue
        res_name = line[17:20]
        if res_name != 'EXT':
            continue

        bfactor = float(line[60:66])

        if bfactor > max_bfactor:
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])

##            coord_tmp = numpy.array([x,y,z,])

##            bool_vicinal_to_rigid = False
##            for coord_rigid in l_coords_rigid:
##                dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2))
##                if dist_from_rigid < 6:
##                    bool_vicinal_to_rigid = True
##                    break
##            if bool_vicinal_to_rigid == False:
##                continue

##            bool_vicinal_to_flexible = False
##            for coord_flexible in l_coords_flexible:
##                dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2))
##                if dist_from_flexible < 6:
##                    bool_vicinal_to_flexible = True
##                    break
##            if bool_vicinal_to_flexible == True:
##                continue

##            min_dist = [1000.,None,]
##            for i_coord_alpha in range(len(l_coords_alpha)):
##                coord_alpha = l_coords_alpha[i_coord_alpha]
##                dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2))
##                if dist_from_alpha < min_dist[0]:
##                    min_dist = [dist_from_alpha,i_coord_alpha,]
##            if l_amplitudes[min_dist[1]] > average+stddev:
##                continue

            coord = numpy.array([x,y,z,])
            max_bfactor = bfactor

    return coord
def main():

    d_MV = {}

    path = '/data/mmCIF'
    l_dn = os.listdir(path)
    l_dn.sort()
    for dn in l_dn:
        if dn == 'mmCIF.py':
            continue
        if dn < sys.argv[-2]:
            continue
        if dn > sys.argv[-1]:
            continue
        l_fn = os.listdir('%s/%s' %(path,dn))
        for fn in l_fn:
            pdb = fn[:4]
##            if pdb.upper() not in s_pdbs:
##                continue
            d_mmCIF = parse_mmCIF.main(
                pdb,
                d_breaks = {'_exptl.method':'SOLUTION NMR'},
                l_data_categories = [
                    '_cell','_entity','_exptl','_exptl_crystal',
                    '_entity_poly',
                    '_symmetry',
                    ## virus
                    '_pdbx_struct_assembly',
                    ## split structure
                    '_pdbx_database_related',
                    ],
                )

            ## x-ray structure
            if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']:
                continue

            ## polymer present
            if not '_entity_poly.type' in d_mmCIF.keys():
                continue

            ## only polymer present is protein
            if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']:
                continue

            if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys():
                continue

            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['?']:
                continue
            
            ## virus
            if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'][0]) % 60 == 0:
                continue

            ## not monomer
            if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['1']:
                continue

            ## split structure
            if '_pdbx_database_related' in d_mmCIF.keys():
                if 'split' in d_mmCIF['_pdbx_database_related']:
                    continue
                if 'SPLIT' in d_mmCIF['_pdbx_database_related']:
                    print pdb
                    stop

            if not '_cell.Z_PDB' in d_mmCIF.keys():
                continue

            if pdb in [
                ## treshold
                '1e54','1e9i',
                ## difference between calculated MV and MV in mmCIF
                '3eiq',
                ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with
                ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1
                ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01.
                ## Toscana has published with Hellinga...
                '2cjf','2bt4',
                ]:
                continue

##            if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [
##                'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3',
##                ]:
##                continue ## tmp!!!

            a = float(d_mmCIF['_cell.length_a'][0])
            b = float(d_mmCIF['_cell.length_b'][0])
            c = float(d_mmCIF['_cell.length_c'][0])
            alpha = float(d_mmCIF['_cell.angle_alpha'][0])
            beta = float(d_mmCIF['_cell.angle_beta'][0])
            gamma = float(d_mmCIF['_cell.angle_gamma'][0])
            Z = int(d_mmCIF['_cell.Z_PDB'][0])
            mw = 0
            for i in range(len(d_mmCIF['_entity.id'])):
##                if d_mmCIF['_entity.type'][i] == 'polymer':
                    s = d_mmCIF['_entity.formula_weight'][i]
                    ## unknown ligand
                    if s == '?':
                        continue
                    mw += float(s)

            MV = matthews_coefficient.main(a,b,c,alpha,beta,gamma,mw,Z)

            spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M'])

            if spacegroup not in [
                'F 4 3 2',
                'F 41 3 2',
                'I 41 3 2',
                ]:
                continue ## tmp!!!

            if MV > 10:
                print pdb
                print 'mw', mw
                print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews']
                print 'Z', Z
                import math
                alpha *= math.pi/180.
                beta *= math.pi/180.
                gamma *= math.pi/180.
                V = a*b*c*math.sqrt(1-math.cos(alpha)**2-math.cos(beta)**2-math.cos(gamma)**2+2*(math.cos(alpha)*math.cos(beta)*math.cos(gamma)))
                print 'V', V
                continue
                stop_treshold
                stop
            if '_exptl_crystal.density_Matthews' in d_mmCIF.keys():
                if d_mmCIF['_exptl_crystal.density_Matthews'] not in [['?'],len(d_mmCIF['_exptl_crystal.density_Matthews'])*['?'],]:
                    if abs(MV-float(d_mmCIF['_exptl_crystal.density_Matthews'][0])) > 1:
                        print 'MV', MV
                        print 'MV', d_mmCIF['_exptl_crystal.density_Matthews']
                        print 'mw', mw
                        print 'Z', Z
                        continue
                        stop_difference


            if not spacegroup in d_MV.keys():
                d_MV[spacegroup] = []
            d_MV[spacegroup] += [MV]

            print pdb, round(MV,2), spacegroup

##    fd = open('MV_v_spacegroup.txt','w')
##    fd.write(str(d_MV))
##    fd.close()

    l = ['# MV_average MV_stddev n spacegroup\n']
    for spacegroup in d_MV.keys():
        l_MV = d_MV[spacegroup]
        if len(l_MV) <= 1:
            continue
        average, stddev = statistics.do_stddev(l_MV)
        average, stderr = statistics.do_stderr(l_MV)
##        l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)]
        l += ['%s %s %s %s\n' %(average,stderr,len(l_MV),spacegroup,)]

    fd = open('MV_v_spacegroup.txt','w')
    fd.writelines(l)
    fd.close()

    return
Пример #5
0
            if d['spacegroup'] == 'different':
                d_statistics['diffSG']['alpha'] += [rmsd_alpha]
                d_statistics['diffSG']['heavy'] += [rmsd_heavy]
                d_statistics['diffSG']['chi1'] += [chi1_diff_average]
            else:
                d_statistics['sameSG']['alpha'] += [rmsd_alpha]
                d_statistics['sameSG']['heavy'] += [rmsd_heavy]
                d_statistics['sameSG']['chi1'] += [chi1_diff_average]

    prefix = 'CA_v_%s_%s_%s' %(y_property,protein,suffix_exclusion,)

    fd = open('%s.gnuplotdata' %(prefix),'w')
    fd.writelines(lines)
    fd.close()

    average_alpha, stddev_alpha = statistics.do_stddev(l_rmsds_alpha)
    average_heavy, stddev_heavy = statistics.do_stddev(l_rmsds_heavy)
    print 'alpha rmsd', len(l_rmsds_alpha), 'average', average_alpha, 'stddev', stddev_alpha
    print 'heavy rmsd', len(l_rmsds_heavy), 'average', average_heavy, 'stddev', stddev_heavy

    ################################################################################

    #### mutants
    ##for i in range(n_columns):
    ##    l_columns += [[l_columns[i][0]+n_columns,'',l_columns[i][2],]]
    #### put mutants in the background behind wts
    ##l_columns = l_columns[n_columns:]+l_columns[:n_columns]

    l_colors = []
    l_pointsizes = []
    l_pointtypes = []