Пример #1
0
    def filter_z_clusters_2(self, z_clusters, dataset, min_contact_dist=6):
        """Find and remove clusters more than a minimum distance from the protein"""

        # min_contact_dist - blobs are rejected if they are more than this distance from the protein

        # Extract the protein sites in the reference frame
        ref_sites_cart = dataset.model.alignment.nat2ref(protein(dataset.model.hierarchy).atoms().extract_xyz())
        # Save time - calculate the square of the contact distance
        min_contact_dist_sq = min_contact_dist ** 2

        # Remove any clusters that are more than min_contact_dist from the protein
        filtered_c_idxs = []
        for c_idx, (c_gps, c_val) in enumerate(z_clusters):
            # Extract points in cluster
            cluster_points_cart = self.grid.grid2cart(c_gps)
            # Calculate minimum distance to protein
            for r_site_cart in ref_sites_cart:
                diff_vecs_cart = cluster_points_cart - r_site_cart
                # Keep cluster if minimum distance is less than min_contact_dist
                if min(diff_vecs_cart.dot()) < min_contact_dist_sq:
                    filtered_c_idxs.append(c_idx)
                    break

        filt_z_clusters = [z_clusters[i] for i in filtered_c_idxs]

        return len(filt_z_clusters), filt_z_clusters
Пример #2
0
    def filter_z_clusters_2(self, z_clusters, dataset, min_contact_dist=6):
        """Find and remove clusters more than a minimum distance from the protein"""

        # min_contact_dist - blobs are rejected if they are more than this distance from the protein

        self.log('----------------------------------->>>')
        self.log('Filtering by minimum distance from protein')

        # Extract the protein sites in the reference frame
        ref_sites_cart = dataset.model.alignment.nat2ref(
            protein(dataset.model.hierarchy).atoms().extract_xyz())
        # Save time - calculate the square of the contact distance
        min_contact_dist_sq = min_contact_dist**2

        # Remove any clusters that are more than min_contact_dist from the protein
        filtered_c_idxs = []
        for c_idx, (c_gps, c_val) in enumerate(z_clusters):
            # Extract points in cluster
            cluster_points_cart = self.grid.grid2cart(c_gps)
            # Calculate minimum distance to protein
            for r_site_cart in ref_sites_cart:
                diff_vecs_cart = cluster_points_cart - r_site_cart
                # Keep cluster if minimum distance is less than min_contact_dist
                if min(diff_vecs_cart.dot()) < min_contact_dist_sq:
                    filtered_c_idxs.append(c_idx)
                    break
            # Report
#            if self.log.verbose:
#                if filtered_c_idxs and (filtered_c_idxs[-1] == c_idx):
#                    print('KEEPING CLUSTER:', c_idx)
#                else:
#                    print('REJECTING CLUSTER:', c_idx)
# Select filtered clusters
        filt_z_clusters = [z_clusters[i] for i in filtered_c_idxs]

        self.log('Filtered {!s} Clusters to {!s} Clusters'.format(
            len(z_clusters), len(filt_z_clusters)))
        return len(filt_z_clusters), filt_z_clusters
Пример #3
0
    def from_pdb(cls, pdb_input=None, pdb_hierarchy=None):
        """Calculate the b-factor statistics of a model"""

        assert [pdb_input, pdb_hierarchy
                ].count(None) == 1, 'Provide pdb_input OR pdb_hierarchy'
        if pdb_input: pdb_hierarchy = pdb_input.construct_hierarchy()

        cache = pdb_hierarchy.atom_selection_cache()

        all_b = non_h(hierarchy=pdb_hierarchy, cache=cache,
                      copy=True).atoms().extract_b()
        protein_b = protein(hierarchy=pdb_hierarchy, cache=cache,
                            copy=True).atoms().extract_b()
        backbone_b = backbone(hierarchy=pdb_hierarchy, cache=cache,
                              copy=True).atoms().extract_b()
        sidechain_b = sidechains(hierarchy=pdb_hierarchy,
                                 cache=cache,
                                 copy=True).atoms().extract_b()

        return cls(all=basic_statistics(all_b),
                   protein=basic_statistics(protein_b),
                   backbone=basic_statistics(backbone_b),
                   sidechain=basic_statistics(sidechain_b))
Пример #4
0
    def run(self):
        """Process the dataset"""

        dataset, dataset_map, grid, map_analyser, args, verbose = self.data

        # TODO Hardcoded check - to be removed? TODO
        assert dataset_map.is_sparse()

        # ============================================================================>
        # Prepare output objects
        # ============================================================================>
        log_strs = []
        log_file = dataset.file_manager.get_file('dataset_log')
        log = Log(log_file=log_file, verbose=False, silent=True)

        # ============================================================================>
        # Build new blob search object
        # ============================================================================>
        blob_finder = PanddaZMapAnalyser(params=args.params.z_map_analysis,
                                         grid=grid,
                                         log=log)
        print('Writing log for dataset {!s} to ...{}'.format(
            dataset.tag, log_file[log_file.index('processed'):]))

        # ============================================================================>
        # Extract the global mask object from the grid
        # ============================================================================>
        dset_total_temp = grid.global_mask().total_mask_binary().copy()

        # ============================================================================>
        # Generate symmetry masks for this dataset
        # ============================================================================>
        log.bar()
        log('Masking symetry contacts from Z-map.')
        # Generate symmetry contacts for this dataset and align to reference frame
        dataset_sym_copies = dataset.model.crystal_contacts(
            distance_cutoff=args.params.masks.outer_mask + 5,
            combine_copies=True)
        dataset_sym_copies.atoms().set_xyz(
            dataset.model.alignment.nat2ref(
                dataset_sym_copies.atoms().extract_xyz()))
        # Only need to write if writing reference frame maps
        if args.output.developer.write_reference_frame_maps:
            dataset_sym_copies.write_pdb_file(
                dataset.file_manager.get_file('symmetry_copies'))
        # Extract protein atoms from the symmetry copies
        dataset_sym_sites_cart = non_water(
            dataset_sym_copies).atoms().extract_xyz()
        # Generate symmetry contacts grid mask
        dataset_mask = GridMask(parent=grid,
                                sites_cart=dataset_sym_sites_cart,
                                max_dist=args.params.masks.outer_mask,
                                min_dist=args.params.masks.inner_mask_symmetry)
        # Combine with the total mask to generate custom mask for this dataset
        dset_total_temp.put(dataset_mask.inner_mask_indices(), 0)
        dset_total_idxs = numpy.where(dset_total_temp)[0]
        log('After masking with symmetry contacts: {} points for Z-map analysis'
            .format(len(dset_total_idxs)))
        # Write map of grid + symmetry mask
        if args.output.developer.write_reference_frame_grid_masks:
            grid.write_indices_as_map(
                indices=dset_total_idxs,
                f_name=dataset.file_manager.get_file('grid_mask'),
                origin_shift=True)

        # ============================================================================>
        # Generate custom masks for this dataset
        # ============================================================================>
        if args.params.z_map_analysis.masks.selection_string is not None:
            log.bar()
            log('Applying custom mask to the Z-map: "{}"'.format(
                args.params.z_map_analysis.masks.selection_string))
            cache = dataset.model.hierarchy.atom_selection_cache()
            custom_mask_selection = cache.selection(
                args.params.z_map_analysis.masks.selection_string)
            custom_mask_sites = dataset.model.hierarchy.select(
                custom_mask_selection).atoms().extract_xyz()
            log('Masking with {} atoms'.format(len(custom_mask_sites)))
            # Generate custom grid mask
            dataset_mask = GridMask(
                parent=grid,
                sites_cart=custom_mask_sites,
                max_dist=args.params.z_map_analysis.masks.outer_mask,
                min_dist=args.params.z_map_analysis.masks.inner_mask)
            # Combine with the total mask to generate custom mask for this dataset
            dset_total_temp *= dataset_mask.total_mask_binary()
            dset_total_idxs = numpy.where(dset_total_temp)[0]
            log('After masking with custom mask: {} points for Z-map analysis'.
                format(len(dset_total_idxs)))
            # Write out mask
            grid.write_indices_as_map(
                indices=dset_total_idxs,
                f_name=dataset.file_manager.get_file('z_map_mask'),
                origin_shift=True)

        # ============================================================================>
        #####
        # CALCULATE Z-MAPS AND LOOK FOR LARGE BLOBS
        #####
        # ============================================================================>
        # Check maps and that all maps are sparse
        # ============================================================================>
        assert dataset_map.data is not None, 'Something has gone wrong - this dataset has no loaded map'
        assert dataset_map.is_sparse(
        ) is map_analyser.statistical_maps.mean_map.is_sparse()
        assert dataset_map.is_sparse(
        ) is map_analyser.statistical_maps.medn_map.is_sparse()
        assert dataset_map.is_sparse(
        ) is map_analyser.statistical_maps.stds_map.is_sparse()
        assert dataset_map.is_sparse(
        ) is map_analyser.statistical_maps.sadj_map.is_sparse()
        # ============================================================================>
        # CALCULATE MEAN-DIFF MAPS
        # ============================================================================>
        mean_diff_map = map_analyser.calculate_z_map(map=dataset_map,
                                                     method='none')
        #        # ============================================================================>
        #        # NAIVE Z-MAP - NOT USING UNCERTAINTY ESTIMATION OR ADJUSTED STDS
        #        # ============================================================================>
        #        z_map_naive = map_analyser.calculate_z_map(map=dataset_map, method='naive')
        #        z_map_naive_normalised = z_map_naive.normalised_copy()
        # ============================================================================>
        # UNCERTAINTY Z-MAP - NOT USING ADJUSTED STDS
        # ============================================================================>
        z_map_uncty = map_analyser.calculate_z_map(
            map=dataset_map,
            uncertainty=dataset_map.meta.map_uncertainty,
            method='uncertainty')
        z_map_uncty_normalised = z_map_uncty.normalised_copy()
        # ============================================================================>
        # ADJUSTED+UNCERTAINTY Z-MAP
        # ============================================================================>
        z_map_compl = map_analyser.calculate_z_map(
            map=dataset_map,
            uncertainty=dataset_map.meta.map_uncertainty,
            method='adjusted+uncertainty')
        z_map_compl_normalised = z_map_compl.normalised_copy()

        # ============================================================================>
        # SELECT WHICH MAP TO DO THE BLOB SEARCHING ON
        # ============================================================================>
        #        if args.params.statistical_maps.z_map_type == 'naive':
        #            z_map = z_map_naive_normalised
        #            z_map_stats = basic_statistics(flex.double(z_map_naive.data))
        if args.params.statistical_maps.z_map_type == 'uncertainty':
            z_map = z_map_uncty_normalised
            z_map_stats = basic_statistics(flex.double(z_map_uncty.data))
        elif args.params.statistical_maps.z_map_type == 'adjusted+uncertainty':
            z_map = z_map_compl_normalised
            z_map_stats = basic_statistics(flex.double(z_map_compl.data))
        else:
            raise Exception('Invalid Z-map type')

        # ============================================================================>
        # RECORD Z-MAP FOR STATISTICS
        # ============================================================================>
        # Calculate statistics of z-maps
        dataset_map.meta.z_mean = z_map_stats.mean
        dataset_map.meta.z_stdv = z_map_stats.bias_corrected_standard_deviation
        dataset_map.meta.z_skew = z_map_stats.skew
        dataset_map.meta.z_kurt = z_map_stats.kurtosis
        # ============================================================================>
        z_map.meta.type = 'z-map'
        # ============================================================================>

        # ============================================================================>
        #####
        # WRITE ALL MAP DISTRIBUTIONS (THESE DON'T USE MUCH SPACE)
        #####
        # ============================================================================>
        # Sampled Map
        analyse_graphs.map_value_distribution(
            f_name=dataset.file_manager.get_file('s_map_png'),
            plot_vals=dataset_map.get_map_data(sparse=True))
        # Mean-Difference
        analyse_graphs.map_value_distribution(
            f_name=dataset.file_manager.get_file('d_mean_map_png'),
            plot_vals=mean_diff_map.get_map_data(sparse=True))
        #        # Naive Z-Map
        #        analyse_graphs.map_value_distribution(f_name      = dataset.file_manager.get_file('z_map_naive_png'),
        #                                              plot_vals   = z_map_naive.get_map_data(sparse=True),
        #                                              plot_normal = True)
        #        # Normalised Naive Z-Map
        #        analyse_graphs.map_value_distribution(f_name      = dataset.file_manager.get_file('z_map_naive_normalised_png'),
        #                                              plot_vals   = z_map_naive_normalised.get_map_data(sparse=True),
        #                                              plot_normal = True)
        # Uncertainty Z-Map
        analyse_graphs.map_value_distribution(
            f_name=dataset.file_manager.get_file('z_map_uncertainty_png'),
            plot_vals=z_map_uncty.get_map_data(sparse=True),
            plot_normal=True)
        # Normalised Uncertainty Z-Map
        analyse_graphs.map_value_distribution(
            f_name=dataset.file_manager.get_file(
                'z_map_uncertainty_normalised_png'),
            plot_vals=z_map_uncty_normalised.get_map_data(sparse=True),
            plot_normal=True)
        # Corrected Z-Map
        analyse_graphs.map_value_distribution(
            f_name=dataset.file_manager.get_file('z_map_corrected_png'),
            plot_vals=z_map_compl.get_map_data(sparse=True),
            plot_normal=True)
        # Normalised Corrected Z-Map
        analyse_graphs.map_value_distribution(
            f_name=dataset.file_manager.get_file(
                'z_map_corrected_normalised_png'),
            plot_vals=z_map_compl_normalised.get_map_data(sparse=True),
            plot_normal=True)
        # Plot Q-Q Plot of Corrected Z-Map to see how normal it is
        analyse_graphs.qq_plot_against_normal(
            f_name=dataset.file_manager.get_file('z_map_qq_plot_png'),
            plot_vals=z_map_compl_normalised.get_map_data(sparse=True))

        # ============================================================================>
        #####
        # LOOK FOR CLUSTERS OF LARGE Z-SCORES
        #####
        # ============================================================================>
        # Contour the grid at a particular Z-Value
        # ============================================================================>
        num_clusters, z_clusters = blob_finder.cluster_high_z_values(
            z_map_data=z_map.get_map_data(sparse=False),
            point_mask_idx=dset_total_idxs)
        # ============================================================================>
        # Too many points to cluster -- probably a bad dataset
        # ============================================================================>
        if num_clusters == -1:
            # This dataset is too noisy to analyse - flag!
            log_strs.append(
                'Z-Map too noisy to analyse -- not sure what has gone wrong here...'
            )
            return dataset, dataset_map.meta, log_strs

        # ============================================================================>
        #####
        # FILTER/SELECT CLUSTERS OF Z-SCORES
        #####
        # ============================================================================>
        # Filter the clusters by size and peak height
        # ============================================================================>
        if num_clusters > 0:
            num_clusters, z_clusters = blob_finder.filter_z_clusters_1(
                z_clusters=z_clusters)
            blob_finder.validate_clusters(z_clusters)
            if num_clusters == 0:
                log_strs.append('===> Minimum cluster peak/size not reached.')
        # ============================================================================>
        # Filter the clusters by distance from protein
        # ============================================================================>
        if num_clusters > 0:
            num_clusters, z_clusters = blob_finder.filter_z_clusters_2(
                z_clusters=z_clusters, dataset=dataset)
            blob_finder.validate_clusters(z_clusters)
            if num_clusters == 0:
                log_strs.append('===> Clusters too far from protein.')
        # ============================================================================>
        # Group Nearby Clusters Together
        # ============================================================================>
        if num_clusters > 0:
            num_clusters, z_clusters = blob_finder.group_clusters(
                z_clusters=z_clusters)
            blob_finder.validate_clusters(z_clusters)
        # ============================================================================>
        # Filter the clusters by symmetry equivalence
        # ============================================================================>
        if num_clusters > 0:
            num_clusters, z_clusters = blob_finder.filter_z_clusters_3(
                z_clusters=z_clusters, dataset=dataset)
            blob_finder.validate_clusters(z_clusters)

        # ============================================================================>
        #####
        # WRITE MAPS
        #####
        # ============================================================================>
        # write dataset maps in the reference frame
        # ============================================================================>
        if args.output.developer.write_reference_frame_maps:
            dataset_map.to_file(
                filename=dataset.file_manager.get_file('sampled_map'),
                space_group=grid.space_group())
            mean_diff_map.to_file(
                filename=dataset.file_manager.get_file('mean_diff_map'),
                space_group=grid.space_group())
            z_map.to_file(filename=dataset.file_manager.get_file('z_map'),
                          space_group=grid.space_group())
        # ============================================================================>
        # Write out mask of the high z-values
        # ============================================================================>
        if args.output.developer.write_reference_frame_grid_masks:
            # Write map of where the blobs are (high-Z mask)
            highz_points = []
            [highz_points.extend(list(x[0])) for x in z_clusters]
            highz_points = [map(int, v) for v in highz_points]
            highz_indices = map(grid.indexer(), list(highz_points))
            grid.write_indices_as_map(
                indices=highz_indices,
                f_name=dataset.file_manager.get_file('high_z_mask'),
                origin_shift=True)
        # ============================================================================>
        # Write different Z-Maps? (Probably only needed for testing)
        # ============================================================================>
        if args.output.developer.write_reference_frame_all_z_map_types:
            #            z_map_naive.to_file(filename=dataset.file_manager.get_file('z_map_naive'), space_group=grid.space_group())
            #            z_map_naive_normalised.to_file(filename=dataset.file_manager.get_file('z_map_naive_normalised'), space_group=grid.space_group())
            z_map_uncty.to_file(
                filename=dataset.file_manager.get_file('z_map_uncertainty'),
                space_group=grid.space_group())
            z_map_uncty_normalised.to_file(
                filename=dataset.file_manager.get_file(
                    'z_map_uncertainty_normalised'),
                space_group=grid.space_group())
            z_map_compl.to_file(
                filename=dataset.file_manager.get_file('z_map_corrected'),
                space_group=grid.space_group())
            z_map_compl_normalised.to_file(
                filename=dataset.file_manager.get_file(
                    'z_map_corrected_normalised'),
                space_group=grid.space_group())

        # ============================================================================>
        # Skip to next dataset if no clusters found
        # ============================================================================>
        if num_clusters > 0:
            log_strs.append('===> {!s} Cluster(s) found.'.format(num_clusters))
        else:
            log_strs.append('===> No Clusters found.')
            return (dataset, dataset_map.meta, log_strs)
        assert num_clusters > 0, 'NUMBER OF CLUSTERS AFTER FILTERING == 0!'

        # ============================================================================>
        # Extract the map data in non-sparse format
        # ============================================================================>
        dset_map_data = dataset_map.get_map_data(sparse=False)
        avrg_map_data = map_analyser.average_map().get_map_data(sparse=False)
        # ============================================================================>
        # Process the identified features
        # ============================================================================>
        for event_idx, (event_points, event_values) in enumerate(z_clusters):
            # Number events from 1
            event_num = event_idx + 1
            # Create a unique identifier for this event
            event_key = (dataset.tag, event_num)
            # ============================================================================>
            # Create a point cluster object
            # ============================================================================>
            point_cluster = PointCluster(id=event_key,
                                         points=event_points,
                                         values=event_values)
            # ============================================================================>
            # Estimate the background correction of the detected feature
            # ============================================================================>
            # Extract sites for this cluster and estimate the background correction for the event
            log_strs.append('----------------------------------->>>')
            log_strs.append(
                'Estimating Event {!s} Background Correction'.format(
                    event_num))
            # Generate custom grid mask for this dataset
            event_mask = GridMask(parent=grid,
                                  sites_cart=grid.grid2cart(
                                      point_cluster.points, origin_shift=True),
                                  max_dist=2.0,
                                  min_dist=0.0)
            log_strs.append(
                '=> Event sites ({!s} points) expanded to {!s} points'.format(
                    len(point_cluster.points),
                    len(event_mask.outer_mask_indices())))
            # Select masks to define regions for bdc calculation
            exp_event_idxs = flex.size_t(event_mask.outer_mask_indices())
            reference_idxs = flex.size_t(
                grid.global_mask().inner_mask_indices())
            # ============================================================================>
            # Generate BDC-estimation curve and estimate BDC
            # ============================================================================>
            event_remains, event_corrs, global_corrs = calculate_varying_bdc_correlations(
                ref_map_data=avrg_map_data,
                query_map_data=dset_map_data,
                feature_idxs=exp_event_idxs,
                reference_idxs=reference_idxs,
                min_remain=1.0 - args.params.background_correction.max_bdc,
                max_remain=1.0 - args.params.background_correction.min_bdc,
                bdc_increment=args.params.background_correction.increment,
                verbose=verbose)
            event_remain_est = calculate_maximum_series_discrepancy(
                labels=event_remains,
                series_1=global_corrs,
                series_2=event_corrs)
            analyse_graphs.write_occupancy_graph(
                f_name=dataset.file_manager.get_file('bdc_est_png').format(
                    event_num),
                x_values=event_remains,
                global_values=global_corrs,
                local_values=event_corrs)
            log_strs.append(
                '=> Event Background Correction estimated as {!s}'.format(
                    1 - event_remain_est))
            # Reporting (log is normally silenced)
            blob_finder.log('Min-Max: {} {}'.format(
                1.0 - args.params.background_correction.max_bdc,
                1.0 - args.params.background_correction.min_bdc))
            blob_finder.log('Event number: {}'.format(event_num))
            blob_finder.log('Event Remains: {}'.format(','.join(
                map(str, event_remains))))
            blob_finder.log('Event Corrs:  {}'.format(','.join(
                map(str, event_corrs))))
            blob_finder.log('Global Corrs: {}'.format(','.join(
                map(str, global_corrs))))
            # Apply multiplier if provided
            blob_finder.log('Applying multiplier to output 1-BDC: {}'.format(
                args.params.background_correction.output_multiplier))
            event_remain_est = min(
                event_remain_est *
                args.params.background_correction.output_multiplier,
                1.0 - args.params.background_correction.min_bdc)
            # ============================================================================>
            # Calculate the map correlations at the selected BDC
            # ============================================================================>
            event_map_data = calculate_bdc_subtracted_map(
                ref_map_data=avrg_map_data,
                query_map_data=dset_map_data,
                bdc=1.0 - event_remain_est)
            global_corr = numpy.corrcoef(
                event_map_data.select(reference_idxs),
                avrg_map_data.select(reference_idxs))[0, 1]
            local_corr = numpy.corrcoef(
                event_map_data.select(exp_event_idxs),
                avrg_map_data.select(exp_event_idxs))[0, 1]
            # ============================================================================>
            # Write out EVENT map (in the reference frame) and grid masks
            # ============================================================================>
            if args.output.developer.write_reference_frame_maps:
                event_map = dataset_map.new_from_template(event_map_data,
                                                          sparse=False)
                event_map.to_file(
                    filename=dataset.file_manager.get_file('event_map').format(
                        event_num, event_remain_est),
                    space_group=grid.space_group())
            if args.output.developer.write_reference_frame_grid_masks:
                grid.write_indices_as_map(
                    indices=event_mask.outer_mask_indices(),
                    f_name=dataset.file_manager.get_file('grid_mask').replace(
                        '.ccp4', '') + '-event-mask-{}.ccp4'.format(event_num))

            # ============================================================================>
            # Find the nearest atom to the event
            # ============================================================================>
            atm = find_nearest_atoms(atoms=list(
                protein(dataset.model.hierarchy).atoms_with_labels()),
                                     query=dataset.model.alignment.ref2nat(
                                         grid.grid2cart(sites_grid=[
                                             map(int, point_cluster.centroid)
                                         ],
                                                        origin_shift=True)))[0]
            log_strs.append(
                '=> Nearest Residue to event: Chain {}, Residue {} {}'.format(
                    atm.chain_id, atm.resname, atm.resid()))
            # ============================================================================>
            # Create an event object
            # ============================================================================>
            event_obj = Event(id=point_cluster.id, cluster=point_cluster)
            event_obj.info.estimated_pseudo_occupancy = event_remain_est
            event_obj.info.estimated_bdc = 1.0 - event_remain_est
            event_obj.info.global_correlation = global_corr
            event_obj.info.local_correlation = local_corr
            # ============================================================================>
            # Append to dataset handler
            # ============================================================================>
            dataset.events.append(event_obj)

        # ============================================================================>
        # Write out pymol script to load all of the maps easily
        # ============================================================================>
        pml = PythonScript()
        pml.set_normalise_maps(False)
        # Load Structures
        name = pml.load_pdb(
            f_name=dataset.file_manager.get_file('aligned_model'))
        pml.repr_as(obj=name, style='sticks')
        name = pml.load_pdb(
            f_name=dataset.file_manager.get_file('symmetry_copies'))
        pml.repr_hide(obj=name)
        # Load Sampled Map
        name = pml.load_map(
            f_name=dataset.file_manager.get_file('sampled_map'))
        mesh = pml.make_mesh(obj=name, contour_level=1.0, colour='blue')
        # Load Z-maps
        name = pml.load_map(f_name=dataset.file_manager.get_file('z_map'))
        mesh = pml.make_mesh(obj=name,
                             mesh_suffix='.plus',
                             contour_level=3.0,
                             colour='green')
        mesh = pml.make_mesh(obj=name,
                             mesh_suffix='.mins',
                             contour_level=-3.0,
                             colour='red')
        # Load Event maps
        for f in sorted(
                glob.glob(
                    dataset.file_manager.get_file('event_map').format(
                        '*', '*'))):
            name = pml.load_map(f_name=f)
            mesh = pml.make_mesh(obj=name,
                                 contour_level=float(f.split('_')[-2]),
                                 colour='hotpink')
        # Load Miscellaneous maps (e.g. masks)
        for f in sorted(
                glob.glob(
                    os.path.join(dataset.file_manager.get_dir('root'),
                                 '*mask*.ccp4'))):
            name = pml.load_map(f_name=f)
            mesh = pml.make_mesh(obj=name, contour_level=0.0, colour='grey')

        pml.write_script(f_name=dataset.file_manager.get_file('pymol_script'),
                         overwrite=True)

        return (dataset, dataset_map.meta, log_strs)
Пример #5
0
    def mask_reference_grid(self, dataset, selection=None):
        """Create masks for the reference grid based on distances from atoms in the reference structure"""

        # ============================================================================>
        # Get main and neighbouring symmetry copies of the masking structure
        # ============================================================================>
        ref_h = dataset.model.hierarchy
        sym_h = dataset.model.crystal_contacts(distance_cutoff=self.outer_mask+5.0, combine_copies=True)
        # ============================================================================>
        # Apply mask (protein=default if selection is not given)
        # ============================================================================>
        if selection:
            ref_h = ref_h.select(ref_h.atom_selection_cache().selection(selection), copy_atoms=True)
        else:
            ref_h = protein(ref_h)
        # ============================================================================>
        # Always generate symmetry mask using all non-water atoms - TODO also allow custom definitions? TODO
        # ============================================================================>
        sym_h = non_water(sym_h)
        # ============================================================================>
        # Check that these contain atoms
        # ============================================================================>
        if len(ref_h.atoms()) == 0: raise Sorry('Zero atoms have been selected to mask the grid')
        if len(sym_h.atoms()) == 0: raise Sorry('Zero atoms have been selected to mask the grid')
        # ============================================================================>
        # Extract coordinates
        # ============================================================================>
        ref_sites_cart = dataset.model.alignment.nat2ref(ref_h.atoms().extract_xyz())
        sym_sites_cart = dataset.model.alignment.nat2ref(sym_h.atoms().extract_xyz())
        # ============================================================================>
        # Global mask used for removing points in the bulk solvent regions
        # ============================================================================>
        if self.grid.global_mask() is None:
            global_mask = AtomicMask(parent     = self.grid,
                                     sites_cart = ref_sites_cart,
                                     max_dist   = self.outer_mask,
                                     min_dist   = self.inner_mask)
            self.grid.set_global_mask(global_mask)
        # ============================================================================>
        # Global mask used for removing points close to symmetry copies of the protein
        # ============================================================================>
        if self.grid.symmetry_mask() is None:

            symmetry_mask = GridMask(parent     = self.grid,
                                     sites_cart = sym_sites_cart,
                                     max_dist   = self.outer_mask,
                                     min_dist   = self.inner_mask_symmetry)
            self.grid.set_symmetry_mask(symmetry_mask)
        # ============================================================================>
        # Write masked maps
        # ============================================================================>
        # # Write protein masked map
        # indices = self.grid.global_mask().total_mask_indices()
        # f_name = self.file_manager.get_file('reference_dataset').replace('.mtz','.totalmask.ccp4')
        # if self.args.output.developer.write_grid_frame_masks:
        #     self.grid.write_indices_as_map(indices=indices, f_name=splice_ext(f_name, 'grid', position=-1), origin_shift=False)
        # if 1 or self.args.output.developer.write_reference_frame_common_masks_and_maps:
        #     self.grid.write_indices_as_map(indices=indices, f_name=splice_ext(f_name, 'ref', position=-1), origin_shift=True)
        #
        # # Write symmetry masked map
        # indices = self.grid.symmetry_mask().total_mask_indices()
        # f_name = self.file_manager.get_file('reference_dataset').replace('.mtz','.symmask.ccp4')
        # if self.args.output.developer.write_grid_frame_masks:
        #     self.grid.write_indices_as_map(indices=indices, f_name=splice_ext(f_name, 'grid', position=-1), origin_shift=False)
        # if 1 or self.args.output.developer.write_reference_frame_common_masks_and_maps:
        #     self.grid.write_indices_as_map(indices=indices, f_name=splice_ext(f_name, 'ref', position=-1), origin_shift=True)

        return self.grid
Пример #6
0
    def filter_z_clusters_3(self, z_clusters, dataset, max_contact_dist=8):
        """Find and remove symmetry equivalent clusters"""

        if len(z_clusters) == 1:
            return 1, z_clusters
        else:
            self.log('----------------------------------->>>')
            self.log('Filtering symmetry equivalent clusters')

        # Extract the protein sites in the reference frame
        d_sites_cart = protein(dataset.model.hierarchy).atoms().extract_xyz()
        d_unit_cell = dataset.model.unit_cell
        d_sym_ops = dataset.model.crystal_contact_operators()

        # Cartesianise and fractionalise the points in each of the clusters (in the crystallographic frame)
        points_cart = [None] * len(z_clusters)
        points_frac = [None] * len(z_clusters)
        for c_idx, (c_gps, c_val) in enumerate(z_clusters):
            # Extract points in cluster
            points_cart[c_idx] = dataset.model.alignment.ref2nat(
                self.grid.grid2cart(c_gps))
            # Fractionalise them to the unit cell of the dataset
            points_frac[c_idx] = d_unit_cell.fractionalize(points_cart[c_idx])
        # Find the sets of clusters that are symmetry related
        sym_equiv_groups = find_symmetry_equivalent_groups(
            points_frac=points_frac,
            sym_ops=d_sym_ops,
            unit_cell=d_unit_cell,
            cutoff_cart=1.05 * 1.7321 * self.grid_spacing)
        # max_contact_dist - a point contacts an atom if the atoms is within this distance of it
        # Save time - calculate the square of the contact distance
        max_contact_dist_sq = max_contact_dist**2
        # Iterate through and chose one from each group to keep
        filt_z_clusters = []
        for g_id, g_idxs in generate_group_idxs(sym_equiv_groups):
            # Count the number of contact for each cluster in the group
            c_contacts = []
            # Iterate through cluster in the group
            for c_idx in g_idxs:
                # Initialise contact counter
                contacts = 0
                # Get the cartesian points for the cluster
                c_points_cart = points_cart[c_idx]
                # Again, use the brute force all-v-all method
                for rp in d_sites_cart:
                    diffs_cart = c_points_cart - rp
                    # Check to see if site closer to cluster than minimum
                    if min(diffs_cart.dot()) < max_contact_dist_sq:
                        contacts += 1
                # Record the number of contacts (over size of cluster)
                c_contacts.append(1.0 * contacts / len(c_points_cart))
#                if self.log.verbose:
#                    print('CLUSTER:', c_idx, ', CONTACTS PER POINT:', round(c_contacts[-1],3))

# Find the cluster with the most contacts
            max_contacts = max(c_contacts)
            if max_contacts == 0:
                raise Exception('MAX CONTACTS IS 0!')
            else:
                cluster_to_keep = g_idxs[c_contacts.index(max_contacts)]
                filt_z_clusters.append(z_clusters[cluster_to_keep])
#                if self.log.verbose:
#                    print('KEEPING CLUSTER', cluster_to_keep)
        assert len(filt_z_clusters) == max(
            sym_equiv_groups
        ), 'NUMBER OF UNIQUE GROUPS AND GROUPS TO BE RETURNED NOT THE SAME'

        self.log('Filtered {!s} Clusters to {!s} Clusters'.format(
            len(z_clusters), len(filt_z_clusters)))
        return len(filt_z_clusters), filt_z_clusters
Пример #7
0
def score_model(params, pdb1, mtz1, pdb2=None, mtz2=None, label_prefix='', verbose=False):
    """
    Score residues against density, and generate other model quality indicators.
    Identified residues in pdb1 are scored against mtz1 (and mtz2, if provided) using edstats.
    Identified residues in pdb1 are compared to the equivalent residues in pdb2, if provided.
    B-factors ratios of identified residues to surrounding sidechains are calculated.
    """

    if label_prefix: label_prefix = label_prefix + '-'

    # Extract the residues to look for
    res_names = params.selection.res_names_list

    print 'Reading input structure:', pdb1

    # Extract Structure
    h1_all = non_h(strip_pdb_to_input(pdb1, remove_ter=True, remove_end=True).hierarchy)
    # Normalise hierarchy (standardise atomic naming, etc...)
    sanitise_hierarchy(h1_all)
    h1_pro = protein(h1_all)
    h1_bck = backbone(h1_all)
    h1_sch = sidechains(h1_all)

    # Pull out residues to analyse
    if res_names:
        rg_for_analysis = [rg for rg in h1_all.residue_groups() if [n for n in rg.unique_resnames() if n in res_names]]
        print 'Selecting residues named {}: {} residue(s)'.format(' or '.join(res_names), len(rg_for_analysis))
    else:
        rg_for_analysis = h1_all.residue_groups()
        print 'Analysing all residues ({} residues)'.format(len(rg_for_analysis))

    # Check residues to analyse or skip
    if not rg_for_analysis:
        raise Exception('There are no residues called {} in {}'.format(' or '.join(params.selection.res_names_list), pdb1))

    # Extract PDB2
    if pdb2 is not None:
        print 'Reading input structure:', pdb2
        h2_all = non_h(strip_pdb_to_input(pdb2, remove_ter=True, remove_end=True).hierarchy)
        sanitise_hierarchy(h2_all)

    # Score MTZ1
    if mtz1 is not None:
        print 'Scoring model against mtz file'
        print 'Scoring {} >>> {}'.format(pdb1, mtz1)
        mtz1_edstats_scores = Edstats(mtz_file=mtz1, pdb_file=pdb1, f_label=params.input.f_label)
    else:
        mtz1_edstats_scores = None
    # Score MTZ2
    if mtz2 is not None:
        print 'Scoring model against mtz file'
        print 'Scoring {} >>> {}'.format(pdb1, mtz2)
        mtz2_edstats_scores = Edstats(mtz_file=mtz2, pdb_file=pdb1, f_label=params.input.f_label)
    else:
        mtz2_edstats_scores = None

    # Prepare output table
    data_table = prepare_table()

    for rg_sel in rg_for_analysis:

        # Create label for the output table
        #rg_label = (label_prefix+rg_sel.unique_resnames()[0]+'-'+rg_sel.parent().id+'-'+rg_sel.resseq+rg_sel.icode).replace(' ','')
        #rg_label = (label_prefix+rg_sel.parent().id+'-'+rg_sel.resseq+rg_sel.icode).replace(' ','')
        rg_label = ShortLabeller.format(rg_sel).replace(' ','')
        tab_label = label_prefix + rg_label

        if len(rg_sel.unique_resnames()) != 1:
            raise Exception(tab_label+': More than one residue name associated with residue group -- cannot process')

        # Append empty row to output table
        data_table.loc[tab_label] = None

        data_table.set_value(index = tab_label,
                             col   = 'PDB',
                             value = pdb1 )
        data_table.set_value(index = tab_label,
                             col   = 'Occupancy',
                             value = calculate_residue_group_occupancy(residue_group=rg_sel) )

        data_table = calculate_residue_group_bfactor_ratio(residue_group = rg_sel,
                                                           hierarchy     = h1_sch,
                                                           data_table    = data_table,
                                                           rg_label      = tab_label)

        if pdb2 is not None:
            data_table.set_value(index = tab_label,
                                 col   = 'PDB-2',
                                 value = pdb2 )

            # Extract the equivalent residue in pdb2
            rg_sel_2 = [rg for rg in h2_all.residue_groups() if ShortLabeller.format(rg).replace(' ','') == rg_label]

            try:
                assert rg_sel_2, 'Residue is not present in pdb file: {} not in {}'.format(rg_label, pdb2)
                assert len(rg_sel_2) == 1, 'More than one residue has been selected for {} in {}'.format(rg_label, pdb2)
            except:
                raise

            # Extract occupancy
            data_table.set_value(index = tab_label,
                                 col   = 'Occupancy-2',
                                 value = calculate_residue_group_occupancy(residue_group=rg_sel_2[0]) )

            # Calculate the RMSD between the models
            try:
                confs1, confs2, rmsds = zip(*calculate_paired_conformer_rmsds(conformers_1=rg_sel.conformers(), conformers_2=rg_sel_2[0].conformers()))
                data_table.set_value(index=tab_label, col='Model RMSD', value=min(rmsds))
            except:
                raise
                print 'Could not calculate RMSD between pdb_1 and pdb_2 for residue {}'.format(rg_label)
                pass

        # Extract Density Scores - MTZ 1
        if mtz1 is not None:
            data_table.set_value(index=tab_label, col='MTZ', value=mtz1)
        if mtz1_edstats_scores is not None:
            data_table = mtz1_edstats_scores.extract_residue_group_scores(  residue_group  = rg_sel,
                                                                            data_table     = data_table,
                                                                            rg_label       = tab_label )
            # Normalise the RSZO by the Occupancy of the ligand
            data_table['RSZO/OCC'] = data_table['RSZO']/data_table['Occupancy']

        # Extract Density Scores - MTZ 2
        if mtz2 is not None:
            data_table.set_value(index=tab_label, col='MTZ-2', value=mtz2)
        if mtz2_edstats_scores is not None:
            data_table = mtz2_edstats_scores.extract_residue_group_scores(  residue_group  = rg_sel,
                                                                            data_table     = data_table,
                                                                            rg_label       = tab_label,
                                                                            column_suffix  = '-2' )
            # Normalise the RSZO by the Occupancy of the ligand
            data_table['RSZO/OCC-2'] = data_table['RSZO-2']/data_table['Occupancy-2']

    return data_table
Пример #8
0
    def __call__(
        self,
        dataset,
        dataset_map,
        ref_map,
        events,
        grid,
    ):
        # ============================================================================>
        # Extract the map data in non-sparse format
        # ============================================================================>
        dset_map_data = dataset_map.get_map_data(sparse=False)
        ref_map_data = ref_map.get_map_data(sparse=False)
        # ============================================================================>
        # Unpack cluster
        # ============================================================================>
        event_stats = OrderedDict()
        for event in events[2]:
            # ============================================================================>
            # Estimate the background correction of the detected feature
            # ============================================================================>
            # Extract sites for this cluster and estimate the background correction for the event

            # Generate custom grid mask for this dataset
            event_mask = GridMask(
                parent=grid,
                sites_cart=grid.grid2cart(event.cluster.points,
                                          origin_shift=True),
                max_dist=2.0,
                min_dist=0.0,
            )

            # Select masks to define regions for bdc calculation
            exp_event_idxs = flex.size_t(event_mask.outer_mask_indices())
            reference_idxs = flex.size_t(
                grid.global_mask().inner_mask_indices())
            # ============================================================================>
            # Generate BDC-estimation curve and estimate BDC
            # ============================================================================>
            event_remains, event_corrs, global_corrs = calculate_varying_bdc_correlations(
                ref_map_data=ref_map_data,
                query_map_data=dset_map_data,
                feature_idxs=exp_event_idxs,
                reference_idxs=reference_idxs,
                min_remain=1.0 - self.max_bdc,
                max_remain=1.0 - self.min_bdc,
                bdc_increment=self.increment,
                verbose=True)
            event_remain_est = calculate_maximum_series_discrepancy(
                labels=event_remains,
                series_1=global_corrs,
                series_2=event_corrs)

            event_remain_est = min(event_remain_est * self.output_multiplier,
                                   1.0 - self.min_bdc)
            # ============================================================================>
            # Calculate the map correlations at the selected BDC
            # ============================================================================>
            event_map_data = calculate_bdc_subtracted_map(
                ref_map_data=ref_map_data,
                query_map_data=dset_map_data,
                bdc=1.0 - event_remain_est)
            global_corr = \
                numpy.corrcoef(event_map_data.select(reference_idxs), ref_map_data.select(reference_idxs))[
                    0, 1]
            local_corr = numpy.corrcoef(event_map_data.select(exp_event_idxs),
                                        ref_map_data.select(exp_event_idxs))[0,
                                                                             1]
            # ============================================================================>
            # Update event parameters
            # ============================================================================>
            event.info.estimated_pseudo_occupancy = event_remain_est
            event.info.estimated_bdc = 1.0 - event_remain_est
            event.info.global_correlation = global_corr
            event.info.local_correlation = local_corr

            # ============================================================================>
            # Find the nearest atom to the event
            # ============================================================================>
            # TODO: restore this?
            atm = find_nearest_atoms(atoms=list(
                protein(dataset.model.hierarchy).atoms_with_labels()),
                                     query=dataset.model.alignment.ref2nat(
                                         grid.grid2cart(sites_grid=[
                                             map(int, event.cluster.centroid)
                                         ],
                                                        origin_shift=True)))[0]

            event_stats[event.id] = OrderedDict()
            event_stats[
                event.id]["estimated_pseudo_occupancy"] = event_remain_est
            event_stats[event.id]["estimated_bdc"] = 1.0 - event_remain_est
            event_stats[event.id]["global_corr"] = global_corr
            event_stats[event.id]["local_corr"] = global_corr

        return event_stats
Пример #9
0
def align_structures_rigid(mov_hier, ref_hier):
    """Extract c-alpha sites from the structures and align"""
    lsq_rt, alignment_sites, reference_sites = align_chains_rigid(mov_chain=protein(mov_hier, copy=True).models()[0].only_chain(),
                                                                  ref_chain=protein(ref_hier, copy=True).models()[0].only_chain())
    return GlobalAlignment(alignment_mx=lsq_rt, alignment_sites=alignment_sites, reference_sites=reference_sites, id=None)