def scale_all(self): t1 = time.time() self.read_all_mysql() self.millers = self.millers_mysql self.frames = self.frames_mysql self._frames = self._frames_mysql self.observations = self.observations_mysql self._observations = self._observations_mysql if self.params.model is None: self.n_accepted = len(self.frames["cc"]) self.n_low_corr = 0 self.those_accepted = flex.bool(self.n_accepted) else: self.n_accepted = (self.frames["cc"] > self.params.min_corr).count(True) self.n_low_corr = (self.frames["cc"] > self.params.min_corr).count(False) self.those_accepted = (self.frames["cc"] > self.params.min_corr) statsy = flex.mean_and_variance(self.frames["cc"]) print >> self.log, "%5d images, individual image correlation coefficients are %6.3f +/- %5.3f" % ( len(self.frames["cc"]), statsy.mean(), statsy.unweighted_sample_standard_deviation(), ) if self.params.scaling.report_ML and self.frames.has_key( "half_mosaicity_deg"): mosaic = self.frames["half_mosaicity_deg"].select( self.those_accepted) Mstat = flex.mean_and_variance(mosaic) print >> self.log, "%5d images, half mosaicity is %6.3f +/- %5.3f degrees" % ( len(mosaic), Mstat.mean(), Mstat.unweighted_sample_standard_deviation()) domain = self.frames["domain_size_ang"].select(self.those_accepted) Dstat = flex.mean_and_variance(domain) print >> self.log, "%5d images, domain size is %6.0f +/- %5.0f Angstroms" % ( len(domain), Dstat.mean(), Dstat.unweighted_sample_standard_deviation()) invdomain = 1. / domain Dstat = flex.mean_and_variance(invdomain) print >> self.log, "%5d images, inverse domain size is %f +/- %f Angstroms" % ( len(domain), Dstat.mean(), Dstat.unweighted_sample_standard_deviation()) print >> self.log, "%5d images, domain size is %6.0f +/- %5.0f Angstroms" % ( len(domain), 1. / Dstat.mean(), 1. / Dstat.unweighted_sample_standard_deviation()) t2 = time.time() print >> self.log, "" print >> self.log, "#" * 80 print >> self.log, "FINISHED MERGING" print >> self.log, " Elapsed time: %.1fs" % (t2 - t1) print >> self.log, " %d integration files were accepted" % ( self.n_accepted) print >> self.log, " %d rejected due to poor correlation" % \ self.n_low_corr
def print_table(self): from libtbx import table_utils from libtbx.str_utils import format_value table_header = ["Tile","Dist","Nobs","aRmsd","Rmsd","delx","dely","disp","rotdeg","Rsigma","Tsigma"] table_data = [] table_data.append(table_header) sort_radii = flex.sort_permutation(flex.double(self.radii)) tile_rmsds = flex.double() radial_sigmas = flex.double(len(self.tiles) // 4) tangen_sigmas = flex.double(len(self.tiles) // 4) for idx in range(len(self.tiles) // 4): x = sort_radii[idx] if self.tilecounts[x] < 3: wtaveg = 0.0 radial = (0,0) tangential = (0,0) rmean,tmean,rsigma,tsigma=(0,0,1,1) else: wtaveg = self.weighted_average_angle_deg_from_tile(x) radial,tangential,rmean,tmean,rsigma,tsigma = get_radial_tangential_vectors(self,x) radial_sigmas[x]=rsigma tangen_sigmas[x]=tsigma table_data.append( [ format_value("%3d", x), format_value("%7.2f", self.radii[x]), format_value("%6d", self.tilecounts[x]), format_value("%5.2f", self.asymmetric_tile_rmsd[x]), format_value("%5.2f", self.tile_rmsd[x]), format_value("%5.2f", self.mean_cv[x][0]), format_value("%5.2f", self.mean_cv[x][1]), format_value("%5.2f", matrix.col(self.mean_cv[x]).length()), format_value("%6.2f", wtaveg), format_value("%6.2f", rsigma), format_value("%6.2f", tsigma), ]) table_data.append([""]*len(table_header)) rstats = flex.mean_and_variance(radial_sigmas,self.tilecounts.as_double()) tstats = flex.mean_and_variance(tangen_sigmas,self.tilecounts.as_double()) table_data.append( [ format_value("%3s", "ALL"), format_value("%s", ""), format_value("%6d", self.overall_N), format_value("%5.2f", math.sqrt(flex.mean(self.delrsq))), format_value("%5.2f", self.overall_rmsd), format_value("%5.2f", self.overall_cv[0]), format_value("%5.2f", self.overall_cv[1]), format_value("%5.2f", flex.mean(flex.double([matrix.col(cv).length() for cv in self.mean_cv]))), format_value("%s", ""), format_value("%6.2f", rstats.mean()), format_value("%6.2f", tstats.mean()), ]) print print table_utils.format(table_data,has_header=1,justify='center',delim=" ")
def load_data(self, chi_angles=None): # reorganize data from # a list of all dihedrals for each atom_group per model # to # atom_group -> list of grouped dihedral angles (e.g. all chi1 in one list) if (chi_angles is not None): n_models = len(chi_angles) n_groups = len(chi_angles[0]['id_str']) id_str = chi_angles[0]['id_str'] xyz = chi_angles[0]['xyz'] id_str_map = dict() all_angles = [list() for i in range(n_groups)] for i in range(n_groups): # loop over atom_groups n_dihedrals = len(chi_angles[0]['chi_angles'][i]) angles = [list() for j in range(n_dihedrals)] for j in range(n_models): dihedrals = chi_angles[j]['chi_angles'][i] for k in range(n_dihedrals): dihedral = dihedrals[k] if (dihedral is not None): angles[k].append(dihedral) # check if adding 360 to negative angles is helpful # avoids issues where angles are clustered only near 0 and 360 for j in range(len(angles)): if (len(angles[j]) > 0): stddev_a = flex.mean_and_variance( flex.double( angles[j])).unweighted_sample_variance() dihedrals_b = flex.double(angles[j]) for k in range(len(dihedrals_b)): if (dihedrals_b[k] < 0.0): dihedrals_b[k] += 360 stddev_b = flex.mean_and_variance( dihedrals_b).unweighted_sample_variance() if ((stddev_b < stddev_a) or (max(angles[j]) < 0.0)): angles[j] = list(dihedrals_b) # store reorganized data all_angles[i] = angles # build map matching id_str with row index for fast random access id_str_map[id_str[i]] = i self.chi_angles = { 'id_str': id_str, 'id_str_map': id_str_map, 'xyz': xyz, 'values': all_angles } self.meets_threshold = [ False for i in range(len(self.chi_angles['id_str'])) ] self.UpdateTable() self.UpdatePlot()
def exercise_optimise_shelxl_weights(): def calc_goof(fo2, fc, w, k, n_params): fc2 = fc.as_intensity_array() w = w(fo2.data(), fo2.sigmas(), fc2.data(), k) return math.sqrt( flex.sum(w * flex.pow2(fo2.data() - k * fc2.data())) / (fo2.size() - n_params)) xs = smtbx.development.sucrose() k = 0.05 + 10 * flex.random_double() fc = xs.structure_factors(anomalous_flag=False, d_min=0.7).f_calc() fo = fc.as_amplitude_array() fo = fo.customized_copy(data=fo.data() * math.sqrt(k)) fo = fo.customized_copy(sigmas=0.03 * fo.data()) sigmas = fo.sigmas() for i in range(fo.size()): fo.data()[i] += 2 * scitbx.random.variate( scitbx.random.normal_distribution(sigma=sigmas[i]))() \ + 0.5*random.random() fo2 = fo.as_intensity_array() fc2 = fc.as_intensity_array() w = least_squares.mainstream_shelx_weighting(a=0.1) s = calc_goof(fo2, fc, w, k, xs.n_parameters()) w2 = w.optimise_parameters(fo2, fc2, k, xs.n_parameters()) s2 = calc_goof(fo2, fc, w2, k, xs.n_parameters()) # sort data and setup binning by fc/fc_max fc_sq = fc.as_intensity_array() fc_sq_over_fc_sq_max = fc_sq.data() / flex.max(fc_sq.data()) permutation = flex.sort_permutation(fc_sq_over_fc_sq_max) fc_sq_over_fc_sq_max = fc_sq.customized_copy( data=fc_sq_over_fc_sq_max).select(permutation) fc_sq = fc_sq.select(permutation) fo_sq = fo2.select(permutation) n_bins = 10 bin_max = 0 bin_limits = flex.size_t(1, 0) bin_count = flex.size_t() for i in range(n_bins): bin_limits.append(int(math.ceil((i + 1) * fc_sq.size() / n_bins))) bin_count.append(bin_limits[i + 1] - bin_limits[i]) goofs_w = flex.double() goofs_w2 = flex.double() for i_bin in range(n_bins): sel = flex.size_t_range(bin_limits[i_bin], bin_limits[i_bin + 1]) goofs_w2.append( calc_goof(fo_sq.select(sel), fc_sq.select(sel), w2, k, xs.n_parameters())) goofs_w.append( calc_goof(fo_sq.select(sel), fc_sq.select(sel), w, k, xs.n_parameters())) a = flex.mean_and_variance(goofs_w).unweighted_sample_variance() b = flex.mean_and_variance(goofs_w2).unweighted_sample_variance() assert a > b or abs(1 - s) > abs(1 - s2) assert a > b # flat analysis of variance assert abs(1 - s) > abs(1 - s2) # GooF close to 1
def exercise_optimise_shelxl_weights(): def calc_goof(fo2, fc, w, k, n_params): fc2 = fc.as_intensity_array() w = w(fo2.data(), fo2.sigmas(), fc2.data(), k) return math.sqrt(flex.sum( w * flex.pow2(fo2.data() - k*fc2.data()))/(fo2.size() - n_params)) xs = smtbx.development.sucrose() k = 0.05 + 10 * flex.random_double() fc = xs.structure_factors(anomalous_flag=False, d_min=0.7).f_calc() fo = fc.as_amplitude_array() fo = fo.customized_copy(data=fo.data()*math.sqrt(k)) fo = fo.customized_copy(sigmas=0.03*fo.data()) sigmas = fo.sigmas() for i in range(fo.size()): fo.data()[i] += 2 * scitbx.random.variate( scitbx.random.normal_distribution(sigma=sigmas[i]))() \ + 0.5*random.random() fo2 = fo.as_intensity_array() fc2 = fc.as_intensity_array() w = least_squares.mainstream_shelx_weighting(a=0.1) s = calc_goof(fo2, fc, w, k, xs.n_parameters()) w2 = w.optimise_parameters(fo2, fc2, k, xs.n_parameters()) s2 = calc_goof(fo2, fc, w2, k, xs.n_parameters()) # sort data and setup binning by fc/fc_max fc_sq = fc.as_intensity_array() fc_sq_over_fc_sq_max = fc_sq.data()/flex.max(fc_sq.data()) permutation = flex.sort_permutation(fc_sq_over_fc_sq_max) fc_sq_over_fc_sq_max = fc_sq.customized_copy( data=fc_sq_over_fc_sq_max).select(permutation) fc_sq = fc_sq.select(permutation) fo_sq = fo2.select(permutation) n_bins = 10 bin_max = 0 bin_limits = flex.size_t(1, 0) bin_count = flex.size_t() for i in range(n_bins): bin_limits.append(int(math.ceil((i+1) * fc_sq.size()/n_bins))) bin_count.append(bin_limits[i+1] - bin_limits[i]) goofs_w = flex.double() goofs_w2 = flex.double() for i_bin in range(n_bins): sel = flex.size_t_range(bin_limits[i_bin], bin_limits[i_bin+1]) goofs_w2.append(calc_goof(fo_sq.select(sel), fc_sq.select(sel), w2, k, xs.n_parameters())) goofs_w.append(calc_goof(fo_sq.select(sel), fc_sq.select(sel), w, k, xs.n_parameters())) a = flex.mean_and_variance(goofs_w).unweighted_sample_variance() b = flex.mean_and_variance(goofs_w2).unweighted_sample_variance() assert a > b or abs(1-s) > abs(1-s2) assert a > b # flat analysis of variance assert abs(1-s) > abs(1-s2) # GooF close to 1
def load_data(self, chi_angles=None): # reorganize data from # a list of all dihedrals for each atom_group per model # to # atom_group -> list of grouped dihedral angles (e.g. all chi1 in one list) if (chi_angles is not None): n_models = len(chi_angles) n_groups = len(chi_angles[0]['id_str']) id_str = chi_angles[0]['id_str'] xyz = chi_angles[0]['xyz'] id_str_map = dict() all_angles = [ list() for i in xrange(n_groups) ] for i in xrange(n_groups): # loop over atom_groups n_dihedrals = len(chi_angles[0]['chi_angles'][i]) angles = [ list() for j in xrange(n_dihedrals) ] for j in xrange(n_models): dihedrals = chi_angles[j]['chi_angles'][i] for k in xrange(n_dihedrals): dihedral = dihedrals[k] if (dihedral is not None): angles[k].append(dihedral) # check if adding 360 to negative angles is helpful # avoids issues where angles are clustered only near 0 and 360 for j in xrange(len(angles)): if (len(angles[j]) > 0): stddev_a = flex.mean_and_variance( flex.double(angles[j])).unweighted_sample_variance() dihedrals_b = flex.double(angles[j]) for k in xrange(len(dihedrals_b)): if (dihedrals_b[k] < 0.0): dihedrals_b[k] += 360 stddev_b = flex.mean_and_variance( dihedrals_b).unweighted_sample_variance() if ( (stddev_b < stddev_a) or (max(angles[j]) < 0.0) ): angles[j] = list(dihedrals_b) # store reorganized data all_angles[i] = angles # build map matching id_str with row index for fast random access id_str_map[id_str[i]] = i self.chi_angles = { 'id_str': id_str, 'id_str_map': id_str_map, 'xyz': xyz, 'values': all_angles } self.meets_threshold = [ False for i in xrange(len(self.chi_angles['id_str'])) ] self.UpdateTable() self.UpdatePlot()
def scale_all (self) : t1 = time.time() self.read_all_mysql() self.millers = self.millers_mysql self.frames = self.frames_mysql self._frames = self._frames_mysql self.observations = self.observations_mysql self._observations = self._observations_mysql if self.params.model is None: self.n_accepted = len(self.frames["cc"]) self.n_low_corr = 0 self.those_accepted = flex.bool(self.n_accepted) else: self.n_accepted = (self.frames["cc"]>self.params.min_corr).count(True) self.n_low_corr = (self.frames["cc"]>self.params.min_corr).count(False) self.those_accepted = (self.frames["cc"]>self.params.min_corr) statsy = flex.mean_and_variance(self.frames["cc"]) print >> self.log, "%5d images, individual image correlation coefficients are %6.3f +/- %5.3f"%( len(self.frames["cc"]), statsy.mean(), statsy.unweighted_sample_standard_deviation(), ) if self.params.scaling.report_ML and self.frames.has_key("half_mosaicity_deg"): mosaic = self.frames["half_mosaicity_deg"].select(self.those_accepted) Mstat = flex.mean_and_variance(mosaic) print >> self.log, "%5d images, half mosaicity is %6.3f +/- %5.3f degrees"%( len(mosaic), Mstat.mean(), Mstat.unweighted_sample_standard_deviation()) domain = self.frames["domain_size_ang"].select(self.those_accepted) Dstat = flex.mean_and_variance(domain) print >> self.log, "%5d images, domain size is %6.0f +/- %5.0f Angstroms"%( len(domain), Dstat.mean(), Dstat.unweighted_sample_standard_deviation()) invdomain = 1./domain Dstat = flex.mean_and_variance(invdomain) print >> self.log, "%5d images, inverse domain size is %f +/- %f Angstroms"%( len(domain), Dstat.mean(), Dstat.unweighted_sample_standard_deviation()) print >> self.log, "%5d images, domain size is %6.0f +/- %5.0f Angstroms"%( len(domain), 1./Dstat.mean(), 1./Dstat.unweighted_sample_standard_deviation()) t2 = time.time() print >> self.log, "" print >> self.log, "#" * 80 print >> self.log, "FINISHED MERGING" print >> self.log, " Elapsed time: %.1fs" % (t2 - t1) print >> self.log, " %d integration files were accepted" % ( self.n_accepted) print >> self.log, " %d rejected due to poor correlation" % \ self.n_low_corr
def calculate_solvent_mask(self): # calculate mask lsd = local_standard_deviation_map( self.map_coeffs, self.radius, mean_solvent_density=self.mean_solvent_density, symmetry_flags=maptbx.use_space_group_symmetry, resolution_factor=self.params.grid_resolution_factor, method=2) self.rms_map = lsd.map self.mask = lsd.mask(self.params.solvent_fraction) # setup solvent/protein selections self.solvent_selection = (self.mask == 1) self.protein_selection = (self.mask == 0) self.solvent_iselection = self.solvent_selection.iselection() self.protein_iselection = self.protein_selection.iselection() self.n_solvent_grid_points = self.mask.count(1) self.n_protein_grid_points = self.mask.count(0) # map statistics self.mean_protein_density = self.mean_protein_density_start = flex.mean( self.map.select(self.protein_iselection)) self.mean_solvent_density = self.mean_solvent_density_start = flex.mean( self.map.select(self.solvent_iselection)) self.mask_percent = self.n_solvent_grid_points / ( self.mask.size()) * 100 self.f000_over_v = (( (1/self.params.protein_solvent_ratio) * self.mean_protein_density) - self.mean_solvent_density) \ * (self.params.protein_solvent_ratio/(self.params.protein_solvent_ratio-1)) self.rms_protein_density = rms(self.map.select( self.protein_iselection)) self.rms_solvent_density = rms(self.map.select( self.solvent_iselection)) self.standard_deviation_local_rms = flex.mean_and_variance( lsd.map.as_1d()).unweighted_sample_standard_deviation()
def calculate_solvent_mask(self): # calculate mask lsd = local_standard_deviation_map( self.map_coeffs, self.radius, mean_solvent_density=self.mean_solvent_density, symmetry_flags=maptbx.use_space_group_symmetry, resolution_factor=self.params.grid_resolution_factor, method=2) self.rms_map = lsd.map self.mask = lsd.mask(self.params.solvent_fraction) # setup solvent/protein selections self.solvent_selection = (self.mask == 1) self.protein_selection = (self.mask == 0) self.solvent_iselection = self.solvent_selection.iselection() self.protein_iselection = self.protein_selection.iselection() self.n_solvent_grid_points = self.mask.count(1) self.n_protein_grid_points = self.mask.count(0) # map statistics self.mean_protein_density = self.mean_protein_density_start = flex.mean( self.map.select(self.protein_iselection)) self.mean_solvent_density = self.mean_solvent_density_start = flex.mean( self.map.select(self.solvent_iselection)) self.mask_percent = self.n_solvent_grid_points/(self.mask.size()) * 100 self.f000_over_v = (( (1/self.params.protein_solvent_ratio) * self.mean_protein_density) - self.mean_solvent_density) \ * (self.params.protein_solvent_ratio/(self.params.protein_solvent_ratio-1)) self.rms_protein_density = rms(self.map.select(self.protein_iselection)) self.rms_solvent_density = rms(self.map.select(self.solvent_iselection)) self.standard_deviation_local_rms = flex.mean_and_variance( lsd.map.as_1d()).unweighted_sample_standard_deviation()
def UpdateTable(self, event=None): ''' Construct table of residues that satisfy threshold ''' # check threshold threshold = self.threshold_control.GetValue() threshold_error = 'Please enter a fraction, between 0.0 and 1.0, for the threshold.' try: threshold = float(threshold) except ValueError: raise Sorry(threshold_error) if ((threshold < 0.0) or (threshold > 1.0)): raise Sorry(threshold_error) threshold = 1.0 - threshold # check if dihedrals lie within 2 standard deviations of the mean. # for Gaussian distributions, 95% of the sample is within 2 standard # deviations of the mean (default threshold value) # set atom_group to be displayed if fraction is below threshold. # actual check is n_outliers/n_total > (1 - input_threshold) for i in range(len(self.chi_angles['id_str'])): # loop over model self.meets_threshold[i] = False for j in range(len( self.chi_angles['values'][i])): # loop over group dihedrals = flex.double(self.chi_angles['values'][i][j]) if (len(dihedrals) > 0): mean_stddev = flex.mean_and_variance(dihedrals) mean = mean_stddev.mean() stddev = mean_stddev.unweighted_sample_standard_deviation() n_outliers = 0 chi_min = mean - 2.0 * stddev chi_max = mean + 2.0 * stddev for k in range(len(dihedrals)): if ((dihedrals[k] < chi_min) or (dihedrals[k] > chi_max)): n_outliers += 1 is_outlier = (float(n_outliers) / len(dihedrals) > threshold) if (is_outlier): self.meets_threshold[i] = True break # refresh table table_data = list() for i in range(len(self.meets_threshold)): if (self.meets_threshold[i]): row = ['---' for j in range(8)] row[0] = self.chi_angles['id_str'][i] row[6] = None # sel_str for model viewer row[7] = self.chi_angles['xyz'][i] # xyz for model viewer for j in range(len(self.chi_angles['values'][i])): chi = self.chi_angles['values'][i][j] if ((None not in chi) and (len(chi) > 0)): row[j + 1] = flex.mean(flex.double(chi)) table_data.append(row) self.table.ReloadData(table_data) self.table.Refresh()
def show_summary(self): w = flex.double([e.beam.get_wavelength() for e in self.experiments]) stats=flex.mean_and_variance(w) print "Wavelength mean and standard deviation:",stats.mean(),stats.unweighted_sample_standard_deviation() uc = [e.crystal.get_unit_cell().parameters() for e in self.experiments] a = flex.double([u[0] for u in uc]) stats=flex.mean_and_variance(a) print "Unit cell a mean and standard deviation:",stats.mean(),stats.unweighted_sample_standard_deviation() b = flex.double([u[1] for u in uc]) stats=flex.mean_and_variance(b) print "Unit cell b mean and standard deviation:",stats.mean(),stats.unweighted_sample_standard_deviation() c = flex.double([u[2] for u in uc]) stats=flex.mean_and_variance(c) print "Unit cell c mean and standard deviation:",stats.mean(),stats.unweighted_sample_standard_deviation() d = flex.double([e.crystal.domain_size for e in self.experiments]) stats=flex.mean_and_variance(d) # NOTE XXX FIXME: cxi.index seems to record the half-domain size; report here the full domain size print "Domain size mean and standard deviation:",2.*stats.mean(),2.*stats.unweighted_sample_standard_deviation()
def compute_functional_and_gradients(self): """The compute_functional_and_gradients() function @return Two-tuple of the value of the functional, and an <code>n</code>-long vector with the values of the gradients at the current position """ #from libtbx.development.timers import Profiler from xfel import compute_functional_and_gradients n_frames = len(self._subset) #p = Profiler("compute_functional_and_gradients [C++]") (f, g) = compute_functional_and_gradients( self.x, self.w, n_frames, self._observations) #del p # XXX Only output this every 100 iterations or so. scales = self.x[0:len(self._subset)] stats = flex.mean_and_variance(scales) print "* f =% 10.4e, g =% f+/-%f" % ( math.sqrt(f), stats.mean(), stats.unweighted_sample_standard_deviation()) # Warn if there are non_positive per-frame scaling factors. scales_non_positive = scales.select(scales <= 1e-6) # XXX Or just zero! if len(scales_non_positive) > 0: stats = flex.mean_and_variance(scales_non_positive) if len(scales_non_positive) > 1: sigma = stats.unweighted_sample_standard_deviation() else: sigma = 0 print "Have %d non-positive per-frame scaling factors: " \ "%f+/-%f [%f, %f]" % ( len(scales_non_positive), stats.mean(), sigma, flex.min(scales_non_positive), flex.max(scales_non_positive)) return (f, g)
def compute_functional_and_gradients(self): """The compute_functional_and_gradients() function @return Two-tuple of the value of the functional, and an <code>n</code>-long vector with the values of the gradients at the current position """ #from libtbx.development.timers import Profiler from xfel import compute_functional_and_gradients n_frames = len(self._subset) #p = Profiler("compute_functional_and_gradients [C++]") (f, g) = compute_functional_and_gradients(self.x, self.w, n_frames, self._observations) #del p # XXX Only output this every 100 iterations or so. scales = self.x[0:len(self._subset)] stats = flex.mean_and_variance(scales) print("* f =% 10.4e, g =% f+/-%f" % (math.sqrt(f), stats.mean(), stats.unweighted_sample_standard_deviation())) # Warn if there are non_positive per-frame scaling factors. scales_non_positive = scales.select( scales <= 1e-6) # XXX Or just zero! if len(scales_non_positive) > 0: stats = flex.mean_and_variance(scales_non_positive) if len(scales_non_positive) > 1: sigma = stats.unweighted_sample_standard_deviation() else: sigma = 0 print("Have %d non-positive per-frame scaling factors: " \ "%f+/-%f [%f, %f]" % ( len(scales_non_positive), stats.mean(), sigma, flex.min(scales_non_positive), flex.max(scales_non_positive))) return (f, g)
def UpdateTable(self, event=None): ''' Construct table of residues that satisfy threshold ''' # check threshold threshold = self.threshold_control.GetValue() threshold_error = 'Please enter a fraction, between 0.0 and 1.0, for the threshold.' try: threshold = float(threshold) except ValueError: raise Sorry(threshold_error) if ( (threshold < 0.0) or (threshold > 1.0) ): raise Sorry(threshold_error) threshold = 1.0 - threshold # check if dihedrals lie within 2 standard deviations of the mean. # for Gaussian distributions, 95% of the sample is within 2 standard # deviations of the mean (default threshold value) # set atom_group to be displayed if fraction is below threshold. # actual check is n_outliers/n_total > (1 - input_threshold) for i in xrange(len(self.chi_angles['id_str'])): # loop over model self.meets_threshold[i] = False for j in xrange(len(self.chi_angles['values'][i])): # loop over group dihedrals = flex.double(self.chi_angles['values'][i][j]) if (len(dihedrals) > 0): mean_stddev = flex.mean_and_variance(dihedrals) mean = mean_stddev.mean() stddev = mean_stddev.unweighted_sample_standard_deviation() n_outliers = 0 chi_min = mean - 2.0*stddev chi_max = mean + 2.0*stddev for k in xrange(len(dihedrals)): if ( (dihedrals[k] < chi_min) or (dihedrals[k] > chi_max) ): n_outliers += 1 is_outlier = (float(n_outliers)/len(dihedrals) > threshold) if (is_outlier): self.meets_threshold[i] = True break # refresh table table_data = list() for i in xrange(len(self.meets_threshold)): if (self.meets_threshold[i]): row = [ '---' for j in xrange(8) ] row[0] = self.chi_angles['id_str'][i] row[6] = None # sel_str for model viewer row[7] = self.chi_angles['xyz'][i] # xyz for model viewer for j in xrange(len(self.chi_angles['values'][i])): chi = self.chi_angles['values'][i][j] if ( (None not in chi) and (len(chi) > 0) ): row[j+1] = flex.mean(flex.double(chi)) table_data.append(row) self.table.ReloadData(table_data) self.table.Refresh()
def _show_each(edges): for edge, ref_edge, label in zip(edges, ref_edges, labels): h = flex.histogram(edge, n_slots=n_slots) smin, smax = flex.min(edge), flex.max(edge) stats = flex.mean_and_variance(edge) print >> out, " %s edge" % label print >> out, " range: %6.2f - %.2f" % (smin, smax) print >> out, " mean: %6.2f +/- %6.2f on N = %d" % ( stats.mean(), stats.unweighted_sample_standard_deviation(), edge.size()) print >> out, " reference: %6.2f" % ref_edge h.show(f=out, prefix=" ", format_cutoffs="%6.2f") print >> out, ""
def _show_each (edges) : for edge, ref_edge, label in zip(edges, ref_edges, labels) : h = flex.histogram(edge, n_slots=n_slots) smin, smax = flex.min(edge), flex.max(edge) stats = flex.mean_and_variance(edge) print >> out, " %s edge" % label print >> out, " range: %6.2f - %.2f" % (smin, smax) print >> out, " mean: %6.2f +/- %6.2f on N = %d" % ( stats.mean(), stats.unweighted_sample_standard_deviation(), edge.size()) print >> out, " reference: %6.2f" % ref_edge h.show(f=out, prefix=" ", format_cutoffs="%6.2f") print >> out, ""
def get_gaussian_rho(Dij, d_c): NN = Dij.focus()[0] rho = flex.double(NN) mu = flex.mean(Dij.as_1d()) sigma = flex.mean_and_variance( Dij.as_1d()).unweighted_sample_standard_deviation() for i in range(NN): for j in range(NN): z = (Dij[i * NN + j] - mu) / sigma print(z, 'AA') rho[i] += math.exp(-z * z) return rho
def show_summary(self): w = flex.double([e.beam.get_wavelength() for e in self.experiments]) stats = flex.mean_and_variance(w) print "Wavelength mean and standard deviation:", stats.mean( ), stats.unweighted_sample_standard_deviation() uc = [e.crystal.get_unit_cell().parameters() for e in self.experiments] a = flex.double([u[0] for u in uc]) stats = flex.mean_and_variance(a) print "Unit cell a mean and standard deviation:", stats.mean( ), stats.unweighted_sample_standard_deviation() b = flex.double([u[1] for u in uc]) stats = flex.mean_and_variance(b) print "Unit cell b mean and standard deviation:", stats.mean( ), stats.unweighted_sample_standard_deviation() c = flex.double([u[2] for u in uc]) stats = flex.mean_and_variance(c) print "Unit cell c mean and standard deviation:", stats.mean( ), stats.unweighted_sample_standard_deviation() d = flex.double([e.crystal.domain_size for e in self.experiments]) stats = flex.mean_and_variance(d) # NOTE XXX FIXME: cxi.index seems to record the half-domain size; report here the full domain size print "Domain size mean and standard deviation:", 2. * stats.mean( ), 2. * stats.unweighted_sample_standard_deviation()
def __init__(self, datadir, work_params, plot=False, esd_plot=False, half_data_flag=0): casetag = work_params.output.prefix # read the ground truth values back in import cPickle as pickle # it is assumed (for now) that the reference millers contain a complete asymmetric unit # of indices, within the (d_max,d_min) region of interest and possibly outside the region. reference_millers = pickle.load( open(os.path.join(datadir, casetag + "_miller.pickle"), "rb")) experiment_manager = read_experiments(work_params) obs = pickle.load( open(os.path.join(datadir, casetag + "_observation.pickle"), "rb")) print "Read in %d observations" % (len(obs["observed_intensity"])) reference_millers.show_summary(prefix="Miller index file ") print len(obs["frame_lookup"]), len( obs["observed_intensity"]), flex.max( obs['miller_lookup']), flex.max(obs['frame_lookup']) max_frameno = flex.max(obs["frame_lookup"]) from iotbx import mtz mtz_object = mtz.object(file_name=work_params.scaling.mtz_file) #for array in mtz_object.as_miller_arrays(): # this_label = array.info().label_string() # print this_label, array.observation_type() I_sim = mtz_object.as_miller_arrays()[0].as_intensity_array() I_sim.show_summary() MODEL_REINDEX_OP = work_params.model_reindex_op I_sim = I_sim.change_basis(MODEL_REINDEX_OP).map_to_asu() #match up isomorphous (the simulated fake F's) with experimental unique set matches = miller.match_multi_indices( miller_indices_unique=reference_millers.indices(), miller_indices=I_sim.indices()) print "original unique", len(reference_millers.indices()) print "isomorphous set", len(I_sim.indices()) print "pairs", len(matches.pairs()) iso_data = flex.double(len(reference_millers.indices())) for pair in matches.pairs(): iso_data[pair[0]] = I_sim.data()[pair[1]] reference_data = miller.array(miller_set=reference_millers, data=iso_data) reference_data.set_observation_type_xray_intensity() FOBS = prepare_observations_for_scaling( work_params, obs=obs, reference_intensities=reference_data, files=experiment_manager.get_files(), half_data_flag=half_data_flag) I, I_visited, G, G_visited = I_and_G_base_estimate(FOBS, params=work_params) print "I length", len(I), "G length", len( G), "(Reference set; entire asymmetric unit)" assert len(reference_data.data()) == len(I) #presumably these assertions fail when half data are taken for CC1/2 or d_min is cut model_I = reference_data.data()[0:len(I)] T = Timer("%d frames" % (len(G), )) mapper = mapper_factory(xscale6e) minimizer = mapper(I, G, I_visited, G_visited, FOBS, params=work_params, experiments=experiment_manager.get_experiments()) del T minimizer.show_summary() Fit = minimizer.e_unpack() Gstats = flex.mean_and_variance(Fit["G"].select(G_visited == 1)) print "G mean and standard deviation:", Gstats.mean( ), Gstats.unweighted_sample_standard_deviation() if "Bfactor" in work_params.levmar.parameter_flags: Bstats = flex.mean_and_variance(Fit["B"].select(G_visited == 1)) print "B mean and standard deviation:", Bstats.mean( ), Bstats.unweighted_sample_standard_deviation() show_correlation(Fit["I"], model_I, I_visited, "Correlation of I:") Fit_stddev = minimizer.e_unpack_stddev() # XXX FIXME known bug: the length of Fit["G"] could be smaller than the length of experiment_manager.get_files() # Not sure if this has any operational drawbacks. It's a result of half-dataset selection. if plot: plot_it(Fit["I"], model_I, mode="I") if "Rxy" in work_params.levmar.parameter_flags: show_histogram(Fit["Ax"], "Histogram of x rotation (degrees)") show_histogram(Fit["Ay"], "Histogram of y rotation (degrees)") print if esd_plot: minimizer.esd_plot() from cctbx.examples.merging.show_results import show_overall_observations table1, self.n_bins, self.d_min = show_overall_observations( Fit["I"], Fit_stddev["I"], I_visited, reference_data, FOBS, title="Statistics for all reflections", work_params=work_params) self.FSIM = FOBS self.ordered_intensities = reference_data self.reference_millers = reference_millers self.Fit_I = Fit["I"] self.Fit_I_stddev = Fit_stddev["I"] self.I_visited = I_visited self.Fit = Fit self.experiments = experiment_manager
def run_correction_vector_plot(working_phil): L = lines(working_phil) for line in L.vectors(): pass # pull out the information, lines class does all the work close_x = flex.double() close_y = flex.double() far_x = flex.double() far_y = flex.double() master_coords = L.master_coords master_cv = L.master_cv master_tiles = L.master_tiles for idx in range(0, len(master_coords), 10): if matrix.col( master_cv[idx]).length() < L.tile_rmsd[master_tiles[idx]]: pass #close_x.append(master_coords[idx][0]) #close_y.append(master_coords[idx][1]) else: far_x.append(master_coords[idx][0]) far_y.append(master_coords[idx][1]) close_x.append(master_coords[idx][0] + master_cv[idx][0]) close_y.append(master_coords[idx][1] + master_cv[idx][1]) if working_phil.show_plots is True: from matplotlib import pyplot as plt plt.plot(close_x, close_y, "r.") plt.plot(far_x, far_y, "g.") plt.axes().set_aspect("equal") plt.show() sort_radii = flex.sort_permutation(flex.double(L.radii)) tile_rmsds = flex.double() radial_sigmas = flex.double(64) tangen_sigmas = flex.double(64) for idx in range(64): x = sort_radii[idx] print( "Tile %2d: radius %7.2f, %6d observations, delx %5.2f dely %5.2f, rmsd = %5.2f" % (x, L.radii[x], L.tilecounts[x], L.mean_cv[x][0], L.mean_cv[x][1], L.tile_rmsd[x]), end=' ') if L.tilecounts[x] < 3: print() radial = (0, 0) tangential = (0, 0) rmean, tmean, rsigma, tsigma = (0, 0, 1, 1) else: wtaveg = L.weighted_average_angle_deg_from_tile(x) print("Tile rotation %6.2f deg" % wtaveg, end=' ') radial, tangential, rmean, tmean, rsigma, tsigma = get_radial_tangential_vectors( L, x) print("%6.2f %6.2f" % (rsigma, tsigma)) radial_sigmas[x] = rsigma tangen_sigmas[x] = tsigma rstats = flex.mean_and_variance(radial_sigmas, L.tilecounts.as_double()) tstats = flex.mean_and_variance(tangen_sigmas, L.tilecounts.as_double()) print( "\nOverall %8d observations, delx %5.2f dely %5.2f, rmsd = %5.2f" % (L.overall_N, L.overall_cv[0], L.overall_cv[1], L.overall_rmsd)) print("Average tile rmsd %5.2f" % flex.mean(flex.double(L.tile_rmsd))) print("Average tile displacement %5.2f" % (flex.mean(flex.double([matrix.col(cv).length() for cv in L.mean_cv])))) print("Weighted average radial sigma %6.2f" % rstats.mean()) print("Weighted average tangential sigma %6.2f" % tstats.mean()) if working_phil.show_plots is True: plt.plot([(L.tiles[4 * x + 0] + L.tiles[4 * x + 2]) / 2. for x in range(64)], [(L.tiles[4 * x + 1] + L.tiles[4 * x + 3]) / 2. for x in range(64)], "go") for x in range(64): plt.text(10 + (L.tiles[4 * x + 0] + L.tiles[4 * x + 2]) / 2., 10 + (L.tiles[4 * x + 1] + L.tiles[4 * x + 3]) / 2., "%d" % x) plt.show() for idx in range(64): x = sort_radii[idx] print( "Tile %2d: radius %7.2f, %6d observations, delx %5.2f dely %5.2f, rmsd = %5.2f" % (x, L.radii[x], L.tilecounts[x], L.mean_cv[x][0], L.mean_cv[x][1], L.tile_rmsd[x]), end=' ') if L.tilecounts[x] < 3: print() radial = (0, 0) tangential = (0, 0) rmean, tmean, rsigma, tsigma = (0, 0, 1, 1) else: wtaveg = L.weighted_average_angle_deg_from_tile(x) print("Tile rotation %6.2f deg" % wtaveg, end=' ') radial, tangential, rmean, tmean, rsigma, tsigma = get_radial_tangential_vectors( L, x) print("%6.2f %6.2f" % (rsigma, tsigma)) if working_phil.colormap: from pylab import imshow, axes, colorbar, show import numpy xcv, ycv = get_correction_vector_xy(L, x) _min = min(min(xcv), min(ycv)) _max = max(max(xcv), max(ycv)) hist, xedges, yedges = numpy.histogram2d(xcv, ycv, bins=40, range=[[_min, _max], [_min, _max]]) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] imshow(hist.T, extent=extent, interpolation='nearest', origin='lower') from matplotlib.patches import Ellipse ell = Ellipse(xy=(L.mean_cv[x][0], L.mean_cv[x][1]), width=2. * rsigma, height=2. * tsigma, angle=math.atan2(-(radial[1]), -(radial[0])) * 180. / math.pi, edgecolor="y", linewidth=2, fill=False, zorder=100) axes().add_artist(ell) colorbar() show() else: from matplotlib import pyplot as plt xcv, ycv = get_correction_vector_xy(L, x) if len(xcv) == 0 or len(ycv) == 0: continue plt.plot(xcv, ycv, "r.") plt.plot([L.mean_cv[x][0]], [L.mean_cv[x][1]], "go") plt.plot([L.mean_cv[x][0] + radial[0]], [L.mean_cv[x][1] + radial[1]], "yo") plt.plot([L.mean_cv[x][0] + tangential[0]], [L.mean_cv[x][1] + tangential[1]], "bo") from matplotlib.patches import Ellipse ell = Ellipse(xy=(L.mean_cv[x][0], L.mean_cv[x][1]), width=2. * rsigma, height=2. * tsigma, angle=math.atan2(-(radial[1]), -(radial[0])) * 180. / math.pi, edgecolor="y", linewidth=2, fill=False, zorder=100) plt.axes().add_artist(ell) plt.axes().set_aspect("equal") _min = min(min(xcv), min(ycv)) _max = max(max(xcv), max(ycv)) plt.axes().set_xlim(_min, _max) plt.axes().set_ylim(_min, _max) plt.show()
def _normalised_delta_cc_i(self): mav = flex.mean_and_variance(self.delta_cc) return (self.delta_cc - mav.mean()) / mav.unweighted_sample_standard_deviation()
def __init__(self,datadir,work_params,plot=False,esd_plot=False,half_data_flag=0): casetag = work_params.output.prefix # read the ground truth values back in import cPickle as pickle # it is assumed (for now) that the reference millers contain a complete asymmetric unit # of indices, within the (d_max,d_min) region of interest and possibly outside the region. reference_millers = pickle.load(open(os.path.join(datadir,casetag+"_miller.pickle"),"rb")) experiment_manager = read_experiments(work_params) obs = pickle.load(open(os.path.join(datadir,casetag+"_observation.pickle"),"rb")) print "Read in %d observations"%(len(obs["observed_intensity"])) reference_millers.show_summary(prefix="Miller index file ") print len(obs["frame_lookup"]),len(obs["observed_intensity"]), flex.max(obs['miller_lookup']),flex.max(obs['frame_lookup']) max_frameno = flex.max(obs["frame_lookup"]) from iotbx import mtz mtz_object = mtz.object(file_name=work_params.scaling.mtz_file) #for array in mtz_object.as_miller_arrays(): # this_label = array.info().label_string() # print this_label, array.observation_type() I_sim = mtz_object.as_miller_arrays()[0].as_intensity_array() I_sim.show_summary() MODEL_REINDEX_OP = work_params.model_reindex_op I_sim = I_sim.change_basis(MODEL_REINDEX_OP).map_to_asu() #match up isomorphous (the simulated fake F's) with experimental unique set matches = miller.match_multi_indices( miller_indices_unique=reference_millers.indices(), miller_indices=I_sim.indices()) print "original unique",len(reference_millers.indices()) print "isomorphous set",len(I_sim.indices()) print "pairs",len(matches.pairs()) iso_data = flex.double(len(reference_millers.indices())) for pair in matches.pairs(): iso_data[pair[0]] = I_sim.data()[pair[1]] reference_data = miller.array(miller_set = reference_millers, data = iso_data) reference_data.set_observation_type_xray_intensity() FOBS = prepare_observations_for_scaling(work_params,obs=obs, reference_intensities=reference_data, files = experiment_manager.get_files(), half_data_flag=half_data_flag) I,I_visited,G,G_visited = I_and_G_base_estimate(FOBS,params=work_params) print "I length",len(I), "G length",len(G), "(Reference set; entire asymmetric unit)" assert len(reference_data.data()) == len(I) #presumably these assertions fail when half data are taken for CC1/2 or d_min is cut model_I = reference_data.data()[0:len(I)] T = Timer("%d frames"%(len(G), )) mapper = mapper_factory(xscale6e) minimizer = mapper(I,G,I_visited,G_visited,FOBS,params=work_params, experiments=experiment_manager.get_experiments()) del T minimizer.show_summary() Fit = minimizer.e_unpack() Gstats=flex.mean_and_variance(Fit["G"].select(G_visited==1)) print "G mean and standard deviation:",Gstats.mean(),Gstats.unweighted_sample_standard_deviation() if "Bfactor" in work_params.levmar.parameter_flags: Bstats=flex.mean_and_variance(Fit["B"].select(G_visited==1)) print "B mean and standard deviation:",Bstats.mean(),Bstats.unweighted_sample_standard_deviation() show_correlation(Fit["I"],model_I,I_visited,"Correlation of I:") Fit_stddev = minimizer.e_unpack_stddev() # XXX FIXME known bug: the length of Fit["G"] could be smaller than the length of experiment_manager.get_files() # Not sure if this has any operational drawbacks. It's a result of half-dataset selection. if plot: plot_it(Fit["I"], model_I, mode="I") if "Rxy" in work_params.levmar.parameter_flags: show_histogram(Fit["Ax"],"Histogram of x rotation (degrees)") show_histogram(Fit["Ay"],"Histogram of y rotation (degrees)") print if esd_plot: minimizer.esd_plot() from cctbx.examples.merging.show_results import show_overall_observations table1,self.n_bins,self.d_min = show_overall_observations( Fit["I"],Fit_stddev["I"],I_visited, reference_data,FOBS,title="Statistics for all reflections", work_params = work_params) self.FSIM=FOBS self.ordered_intensities=reference_data self.reference_millers=reference_millers self.Fit_I=Fit["I"] self.Fit_I_stddev=Fit_stddev["I"] self.I_visited=I_visited self.Fit = Fit self.experiments = experiment_manager
def __init__(self, **kwargs): group_args.__init__(self, **kwargs) print('finished Dij, now calculating rho_i and density') from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL R = RL(distance_matrix=self.Dij, d_c=self.d_c) #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True) if hasattr(self, 'strategy') is False: self.strategy = 'default' self.rho = rho = R.get_rho() ave_rho = flex.mean(rho.as_double()) NN = self.Dij.focus()[0] i_max = flex.max_index(rho) delta_i_max = flex.max( flex.double([self.Dij[i_max, j] for j in range(NN)])) rho_order = flex.sort_permutation(rho, reverse=True) rho_order_list = list(rho_order) self.delta = delta = R.get_delta(rho_order=rho_order, delta_i_max=delta_i_max) cluster_id = flex.int(NN, -1) # -1 means no cluster delta_order = flex.sort_permutation(delta, reverse=True) MAX_PERCENTILE_RHO = self.max_percentile_rho # cluster centers have to be in the top percentile n_cluster = 0 # # print('Z_DELTA = ', self.Z_delta) pick_top_solution = False rho_stdev = flex.mean_and_variance( rho.as_double()).unweighted_sample_standard_deviation() delta_stdev = flex.mean_and_variance( delta).unweighted_sample_standard_deviation() if rho_stdev != 0.0 and delta_stdev != 0: rho_z = (rho.as_double() - flex.mean(rho.as_double())) / (rho_stdev) delta_z = (delta - flex.mean(delta)) / (delta_stdev) else: pick_top_solution = True if rho_stdev == 0.0: centroids = [flex.first_index(delta, flex.max(delta))] elif delta_stdev == 0.0: centroids = [flex.first_index(rho, flex.max(rho))] significant_delta = [] significant_rho = [] # Define strategy to decide cluster center here. Only one should be true debug_fix_clustering = True if self.strategy == 'one_cluster': debug_fix_clustering = False strategy2 = True if self.strategy == 'strategy_3': debug_fix_clustering = False strategy3 = True strategy2 = False if debug_fix_clustering: if not pick_top_solution: delta_z_cutoff = min(1.0, max(delta_z)) rho_z_cutoff = min(1.0, max(rho_z)) for ic in range(NN): # test the density & rho if delta_z[ic] >= delta_z_cutoff or delta_z[ ic] <= -delta_z_cutoff: significant_delta.append(ic) if rho_z[ic] >= rho_z_cutoff or rho_z[ic] <= -rho_z_cutoff: significant_rho.append(ic) if True: # Use idea quoted in Rodriguez Laio 2014 paper # " Thus, cluster centers are recognized as points for which the value of delta is anomalously large." centroid_candidates = list(significant_delta) candidate_delta_z = flex.double() for ic in centroid_candidates: if ic == rho_order[0]: delta_z_of_rho_order_0 = delta_z[ic] candidate_delta_z.append(delta_z[ic]) i_sorted = flex.sort_permutation(candidate_delta_z, reverse=True) # Check that once sorted the top one is not equal to the 2nd or 3rd position # If there is a tie, assign centroid to the first one in rho order centroids = [] # rho_order[0] has to be a centroid centroids.append(rho_order[0]) #centroids.append(centroid_candidates[i_sorted[0]]) for i in range(0, len(i_sorted[:])): if centroid_candidates[i_sorted[i]] == rho_order[0]: continue if delta_z_of_rho_order_0 - candidate_delta_z[ i_sorted[i]] > 1.0: if i > 1: if -candidate_delta_z[i_sorted[ i - 1]] + candidate_delta_z[ i_sorted[0]] > 1.0: centroids.append( centroid_candidates[i_sorted[i]]) else: centroids.append( centroid_candidates[i_sorted[i]]) else: break if False: centroid_candidates = list( set(significant_delta).intersection( set(significant_rho))) # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev centroids = [] max_delta_z_candidates = -999.9 max_rho_z_candidates = -999.9 for ic in centroid_candidates: if delta_z[ic] > max_delta_z_candidates: max_delta_z_candidates = delta_z[ic] if rho_z[ic] > max_rho_z_candidates: max_rho_z_candidates = rho_z[ic] for ic in centroid_candidates: if max_delta_z_candidates - delta_z[ ic] < 1.0 and max_rho_z_candidates - rho_z[ ic] < 1.0: centroids.append(ic) #item_idxs = [delta_order[ic] for ic,centroid in enumerate(centroids)] item_idxs = centroids for item_idx in item_idxs: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 #### elif strategy2: # Go through list of clusters, see which one has highest joint rank in both rho and delta lists # This will only assign one cluster center based on highest product of rho and delta ranks product_list_of_ranks = [] for ic in range(NN): rho_tmp = self.rho[ic] delta_tmp = self.delta[ic] product_list_of_ranks.append(rho_tmp * delta_tmp) import numpy as np item_idx = np.argmax(product_list_of_ranks) cluster_id[item_idx] = n_cluster # Only cluster assigned print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 elif strategy3: # use product of delta and rho and pick out top candidates # have to use a significance z_score to filter out the very best product_list_of_ranks = flex.double() for ic in range(NN): rho_tmp = self.rho[ic] delta_tmp = self.delta[ic] product_list_of_ranks.append(rho_tmp * delta_tmp) import numpy as np iid_sorted = flex.sort_permutation(product_list_of_ranks, reverse=True) cluster_id[ iid_sorted[0]] = n_cluster # first point always a cluster n_cluster += 1 print('CLUSTERING_STATS S3', iid_sorted[0], cluster_id[iid_sorted[0]]) #product_list_of_ranks[iid_sorted[0]]=0.0 # set this to 0.0 so that the mean/stdev does not get biased by one point stdev = np.std(product_list_of_ranks) mean = np.mean(product_list_of_ranks) n_sorted = 3 #if stdev == 0.0: # n_sorted=1 z_critical = 3.0 # 2 sigma significance ? # Only go through say 3-4 datapoints # basically there won't be more than 2-3 lattices on an image realistically for iid in iid_sorted[1:n_sorted]: z_score = (product_list_of_ranks[iid] - mean) / stdev if z_score > z_critical: cluster_id[iid] = n_cluster n_cluster += 1 print('CLUSTERING_STATS S3', iid, cluster_id[iid]) else: break # No point going over all points once below threshold z_score else: for ic in range(NN): item_idx = delta_order[ic] if ic != 0: if delta[item_idx] <= 0.25 * delta[ delta_order[0]]: # too low to be a medoid continue item_rho_order = rho_order_list.index(item_idx) if (item_rho_order) / NN < MAX_PERCENTILE_RHO: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', ic, item_idx, item_rho_order, cluster_id[item_idx]) n_cluster += 1 ### # print('Found %d clusters' % n_cluster) for x in range(NN): if cluster_id[x] >= 0: print("XC", x, cluster_id[x], rho[x], delta[x]) self.cluster_id_maxima = cluster_id.deep_copy() R.cluster_assignment(rho_order, cluster_id, rho) self.cluster_id_full = cluster_id.deep_copy() #halo = flex.bool(NN,False) #border = R.get_border( cluster_id = cluster_id ) #for ic in range(n_cluster): #loop thru all border regions; find highest density # this_border = (cluster_id == ic) & (border==True) # if this_border.count(True)>0: # highest_density = flex.max(rho.select(this_border)) # halo_selection = (rho < highest_density) & (this_border==True) # if halo_selection.count(True)>0: # cluster_id.set_selected(halo_selection,-1) # core_selection = (cluster_id == ic) & ~halo_selection # highest_density = flex.max(rho.select(core_selection)) # too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic # if too_sparse.count(True)>0: # cluster_id.set_selected(too_sparse,-1) self.cluster_id_final = cluster_id.deep_copy()
def __init__(self, **kwargs): group_args.__init__(self, **kwargs) print('finished Dij, now calculating rho_i and density') from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL R = RL(distance_matrix=self.Dij, d_c=self.d_c) #from IPython import embed; embed(); exit() #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True) self.rho = rho = R.get_rho() ave_rho = flex.mean(rho.as_double()) NN = self.Dij.focus()[0] i_max = flex.max_index(rho) delta_i_max = flex.max( flex.double([self.Dij[i_max, j] for j in range(NN)])) rho_order = flex.sort_permutation(rho, reverse=True) rho_order_list = list(rho_order) self.delta = delta = R.get_delta(rho_order=rho_order, delta_i_max=delta_i_max) cluster_id = flex.int(NN, -1) # -1 means no cluster delta_order = flex.sort_permutation(delta, reverse=True) MAX_PERCENTILE_RHO = self.max_percentile_rho # cluster centers have to be in the top percentile n_cluster = 0 # pick_top_solution = False rho_stdev = flex.mean_and_variance( rho.as_double()).unweighted_sample_standard_deviation() delta_stdev = flex.mean_and_variance( delta).unweighted_sample_standard_deviation() if rho_stdev != 0.0 and delta_stdev != 0: rho_z = (rho.as_double() - flex.mean(rho.as_double())) / (rho_stdev) delta_z = (delta - flex.mean(delta)) / (delta_stdev) else: pick_top_solution = True if rho_stdev == 0.0: centroids = [flex.first_index(delta, flex.max(delta))] elif delta_stdev == 0.0: centroids = [flex.first_index(rho, flex.max(rho))] significant_delta = [] significant_rho = [] debug_fix_clustering = True if debug_fix_clustering: if not pick_top_solution: delta_z_cutoff = min(1.0, max(delta_z)) rho_z_cutoff = min(1.0, max(rho_z)) for ic in range(NN): # test the density & rho if delta_z[ic] >= delta_z_cutoff: significant_delta.append(ic) if rho_z[ic] >= rho_z_cutoff: significant_rho.append(ic) centroid_candidates = list( set(significant_delta).intersection(set(significant_rho))) # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev centroids = [] max_delta_z_candidates = -999.9 max_rho_z_candidates = -999.9 for ic in centroid_candidates: if delta_z[ic] > max_delta_z_candidates: max_delta_z_candidates = delta_z[ic] if rho_z[ic] > max_rho_z_candidates: max_rho_z_candidates = rho_z[ic] for ic in centroid_candidates: if max_delta_z_candidates - delta_z[ ic] < 1.0 and max_rho_z_candidates - rho_z[ ic] < 1.0: centroids.append(ic) item_idxs = [ delta_order[ic] for ic, centroid in enumerate(centroids) ] for item_idx in item_idxs: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 #### else: for ic in range(NN): item_idx = delta_order[ic] if ic != 0: if delta[item_idx] <= 0.25 * delta[ delta_order[0]]: # too low to be a medoid continue item_rho_order = rho_order_list.index(item_idx) if (item_rho_order) / NN < MAX_PERCENTILE_RHO: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', ic, item_idx, item_rho_order, cluster_id[item_idx]) n_cluster += 1 ### # # print('Found %d clusters' % n_cluster) for x in range(NN): if cluster_id[x] >= 0: print("XC", x, cluster_id[x], rho[x], delta[x]) self.cluster_id_maxima = cluster_id.deep_copy() R.cluster_assignment(rho_order, cluster_id) self.cluster_id_full = cluster_id.deep_copy() #halo = flex.bool(NN,False) #border = R.get_border( cluster_id = cluster_id ) #for ic in range(n_cluster): #loop thru all border regions; find highest density # this_border = (cluster_id == ic) & (border==True) # if this_border.count(True)>0: # highest_density = flex.max(rho.select(this_border)) # halo_selection = (rho < highest_density) & (this_border==True) # if halo_selection.count(True)>0: # cluster_id.set_selected(halo_selection,-1) # core_selection = (cluster_id == ic) & ~halo_selection # highest_density = flex.max(rho.select(core_selection)) # too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic # if too_sparse.count(True)>0: # cluster_id.set_selected(too_sparse,-1) self.cluster_id_final = cluster_id.deep_copy()
def _compute_normalised_delta_ccs(self): mav = flex.mean_and_variance(self.delta_cc_half) return (self.delta_cc_half - mav.mean()) / mav.unweighted_sample_standard_deviation()
def run_correction_vector_plot(working_phil): L = lines(working_phil) for line in L.vectors(): pass # pull out the information, lines class does all the work close_x = flex.double() close_y = flex.double() far_x = flex.double() far_y = flex.double() master_coords = L.master_coords master_cv = L.master_cv master_tiles = L.master_tiles for idx in xrange(0,len(master_coords),10): if matrix.col(master_cv[idx]).length() < L.tile_rmsd[ master_tiles[idx] ]: pass #close_x.append(master_coords[idx][0]) #close_y.append(master_coords[idx][1]) else: far_x.append(master_coords[idx][0]) far_y.append(master_coords[idx][1]) close_x.append(master_coords[idx][0]+master_cv[idx][0]) close_y.append(master_coords[idx][1]+master_cv[idx][1]) if working_phil.show_plots is True: from matplotlib import pyplot as plt plt.plot(close_x,close_y,"r.") plt.plot(far_x,far_y,"g.") plt.axes().set_aspect("equal") plt.show() sort_radii = flex.sort_permutation(flex.double(L.radii)) tile_rmsds = flex.double() radial_sigmas = flex.double(64) tangen_sigmas = flex.double(64) for idx in xrange(64): x = sort_radii[idx] print "Tile %2d: radius %7.2f, %6d observations, delx %5.2f dely %5.2f, rmsd = %5.2f"%( x, L.radii[x], L.tilecounts[x], L.mean_cv[x][0], L.mean_cv[x][1], L.tile_rmsd[x] ), if L.tilecounts[x] < 3: print radial = (0,0) tangential = (0,0) rmean,tmean,rsigma,tsigma=(0,0,1,1) else: wtaveg = L.weighted_average_angle_deg_from_tile(x) print "Tile rotation %6.2f deg"%wtaveg, radial,tangential,rmean,tmean,rsigma,tsigma = get_radial_tangential_vectors(L,x) print "%6.2f %6.2f"%(rsigma,tsigma) radial_sigmas[x]=rsigma tangen_sigmas[x]=tsigma rstats = flex.mean_and_variance(radial_sigmas,L.tilecounts.as_double()) tstats = flex.mean_and_variance(tangen_sigmas,L.tilecounts.as_double()) print "\nOverall %8d observations, delx %5.2f dely %5.2f, rmsd = %5.2f"%( L.overall_N, L.overall_cv[0], L.overall_cv[1], L.overall_rmsd) print "Average tile rmsd %5.2f"%flex.mean(flex.double(L.tile_rmsd)) print "Average tile displacement %5.2f"%(flex.mean( flex.double([matrix.col(cv).length() for cv in L.mean_cv]))) print "Weighted average radial sigma %6.2f"%rstats.mean() print "Weighted average tangential sigma %6.2f"%tstats.mean() if working_phil.show_plots is True: plt.plot([(L.tiles[4*x+0]+L.tiles[4*x+2])/2. for x in xrange(64)],[(L.tiles[4*x+1]+L.tiles[4*x+3])/2. for x in xrange(64)],"go") for x in xrange(64): plt.text(10+(L.tiles[4*x+0]+L.tiles[4*x+2])/2.,10+(L.tiles[4*x+1]+L.tiles[4*x+3])/2.,"%d"%x) plt.show() for idx in xrange(64): x = sort_radii[idx] print "Tile %2d: radius %7.2f, %6d observations, delx %5.2f dely %5.2f, rmsd = %5.2f"%( x, L.radii[x], L.tilecounts[x], L.mean_cv[x][0], L.mean_cv[x][1], L.tile_rmsd[x] ), if L.tilecounts[x] < 3: print radial = (0,0) tangential = (0,0) rmean,tmean,rsigma,tsigma=(0,0,1,1) else: wtaveg = L.weighted_average_angle_deg_from_tile(x) print "Tile rotation %6.2f deg"%wtaveg, radial,tangential,rmean,tmean,rsigma,tsigma = get_radial_tangential_vectors(L,x) print "%6.2f %6.2f"%(rsigma,tsigma) if working_phil.colormap: from pylab import imshow, axes, colorbar, show import numpy xcv,ycv = get_correction_vector_xy(L,x) _min = min(min(xcv),min(ycv)) _max = max(max(xcv),max(ycv)) hist,xedges,yedges = numpy.histogram2d(xcv,ycv,bins=40,range=[[_min,_max],[_min,_max]]) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1] ] imshow(hist.T,extent=extent,interpolation='nearest',origin='lower') from matplotlib.patches import Ellipse ell = Ellipse(xy=(L.mean_cv[x][0],L.mean_cv[x][1]), width=2.*rsigma, height=2.*tsigma, angle=math.atan2(-(radial[1]),-(radial[0]))*180./math.pi, edgecolor="y", linewidth=2, fill=False, zorder=100) axes().add_artist(ell) colorbar() show() else: from matplotlib import pyplot as plt xcv,ycv = get_correction_vector_xy(L,x) if len(xcv)==0 or len(ycv)==0: continue plt.plot(xcv,ycv,"r.") plt.plot([L.mean_cv[x][0]],[L.mean_cv[x][1]],"go") plt.plot([L.mean_cv[x][0]+radial[0]],[L.mean_cv[x][1]+radial[1]],"yo") plt.plot([L.mean_cv[x][0]+tangential[0]],[L.mean_cv[x][1]+tangential[1]],"bo") from matplotlib.patches import Ellipse ell = Ellipse(xy=(L.mean_cv[x][0],L.mean_cv[x][1]), width=2.*rsigma, height=2.*tsigma, angle=math.atan2(-(radial[1]),-(radial[0]))*180./math.pi, edgecolor="y", linewidth=2, fill=False, zorder=100) plt.axes().add_artist(ell) plt.axes().set_aspect("equal") _min = min(min(xcv),min(ycv)) _max = max(max(xcv),max(ycv)) plt.axes().set_xlim(_min,_max) plt.axes().set_ylim(_min,_max) plt.show()
def get_uc_consensus(experiments_list, show_plot=False, return_only_first_indexed_model=False, finalize_method=None, clustering_params=None): ''' Uses the Rodriguez Laio 2014 method to do a clustering of the unit cells and then vote for the highest consensus unit cell. Input needs to be a list of experiments object. Clustering code taken from github.com/cctbx-xfel/cluster_regression Returns an experiment object with crystal unit cell from the cluster with the most points ''' if return_only_first_indexed_model: return [experiments_list[0].crystals()[0]], None cells = [] from xfel.clustering.singleframe import CellOnlyFrame save_plot = False # Flag for testing Lysozyme data from NKS.Make sure cluster_regression repository is present and configured # Program will exit after plots are displayed if this flag is true test_nks = False if test_nks: from cctbx import crystal import libtbx.load_env cluster_regression = libtbx.env.find_in_repositories( relative_path="cluster_regression", test=os.path.isdir) file_name = os.path.join(cluster_regression, 'examples', 'lysozyme1341.txt') for line in open(file_name, "r").xreadlines(): tokens = line.strip().split() unit_cell = tuple(float(x) for x in tokens[0:6]) space_group_symbol = tokens[6] crystal_symmetry = crystal.symmetry( unit_cell=unit_cell, space_group_symbol=space_group_symbol) cells.append(CellOnlyFrame(crystal_symmetry)) else: for experiment in experiments_list: if len(experiment.crystals()) > 1: print('IOTA:Should have only one crystal model') crystal_symmetry = experiment.crystals()[0].get_crystal_symmetry() cells.append(CellOnlyFrame(crystal_symmetry)) MM = [c.mm for c in cells] # metrical matrices MM_double = flex.double() for i in range(len(MM)): Tup = MM[i] for j in range(6): MM_double.append(Tup[j]) print('There are %d cells' % len(MM)) coord_x = flex.double([c.uc[0] for c in cells]) coord_y = flex.double([c.uc[1] for c in cells]) if show_plot or save_plot: import matplotlib if not show_plot: matplotlib.use('Agg') import matplotlib.pyplot as plt #from IPython import embed; embed(); exit() plt.plot([c.uc[0] for c in cells], [c.uc[1] for c in cells], "k.", markersize=3.) plt.axes().set_aspect("equal") if save_plot: plot_name = 'uc_cluster.png' plt.savefig(plot_name, size_inches=(10, 10), dpi=300, bbox_inches='tight') if show_plot: plt.show() print('Now constructing a Dij matrix: Starting Unit Cell clustering') NN = len(MM) from cctbx.uctbx.determine_unit_cell import NCDist_flatten Dij = NCDist_flatten(MM_double) d_c = flex.mean_and_variance( Dij.as_1d()).unweighted_sample_standard_deviation() #6.13 #FIXME should be a PHIL param if len(cells) < 5: return [experiments_list[0].crystals()[0]], None CM = clustering_manager(Dij=Dij, d_c=d_c, max_percentile_rho=0.95) n_cluster = 1 + flex.max(CM.cluster_id_final) print(len(cells), ' datapoints have been analyzed') print('%d CLUSTERS' % n_cluster) for i in range(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) print('Cluster %d central Unit cell = %d' % (i, item)) cells[item].crystal_symmetry.show_summary() # More plots for debugging appcolors = [ 'b', 'r', '#ff7f0e', '#2ca02c', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf' ] if show_plot: # Decision graph import matplotlib.pyplot as plt plt.plot(CM.rho, CM.delta, "r.", markersize=3.) for x in range(NN): if CM.cluster_id_maxima[x] >= 0: plt.plot([CM.rho[x]], [CM.delta[x]], "ro") plt.show() if show_plot: import matplotlib.pyplot as plt colors = [appcolors[i % 10] for i in CM.cluster_id_full] plt.scatter(coord_x, coord_y, marker='o', color=colors, linewidth=0.4, edgecolor='k') for i in range(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) plt.plot([cells[item].uc[0]], cells[item].uc[1], 'y.') plt.axes().set_aspect("equal") plt.show() if test_nks: exit() # Now look at each unit cell cluster for orientational clustering # idea is to cluster the orientational component in each of the unit cell clusters # do_orientational_clustering = not return_only_first_indexed_model # temporary. dxtbx_crystal_models = [] if do_orientational_clustering: print('IOTA: Starting orientational clustering') Dij_ori = {} # dictionary to store Dij for each cluster uc_experiments_list = { } # dictionary to store experiments_lists for each cluster from collections import Counter uc_cluster_count = Counter(list(CM.cluster_id_final)) # instantiate the Dij_ori flat 1-d array # Put all experiments list from same uc cluster together if True: from scitbx.matrix import sqr from cctbx_orientation_ext import crystal_orientation #crystal_orientation_list = [] #for i in range(len(experiments_list)): # crystal_orientation_list.append(crystal_orientation(experiments_list[i].crystals()[0].get_A(), True)) #from IPython import embed; embed(); exit() #A_direct = sqr(crystal_orientation_list[i].reciprocal_matrix()).transpose().inverse() #print ("Direct A matrix 1st element = %12.6f"%A_direct[0]) for i in range(len(experiments_list)): if CM.cluster_id_full[i] not in uc_experiments_list: uc_experiments_list[CM.cluster_id_full[i]] = [] uc_experiments_list[CM.cluster_id_full[i]].append( experiments_list[i]) for cluster in uc_cluster_count: # Make sure there are atleast a minimum number of samples in the cluster if uc_cluster_count[cluster] < 5: continue Dij_ori[cluster] = flex.double( [[0.0] * uc_cluster_count[cluster]] * uc_cluster_count[cluster]) # Now populate the Dij_ori array N_samples_in_cluster = len(uc_experiments_list[cluster]) for i in range(N_samples_in_cluster - 1): for j in range(i + 1, N_samples_in_cluster): dij_ori = get_dij_ori( uc_experiments_list[cluster][i].crystals()[0], uc_experiments_list[cluster][j].crystals()[0]) Dij_ori[cluster][N_samples_in_cluster * i + j] = dij_ori Dij_ori[cluster][N_samples_in_cluster * j + i] = dij_ori # Now do the orientational cluster analysis #from IPython import embed; embed(); exit() d_c_ori = 0.13 from exafel_project.ADSE13_25.clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-Dij_ori[1]/flex.max(Dij_ori[1]), show_plot=True) for cluster in Dij_ori: d_c_ori = flex.mean_and_variance(Dij_ori[cluster].as_1d( )).unweighted_sample_standard_deviation() CM_ori = clustering_manager(Dij=Dij_ori[cluster], d_c=d_c_ori, max_percentile_rho=0.85) n_cluster_ori = 1 + flex.max(CM_ori.cluster_id_final) #from IPython import embed; embed() #FIXME should be a PHIL param for i in range(n_cluster_ori): if len([zz for zz in CM_ori.cluster_id_final if zz == i]) < 5: continue item = flex.first_index(CM_ori.cluster_id_maxima, i) dxtbx_crystal_model = uc_experiments_list[cluster][ item].crystals()[0] dxtbx_crystal_models.append(dxtbx_crystal_model) from scitbx.matrix import sqr from cctbx_orientation_ext import crystal_orientation crystal_orientation = crystal_orientation( dxtbx_crystal_model.get_A(), True) A_direct = sqr(crystal_orientation.reciprocal_matrix() ).transpose().inverse() print( "IOTA: Direct A matrix 1st element of orientational cluster %d = %12.6f" % (i, A_direct[0])) if show_plot: # Decision graph stretch_plot_factor = 1.05 # (1+fraction of limits by which xlim,ylim should be set) import matplotlib.pyplot as plt plt.plot(CM_ori.rho, CM_ori.delta, "r.", markersize=3.) for x in range(len(list(CM_ori.cluster_id_final))): if CM_ori.cluster_id_maxima[x] >= 0: plt.plot([CM_ori.rho[x]], [CM_ori.delta[x]], "ro") #from IPython import embed; embed(); exit() plt.xlim([-10, stretch_plot_factor * flex.max(CM_ori.rho)]) plt.ylim([-10, stretch_plot_factor * flex.max(CM_ori.delta)]) plt.show() # Make sure the crystal models are not too close to each other # FIXME should be a PHIL min_angle = 5.0 # taken from indexer.py close_models_list = [] if len(dxtbx_crystal_models) > 1: from dials.algorithms.indexing.compare_orientation_matrices import difference_rotation_matrix_axis_angle for i_a in range(0, len(dxtbx_crystal_models) - 1): for i_b in range(i_a, len(dxtbx_crystal_models)): cryst_a = dxtbx_crystal_models[i_a] cryst_b = dxtbx_crystal_models[i_b] R_ab, axis, angle, cb_op_ab = difference_rotation_matrix_axis_angle( cryst_a, cryst_b) # FIXME if abs(angle) < min_angle: # degrees close_models_list.append((i_a, i_b)) # Now prune the dxtbx_crystal_models list for close_models in close_models_list: i_a, i_b = close_models if dxtbx_crystal_models[i_a] is not None and dxtbx_crystal_models[ i_b] is not None: dxtbx_crystal_models[i_a] = None dxtbx_crystal_models = [x for x in dxtbx_crystal_models if x is not None] if len(dxtbx_crystal_models) > 0: return dxtbx_crystal_models, None else: # If nothing works, atleast return the 1st crystal model that was found return [experiments_list[0].crystals()[0]], None