def run(self): from mmtbx.scaling import absolute_scaling iso_scale_and_b_obs = absolute_scaling.ml_iso_absolute_scaling(miller_array=self.obs, n_residues=100) iso_scale_and_b_calc = absolute_scaling.ml_iso_absolute_scaling(miller_array=self.calc, n_residues=100) b_obs = iso_scale_and_b_obs.b_wilson b_calc = iso_scale_and_b_calc.b_wilson B = -2*(b_calc - b_obs) k = kBdecider2.get_linear_scale(self.obs, self.calc, B) print "k,B=",k,B return k, B
def ml_normalisation(self, aniso=False): # estimate number of residues per unit cell mr = matthews.matthews_rupp(self.intensities.crystal_symmetry()) n_residues = mr.n_residues # estimate B-factor and scale factors for normalisation if aniso: normalisation = absolute_scaling.ml_aniso_absolute_scaling( self.intensities, n_residues=n_residues) u_star = normalisation.u_star else: normalisation = absolute_scaling.ml_iso_absolute_scaling( self.intensities, n_residues=n_residues) u_star = adptbx.b_as_u( adptbx.u_iso_as_u_star( self.intensities.unit_cell(), normalisation.b_wilson)) # apply scales self.intensities = self.intensities.customized_copy( data=scaling.ml_normalise_aniso( self.intensities.indices(), self.intensities.data(), normalisation.p_scale, self.intensities.unit_cell(), u_star), sigmas=scaling.ml_normalise_aniso( self.intensities.indices(), self.intensities.sigmas(), normalisation.p_scale, self.intensities.unit_cell(), u_star)).set_info(self.intensities.info()) # record output in log file s = StringIO() mr.show(out=s) normalisation.show(out=s) logger.info(s.getvalue())
def wilson_scaling(F, n_residues, n_bases): from mmtbx.scaling import absolute_scaling iso_scale = absolute_scaling.ml_iso_absolute_scaling(miller_array=F, n_residues=n_residues, n_bases=n_bases) aniso_scale = absolute_scaling.ml_aniso_absolute_scaling( miller_array=F, n_residues=n_residues, n_bases=n_bases) return iso_scale, aniso_scale
def _ml_normalisation(intensities, aniso): # estimate number of residues per unit cell mr = matthews.matthews_rupp(intensities.crystal_symmetry()) n_residues = mr.n_residues # estimate B-factor and scale factors for normalisation if aniso: normalisation = absolute_scaling.ml_aniso_absolute_scaling( intensities, n_residues=n_residues ) u_star = normalisation.u_star else: normalisation = absolute_scaling.ml_iso_absolute_scaling( intensities, n_residues=n_residues ) u_star = adptbx.b_as_u( adptbx.u_iso_as_u_star(intensities.unit_cell(), normalisation.b_wilson) ) # record output in log file if aniso: b_cart = normalisation.b_cart logger.info("ML estimate of overall B_cart value:") logger.info( """\ %5.2f, %5.2f, %5.2f %12.2f, %5.2f %19.2f""" % (b_cart[0], b_cart[3], b_cart[4], b_cart[1], b_cart[5], b_cart[2]) ) else: logger.info("ML estimate of overall B value:") logger.info(" %5.2f A**2" % normalisation.b_wilson) logger.info("ML estimate of -log of scale factor:") logger.info(" %5.2f" % (normalisation.p_scale)) s = StringIO() mr.show(out=s) normalisation.show(out=s) logger.debug(s.getvalue()) # apply scales return intensities.customized_copy( data=scaling.ml_normalise_aniso( intensities.indices(), intensities.data(), normalisation.p_scale, intensities.unit_cell(), u_star, ), sigmas=scaling.ml_normalise_aniso( intensities.indices(), intensities.sigmas(), normalisation.p_scale, intensities.unit_cell(), u_star, ), )
def set_chunk_stats(chunk, stats, stat_choice, n_residues=None, ref_cell=None, space_group=None, d_min=None, ref_data=None): if "reslimit" in stat_choice: stats["reslimit"].append(chunk.res_lim) else: stats["reslimit"].append(float("nan")) if "pr" in stat_choice: stats["pr"].append(chunk.profile_radius) else: stats["pr"].append(float("nan")) stats["ccref"].append(float("nan")) if set(["ioversigma","resnatsnr1","ccref"]).intersection(stat_choice): iobs = chunk.data_array(space_group, False) iobs = iobs.select(iobs.sigmas()>0).merge_equivalents(use_internal_variance=False).array() binner = iobs.setup_binner(auto_binning=True) if "resnatsnr1" in stat_choice: res = float("nan") for i_bin in binner.range_used(): sel = binner.selection(i_bin) tmp = iobs.select(sel) if tmp.size() == 0: continue sn = flex.mean(tmp.data()/tmp.sigmas()) if sn <= 1: res = binner.bin_d_range(i_bin)[1] break stats["resnatsnr1"].append(res) else: stats["resnatsnr1"].append(float("nan")) if d_min: iobs = iobs.resolution_filter(d_min=d_min) if "ccref" in stat_choice: corr = iobs.correlation(ref_data, assert_is_similar_symmetry=False) if corr.is_well_defined(): stats["ccref"][-1] = corr.coefficient() if "ioversigma" in stat_choice: stats["ioversigma"].append(flex.mean(iobs.data()/iobs.sigmas())) else: stats["ioversigma"].append(float("nan")) else: stats["ioversigma"].append(float("nan")) stats["resnatsnr1"].append(float("nan")) if "abdist" in stat_choice: from cctbx.uctbx.determine_unit_cell import NCDist G6a, G6b = make_G6(ref_cell), make_G6(chunk.cell) abdist = NCDist(G6a, G6b) stats["abdist"].append(abdist) else: stats["abdist"].append(float("nan")) if "wilsonb" in stat_choice: iso_scale_and_b = ml_iso_absolute_scaling(iobs, n_residues, 0) stats["wilsonb"].append(iso_scale_and_b.b_wilson) else: stats["wilsonb"].append(float("nan"))
def wilson_scaling (F, n_residues, n_bases) : from mmtbx.scaling import absolute_scaling iso_scale = absolute_scaling.ml_iso_absolute_scaling( miller_array=F, n_residues=n_residues, n_bases=n_bases) aniso_scale = absolute_scaling.ml_aniso_absolute_scaling( miller_array=F, n_residues=n_residues, n_bases=n_bases) return iso_scale, aniso_scale
def test_scaling_on_random_data(B_add): miller_array = random_data(B_add, n_residues=100.0) scale_object_iso = absolute_scaling.ml_iso_absolute_scaling( miller_array, n_residues=100.0) ## compare the results please assert approx_equal(B_add, scale_object_iso.b_wilson, eps=5) scale_object_aniso = absolute_scaling.ml_aniso_absolute_scaling( miller_array, n_residues=100.0) assert approx_equal(B_add, scale_object_aniso.b_cart[0], eps=5) assert approx_equal(B_add, scale_object_aniso.b_cart[1], eps=5) assert approx_equal(B_add, scale_object_aniso.b_cart[2], eps=5)
def test_scaling_on_random_data(B_add): miller_array = random_data(B_add,n_residues=100.0) scale_object_iso = absolute_scaling.ml_iso_absolute_scaling( miller_array, n_residues=100.0) ## compare the results please assert approx_equal(B_add, scale_object_iso.b_wilson, eps=5) scale_object_aniso = absolute_scaling.ml_aniso_absolute_scaling( miller_array, n_residues=100.0) assert approx_equal(B_add, scale_object_aniso.b_cart[0], eps=5) assert approx_equal(B_add, scale_object_aniso.b_cart[1], eps=5) assert approx_equal(B_add, scale_object_aniso.b_cart[2], eps=5)
def set_chunk_stats(chunk, stats, stat_choice, n_residues=None, ref_cell=None, space_group=None, d_min=None, ref_data=None): if "reslimit" in stat_choice: stats["reslimit"].append(chunk.res_lim) else: stats["reslimit"].append(float("nan")) if "pr" in stat_choice: stats["pr"].append(chunk.profile_radius) else: stats["pr"].append(float("nan")) stats["ccref"].append(float("nan")) if set(["ioversigma", "resnatsnr1", "ccref"]).intersection(stat_choice): iobs = chunk.data_array(space_group, False) iobs = iobs.select(iobs.sigmas() > 0).merge_equivalents( use_internal_variance=False).array() binner = iobs.setup_binner(auto_binning=True) if "resnatsnr1" in stat_choice: res = float("nan") for i_bin in binner.range_used(): sel = binner.selection(i_bin) tmp = iobs.select(sel) if tmp.size() == 0: continue sn = flex.mean(tmp.data() / tmp.sigmas()) if sn <= 1: res = binner.bin_d_range(i_bin)[1] break stats["resnatsnr1"].append(res) else: stats["resnatsnr1"].append(float("nan")) if d_min: iobs = iobs.resolution_filter(d_min=d_min) if "ccref" in stat_choice: corr = iobs.correlation(ref_data, assert_is_similar_symmetry=False) if corr.is_well_defined(): stats["ccref"][-1] = corr.coefficient() if "ioversigma" in stat_choice: stats["ioversigma"].append(flex.mean(iobs.data() / iobs.sigmas())) else: stats["ioversigma"].append(float("nan")) else: stats["ioversigma"].append(float("nan")) stats["resnatsnr1"].append(float("nan")) if "abdist" in stat_choice: from cctbx.uctbx.determine_unit_cell import NCDist G6a, G6b = make_G6(ref_cell), make_G6(chunk.cell) abdist = NCDist(G6a, G6b) stats["abdist"].append(abdist) else: stats["abdist"].append(float("nan")) if "wilsonb" in stat_choice: iso_scale_and_b = ml_iso_absolute_scaling(iobs, n_residues, 0) stats["wilsonb"].append(iso_scale_and_b.b_wilson) else: stats["wilsonb"].append(float("nan"))
def do_clustering(self, nproc=1, b_scale=False, use_normalized=False, html_maker=None): self.clusters = {} prefix = os.path.join(self.wdir, "cctable") assert (b_scale, use_normalized).count(True) <= 1 if len(self.arrays) < 2: print "WARNING: less than two data! can't do cc-based clustering" self.clusters[1] = [float("nan"), [0]] return # Absolute scaling using Wilson-B factor if b_scale: from mmtbx.scaling.matthews import p_vm_calculator from mmtbx.scaling.absolute_scaling import ml_iso_absolute_scaling ofs_wilson = open("%s_wilson_scales.dat" % prefix, "w") n_residues = p_vm_calculator(self.arrays.values()[0], 1, 0).best_guess ofs_wilson.write("# guessed n_residues= %d\n" % n_residues) ofs_wilson.write("file wilsonB\n") for f in self.arrays: arr = self.arrays[f] iso_scale_and_b = ml_iso_absolute_scaling(arr, n_residues, 0) wilson_b = iso_scale_and_b.b_wilson ofs_wilson.write("%s %.3f\n" % (f, wilson_b)) if wilson_b > 0: # Ignoring data with B<0? is a bad idea.. but how..? tmp = flex.exp(-2. * wilson_b * arr.unit_cell().d_star_sq(arr.indices()) / 4.) self.arrays[f] = arr.customized_copy(data=arr.data() * tmp, sigmas=arr.sigmas() * tmp) ofs_wilson.close() elif use_normalized: from mmtbx.scaling.absolute_scaling import kernel_normalisation for f in self.arrays: arr = self.arrays[f] normaliser = kernel_normalisation(arr, auto_kernel=True) self.arrays[f] = arr.customized_copy( data=arr.data() / normaliser.normalizer_for_miller_array, sigmas=arr.sigmas() / normaliser.normalizer_for_miller_array) # Prep args = [] for i in xrange(len(self.arrays) - 1): for j in xrange(i + 1, len(self.arrays)): args.append((i, j)) # Calc all CC if self.use_sfdist: worker = lambda x: calc_sfdist(self.arrays.values()[x[0]], self.arrays.values()[x[1]]) else: worker = lambda x: calc_cc(self.arrays.values()[x[0]], self.arrays.values()[x[1]]) results = easy_mp.pool_map(fixed_func=worker, args=args, processes=nproc) # Check NaN and decide which data to remove idx_bad = {} nans = [] cc_data_for_html = [] for (i, j), (cc, nref) in zip(args, results): cc_data_for_html.append((i, j, cc, nref)) if cc == cc: continue idx_bad[i] = idx_bad.get(i, 0) + 1 idx_bad[j] = idx_bad.get(j, 0) + 1 nans.append([i, j]) if html_maker is not None: html_maker.add_cc_clustering_details(cc_data_for_html) idx_bad = idx_bad.items() idx_bad.sort(key=lambda x: x[1]) remove_idxes = set() for idx, badcount in reversed(idx_bad): if len(filter(lambda x: idx in x, nans)) == 0: continue remove_idxes.add(idx) nans = filter(lambda x: idx not in x, nans) if len(nans) == 0: break use_idxes = filter(lambda x: x not in remove_idxes, xrange(len(self.arrays))) # Make table: original index (in file list) -> new index (in matrix) count = 0 org2now = collections.OrderedDict() for i in xrange(len(self.arrays)): if i in remove_idxes: continue org2now[i] = count count += 1 if len(remove_idxes) > 0: open("%s_notused.lst" % prefix, "w").write("\n".join( map(lambda x: self.arrays.keys()[x], remove_idxes))) # Make matrix mat = numpy.zeros(shape=(len(use_idxes), len(use_idxes))) for (i, j), (cc, nref) in zip(args, results): if i in remove_idxes or j in remove_idxes: continue mat[org2now[j], org2now[i]] = cc open("%s.matrix" % prefix, "w").write(" ".join(map(lambda x: "%.4f" % x, mat.flatten()))) ofs = open("%s.dat" % prefix, "w") ofs.write(" i j cc nref\n") for (i, j), (cc, nref) in zip(args, results): ofs.write("%4d %4d %.4f %4d\n" % (i, j, cc, nref)) open("%s_ana.R" % prefix, "w").write("""\ treeToList2 <- function(htree) { # stolen from $CCP4/share/blend/R/blend0.R groups <- list() itree <- dim(htree$merge)[1] for (i in 1:itree) { il <- htree$merge[i,1] ir <- htree$merge[i,2] if (il < 0) lab1 <- htree$labels[-il] if (ir < 0) lab2 <- htree$labels[-ir] if (il > 0) lab1 <- groups[[il]] if (ir > 0) lab2 <- groups[[ir]] lab <- c(lab1,lab2) lab <- as.integer(lab) groups <- c(groups,list(lab)) } return(groups) } cc<-scan("%(prefix)s.matrix") md<-matrix(1-cc, ncol=%(ncol)d, byrow=TRUE) hc <- hclust(as.dist(md),method="ward") pdf("tree.pdf") plot(hc) dev.off() png("tree.png",height=1000,width=1000) plot(hc) dev.off() hc$labels <- c(%(hclabels)s) groups <- treeToList2(hc) cat("ClNumber Nds Clheight IDs\\n",file="./CLUSTERS.txt") for (i in 1:length(groups)) { sorted_groups <- sort(groups[[i]]) linea <- sprintf("%%04d %%4d %%7.3f %%s\\n", i,length(groups[[i]]),hc$height[i], paste(sorted_groups,collapse=" ")) cat(linea, file="./CLUSTERS.txt", append=TRUE) } # reference: http://www.coppelia.io/2014/07/converting-an-r-hclust-object-into-a-d3-js-dendrogram/ library(rjson) HCtoJSON<-function(hc){ labels<-hc$labels merge<-data.frame(hc$merge) for (i in (1:nrow(merge))) { if (merge[i,1]<0 & merge[i,2]<0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(list(name=labels[-merge[i,1]]),list(name=labels[-merge[i,2]])))")))} else if (merge[i,1]>0 & merge[i,2]<0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(node", merge[i,1], ", list(name=labels[-merge[i,2]])))")))} else if (merge[i,1]<0 & merge[i,2]>0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(list(name=labels[-merge[i,1]]), node", merge[i,2],"))")))} else if (merge[i,1]>0 & merge[i,2]>0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(node",merge[i,1] , ", node" , merge[i,2]," ))")))} } eval(parse(text=paste0("JSON<-toJSON(node",nrow(merge), ")"))) return(JSON) } JSON<-HCtoJSON(hc) cat(JSON, file="dendro.json") q(save="yes") """ % dict(prefix=os.path.basename(prefix), ncol=len(self.arrays), hclabels=",".join(map(lambda x: "%d" % (x + 1), org2now.keys())))) call(cmd="Rscript", arg="%s_ana.R" % os.path.basename(prefix), wdir=self.wdir) output = open(os.path.join(self.wdir, "CLUSTERS.txt")).readlines() for l in output[1:]: sp = l.split() clid, clheight, ids = sp[0], sp[2], sp[3:] self.clusters[int(clid)] = [float(clheight), map(int, ids)]
def do_clustering(self, nproc=1, b_scale=False, use_normalized=False, cluster_method="ward", distance_eqn="sqrt(1-cc)", min_common_refs=3, html_maker=None): """ Using correlation as distance metric (for hierarchical clustering) https://stats.stackexchange.com/questions/165194/using-correlation-as-distance-metric-for-hierarchical-clustering Correlation "Distances" and Hierarchical Clustering http://research.stowers.org/mcm/efg/R/Visualization/cor-cluster/index.htm """ self.clusters = {} prefix = os.path.join(self.wdir, "cctable") assert (b_scale, use_normalized).count(True) <= 1 distance_eqns = { "sqrt(1-cc)": lambda x: numpy.sqrt(1. - x), "1-cc": lambda x: 1. - x, "sqrt(1-cc^2)": lambda x: numpy.sqrt(1. - x**2), } cc_to_distance = distance_eqns[ distance_eqn] # Fail when unknown options assert cluster_method in ("single", "complete", "average", "weighted", "centroid", "median", "ward" ) # available methods in scipy if len(self.arrays) < 2: print "WARNING: less than two data! can't do cc-based clustering" self.clusters[1] = [float("nan"), [0]] return # Absolute scaling using Wilson-B factor if b_scale: from mmtbx.scaling.matthews import p_vm_calculator from mmtbx.scaling.absolute_scaling import ml_iso_absolute_scaling ofs_wilson = open("%s_wilson_scales.dat" % prefix, "w") n_residues = p_vm_calculator(self.arrays.values()[0], 1, 0).best_guess ofs_wilson.write("# guessed n_residues= %d\n" % n_residues) ofs_wilson.write("file wilsonB\n") for f in self.arrays: arr = self.arrays[f] iso_scale_and_b = ml_iso_absolute_scaling(arr, n_residues, 0) wilson_b = iso_scale_and_b.b_wilson ofs_wilson.write("%s %.3f\n" % (f, wilson_b)) if wilson_b > 0: # Ignoring data with B<0? is a bad idea.. but how..? tmp = flex.exp(-2. * wilson_b * arr.unit_cell().d_star_sq(arr.indices()) / 4.) self.arrays[f] = arr.customized_copy(data=arr.data() * tmp, sigmas=arr.sigmas() * tmp) ofs_wilson.close() elif use_normalized: from mmtbx.scaling.absolute_scaling import kernel_normalisation failed = {} for f in self.arrays: arr = self.arrays[f] try: normaliser = kernel_normalisation(arr, auto_kernel=True) self.arrays[f] = arr.customized_copy( data=arr.data() / normaliser.normalizer_for_miller_array, sigmas=arr.sigmas() / normaliser.normalizer_for_miller_array) except Exception, e: failed.setdefault(e.message, []).append(f) if failed: msg = "" for r in failed: msg += " %s\n%s\n" % (r, "\n".join( map(lambda x: " %s" % x, failed[r]))) raise Sorry( "intensity normalization failed by following reason(s):\n%s" % msg)
def __init__(self, miller_array, phil_object, out=None, out_plot=None, miller_calc=None, original_intensities=None, completeness_as_non_anomalous=None, verbose=0): if out is None: out = sys.stdout if verbose > 0: print >> out print >> out print >> out, "Matthews coefficient and Solvent content statistics" n_copies_solc = 1.0 self.nres_known = False if (phil_object.scaling.input.asu_contents.n_residues is not None or phil_object.scaling.input.asu_contents.n_bases is not None): self.nres_known = True if (phil_object.scaling.input.asu_contents.sequence_file is not None): print >> out, " warning: ignoring sequence file" elif (phil_object.scaling.input.asu_contents.sequence_file is not None): print >> out, " determining composition from sequence file %s" % \ phil_object.scaling.input.asu_contents.sequence_file seq_comp = iotbx.bioinformatics.composition_from_sequence_file( file_name=phil_object.scaling.input.asu_contents.sequence_file, log=out) if (seq_comp is not None): phil_object.scaling.input.asu_contents.n_residues = seq_comp.n_residues phil_object.scaling.input.asu_contents.n_bases = seq_comp.n_bases self.nres_known = True matthews_results = matthews.matthews_rupp( crystal_symmetry=miller_array, n_residues=phil_object.scaling.input.asu_contents.n_residues, n_bases=phil_object.scaling.input.asu_contents.n_bases, out=out, verbose=1) phil_object.scaling.input.asu_contents.n_residues = matthews_results[0] phil_object.scaling.input.asu_contents.n_bases = matthews_results[1] n_copies_solc = matthews_results[2] self.matthews_results = matthews_results if phil_object.scaling.input.asu_contents.n_copies_per_asu is not None: n_copies_solc = phil_object.scaling.input.asu_contents.n_copies_per_asu self.defined_copies = n_copies_solc if verbose > 0: print >> out, "Number of copies per asymmetric unit provided" print >> out, " Will use user specified value of ", n_copies_solc else: phil_object.scaling.input.asu_contents.n_copies_per_asu = n_copies_solc self.guessed_copies = n_copies_solc # first report on I over sigma miller_array_new = miller_array self.data_strength = None miller_array_intensities = miller_array if (original_intensities is not None): assert original_intensities.is_xray_intensity_array() miller_array_intensities = original_intensities if miller_array_intensities.sigmas() is not None: data_strength = data_statistics.i_sigi_completeness_stats( miller_array_intensities, isigi_cut=phil_object.scaling.input.parameters. misc_twin_parameters.twin_test_cuts.isigi_cut, completeness_cut=phil_object.scaling.input.parameters. misc_twin_parameters.twin_test_cuts.completeness_cut, completeness_as_non_anomalous=completeness_as_non_anomalous) data_strength.show(out) self.data_strength = data_strength if phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution is None: if data_strength.resolution_cut > data_strength.resolution_at_least: phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution = data_strength.resolution_at_least else: phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution = data_strength.resolution_cut ## Isotropic wilson scaling if verbose > 0: print >> out print >> out print >> out, "Maximum likelihood isotropic Wilson scaling " n_residues = phil_object.scaling.input.asu_contents.n_residues n_bases = phil_object.scaling.input.asu_contents.n_bases if n_residues is None: n_residues = 0 if n_bases is None: n_bases = 0 if n_bases + n_residues == 0: raise Sorry("No scatterers available") iso_scale_and_b = absolute_scaling.ml_iso_absolute_scaling( miller_array=miller_array_new, n_residues=n_residues * miller_array.space_group().order_z() * n_copies_solc, n_bases=n_bases * miller_array.space_group().order_z() * n_copies_solc) iso_scale_and_b.show(out=out, verbose=verbose) self.iso_scale_and_b = iso_scale_and_b ## Store the b and scale values from isotropic ML scaling self.iso_p_scale = iso_scale_and_b.p_scale self.iso_b_wilson = iso_scale_and_b.b_wilson ## Anisotropic ml wilson scaling if verbose > 0: print >> out print >> out print >> out, "Maximum likelihood anisotropic Wilson scaling " aniso_scale_and_b = absolute_scaling.ml_aniso_absolute_scaling( miller_array=miller_array_new, n_residues=n_residues * miller_array.space_group().order_z() * n_copies_solc, n_bases=n_bases * miller_array.space_group().order_z() * n_copies_solc) aniso_scale_and_b.show(out=out, verbose=1) self.aniso_scale_and_b = aniso_scale_and_b try: b_cart = aniso_scale_and_b.b_cart except AttributeError, e: print >> out, "*** ERROR ***" print >> out, str(e) show_exception_info_if_full_testing() return
def do_clustering(self, nproc=1, b_scale=False, use_normalized=False, html_maker=None): self.clusters = {} prefix = os.path.join(self.wdir, "cctable") assert (b_scale, use_normalized).count(True) <= 1 if len(self.arrays) < 2: print "WARNING: less than two data! can't do cc-based clustering" self.clusters[1] = [float("nan"), [0]] return # Absolute scaling using Wilson-B factor if b_scale: from mmtbx.scaling.matthews import p_vm_calculator from mmtbx.scaling.absolute_scaling import ml_iso_absolute_scaling ofs_wilson = open("%s_wilson_scales.dat"%prefix, "w") n_residues = p_vm_calculator(self.arrays.values()[0], 1, 0).best_guess ofs_wilson.write("# guessed n_residues= %d\n" % n_residues) ofs_wilson.write("file wilsonB\n") for f in self.arrays: arr = self.arrays[f] iso_scale_and_b = ml_iso_absolute_scaling(arr, n_residues, 0) wilson_b = iso_scale_and_b.b_wilson ofs_wilson.write("%s %.3f\n" % (f, wilson_b)) if wilson_b > 0: # Ignoring data with B<0? is a bad idea.. but how..? tmp = flex.exp(-2. * wilson_b * arr.unit_cell().d_star_sq(arr.indices())/4.) self.arrays[f] = arr.customized_copy(data=arr.data()*tmp, sigmas=arr.sigmas()*tmp) ofs_wilson.close() elif use_normalized: from mmtbx.scaling.absolute_scaling import kernel_normalisation for f in self.arrays: arr = self.arrays[f] normaliser = kernel_normalisation(arr, auto_kernel=True) self.arrays[f] = arr.customized_copy(data=arr.data()/normaliser.normalizer_for_miller_array, sigmas=arr.sigmas()/normaliser.normalizer_for_miller_array) # Prep args = [] for i in xrange(len(self.arrays)-1): for j in xrange(i+1, len(self.arrays)): args.append((i,j)) # Calc all CC worker = lambda x: calc_cc(self.arrays.values()[x[0]], self.arrays.values()[x[1]]) results = easy_mp.pool_map(fixed_func=worker, args=args, processes=nproc) # Check NaN and decide which data to remove idx_bad = {} nans = [] cc_data_for_html = [] for (i,j), (cc,nref) in zip(args, results): cc_data_for_html.append((i,j,cc,nref)) if cc==cc: continue idx_bad[i] = idx_bad.get(i, 0) + 1 idx_bad[j] = idx_bad.get(j, 0) + 1 nans.append([i,j]) if html_maker is not None: html_maker.add_cc_clustering_details(cc_data_for_html) idx_bad = idx_bad.items() idx_bad.sort(key=lambda x:x[1]) remove_idxes = set() for idx, badcount in reversed(idx_bad): if len(filter(lambda x: idx in x, nans)) == 0: continue remove_idxes.add(idx) nans = filter(lambda x: idx not in x, nans) if len(nans) == 0: break use_idxes = filter(lambda x: x not in remove_idxes, xrange(len(self.arrays))) # Make table: original index (in file list) -> new index (in matrix) count = 0 org2now = collections.OrderedDict() for i in xrange(len(self.arrays)): if i in remove_idxes: continue org2now[i] = count count += 1 if len(remove_idxes) > 0: open("%s_notused.lst"%prefix, "w").write("\n".join(map(lambda x: self.arrays.keys()[x], remove_idxes))) # Make matrix mat = numpy.zeros(shape=(len(use_idxes), len(use_idxes))) for (i,j), (cc,nref) in zip(args, results): if i in remove_idxes or j in remove_idxes: continue mat[org2now[j], org2now[i]] = cc open("%s.matrix"%prefix, "w").write(" ".join(map(lambda x:"%.4f"%x, mat.flatten()))) ofs = open("%s.dat"%prefix, "w") ofs.write(" i j cc nref\n") for (i,j), (cc,nref) in zip(args, results): ofs.write("%4d %4d %.4f %4d\n" % (i,j,cc,nref)) open("%s_ana.R"%prefix, "w").write("""\ treeToList2 <- function(htree) { # stolen from $CCP4/share/blend/R/blend0.R groups <- list() itree <- dim(htree$merge)[1] for (i in 1:itree) { il <- htree$merge[i,1] ir <- htree$merge[i,2] if (il < 0) lab1 <- htree$labels[-il] if (ir < 0) lab2 <- htree$labels[-ir] if (il > 0) lab1 <- groups[[il]] if (ir > 0) lab2 <- groups[[ir]] lab <- c(lab1,lab2) lab <- as.integer(lab) groups <- c(groups,list(lab)) } return(groups) } cc<-scan("%(prefix)s.matrix") md<-matrix(1-cc, ncol=%(ncol)d, byrow=TRUE) hc <- hclust(as.dist(md),method="ward") pdf("tree.pdf") plot(hc) dev.off() png("tree.png",height=1000,width=1000) plot(hc) dev.off() hc$labels <- c(%(hclabels)s) groups <- treeToList2(hc) cat("ClNumber Nds Clheight IDs\\n",file="./CLUSTERS.txt") for (i in 1:length(groups)) { sorted_groups <- sort(groups[[i]]) linea <- sprintf("%%04d %%4d %%7.3f %%s\\n", i,length(groups[[i]]),hc$height[i], paste(sorted_groups,collapse=" ")) cat(linea, file="./CLUSTERS.txt", append=TRUE) } # reference: http://www.coppelia.io/2014/07/converting-an-r-hclust-object-into-a-d3-js-dendrogram/ library(rjson) HCtoJSON<-function(hc){ labels<-hc$labels merge<-data.frame(hc$merge) for (i in (1:nrow(merge))) { if (merge[i,1]<0 & merge[i,2]<0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(list(name=labels[-merge[i,1]]),list(name=labels[-merge[i,2]])))")))} else if (merge[i,1]>0 & merge[i,2]<0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(node", merge[i,1], ", list(name=labels[-merge[i,2]])))")))} else if (merge[i,1]<0 & merge[i,2]>0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(list(name=labels[-merge[i,1]]), node", merge[i,2],"))")))} else if (merge[i,1]>0 & merge[i,2]>0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(node",merge[i,1] , ", node" , merge[i,2]," ))")))} } eval(parse(text=paste0("JSON<-toJSON(node",nrow(merge), ")"))) return(JSON) } JSON<-HCtoJSON(hc) cat(JSON, file="dendro.json") q(save="yes") """ % dict(prefix=os.path.basename(prefix), ncol=len(self.arrays), hclabels=",".join(map(lambda x: "%d"%(x+1), org2now.keys())))) call(cmd="Rscript", arg="%s_ana.R" % os.path.basename(prefix), wdir=self.wdir) output = open(os.path.join(self.wdir, "CLUSTERS.txt")).readlines() for l in output[1:]: sp = l.split() clid, clheight, ids = sp[0], sp[2], sp[3:] self.clusters[int(clid)] = [float(clheight), map(int,ids)]
def __init__(self, miller_array, phil_object, out=None, out_plot=None, miller_calc=None, original_intensities=None, completeness_as_non_anomalous=None, verbose=0): if out is None: out = sys.stdout if verbose > 0: print(file=out) print(file=out) print("Matthews coefficient and Solvent content statistics", file=out) n_copies_solc = 1.0 self.nres_known = False if (phil_object.scaling.input.asu_contents.n_residues is not None or phil_object.scaling.input.asu_contents.n_bases is not None): self.nres_known = True if (phil_object.scaling.input.asu_contents.sequence_file is not None): print(" warning: ignoring sequence file", file=out) elif (phil_object.scaling.input.asu_contents.sequence_file is not None): print(" determining composition from sequence file %s" % \ phil_object.scaling.input.asu_contents.sequence_file, file=out) seq_comp = iotbx.bioinformatics.composition_from_sequence_file( file_name=phil_object.scaling.input.asu_contents.sequence_file, log=out) if (seq_comp is not None): phil_object.scaling.input.asu_contents.n_residues = seq_comp.n_residues phil_object.scaling.input.asu_contents.n_bases = seq_comp.n_bases self.nres_known = True matthews_results = matthews.matthews_rupp( crystal_symmetry=miller_array, n_residues=phil_object.scaling.input.asu_contents.n_residues, n_bases=phil_object.scaling.input.asu_contents.n_bases, out=out, verbose=1) phil_object.scaling.input.asu_contents.n_residues = matthews_results[0] phil_object.scaling.input.asu_contents.n_bases = matthews_results[1] n_copies_solc = matthews_results[2] self.matthews_results = matthews_results if phil_object.scaling.input.asu_contents.n_copies_per_asu is not None: n_copies_solc = phil_object.scaling.input.asu_contents.n_copies_per_asu self.defined_copies = n_copies_solc if verbose > 0: print("Number of copies per asymmetric unit provided", file=out) print(" Will use user specified value of ", n_copies_solc, file=out) else: phil_object.scaling.input.asu_contents.n_copies_per_asu = n_copies_solc self.guessed_copies = n_copies_solc # first report on I over sigma miller_array_new = miller_array self.data_strength = None miller_array_intensities = miller_array if (original_intensities is not None): assert original_intensities.is_xray_intensity_array() miller_array_intensities = original_intensities if miller_array_intensities.sigmas() is not None: data_strength = data_statistics.i_sigi_completeness_stats( miller_array_intensities, isigi_cut=phil_object.scaling.input.parameters. misc_twin_parameters.twin_test_cuts.isigi_cut, completeness_cut=phil_object.scaling.input.parameters. misc_twin_parameters.twin_test_cuts.completeness_cut, completeness_as_non_anomalous=completeness_as_non_anomalous) data_strength.show(out) self.data_strength = data_strength if phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution is None: if data_strength.resolution_cut > data_strength.resolution_at_least: phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution = data_strength.resolution_at_least else: phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution = data_strength.resolution_cut ## Isotropic wilson scaling if verbose > 0: print(file=out) print(file=out) print("Maximum likelihood isotropic Wilson scaling ", file=out) n_residues = phil_object.scaling.input.asu_contents.n_residues n_bases = phil_object.scaling.input.asu_contents.n_bases if n_residues is None: n_residues = 0 if n_bases is None: n_bases = 0 if n_bases + n_residues == 0: raise Sorry("No scatterers available") iso_scale_and_b = absolute_scaling.ml_iso_absolute_scaling( miller_array=miller_array_new, n_residues=n_residues * miller_array.space_group().order_z() * n_copies_solc, n_bases=n_bases * miller_array.space_group().order_z() * n_copies_solc) iso_scale_and_b.show(out=out, verbose=verbose) self.iso_scale_and_b = iso_scale_and_b ## Store the b and scale values from isotropic ML scaling self.iso_p_scale = iso_scale_and_b.p_scale self.iso_b_wilson = iso_scale_and_b.b_wilson ## Anisotropic ml wilson scaling if verbose > 0: print(file=out) print(file=out) print("Maximum likelihood anisotropic Wilson scaling ", file=out) aniso_scale_and_b = absolute_scaling.ml_aniso_absolute_scaling( miller_array=miller_array_new, n_residues=n_residues * miller_array.space_group().order_z() * n_copies_solc, n_bases=n_bases * miller_array.space_group().order_z() * n_copies_solc) aniso_scale_and_b.show(out=out, verbose=1) self.aniso_scale_and_b = aniso_scale_and_b try: b_cart = aniso_scale_and_b.b_cart except AttributeError as e: print("*** ERROR ***", file=out) print(str(e), file=out) show_exception_info_if_full_testing() return self.aniso_p_scale = aniso_scale_and_b.p_scale self.aniso_u_star = aniso_scale_and_b.u_star self.aniso_b_cart = aniso_scale_and_b.b_cart # XXX: for GUI self.overall_b_cart = getattr(aniso_scale_and_b, "overall_b_cart", None) ## Correcting for anisotropy if verbose > 0: print("Correcting for anisotropy in the data", file=out) print(file=out) b_cart_observed = aniso_scale_and_b.b_cart b_trace_average = (b_cart_observed[0] + b_cart_observed[1] + b_cart_observed[2]) / 3.0 b_trace_min = b_cart_observed[0] if b_cart_observed[1] < b_trace_min: b_trace_min = b_cart_observed[1] if b_cart_observed[2] < b_trace_min: b_trace_min = b_cart_observed[2] if phil_object.scaling.input.optional.aniso.final_b == "eigen_min": b_use = aniso_scale_and_b.eigen_values[2] elif phil_object.scaling.input.optional.aniso.final_b == "eigen_mean": b_use = flex.mean(aniso_scale_and_b.eigen_values) elif phil_object.scaling.input.optional.aniso.final_b == "user_b_iso": assert phil_object.scaling.input.optional.aniso.b_iso is not None b_use = phil_object.scaling.input.optional.aniso.b_iso else: b_use = 30 b_cart_aniso_removed = [-b_use, -b_use, -b_use, 0, 0, 0] u_star_aniso_removed = adptbx.u_cart_as_u_star( miller_array.unit_cell(), adptbx.b_as_u(b_cart_aniso_removed)) ## I do things in two steps, but can easely be done in 1 step ## just for clarity, thats all. self.no_aniso_array = absolute_scaling.anisotropic_correction( miller_array_new, 0.0, aniso_scale_and_b.u_star) self.no_aniso_array = absolute_scaling.anisotropic_correction( self.no_aniso_array, 0.0, u_star_aniso_removed) self.no_aniso_array = self.no_aniso_array.set_observation_type( miller_array) ## Make normalised structure factors please sel_big = self.no_aniso_array.data() > 1.e+50 self.no_aniso_array = self.no_aniso_array.array( data=self.no_aniso_array.data().set_selected(sel_big, 0)) self.no_aniso_array = self.no_aniso_array.set_observation_type( miller_array) normalistion = absolute_scaling.kernel_normalisation( self.no_aniso_array, auto_kernel=True) self.normalised_miller = normalistion.normalised_miller.deep_copy() self.phil_object = phil_object ## Some basic statistics and sanity checks follow if verbose > 0: print("Some basic intensity statistics follow.", file=out) print(file=out) basic_data_stats = data_statistics.basic_intensity_statistics( miller_array, aniso_scale_and_b.p_scale, aniso_scale_and_b.u_star, iso_scale_and_b.scat_info, out=out, out_plot=out_plot) self.basic_data_stats = basic_data_stats self.miller_array = basic_data_stats.new_miller #relative wilson plot self.rel_wilson = None if (miller_calc is not None) and (miller_calc.d_min() < 4.0): try: self.rel_wilson = relative_wilson.relative_wilson( miller_obs=miller_array, miller_calc=miller_calc) except RuntimeError as e: print("*** Error calculating relative Wilson plot - skipping.", file=out) print("", file=out) if verbose > 0: print("Basic analyses completed", file=out)
def __init__(self, miller_array, phil_object, out=None, out_plot=None, miller_calc=None, original_intensities=None, completeness_as_non_anomalous=None, verbose=0): if out is None: out=sys.stdout if verbose>0: print >> out print >> out print >> out, "Matthews coefficient and Solvent content statistics" n_copies_solc = 1.0 self.nres_known = False if (phil_object.scaling.input.asu_contents.n_residues is not None or phil_object.scaling.input.asu_contents.n_bases is not None) : self.nres_known = True if (phil_object.scaling.input.asu_contents.sequence_file is not None) : print >> out, " warning: ignoring sequence file" elif (phil_object.scaling.input.asu_contents.sequence_file is not None) : print >> out, " determining composition from sequence file %s" % \ phil_object.scaling.input.asu_contents.sequence_file seq_comp = iotbx.bioinformatics.composition_from_sequence_file( file_name=phil_object.scaling.input.asu_contents.sequence_file, log=out) if (seq_comp is not None) : phil_object.scaling.input.asu_contents.n_residues = seq_comp.n_residues phil_object.scaling.input.asu_contents.n_bases = seq_comp.n_bases self.nres_known = True matthews_results =matthews.matthews_rupp( crystal_symmetry = miller_array, n_residues = phil_object.scaling.input.asu_contents.n_residues, n_bases = phil_object.scaling.input.asu_contents.n_bases, out=out,verbose=1) phil_object.scaling.input.asu_contents.n_residues = matthews_results[0] phil_object.scaling.input.asu_contents.n_bases = matthews_results[1] n_copies_solc = matthews_results[2] self.matthews_results = matthews_results if phil_object.scaling.input.asu_contents.n_copies_per_asu is not None: n_copies_solc = phil_object.scaling.input.asu_contents.n_copies_per_asu self.defined_copies = n_copies_solc if verbose>0: print >> out,"Number of copies per asymmetric unit provided" print >> out," Will use user specified value of ", n_copies_solc else: phil_object.scaling.input.asu_contents.n_copies_per_asu = n_copies_solc self.guessed_copies = n_copies_solc # first report on I over sigma miller_array_new = miller_array self.data_strength = None miller_array_intensities = miller_array if (original_intensities is not None) : assert original_intensities.is_xray_intensity_array() miller_array_intensities = original_intensities if miller_array_intensities.sigmas() is not None: data_strength=data_statistics.i_sigi_completeness_stats( miller_array_intensities, isigi_cut = phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.isigi_cut, completeness_cut = phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.completeness_cut, completeness_as_non_anomalous=completeness_as_non_anomalous) data_strength.show(out) self.data_strength = data_strength if phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution is None: if data_strength.resolution_cut > data_strength.resolution_at_least: phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution = data_strength.resolution_at_least else: phil_object.scaling.input.parameters.misc_twin_parameters.twin_test_cuts.high_resolution = data_strength.resolution_cut ## Isotropic wilson scaling if verbose>0: print >> out print >> out print >> out, "Maximum likelihood isotropic Wilson scaling " n_residues = phil_object.scaling.input.asu_contents.n_residues n_bases = phil_object.scaling.input.asu_contents.n_bases if n_residues is None: n_residues = 0 if n_bases is None: n_bases = 0 if n_bases+n_residues==0: raise Sorry("No scatterers available") iso_scale_and_b = absolute_scaling.ml_iso_absolute_scaling( miller_array = miller_array_new, n_residues = n_residues* miller_array.space_group().order_z()*n_copies_solc, n_bases=n_bases* miller_array.space_group().order_z()*n_copies_solc) iso_scale_and_b.show(out=out,verbose=verbose) self.iso_scale_and_b = iso_scale_and_b ## Store the b and scale values from isotropic ML scaling self.iso_p_scale = iso_scale_and_b.p_scale self.iso_b_wilson = iso_scale_and_b.b_wilson ## Anisotropic ml wilson scaling if verbose>0: print >> out print >> out print >> out, "Maximum likelihood anisotropic Wilson scaling " aniso_scale_and_b = absolute_scaling.ml_aniso_absolute_scaling( miller_array = miller_array_new, n_residues = n_residues*miller_array.space_group().order_z()*n_copies_solc, n_bases = n_bases*miller_array.space_group().order_z()*n_copies_solc) aniso_scale_and_b.show(out=out,verbose=1) self.aniso_scale_and_b = aniso_scale_and_b try: b_cart = aniso_scale_and_b.b_cart except AttributeError, e: print >> out, "*** ERROR ***" print >> out, str(e) show_exception_info_if_full_testing() return
def calc_stats(xac_file, stat_choice, n_residues=None, ref_v6cell=None, min_peak=None, min_peak_percentile=None, correct_peak=None): # Open XDS_ASCII if xac_file.endswith(".pkl"): xac = pickle.load(open(xac_file)) else: xac = xds_ascii.XDS_ASCII(xac_file) sel_remove = flex.bool(xac.iobs.size(), False) if min_peak is not None: sel = xac.peak < min_peak sel_remove |= sel elif min_peak_percentile is not None: q = numpy.percentile(xac.peak, min_peak_percentile) print "percentile %.2f %s" % (q, xac) sel = xac.peak < q sel_remove |= sel if correct_peak: sel_remove |= (xac.peak < 1) # remove PEAK==0 xac.remove_selection(sel_remove) if params.correct_peak: xac.iobs *= xac.peak * .01 xac.sigma_iobs *= xac.peak * .01 iobs = xac.i_obs(anomalous_flag=False) iobs = iobs.select(iobs.sigmas()>0).merge_equivalents(use_internal_variance=False).array() stats = dict(filename=xac_file, cell=iobs.unit_cell().parameters()) if iobs.size() == 0: return stats if "ioversigma" in stat_choice or "resnatsnr1" in stat_choice: binner = iobs.setup_binner(auto_binning=True) if "ioversigma" in stat_choice: stats["ioversigma"] = flex.mean(iobs.data()/iobs.sigmas()) if "resnatsnr1" in stat_choice: res = float("nan") for i_bin in binner.range_used(): sel = binner.selection(i_bin) tmp = iobs.select(sel) if tmp.size() == 0: continue sn = flex.mean(tmp.data()/tmp.sigmas()) if sn <= 1: res = binner.bin_d_range(i_bin)[1] break stats["resnatsnr1"] = res if "abdist" in stat_choice: from cctbx.uctbx.determine_unit_cell import NCDist G6a, G6b = ref_v6cell, v6cell(iobs.unit_cell().niggli_cell()) abdist = NCDist(G6a, G6b) stats["abdist"] = abdist if "wilsonb" in stat_choice: iso_scale_and_b = ml_iso_absolute_scaling(iobs, n_residues, 0) stats["wilsonb"] = iso_scale_and_b.b_wilson print stats return stats
def calc_stats(xac_file, stat_choice, n_residues=None, ref_v6cell=None, min_peak=None, min_peak_percentile=None, correct_peak=None): # Open XDS_ASCII if xac_file.endswith(".pkl"): xac = pickle.load(open(xac_file)) else: xac = xds_ascii.XDS_ASCII(xac_file) sel_remove = flex.bool(xac.iobs.size(), False) if min_peak is not None: sel = xac.peak < min_peak sel_remove |= sel elif min_peak_percentile is not None: q = numpy.percentile(xac.peak, min_peak_percentile) print "percentile %.2f %s" % (q, xac) sel = xac.peak < q sel_remove |= sel if correct_peak: sel_remove |= (xac.peak < 1) # remove PEAK==0 xac.remove_selection(sel_remove) if params.correct_peak: xac.iobs *= xac.peak * .01 xac.sigma_iobs *= xac.peak * .01 iobs = xac.i_obs(anomalous_flag=False) iobs = iobs.select(iobs.sigmas() > 0).merge_equivalents( use_internal_variance=False).array() stats = dict(filename=xac_file, cell=iobs.unit_cell().parameters()) if iobs.size() == 0: return stats if "ioversigma" in stat_choice or "resnatsnr1" in stat_choice: binner = iobs.setup_binner(auto_binning=True) if "ioversigma" in stat_choice: stats["ioversigma"] = flex.mean(iobs.data() / iobs.sigmas()) if "resnatsnr1" in stat_choice: res = float("nan") for i_bin in binner.range_used(): sel = binner.selection(i_bin) tmp = iobs.select(sel) if tmp.size() == 0: continue sn = flex.mean(tmp.data() / tmp.sigmas()) if sn <= 1: res = binner.bin_d_range(i_bin)[1] break stats["resnatsnr1"] = res if "abdist" in stat_choice: from cctbx.uctbx.determine_unit_cell import NCDist G6a, G6b = ref_v6cell, v6cell(iobs.unit_cell().niggli_cell()) abdist = NCDist(G6a, G6b) stats["abdist"] = abdist if "wilsonb" in stat_choice: iso_scale_and_b = ml_iso_absolute_scaling(iobs, n_residues, 0) stats["wilsonb"] = iso_scale_and_b.b_wilson print stats return stats