def merge_datasets(params, workdir, xds_files, cells, space_group): if not os.path.exists(workdir): os.makedirs(workdir) out = open(os.path.join(workdir, "merge.log"), "w") if params.program == "xscale": cycles = multi_merging.xscale.XscaleCycles( workdir, anomalous_flag=params.anomalous, d_min=params.d_min, d_max=params.d_max, reject_method=params.reject_method, reject_params=params.rejection, xscale_params=params.xscale, res_params=params.resolution, reference_file=params.reference_file, space_group=space_group, ref_mtz=params.reference.data if params.reference.copy_test_flag else None, out=out, nproc=params.nproc, batch_params=params.batch) unused_files, reasons = cycles.run_cycles(xds_files) used_files = set(xds_files).difference(set(unused_files)) print >> out print >> out, " SUMMARY " print >> out, "========================" for i, files in enumerate((used_files, unused_files)): print >> out, "\n%6s %4d files:\n" % ( ("Used", "Unused")[i], len(files)) if len(files) == 0: continue maxlen_f = max( map(lambda f: len(os.path.relpath(f, params.workdir)), files)) for f in files: cell = cells[f] merge_log = os.path.join(os.path.dirname(f), "merging_stats.log") try: lines = open(merge_log).readlines() resn = float( filter(lambda x: x.startswith("Resolution:"), lines)[0].split()[-1]) cmpl = float( filter(lambda x: x.startswith("Completeness:"), lines)[0].split()[-1].replace("%", "")) except: resn = float("nan") cmpl = float("nan") if i == 1: # print reason print >> out, "%-15s" % reasons.get(f, "unknown"), print >> out, ("%-" + str(maxlen_f) + "s") % os.path.relpath( f, params.workdir), cell, #print >>out, "ISa=%5.1f" % correctlp.get_ISa(os.path.join(os.path.dirname(f), "CORRECT.LP")), print >> out, "Cmpl=%3.0f%%, Resn= %.1f" % (cmpl, resn) ret = [] tkvals = lambda x: (x[-1], x[0], x[-2]) # overall, inner, outer for i in xrange(1, cycles.get_last_cycle_number() + 1): wd = os.path.join(workdir, "run_%.2d" % i) xscale_lp = os.path.join(wd, "XSCALE.LP") table = xscalelp.read_stats_table(xscale_lp) num_files = len(xscalelp.get_read_data(xscale_lp)) xtriage_logfile = os.path.join(wd, "ccp4", "logfile.log") aniso = xds_aniso_analysis.parse_logfile( os.path.join(wd, "aniso.log")) cellinfo = cycles.cell_info_at_cycles[i] ret.append([ i, wd, num_files, dict(cmpl=tkvals(table["cmpl"]), redundancy=tkvals(table["redundancy"]), i_over_sigma=tkvals(table["i_over_sigma"]), r_meas=tkvals(table["r_meas"]), cc_half=tkvals(table["cc_half"]), sig_ano=tkvals(table["sig_ano"]), cc_ano=tkvals(table["cc_ano"]), drange=tkvals(table["d_range"]), lp=xscale_lp, xtriage_log=xtriage.XtriageLogfile(xtriage_logfile), aniso=aniso, lcv=cellinfo[1], alcv=cellinfo[2], dmin_est=cycles.dmin_est_at_cycles.get(i, float("nan"))) ]) xscale_lp = os.path.join(cycles.current_working_dir(), "XSCALE.LP") print >> out, "\nFinal statistics:\n" print >> out, xscalelp.snip_stats_table(xscale_lp) return ret elif params.program == "aimless": worker = Pointless() print >> out, "\nRunning pointless" runinfo = worker.run_copy(hklout="pointless.mtz", wdir=workdir, xdsin=xds_files, logout=os.path.join(workdir, "pointless.log"), tolerance=30) # Table of file name -> Batch range assert len(xds_files) == len(runinfo) batch_info = collections.OrderedDict( map(lambda x: (x[0], (x[1][1:3])), zip(xds_files, runinfo))) cycles = multi_merging.aimless.AimlessCycles( workdir, anomalous_flag=params.anomalous, d_min=params.d_min, d_max=params.d_max, reject_method=params.reject_method, cc_cutoff=params.rejection.lpstats.pwcc.abs_cutoff, delta_cchalf_bin=params.rejection.delta_cchalf.bin, mtzin=os.path.join(workdir, "pointless.mtz"), batch_info=batch_info, out=out, nproc=params.nproc, nproc_each=params.batch.nproc_each, batchjobs=None) # FIXME batchjobs unused_files, reasons = cycles.run_cycles(xds_files) used_files = set(xds_files).difference(set(unused_files)) print >> out print >> out, " SUMMARY " print >> out, "========================" for i, files in enumerate((used_files, unused_files)): print >> out, "\n%6s %4d files:\n" % ( ("Used", "Unused")[i], len(files)) if len(files) == 0: continue maxlen_f = max( map(lambda f: len(os.path.relpath(f, params.workdir)), files)) for f in files: cell = cells[f] merge_log = os.path.join(os.path.dirname(f), "merging_stats.log") try: lines = open(merge_log).readlines() resn = float( filter(lambda x: x.startswith("Resolution:"), lines)[0].split()[-1]) cmpl = float( filter(lambda x: x.startswith("Completeness:"), lines)[0].split()[-1].replace("%", "")) except: resn = float("nan") cmpl = float("nan") if i == 1: # print reason print >> out, "%-15s" % reasons.get(f, "unknown"), print >> out, ("%-" + str(maxlen_f) + "s") % os.path.relpath( f, params.workdir), cell, print >> out, "ISa=%5.1f" % correctlp.get_ISa( os.path.join(os.path.dirname(f), "CORRECT.LP")), print >> out, "Cmpl=%3.0f%%, Resn= %.1f" % (cmpl, resn) aimless_log = os.path.join(cycles.current_working_dir(), "aimless.log") print >> out, "\nFinal statistics:\n" print >> out, aimless.snip_summary(aimless_log) # Write summary table = aimless.read_summary(aimless_log) tkvals = lambda x: (x[0], x[1], x[2]) # overall, inner, outer return [ [ cycles.get_last_cycle_number(), cycles.current_working_dir(), len(used_files), dict(cmpl=tkvals(table["cmpl"]), redundancy=tkvals(table["redundancy"]), i_over_sigma=tkvals(table["i_over_sigma"]), r_meas=tkvals(table["r_meas"]), cc_half=tkvals(table["cc_half"]), sig_ano=(float("nan"), ) * 3, cc_ano=tkvals(table["cc_ano"])) ], ] #print >>out, "\nRunning aimless" #aimless.run_aimless(mtzin="pointless.mtz", # wdir=workdir, # anomalous=params.anomalous, d_min=params.d_min, prefix=None) else: print >> out, "Unknown program:", params.program return []
def run_cycle(self, xds_files, do_rejection=True): if len(xds_files) == 0: print >> self.out, "Error: no files given." return inp_str = "" for i, f in enumerate(xds_files): brange = self.batch_info[f] inp_str += "RUN %3d BATCH %4d to %4d\n" % (i + 1, brange[0], brange[1]) print >> self.out, "DEBUG:: running aimless with %3d files.." % len(xds_files) aimless.run_aimless( mtzin=os.path.relpath(self.mtzin, self.workdir), wdir=self.workdir, anomalous=self.anomalous_flag, d_min=self.d_min, prefix="aimless", add_stdin=inp_str, ) aimless_log = os.path.join(self.workdir, "aimless.log") # XXX Aimless error handling here. if not do_rejection: return # Remove bad data remove_idxes = [] if self.reject_method == "delta_cc1/2": print >> self.out, "Rejection based on delta_CC1/2 in %s shell" % self.delta_cchalf_bin table = aimless.read_summary(aimless_log) i_stat = 0 if self.delta_cchalf_bin == "total" else 2 prev_cchalf = table["cc_half"][i_stat] prev_nuniq = table["nuniq"][i_stat] # file_name->idx table remaining_files = collections.OrderedDict(map(lambda x: x[::-1], enumerate(xds_files))) for i in xrange(len(xds_files) - 1): # if only one file, cannot proceed. tmpdir = os.path.join(self.workdir, "reject_test_%.3d" % i) cchalf_list = aimless.calc_cchalf_by_removing( wdir=tmpdir, mtzin=self.mtzin, batch_info=self.batch_info, inpfiles=remaining_files.keys(), anomalous_flag=self.anomalous_flag, d_min=self.d_min, stat_bin=self.delta_cchalf_bin, nproc=self.nproc, nproc_each=self.nproc_each, batchjobs=self.batchjobs, ) rem_idx, cc_i, nuniq_i = cchalf_list[0] # First (largest) is worst one to remove. rem_idx_in_org = remaining_files[remaining_files.keys()[rem_idx]] # Decision making by CC1/2 print >> self.out, "DEBUG:: remove %3d if %.4f*%d > %.4f*%d" % ( rem_idx_in_org, cc_i, nuniq_i, prev_cchalf, prev_nuniq, ) if cc_i * nuniq_i <= prev_cchalf * prev_nuniq: break print >> self.out, "Removing idx= %3d gains CC1/2 by %.4f" % (rem_idx_in_org, cc_i - prev_cchalf) prev_cchalf, prev_nuniq = cc_i, nuniq_i remove_idxes.append(rem_idx_in_org) del remaining_files[remaining_files.keys()[rem_idx]] # remove file from table else: print >> self.out, "ERROR:: Unsupported reject_method (%s)" % reject_method if len(remove_idxes) > 0: print >> self.out, "DEBUG:: Need to remove %d files" % len(remove_idxes) for i in sorted(remove_idxes): print >> self.out, " %.3d %s" % (i + 1, xds_files[i]) self.removed_files.append(xds_files[i]) self.removed_reason[xds_files[i]] = "badcc" if self.next_delta_cchalf_bin != []: self.delta_cchalf_bin = self.next_delta_cchalf_bin.pop(0) do_rejection = True else: do_rejection = False if do_rejection or len(remove_idxes) > 0: keep_idxes = filter(lambda x: x not in remove_idxes, xrange(len(xds_files))) self.workdir = self.request_next_workdir() self.run_cycle(map(lambda i: xds_files[i], keep_idxes), do_rejection=do_rejection)
def merge_datasets(params, workdir, xds_files, cells, batchjobs): if not os.path.exists(workdir): os.makedirs(workdir) out = open(os.path.join(workdir, "merge.log"), "w") if params.program == "xscale": cycles = multi_merging.xscale.XscaleCycles(workdir, anomalous_flag=params.anomalous, d_min=params.d_min, d_max=params.d_max, reject_method=params.reject_method, reject_params=params.rejection, xscale_params=params.xscale, reference_file=params.reference_file, out=out, nproc=params.nproc, nproc_each=params.batch.nproc_each, batchjobs=batchjobs if "deltacchalf" in params.batch.par_run else None) unused_files, reasons = cycles.run_cycles(xds_files) used_files = set(xds_files).difference(set(unused_files)) print >>out print >>out, " SUMMARY " print >>out, "========================" for i, files in enumerate((used_files, unused_files)): print >>out, "\n%6s %4d files:\n" % (("Used", "Unused")[i], len(files)) if len(files) == 0: continue maxlen_f = max(map(lambda f: len(os.path.relpath(f, params.workdir)), files)) for f in files: cell = cells[f] merge_log = os.path.join(os.path.dirname(f), "merging_stats.log") try: lines = open(merge_log).readlines() resn = float(filter(lambda x:x.startswith("Resolution:"), lines)[0].split()[-1]) cmpl = float(filter(lambda x:x.startswith("Completeness:"), lines)[0].split()[-1].replace("%","")) except: resn = float("nan") cmpl = float("nan") if i == 1: # print reason print >>out, "%-15s"%reasons.get(f, "unknown"), print >>out, ("%-"+str(maxlen_f)+"s")%os.path.relpath(f, params.workdir), cell, #print >>out, "ISa=%5.1f" % correctlp.get_ISa(os.path.join(os.path.dirname(f), "CORRECT.LP")), print >>out, "Cmpl=%3.0f%%, Resn= %.1f" % (cmpl, resn) ret = [] tkvals = lambda x: (x[-1], x[0], x[-2]) # overall, inner, outer for i in xrange(1, cycles.get_last_cycle_number()+1): wd = os.path.join(workdir, "run_%.2d"%i) xscale_lp = os.path.join(wd, "XSCALE.LP") table = xscalelp.read_stats_table(xscale_lp) num_files = len(xscalelp.get_read_data(xscale_lp)) xtriage_logfile = os.path.join(wd, "ccp4", "logfile.log") ret.append([i, wd, num_files, dict(cmpl=tkvals(table["cmpl"]), redundancy=tkvals(table["redundancy"]), i_over_sigma=tkvals(table["i_over_sigma"]), r_meas=tkvals(table["r_meas"]), cc_half=tkvals(table["cc_half"]), sig_ano=tkvals(table["sig_ano"]), cc_ano=tkvals(table["cc_ano"]), drange=tkvals(table["d_range"]), lp=xscale_lp, xtriage_log=xtriage.XtriageLogfile(xtriage_logfile)) ]) xscale_lp = os.path.join(cycles.current_working_dir(), "XSCALE.LP") print >>out, "\nFinal statistics:\n" print >>out, xscalelp.snip_stats_table(xscale_lp) return ret elif params.program == "aimless": worker = Pointless() print >>out, "\nRunning pointless" runinfo = worker.run_copy(hklout="pointless.mtz", wdir=workdir, xdsin=xds_files, logout=os.path.join(workdir, "pointless.log"), tolerance=30) # Table of file name -> Batch range assert len(xds_files) == len(runinfo) batch_info = collections.OrderedDict(map(lambda x: (x[0], (x[1][1:3])), zip(xds_files, runinfo))) cycles = multi_merging.aimless.AimlessCycles(workdir, anomalous_flag=params.anomalous, d_min=params.d_min, d_max=params.d_max, reject_method=params.reject_method, cc_cutoff=params.rejection.lpstats.pwcc.abs_cutoff, delta_cchalf_bin=params.rejection.delta_cchalf.bin, mtzin=os.path.join(workdir, "pointless.mtz"), batch_info=batch_info, out=out, nproc=params.nproc, nproc_each=params.batch.nproc_each, batchjobs=batchjobs if "deltacchalf" in params.batch.par_run else None) unused_files, reasons = cycles.run_cycles(xds_files) used_files = set(xds_files).difference(set(unused_files)) print >>out print >>out, " SUMMARY " print >>out, "========================" for i, files in enumerate((used_files, unused_files)): print >>out, "\n%6s %4d files:\n" % (("Used", "Unused")[i], len(files)) if len(files) == 0: continue maxlen_f = max(map(lambda f: len(os.path.relpath(f, params.workdir)), files)) for f in files: cell = cells[f] merge_log = os.path.join(os.path.dirname(f), "merging_stats.log") try: lines = open(merge_log).readlines() resn = float(filter(lambda x:x.startswith("Resolution:"), lines)[0].split()[-1]) cmpl = float(filter(lambda x:x.startswith("Completeness:"), lines)[0].split()[-1].replace("%","")) except: resn = float("nan") cmpl = float("nan") if i == 1: # print reason print >>out, "%-15s"%reasons.get(f, "unknown"), print >>out, ("%-"+str(maxlen_f)+"s")%os.path.relpath(f, params.workdir), cell, print >>out, "ISa=%5.1f" % correctlp.get_ISa(os.path.join(os.path.dirname(f), "CORRECT.LP")), print >>out, "Cmpl=%3.0f%%, Resn= %.1f" % (cmpl, resn) aimless_log = os.path.join(cycles.current_working_dir(), "aimless.log") print >>out, "\nFinal statistics:\n" print >>out, aimless.snip_summary(aimless_log) # Write summary table = aimless.read_summary(aimless_log) tkvals = lambda x: (x[0], x[1], x[2]) # overall, inner, outer return [[cycles.get_last_cycle_number(), cycles.current_working_dir(), len(used_files), dict(cmpl=tkvals(table["cmpl"]), redundancy=tkvals(table["redundancy"]), i_over_sigma=tkvals(table["i_over_sigma"]), r_meas=tkvals(table["r_meas"]), cc_half=tkvals(table["cc_half"]), sig_ano=(float("nan"),)*3, cc_ano=tkvals(table["cc_ano"]))], ] #print >>out, "\nRunning aimless" #aimless.run_aimless(mtzin="pointless.mtz", # wdir=workdir, # anomalous=params.anomalous, d_min=params.d_min, prefix=None) else: print >>out, "Unknown program:", params.program return []
def run_cycle(self, xds_files, do_rejection=True): if len(xds_files) == 0: print >> self.out, "Error: no files given." return inp_str = "" for i, f in enumerate(xds_files): brange = self.batch_info[f] inp_str += "RUN %3d BATCH %4d to %4d\n" % (i + 1, brange[0], brange[1]) print >> self.out, "DEBUG:: running aimless with %3d files.." % len( xds_files) aimless.run_aimless(mtzin=os.path.relpath(self.mtzin, self.workdir), wdir=self.workdir, anomalous=self.anomalous_flag, d_min=self.d_min, prefix="aimless", add_stdin=inp_str) aimless_log = os.path.join(self.workdir, "aimless.log") # XXX Aimless error handling here. if not do_rejection: return # Remove bad data remove_idxes = [] if self.reject_method == "delta_cc1/2": print >> self.out, "Rejection based on delta_CC1/2 in %s shell" % self.delta_cchalf_bin table = aimless.read_summary(aimless_log) i_stat = 0 if self.delta_cchalf_bin == "total" else 2 prev_cchalf = table["cc_half"][i_stat] prev_nuniq = table["nuniq"][i_stat] # file_name->idx table remaining_files = collections.OrderedDict( map(lambda x: x[::-1], enumerate(xds_files))) for i in xrange(len(xds_files) - 1): # if only one file, cannot proceed. tmpdir = os.path.join(self.workdir, "reject_test_%.3d" % i) cchalf_list = aimless.calc_cchalf_by_removing( wdir=tmpdir, mtzin=self.mtzin, batch_info=self.batch_info, inpfiles=remaining_files.keys(), anomalous_flag=self.anomalous_flag, d_min=self.d_min, stat_bin=self.delta_cchalf_bin, nproc=self.nproc, nproc_each=self.nproc_each, batchjobs=self.batchjobs) rem_idx, cc_i, nuniq_i = cchalf_list[ 0] # First (largest) is worst one to remove. rem_idx_in_org = remaining_files[remaining_files.keys() [rem_idx]] # Decision making by CC1/2 print >> self.out, "DEBUG:: remove %3d if %.4f*%d > %.4f*%d" % ( rem_idx_in_org, cc_i, nuniq_i, prev_cchalf, prev_nuniq) if cc_i * nuniq_i <= prev_cchalf * prev_nuniq: break print >> self.out, "Removing idx= %3d gains CC1/2 by %.4f" % ( rem_idx_in_org, cc_i - prev_cchalf) prev_cchalf, prev_nuniq = cc_i, nuniq_i remove_idxes.append(rem_idx_in_org) del remaining_files[remaining_files.keys() [rem_idx]] # remove file from table else: print >> self.out, "ERROR:: Unsupported reject_method (%s)" % reject_method if len(remove_idxes) > 0: print >> self.out, "DEBUG:: Need to remove %d files" % len( remove_idxes) for i in sorted(remove_idxes): print >> self.out, " %.3d %s" % (i + 1, xds_files[i]) self.removed_files.append(xds_files[i]) self.removed_reason[xds_files[i]] = "badcc" if self.next_delta_cchalf_bin != []: self.delta_cchalf_bin = self.next_delta_cchalf_bin.pop(0) do_rejection = True else: do_rejection = False if do_rejection or len(remove_idxes) > 0: keep_idxes = filter(lambda x: x not in remove_idxes, xrange(len(xds_files))) self.workdir = self.request_next_workdir() self.run_cycle(map(lambda i: xds_files[i], keep_idxes), do_rejection=do_rejection)