def score_set( set_id, bg_dist_mean, primers, chr_ends, score_fun, score_expression, max_fg_bind_dist, bg_genome_len=None, interactive=False): binding_locations = swga.locate.linearize_binding_sites(primers, chr_ends) max_dist = max(swga.score.seq_diff(binding_locations)) # Abort now if it's not passing filter (and it's not a user-supplied set) if not interactive and max_dist > max_fg_bind_dist: return False, max_dist if not bg_dist_mean and not bg_genome_len: swga.error("Neither background length nor ratio were provided, " "cannot calculate bg_dist_mean") elif not bg_dist_mean: bg_dist_mean = float(bg_genome_len)/sum(p.bg_freq for p in primers) set_score, variables = score_fun( primer_set=primers, primer_locs=binding_locations, max_dist=max_dist, bg_dist_mean=bg_dist_mean) add_set = True # If it's user-supplied, they have the option of not adding it to db if interactive: set_dict = dict( {'score': set_score, 'scoring_fn': score_expression}.items() + variables.items()) swga.message("Set statistics:\n - " + "\n - ".join( fmtkv(k, v) for k, v in set_dict.items())) if click.confirm("Add set to database?"): # User-provided sets have negative numbers, so we find the smallest # and decrement by 1 set_id = Set.select(fn.Min(Set._id)).scalar() - 1 else: add_set = False if add_set: s = swga.database.add_set( _id=set_id, primers=primers, score=set_score, scoring_fn=score_expression, **variables) set_added = s is not None if interactive and set_added: swga.message("Set {} added successfully.".format(set_id)) elif interactive: swga.message("That primer set already exists.") return set_added, max_dist
def main(argv, cfg_file): cmd = Command('find_sets', cfg_file=cfg_file) score_cmd = Command('score', cfg_file=cfg_file) cmd.parse_args(argv) score_cmd.parse_args(argv) init_db(cmd.primer_db) # We need to clear all the previously-used sets each time due to uniqueness # constraints allsets = Set.select() if allsets.count() > 0: if not cmd.force: click.confirm("Remove all previously-found sets?", abort=True) for s in progress.bar(allsets, expected_size=allsets.count()): s.primers.clear() s.delete_instance() make_graph(cmd.max_dimer_bp, graph_fname) swga.message("Now finding sets. If nothing appears, try relaxing your parameters.") if cmd.workers <= 1: setlines = setfinder.find_sets( cmd.min_bg_bind_dist, cmd.min_size, cmd.max_size, cmd.bg_genome_len, graph_fp=graph_fname) else: setlines = setfinder.mp_find_sets( nprocesses=cmd.workers, graph_fp=graph_fname, min_bg_bind_dist=cmd.min_bg_bind_dist, min_size=cmd.min_size, max_size=cmd.max_size, bg_genome_len=cmd.bg_genome_len) score_sets( setlines, cmd.fg_genome_fp, score_cmd.score_expression, cmd.max_fg_bind_dist, cmd.max_sets)
def summary(primer_db, fg_length, bg_length): db = swga.database.init_db(primer_db) db.connect() swga.database.create_tables(drop=False) avg_fg_bind, avg_bg_bind, nprimers = ( Primer .select(fn.Avg(Primer.fg_freq), fn.Avg(Primer.bg_freq), fn.Count(Primer.seq)) .scalar(as_tuple=True)) if (avg_fg_bind is None) or (avg_bg_bind is None): raise swga.error( "Could not calculate summary statistics; database may be corrupt") fg_bind_ratio = avg_fg_bind / float(fg_length) bg_bind_ratio = avg_bg_bind / float(bg_length) nactive = Primer.select().where(Primer.active==True).count() min_tm, max_tm, avg_tm = ( Primer .select(fn.Min(Primer.tm), fn.Max(Primer.tm), fn.Avg(Primer.tm)) .where(Primer.active==True) .scalar(as_tuple=True)) nsets = Set.select(fn.Count(Set._id)).scalar() if nsets > 0: bs = Set.select().order_by(Set.score).limit(1).get() bs_primers = ", ".join(swga.database.get_primers_for_set(bs._id)).strip() best_set = bs._id bs_size = bs.set_size bs_score = bs.score bs_stats = "- "+"\n - ".join( fmtkv(k, v) for k, v in bs.__dict__['_data'].items() if k not in ["_id", "pids", "score"] ) version_header = ( "---------------------\n" "==== SWGA v{version} ====\n" "---------------------\n" .format(version=swga.__version__) ) summary_msg = """ {version_header} PRIMER SUMMARY --------------- There are {nprimers} primers in the database. {nactive} are marked as active (i.e., they passed filter steps and will be used to find sets of compatible primers.) {ifzero_primers_msg} The average number of foreground genome binding sites is {avg_fg_bind:.0f}. (avg binding / genome_length = {fg_bind_ratio:05f}) The average number of background genome binding sites is {avg_bg_bind:.0f}. (avg binding / genome_length = {bg_bind_ratio:05f}) {melting_tmp_msg} SETS SUMMARY --------------- There are {nsets} sets in the database. {set_msg}--------------- Report generated from {primer_db} """ ifzero_primers_msg = colored.green( "Run `swga filter` to identify primers to use." if nactive == 0 else "") melting_tmp_msg = ( """The melting temp of the primers ranges between {min_tm:.2f}C and {max_tm:.2f}C with an average of {avg_tm:.2f}C.""" if nactive > 0 and min_tm and max_tm else "No melting temps have been calculated yet.") ifzero_sets_msg = colored.green( "Run `swga find_sets` after identifying valid primers to begin collecting sets.\n") set_msg = (""" The best scoring set is #{best_set}, with {bs_size} primers and a score of {bs_score:03f}.\nVarious statistics: {bs_stats} The primers in Set {best_set} are: {bs_primers} """ if nsets > 0 else ifzero_sets_msg) primer_db = os.path.abspath(primer_db) nprimers = colored.blue(nprimers, bold=True) nactive = colored.blue(nactive, bold=True) nsets = colored.blue(nsets, bold=True) set_msg = set_msg.format(**locals()) melting_tmp_msg = melting_tmp_msg.format(**locals()) version_header = colored.green(version_header) summary_msg = summary_msg.format(**locals()) with indent(2): puts(max_width(summary_msg, 80))