def make_graph(max_hetdimer_bind, outfile): '''Selects all active primers and outputs a primer compatibility graph.''' # Reset all the primer IDs (as ids are only used for set_finder) Primer.update(_id = -1).execute() primers = list(Primer.select().where(Primer.active == True) .order_by(Primer.ratio.desc()).execute()) if len(primers) == 0: swga.error("No active sets found. Run `swga filter` first.") for i, p in enumerate(primers): p._id = i + 1 update_in_chunks(primers, show_progress=False) swga.message("Composing primer compatibility graph...") edges = graph.test_pairs(primers, max_hetdimer_bind) if len(edges) == 0: swga.error("No compatible primers. Try relaxing your parameters.", exception=False) with open(outfile, 'wb') as out: graph.write_graph(primers, edges, out)
def primers(): primers = [ Primer.create(seq="ATGC", fg_freq=1, bg_freq=2, ratio=1.0, active=True), Primer.create(seq="GGCC", fg_freq=1, bg_freq=3, ratio=0.5, active=True), Primer.create(seq="CCTA", fg_freq=2, bg_freq=0, ratio=float('inf'), active=True) ] return primers
def test_bad_add_set_function(self, initdb, tprimers): '''Should raise errors if invalid primers supplied.''' with pytest.raises(swga.SWGAError): database.add_set(_id=2, primers=None, score=100) with pytest.raises(swga.SWGAError): invalid_primers = Primer.select().where(Primer.seq == "XX") database.add_set(_id=3, primers=invalid_primers, score=100)
def test_create_tables(self): database.db.init(":memory:") database.create_tables() p = Primer.create(seq="ATGC") s = Set.create(_id=1, score=1) s.primers.add(p) database.db.close()
def test_update_in_chunks(self, initdb, tprimers, seqs): '''Must push all the updates successfully..''' for primer in tprimers: primer.fg_freq = 100 database.update_in_chunks(tprimers) primers = Primer.select().where(Primer.seq << seqs) for primer in primers: assert primer.fg_freq == 100
def update_Tms(primers): targets = list( Primer.select() .where( (Primer.seq << primers) & (Primer.tm >> None))) for primer in targets: primer.update_tm() swga.database.update_in_chunks(targets, label="Updating primer db... ")
def test_destroy_primer_sets(self, initdb, tprimers, tset): ''' The many-to-many relationship must be updated when a set is destroyed. ''' mers = Primer.select().limit(5) tset.primers.add(mers) tset.delete_instance() assert mers.count() == 5 for mer in mers: assert tset not in mer.sets
def test_create_primers_sets(self, initdb, tprimers, tset): ''' Create primers and sets, and find which primers belong to which sets. ''' mers = Primer.select().limit(5) tset.primers.add(mers) assert mers.count() == 5 for mer in mers: assert tset in mer.sets assert mer in tset.primers
def filter_primers( primers, min_fg_bind, max_bg_bind, fg_length, bg_length, min_tm, max_tm, max_primers): """ Takes a list of sequences and retrieves them in the database, then returns those sequences that pass various criteria. """ primers = Primer.select().where(Primer.seq << primers) fg_min_freq = min_fg_bind bg_max_freq = max_bg_bind # Find primers that pass the binding rate thresholds fgp = Primer.select().where((Primer.seq << primers) & (Primer.fg_freq >= fg_min_freq)) swga.message("{} primers bind foreground genome with freq >= {} sites" .format(fgp.count(), min_fg_bind)) bgp = Primer.select().where((Primer.seq << primers) & (Primer.bg_freq <= bg_max_freq)) swga.message("{} primers bind background genome with freq <= {} sites" .format(bgp.count(), max_bg_bind)) candidates = Primer.select().where((Primer.seq << primers) & (Primer.seq << fgp) & (Primer.seq << bgp)) swga.message( "{} primers pass both fg and bg binding freq filters" .format(candidates.count())) # Add melt temp for any primer that doesn't have it yet swga.primers.update_Tms(candidates) valid_primers = Primer.select().where((Primer.seq << candidates) & (Primer.tm <= max_tm) & (Primer.tm >= min_tm)) swga.message("{} of those primers have a melting temp within given range" .format(valid_primers.count())) # Sort by background binding rate (smallest -> largest) and select top `n`, # then sort those by ratio (highest -> lowest) first_pass = (Primer.select() .where(Primer.seq << valid_primers) .order_by(Primer.bg_freq) .limit(max_primers)) second_pass = (Primer.select() .where(Primer.seq << first_pass) .order_by(Primer.ratio.desc())) return second_pass
def update_locations(primers, fg_genome_fp): ''' Updates the primers from the given set who are missing location data. ''' targets = list( Primer.select() .where( (Primer.seq << primers) & (Primer._locations >> None))) for primer in targets: primer._update_locations(fg_genome_fp) swga.database.update_in_chunks(targets, label="Updating primer db... ")
def test_linearize_binding_sites(kmer, initdb, fastafile): p = Primer.create(seq=kmer) p._update_locations(fastafile) chr_ends = swga.locate.chromosome_ends(fastafile) linear_bind_sites = swga.locate.linearize_binding_sites([p], chr_ends) # (number of sites + (2*number of chromosomes) - (any overlaps)) assert len(linear_bind_sites) == 10 for record, ends in chr_ends.iteritems(): start, end = ends assert start in linear_bind_sites assert end in linear_bind_sites for site in p.locations()[record]: assert site in linear_bind_sites
def read_primer_list(lines, fg_genome_fp, bg_genome_fp): ''' Reads in a list of primers, one per line, and returns the corresponding records from the primer database. If the primer doesn't exist in the db, tries to create it manually. If the primer doesn't appear in the fg genome, it skips it with a warning. ''' seqs = [re.split(r'[ \t]+', line.strip('\n'))[0] for line in lines] primers = list(Primer.select().where(Primer.seq << seqs).execute()) if len(primers) < len(seqs): primer_seqs = [p.seq for p in primers] missing = [_ for _ in seqs if _ not in primer_seqs] for seq in missing: swga.message(seq + " not in the database; skipping. Add it " "manually with `swga count --input <file>` ") return primers
def primers(self, initdb): primers = [ # reference primer Primer.create(_id=0, seq="ATGCTC"), # rev. complement has 4 bases overlapping Primer.create(_id=1, seq="CAGCAT"), # rev. complement has 3 bases overlapping Primer.create(_id=2, seq="GAGGTA"), Primer.create(_id=3, seq="ATCGAG"), # rev. complement has one base overlapping Primer.create(_id=4, seq="TTCCAC"), # substring of reference primer Primer.create(_id=5, seq="ATGC") ] return primers
def main(argv, cfg_file): cmd = Command('filter', cfg_file=cfg_file) cmd.parse_args(argv) swga.database.init_db(cmd.primer_db) # If we have an input file, use that. Otherwise pull from db if cmd.input: with open(cmd.input, 'rb') as infile: primers = swga.primers.read_primer_list( infile, cmd.fg_genome_fp, cmd.bg_genome_fp) else: cmd.skip_filtering = False primers = Primer.select() # Undo all active marks, if any deactivate_all_primers() if not cmd.skip_filtering: primers = filter_primers( primers, cmd.min_fg_bind, cmd.max_bg_bind, cmd.fg_length, cmd.bg_length, cmd.min_tm, cmd.max_tm, cmd.max_primers) swga.primers.update_locations(primers, cmd.fg_genome_fp) n_active = activate_primers(primers) if n_active < cmd.max_primers: swga.warn( "Fewer than {} primers were selected ({} passed all the filters). " "You may want to try less restrictive filtering parameters." .format(cmd.max_primers, n_active))
def summary(primer_db, fg_length, bg_length): db = swga.database.init_db(primer_db) db.connect() swga.database.create_tables(drop=False) avg_fg_bind, avg_bg_bind, nprimers = ( Primer .select(fn.Avg(Primer.fg_freq), fn.Avg(Primer.bg_freq), fn.Count(Primer.seq)) .scalar(as_tuple=True)) if (avg_fg_bind is None) or (avg_bg_bind is None): raise swga.error( "Could not calculate summary statistics; database may be corrupt") fg_bind_ratio = avg_fg_bind / float(fg_length) bg_bind_ratio = avg_bg_bind / float(bg_length) nactive = Primer.select().where(Primer.active==True).count() min_tm, max_tm, avg_tm = ( Primer .select(fn.Min(Primer.tm), fn.Max(Primer.tm), fn.Avg(Primer.tm)) .where(Primer.active==True) .scalar(as_tuple=True)) nsets = Set.select(fn.Count(Set._id)).scalar() if nsets > 0: bs = Set.select().order_by(Set.score).limit(1).get() bs_primers = ", ".join(swga.database.get_primers_for_set(bs._id)).strip() best_set = bs._id bs_size = bs.set_size bs_score = bs.score bs_stats = "- "+"\n - ".join( fmtkv(k, v) for k, v in bs.__dict__['_data'].items() if k not in ["_id", "pids", "score"] ) version_header = ( "---------------------\n" "==== SWGA v{version} ====\n" "---------------------\n" .format(version=swga.__version__) ) summary_msg = """ {version_header} PRIMER SUMMARY --------------- There are {nprimers} primers in the database. {nactive} are marked as active (i.e., they passed filter steps and will be used to find sets of compatible primers.) {ifzero_primers_msg} The average number of foreground genome binding sites is {avg_fg_bind:.0f}. (avg binding / genome_length = {fg_bind_ratio:05f}) The average number of background genome binding sites is {avg_bg_bind:.0f}. (avg binding / genome_length = {bg_bind_ratio:05f}) {melting_tmp_msg} SETS SUMMARY --------------- There are {nsets} sets in the database. {set_msg}--------------- Report generated from {primer_db} """ ifzero_primers_msg = colored.green( "Run `swga filter` to identify primers to use." if nactive == 0 else "") melting_tmp_msg = ( """The melting temp of the primers ranges between {min_tm:.2f}C and {max_tm:.2f}C with an average of {avg_tm:.2f}C.""" if nactive > 0 and min_tm and max_tm else "No melting temps have been calculated yet.") ifzero_sets_msg = colored.green( "Run `swga find_sets` after identifying valid primers to begin collecting sets.\n") set_msg = (""" The best scoring set is #{best_set}, with {bs_size} primers and a score of {bs_score:03f}.\nVarious statistics: {bs_stats} The primers in Set {best_set} are: {bs_primers} """ if nsets > 0 else ifzero_sets_msg) primer_db = os.path.abspath(primer_db) nprimers = colored.blue(nprimers, bold=True) nactive = colored.blue(nactive, bold=True) nsets = colored.blue(nsets, bold=True) set_msg = set_msg.format(**locals()) melting_tmp_msg = melting_tmp_msg.format(**locals()) version_header = colored.green(version_header) summary_msg = summary_msg.format(**locals()) with indent(2): puts(max_width(summary_msg, 80))
def test_add_primers(self, initdb): '''Must add the reverse complement of a primer if requested.''' primers = [{'seq': "AAAA"}] database.add_primers(primers, add_revcomp=True) assert Primer.select().where(Primer.seq == "TTTT").count() == 1
def tprimers(seqs): return [Primer.create(seq=seq) for seq in seqs]
def validate_order_field(field, model): '''Ensures the given field exists in the model.''' if field and field not in model.fields(): swga.error( "Cannot order by '{}'. Valid choices are {}" .format(field, ", ".join(Primer.fields())))
def activate(primers): '''Marks a list of primers as active.''' n = Primer.update(active=True).where( Primer.seq << primers).execute() return n
def deactivate_all_primers(): """Resets all active marks on primers.""" Primer.update(active=False).execute()