def get_matches(lastz_file): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format=True): uce_name = get_uce_name(lz.name2) probe_number = get_uce_num(lz.name2) if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append([get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1]) return matches, probes
def get_bgi_matches(lastz_file, stripnum): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format=True): uce_name = re.sub(stripnum, 's', lz.name2).lower() probe_number = int(lz.name2.split('_')[-1]) if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append( [get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1]) return matches, probes
def main(): args = get_args() uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [ os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace( '-', "_") for f in files ] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k, v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose: print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def get_matches(lastz_file): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format=True): uce_name = get_uce_name(lz.name2) probe_number = get_uce_num(lz.name2) if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append( [ get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1 ] ) return matches, probes
def get_bgi_matches(lastz_file, stripnum): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format=True): uce_name = re.sub(stripnum, 's', lz.name2).lower() probe_number = int(lz.name2.split('_')[-1]) if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append( [ get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1 ] ) return matches, probes
def main(): args = get_args() uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k,v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key = itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose:print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def main(): args = get_args() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read(args.config) conn = sqlite3.connect(args.db) c = conn.cursor() if args.extend_db: query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db) c.execute(query) organisms = get_names_from_config(config, "Organisms") uces = get_names_from_config(config, "Loci") # pdb.set_trace() uce_fasta_out = fasta.FastaWriter(args.output) regex = re.compile("[N,n]{1,21}") for organism in organisms: print "Getting {0} reads...".format(organism) written = [] # going to need to do something more generic w/ suffixes # pdb.set_trace() name = organism.replace("_", "-") if args.notstrict: if not organism.endswith("*"): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True) elif args.extend_dir: # remove the asterisk name = name.rstrip("*") reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip("*"), uces, extend=True, notstrict=True) else: if not name.endswith("*"): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces) elif name.endswith("*") and args.extend_dir: # remove the asterisk name = name.rstrip("*") reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip("*"), uces, extend=True) for read in fasta.FastaReader(reads): name = get_name(read.identifier).lower() coverage = get_coverage(read.identifier) if name in node_dict.keys(): uce_seq = fasta.FastaSequence() uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism.rstrip("*"), coverage) # deal with strandedness because aligners dont, which # is annoying if node_dict[name][1] == "-": uce_seq.sequence = transform.DNA_reverse_complement(read.sequence) else: uce_seq.sequence = read.sequence # replace any occurrences of <21 Ns in a given sequence with # blanks. These should gap out during alignment. if regex.search(uce_seq.sequence): uce_seq.sequence = re.sub(regex, "", uce_seq.sequence) print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(" ")[0]) # Replace and leading/trailing lowercase bases from velvet # assemblies. Lowercase bases indicate low coverage, and these # have been problematic in downstream alignments). uce_seq.sequence = re.sub("^[acgtn]+", "", uce_seq.sequence) uce_seq.sequence = re.sub("[acgtn]+$", "", uce_seq.sequence) uce_fasta_out.write(uce_seq) written.append(str(node_dict[name][0])) else: pass # pdb.set_trace() if args.notstrict and missing: args.notstrict.write("[{0}]\n".format(organism)) for name in missing: args.notstrict.write("{0}\n".format(name)) written.append(name) assert set(written) == set(uces), "UCE names do not match" # assert set(written) == set(uces), pdb.set_trace() uce_fasta_out.close()
def main(): args = get_args() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read(args.config) conn = sqlite3.connect(args.db) c = conn.cursor() if args.extend_db: query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db) c.execute(query) organisms = get_names_from_config(config, 'Organisms') uces = get_names_from_config(config, 'Loci') #pdb.set_trace() uce_fasta_out = fasta.FastaWriter(args.output) regex = re.compile("[N,n]{1,21}") for organism in organisms: print "Getting {0} reads...".format(organism) written = [] # going to need to do something more generic w/ suffixes #pdb.set_trace() name = organism.replace('_', '-') if args.notstrict: if not organism.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True) elif args.extend_dir: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True) else: if not name.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces) elif name.endswith('*') and args.extend_dir: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True) for read in fasta.FastaReader(reads): name = get_name(read.identifier).lower() coverage = get_coverage(read.identifier) if name in node_dict.keys(): uce_seq = fasta.FastaSequence() uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism, coverage) # deal with strandedness because aligners dont, which # is annoying if node_dict[name][1] == '-': uce_seq.sequence = transform.DNA_reverse_complement(read.sequence) else: uce_seq.sequence = read.sequence # replace any occurrences of <21 Ns if regex.search(uce_seq.sequence): uce_seq.sequence = re.sub(regex, "", uce_seq.sequence) print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(' ')[0]) uce_fasta_out.write(uce_seq) written.append(str(node_dict[name][0])) else: pass #pdb.set_trace() if args.notstrict and missing: args.notstrict.write("[{0}]\n".format(organism)) for name in missing: args.notstrict.write("{0}\n".format(name)) written.append(name) assert set(written) == set(uces), "UCE names do not match" #assert set(written) == set(uces), pdb.set_trace() uce_fasta_out.close()