def main(): #pdb.set_trace() args = get_args() names = {} temp1 = "{}.temp1".format(args.infile) temp2 = "{}.temp2".format(args.infile) outf = fasta.FastaWriter(temp1) mask_file = os.path.splitext(args.infile)[0] + ".fa.out" f = fasta.FastaReader(args.infile) for seq in f: print seq.identifier gb = seq.identifier.split('|')[3] newname = seq.identifier.split(',')[0].split(' ')[-1] names[gb] = newname seq.identifier = ">{}".format(gb) outf.write(seq) outf.close() cmd = ["maskOutFa", "-softAdd", temp1, mask_file, temp2] subprocess.Popen(cmd).wait() final = "{}.masked".format(args.infile) outf = fasta.FastaWriter(final) for seq in fasta.FastaReader(temp2): iden = seq.identifier.strip('>') seq.identifier = "{}".format(names[iden]) print seq.identifier outf.write(seq) outf.close()
def setUp(self): # switch to this directory - so we can have access to data try: os.chdir(os.path.dirname(os.path.abspath( __file__ ))) seq = 'test-data/sequence.fasta' self.fasta = fasta.FastaReader(seq) except OSError: seq = 'test-data/sequence.fasta' self.fasta = fasta.FastaReader(seq)
def main(): args = get_args() conn = sqlite3.connect(args.db) c = conn.cursor() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read(args.config) organisms = get_names_from_config(config, args.group) excludes = get_names_from_config(config, 'Excludes') if excludes: organisms = [org for org in organisms if org not in excludes] args.output.write("org\tcontigs\tavg len\n") for org in organisms: # skip extended data, which are typically from genome-enabled orgs, # not capture data if not org.endswith('*'): # get the uce-matching node names from the db matching_nodes = get_matching_node_names(c, org) # parse the contig file for the organism, and return contig # lengths f = os.path.join(args.fasta, "{0}.{1}".format(org.replace('_','-'),'contigs.fasta')) records = fasta.FastaReader(f) contig_lens = [len(seq) for seq in records if '_'.join(seq.identifier.strip('>').split('_')[0:2]) in matching_nodes] # write the average contig length of contigs matching UCEs args.output.write("{0}\t{1}\t{2}\n".format(org, len(contig_lens), float(sum(contig_lens))/len(contig_lens)))
def get_fasta_dict(args): print 'Building the locus dictionary...' if args.ambiguous: print 'NOT removing sequences with ambiguous bases...' else: print 'Removing ALL sequences with ambiguous bases...' loci = defaultdict(list) for record in fasta.FastaReader(args.infile): #pdb.set_trace() if not args.faircloth: locus = record.identifier.split('|')[1] else: locus = '_'.join([record.identifier.split('|')[0], \ record.identifier.split('|')[1].split('_')[0]]) loci = build_locus_dict(loci, locus, record, args.ambiguous) # workon a copy so we can iterate and delete snapshot = copy.deepcopy(loci) # iterate over loci to check for all species at a locus for locus, data in snapshot.iteritems(): if args.notstrict: if len(data) < 3: t = "\tDropping Locus {0} because it has fewer " + \ "than the minimum number " + \ "of taxa for alignment (N < 2)" print(t).format(locus) del loci[locus] else: if len(data) < args.species: del loci[locus] t = "\tDropping Locus {0} because it has fewer " + \ "than the minimum number " + \ "of taxa for alignment (N < 2)" print(t).format(locus) return loci
def main(): args = get_args() avg_read_length = get_average_read_length(args.input) kmer = raw_input("What was kmer length? ") kmer = int(kmer) avg_c = [] for read in fasta.FastaReader(args.input): s_read = read.identifier.split('_') ck = float(s_read[-1]) c = ck * avg_read_length / (avg_read_length - kmer + 1) avg_c.append(c) avg_c = numpy.array(avg_c) if not args.csv: print "mean:\t", numpy.mean(avg_c) print "95ci:\t", 1.96 * (numpy.std(avg_c, ddof=1) / math.sqrt(len(avg_c))) print "min:\t", min(avg_c) print "max:\t", max(avg_c) print "median:\t", numpy.median(avg_c) print "<10x:\t", sum(avg_c < 10) print "<25x:\t", sum(avg_c < 25) print "<50x:\t", sum(avg_c < 50) print "<100x:\t", sum(avg_c < 100) else: print "{0},{1},{2},{3},{4},{5},{6},{7},{8}".format( numpy.mean(avg_c), 1.96 * (numpy.std(avg_c, ddof=1) / math.sqrt(len(avg_c))), min(avg_c), max(avg_c), numpy.median(avg_c), sum(avg_c < 10), sum(avg_c < 25), sum(avg_c < 50), sum(avg_c < 100))
def main(): args = get_args() uce_loci = [] # get lengths of loci seq_lengths = {} for seq in fasta.FastaReader(args.fasta): name = seq.identifier.split('|')[1] uce_loci.append(name) seq_lengths[name] = len(seq.sequence) overlappers = defaultdict(dict) names = defaultdict(list) coords = {} for match in lastz.Reader(args.lastz, long_format=True): locus = match.name2.split('|')[1] chromo = match.name1 coords[locus] = (match.zstart1, match.end1) for pmatch, span in overlappers[chromo].iteritems(): if locus == 'chr5_10696_s' and pmatch == 'chr13_710_s': pdb.set_trace() overlap = span.find(match.zstart1, match.end1) if overlap: overlappers[chromo][pmatch].insert(match.zstart1, match.end1, locus) names[pmatch].append(locus) break else: overlappers = add_new_locus(match, overlappers, chromo) overlapping_loci = [] all_groups = [] for k, v in names.iteritems(): # group loci into overlapping clusters base = [k] base.extend(v) all_groups.append(base) # get list of "bad loci" so we can determine non-overlappers overlapping_loci.append(k) overlapping_loci.extend(v) pdb.set_trace() non_overlapping_loci = set(uce_loci).difference(set(overlapping_loci)) # generate output in config-file format: config = ConfigParser.RawConfigParser() config.add_section('Non-overlapping loci') for locus in list(non_overlapping_loci): config.set('Non-overlapping loci', locus, seq_lengths[locus]) longest_of_overlapping = get_longest_of_overlapping_loci( all_groups, seq_lengths) config.add_section('Longest loci of group') for locus in longest_of_overlapping: config.set('Longest loci of group', locus, seq_lengths[locus]) config.add_section('Superlocus groups') for c, group in enumerate(all_groups): # order loci by start position starts = [(name, coords[name][0], coords[name][1]) for name in group] starts = sorted(starts, key=itemgetter(1)) sorted_names = [n[0] for n in starts] print starts #pdb.set_trace() config.set('Superlocus groups', "Group{0}".format(c), ','.join(sorted_names)) config.write(args.output)
def get_fasta_dict(args): if args.verbose: sys.stdout.write('Building the locus dictionary...\n') if args.ambiguous: sys.stdout.write( 'NOT removing sequences with ambiguous bases...\n') else: sys.stdout.write( 'Removing ALL sequences with ambiguous bases...\n') sys.stdout.flush() loci = defaultdict(list) if os.path.isfile(args.infile): for record in fasta.FastaReader(args.infile): if not args.faircloth: locus = record.identifier.split('|')[1] else: locus = '_'.join([record.identifier.split('|')[0], \ record.identifier.split('|')[1].split('_')[0]]) loci = build_locus_dict(loci, locus, record, args.ambiguous) # work with a directory of fastas if we have those - get locus name from # filename elif os.path.isdir(args.infile): for ff in glob.glob(os.path.join(args.infile, '*.fa*')): locus = os.path.splitext(os.path.basename(ff))[0] for record in fasta.FastaReader(ff): loci = build_locus_dict(loci, locus, record, args.ambiguous) # workon a copy so we can iterate and delete snapshot = copy.deepcopy(loci) # iterate over loci to check for all species at a locus for locus, data in snapshot.iteritems(): if args.notstrict: if len(data) < 3: t = "\tDropping Locus {0} because it has fewer " + \ "than the minimum number " + \ "of taxa for alignment (N < 2)\n" sys.stdout.write((t).format(locus)) sys.stdout.flush() del loci[locus] else: if len(data) < args.species: del loci[locus] t = "\tDropping Locus {0} because it has fewer " + \ "than the minimum number " + \ "of taxa for alignment (N < 2)\n" sys.stdout.write((t).format(locus)) sys.stdout.flush() return loci
def main(): args = get_args() records = fasta.FastaReader(args.fasta) lengths = defaultdict(list) for sequence in records: # BEWARE: this may cause name clash, which will error out org = sequence.identifier.split(' ')[0].split('_')[-2] lengths[org].append(len(sequence)) for org, l in lengths.iteritems(): #pdb.set_trace() args.output.write("{0}\t{1}\n".format(org, float(sum(l)) / len(l)))
def main(): args = get_args() uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [ os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace( '-', "_") for f in files ] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k, v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose: print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def add_additional_columns(args, conn, c): assert args.name is not None, "You need to include --add-name to add a table" query = """ALTER TABLE probeset ADD COLUMN {0} int DEFAULT 0""".format( args.name) c.execute(query) for seq in fasta.FastaReader(args.add): locus = seq.identifier.lstrip('>').split('|')[0] query = """SELECT id, locus, probe, source, sequence, oldprobe FROM probes WHERE oldprobe LIKE '%{0}%'""".format(locus) c.execute(query) rows = c.fetchall() hit = False for row in rows: idx, locus, probe, source, sequence, oldlocus = row if seq.sequence in sequence: hit = True query = """UPDATE probeset set {0} = 1 WHERE id = {1}""".format( args.name, idx) c.execute(query) if not hit: print "Miss: {0}".format(seq.identifier)
def main(): args = get_args() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read(args.config) conn = sqlite3.connect(args.db) c = conn.cursor() if args.extend_db: query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db) c.execute(query) organisms = get_names_from_config(config, 'Organisms') uces = get_names_from_config(config, 'Loci') #pdb.set_trace() uce_fasta_out = fasta.FastaWriter(args.output) regex = re.compile("[N,n]{1,21}") for organism in organisms: print "Getting {0} reads...".format(organism) written = [] # going to need to do something more generic w/ suffixes #pdb.set_trace() name = organism.replace('_', '-') if args.notstrict: if not organism.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True) elif args.extend_dir: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True) else: if not name.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces) elif name.endswith('*') and args.extend_dir: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True) for read in fasta.FastaReader(reads): name = get_name(read.identifier).lower() coverage = get_coverage(read.identifier) if name in node_dict.keys(): uce_seq = fasta.FastaSequence() uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism, coverage) # deal with strandedness because aligners dont, which # is annoying if node_dict[name][1] == '-': uce_seq.sequence = transform.DNA_reverse_complement(read.sequence) else: uce_seq.sequence = read.sequence # replace any occurrences of <21 Ns if regex.search(uce_seq.sequence): uce_seq.sequence = re.sub(regex, "", uce_seq.sequence) print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(' ')[0]) uce_fasta_out.write(uce_seq) written.append(str(node_dict[name][0])) else: pass #pdb.set_trace() if args.notstrict and missing: args.notstrict.write("[{0}]\n".format(organism)) for name in missing: args.notstrict.write("{0}\n".format(name)) written.append(name) assert set(written) == set(uces), "UCE names do not match" #assert set(written) == set(uces), pdb.set_trace() uce_fasta_out.close()
def main(): uces = [] # get all ids of probes in 2560 set for seq in fasta.FastaReader('../archive/probe-subset-2560-synthesized.fasta'): name_split = seq.identifier.split('_') if name_split[0] not in ['>chrE22C19W28','>chrUn']: iden = '_'.join(name_split[:2]).strip('>') else: iden = '_'.join(name_split[:3]).strip('>') uces.append(iden) # get names, lengths, and GC content of loci in dbase conn = sqlite3.connect('/Users/bcf/Git/brant/seqcap/Non-repo/probe.sqlite') cur = conn.cursor() metadata = defaultdict(dict) for uce in uces: cur.execute("SELECT cons, cons_len FROM cons WHERE seq = ?", (uce,)) data = cur.fetchall() # ensure we only get one record back assert len(data) == 1, "More than one record" read, length = data[0] gc = round((read.count('C') + read.count('G')) / float(len(read)), 3) cur.execute('''SELECT count(*) FROM sureselect WHERE seq = ? AND selected = 1''', (uce,)) data = cur.fetchall() assert len(data) == 1, "More than one record" count = data[0][0] if count > 1: cur.execute('''SELECT avg(tm), avg(masked_bases), avg(added_bases) from sureselect where seq = ? group by seq''', (uce,)) else: cur.execute('''SELECT tm, masked_bases, added_bases from sureselect where seq = ? group by seq''', (uce,)) tm, masked, added = cur.fetchall()[0] metadata[uce] = { 'gc':gc, 'length':length, 'count':count, 'tm':tm, 'masked':masked, 'added':added } cur.close() conn.close() conn = sqlite3.connect('../archive/birds-probe-matches.sqlite') cur = conn.cursor() taxa = [ 'anser_erythropus', 'gallus_gallus', 'pitta_guajana', 'dromaius_novaehollandiae', 'megalaima_virens', 'struthio_camelus', 'eudromia_elegans', 'phalacrocorax_carbo', 'urocolius_indicus', ] query = "SELECT {} FROM matches WHERE uce = ?".format(', '.join(taxa)) for uce in metadata.keys(): cur.execute(query, (uce.lower(),)) data = cur.fetchall() for k,v in enumerate(data[0]): metadata[uce][taxa[k]] = v #pdb.set_trace() outfile = open('gc-length-species-matches.csv', 'w') outfile.write('uce,gc,length,count,tm,masked,added,present,taxon\n') for uce in sorted(metadata.keys()): for taxon in taxa: outfile.write('{},{},{},{},{},{},{},{},{}\n'.format( uce, metadata[uce]['gc'], metadata[uce]['length'], metadata[uce]['count'], metadata[uce]['tm'], metadata[uce]['masked'], metadata[uce]['added'], metadata[uce][taxon], taxon.replace('_',' ').capitalize() )) outfile.close()
def main(): args = get_args() if args.regex and args.repl is not None: # "s_[0-9]+$" regex = re.compile(args.regex) uces = set([get_name(read.identifier, "|", 1, regex=regex, repl=args.repl) for read in fasta.FastaReader(args.query)]) else: uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) regex = None if args.dupefile: print "\t Getting dupes" dupes = get_dupes(args.dupefile, regex, args.repl) contig = args.contigs#glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = ["contigs"]#get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( uces ) print "Processing:" #for contig in fasta_files: critter = os.path.basename(contig).split('.')[0].replace('-', "_") #output = args.align # os.path.join( # args.align, \ # os.path.splitext(os.path.basename(contig))[0] + '.lastz' # ) contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align( contig, args.query, args.coverage, args.identity, args.align ) lzstdout, lztstderr = alignment.run() # parse the lastz results of the alignment matches, orientation, revmatches = \ defaultdict(set), defaultdict(set), defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(args.align ): # get strandedness of match contig_name = get_name(lz.name1) uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl) if args.dupefile and uce_name in dupes: probe_dupes.add(uce_name) else: matches[contig_name].add(uce_name) orientation[uce_name].add(lz.strand2) revmatches[uce_name].add(contig_name) else: print "Error in lastz:" print "STDerr:" print lztstderr print "STDout:" print lzstdout # we need to check nodes for dupe matches to the same probes contigs_matching_mult_uces = check_contigs_for_dupes(matches) uces_matching_mult_contigs = check_probes_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_uces nodes_to_drop_one_of = uces_matching_mult_contigs # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) already_observed = list() for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] elif k in nodes_to_drop_one_of: if matches[k] in already_observed: del matches[k] else: already_observed.append(matches[k]) store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_print_output( critter, matches, contigs, probe_dupes, contigs_matching_mult_uces, uces_matching_mult_contigs ) # get all the UCE records from the db query = "SELECT uce, {0} FROM match_map WHERE {0} IS NOT NULL".format("contigs") c.execute(query) data = {row[1].split("(")[0]:row[0] for row in c.fetchall()} nodenames = set(data.keys()) # make sure we don't lose any dupes assert len(data) == len(nodenames), "There were duplicate contigs." outp = open(args.output, 'w') print "Building UCE fasta:" #for contig in fasta_files: for record in SeqIO.parse(open(contig), 'fasta'): name = '_'.join(record.id.split('_')[:2]) if name.lower() in nodenames: record.id = "{0}|{1}".format(data[name.lower()], record.id) outp.write(record.format('fasta')) outp.close()
def main(): args = get_args() # compile some regular expressions we'll use later stripnum = re.compile("s_[0-9]+$") manyn = re.compile("[N,n]{20,}") # get names of loci and taxa uces = get_uce_names_from_probes(args.probes) taxa = get_taxa_names_from_fastas(args.fasta) print "\n" if not args.extend: if args.db is None: db = os.path.join(args.output, 'probe.matches.sqlite') else: db = args.db # create db to hold results conn, c = create_probe_database( db, taxa, uces, True ) else: conn, c = extend_probe_database( args.db, taxa ) # get duplicate probe sequences for filtering if args.dupefile: print "Determining duplicate probes..." dupes = get_dupes(args.dupefile, longfile=False) else: dupes = None # iterate over LASTZ files for each taxon for lz in glob.glob(os.path.join(args.lastz, '*')): # get fasta name from lastz file ff = get_fasta_name_from_lastz_pth(lz, args.fasta, args.pattern) # get taxon name from lastz file taxon = get_taxon_from_filename(ff) print "\n{0}\n{1}\n{0}".format('=' * 30, taxon) # get lastz matches print "\tGetting LASTZ matches from GENOME alignments..." matches, probes = get_matches(lz) # remove bad loci (dupes) print "\tGetting bad (potentially duplicate) GENOME matches..." loci_to_skip = [] for k, v in matches.iteritems(): # check matches to makes sure all is well - keep names lc loci_to_skip.extend(quality_control_matches(matches, probes, dupes, k, v, False)) #pdb.set_trace() # convert to set, to keep only uniques loci_to_skip = set(loci_to_skip) print "\tSkipping {} bad (duplicate hit) loci...".format(len(loci_to_skip)) # get (and possibly assemble) non-skipped seqdict = defaultdict(list) # determine those contigs to skip and group those to assemble for contig in fasta.FastaReader(ff): # make sure all names are lowercase contig.identifier = contig.identifier.lower() name = contig.identifier.split('|')[-4].strip() locus = name.split('_')[0] # skip what we identified as bad loci if locus not in loci_to_skip: seqdict[locus].append(contig) output_name = "{}.fasta".format(taxon.replace('_', '-')) fout_name = os.path.join(args.output, output_name) print "\tOutput filename is {}".format(output_name) fout = fasta.FastaWriter(fout_name) # this tracks "fake" contig number count = 0 # this tracks loci kept kept = 0 # when > 1 contig, assemble contigs across matches sys.stdout.write("\tWriting and Aligning/Assembling UCE loci with multiple probes (dot/1000 loci)") for k, v in seqdict.iteritems(): bad = False contig_names = [] if count % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() if len(v) == 1: # trim ambiguous bases on flanks record = v[0] orient = [matches[k][0][1]] if args.flank: record = trim_uce_reads(record, args.flank) contig_names.append(record.identifier) record.sequence = record.sequence.strip('N') # trim many ambiguous bases within contig result = manyn.search(record.sequence) if result: uce_start, uce_end = get_probe_positions(record) uce = record.sequence[uce_start:uce_end] record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False) # change header record.identifier = ">Node_{0}_length_{1}_cov_1000".format( count, len(record.sequence) ) fout.write(v[0]) else: orient = list(set([m[1] for m in matches[k]])) # skip any loci having matches of mixed orientation # ['+', '-'] if len(orient) == 1: # create tempfile for the reads fd, temp = tempfile.mkstemp(suffix='.fasta') os.close(fd) temp_out = fasta.FastaWriter(temp) # write all slices to outfile, trimming if we want #pdb.set_trace() for record in v: if args.flank: record = trim_uce_reads(record, args.flank) # keep names of contigs we assembled to store in db assoc # w/ resulting assembled contig name contig_names.append(record.identifier) record.sequence = record.sequence.strip('N') # trim many ambiguous bases within contig result = manyn.search(record.sequence) if result: uce_start, uce_end = get_probe_positions(record) uce = record.sequence[uce_start:uce_end] record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False) temp_out.write(record) # make sure to close the file temp_out.close() # assemble aln = Align(temp) aln.run_alignment() record = fasta.FastaSequence() record.sequence = aln.alignment_consensus.tostring() record.identifier = ">Node_{0}_length_{1}_cov_1000".format( count, len(record.sequence) ) fout.write(record) else: bad = True if not bad: # track contig assembly and renaming data in db q = "UPDATE matches SET {0} = 1 WHERE uce = '{1}'".format(taxon, k) c.execute(q) # generate db match and match map tables for data orient_key = "node_{0}({1})".format(count, orient[0]) q = "UPDATE match_map SET {0} = '{1}' WHERE uce = '{2}'".format(taxon, orient_key, k) c.execute(q) # keep track of new name :: old name mapping for old_name in contig_names: q = "INSERT INTO contig_map VALUES ('{0}', '{1}', '{2}', '{3}')".format(taxon, k, old_name, record.identifier) c.execute(q) kept += 1 # tracking "fake" contig number count += 1 conn.commit() print "\n\t{0} loci of {1} matched ({2:.0f}%), {3} dupes dropped ({4:.0f}%), {5} ({6:.0f}%) kept".format( count, len(uces), float(count) / len(uces) * 100, len(loci_to_skip), float(len(loci_to_skip)) / len(uces) * 100, kept, float(kept) / len(uces) * 100 ) #conn.commit() c.close() conn.close()
def main(): args = get_args() if args.regex and args.repl is not None: # "s_[0-9]+$" regex = re.compile(args.regex) uces = set([ get_name(read.identifier, "|", 1, regex=regex, repl=args.repl) for read in fasta.FastaReader(args.query) ]) else: uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) regex = None if args.dupefile: print "\t Getting dupes" dupes = get_dupes(args.dupefile, regex, args.repl) fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( os.path.join(args.output, 'probe.matches.sqlite'), organisms, uces) print "Processing:" for contig in fasta_files: critter = os.path.basename(contig).split('.')[0].replace('-', "_") output = os.path.join( args.output, \ os.path.splitext(os.path.basename(contig))[0] + '.lastz' ) contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align(contig, args.query, args.coverage, args.identity, output) lzstdout, lztstderr = alignment.run() # parse the lastz results of the alignment matches, orientation, revmatches = \ defaultdict(set), defaultdict(set), defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(output): # get strandedness of match contig_name = get_name(lz.name1) uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl) if args.dupefile and uce_name in dupes: probe_dupes.add(uce_name) else: matches[contig_name].add(uce_name) orientation[uce_name].add(lz.strand2) revmatches[uce_name].add(contig_name) # we need to check nodes for dupe matches to the same probes contigs_matching_mult_uces = check_contigs_for_dupes(matches) uces_matching_mult_contigs = check_probes_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_uces.union( uces_matching_mult_contigs) # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_print_output(critter, matches, contigs, probe_dupes, contigs_matching_mult_uces, uces_matching_mult_contigs)