def main(): args = get_args() regex = re.compile("[N,n]{20,}") if args.dupefile: dupes = get_dupes(args.dupefile, longfile=False) else: dupes = None matches, probes = get_matches(args.lastz, args.splitchar, args.components, args.fish) #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]]) if args.fasta: tb = bx.seq.twobit.TwoBitFile(file(args.genome)) count = 0 for k, v in matches.iteritems(): chromo, strand, start, end, skip = quality_control_matches( matches, probes, dupes, k, v, args.verbose) if not skip and args.fasta: prep_and_write_fasta(tb, regex, args.fasta, chromo, strand, start, end, count, args.flank) if not skip and args.bed: args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format( chromo, start - args.flank, end + args.flank, k, strand)) count += 1 #pdb.set_trace() args.fasta.close()
def main(): args = get_args() uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [ os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace( '-', "_") for f in files ] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k, v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose: print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def check_for_dupes(probe_set): """create some temp files and search a newly-designed probe-set for dupes""" # write to a tempfile f = tempfile.NamedTemporaryFile(mode='w', delete=False) for ps in probe_set: SeqIO.write(ps, f, 'fasta') f.close() # align f to itself lz = lastz.Align(f.name, f.name, 70, 80) lz.run() dupes = get_dupes(lz.output, pos=2) os.remove(f.name) os.remove(lz.output) return dupes
def main(): args = get_args() uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k,v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key = itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose:print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def main(): args = get_args() regex = re.compile("[N,n]{20,}") if args.dupefile: dupes = get_dupes(args.dupefile, longfile=False) else: dupes = None matches, probes = get_matches(args.lastz, args.splitchar, args.components, args.fish) #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]]) if args.fasta: tb = bx.seq.twobit.TwoBitFile(file(args.genome)) count = 0 for k, v in matches.iteritems(): chromo, strand, start, end, skip = quality_control_matches(matches, probes, dupes, k, v, args.verbose) if not skip and args.fasta: prep_and_write_fasta(tb, regex, args.fasta, chromo, strand, start, end, count, args.flank) if not skip and args.bed: args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format(chromo, start - args.flank, end + args.flank, k, strand)) count += 1 #pdb.set_trace() args.fasta.close()
def main(): args = get_args() if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() # get dbSNP data all_snps = get_xml_data(args.xml) used = set() # iterate over intersections args.output.write('rsid,pos,maf,1000g\n') for row in args.dbsnp: if not row.startswith('UCE'): uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split(',') start, end, snps, snpe = map(int, [start, end, snps, snpe]) # get relative position if not snpe - snps > 1 and snp not in used and not uce in dupes: middle = int(round((start + end)/2, 0)) rel_snp_pos = snps - middle # lookup data for snps if all_snps[snp.strip('rs')].val_1000G and all_snps[snp.strip('rs')].val_1000G.lower() == 'true': thousandg = True else: thousandg = False if not all_snps[snp.strip('rs')].freq_freq: freq = 0.0 else: freq = float(all_snps[snp.strip('rs')].freq_freq) args.output.write("{0},{1},{2},{3}\n".format( snp, rel_snp_pos, freq, thousandg ) ) # make sure we skip any duplicates used.add(snp)
def main(): args = get_args() print get_dupes(args.lastz)
def main(): args = get_args() if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None used = set() mx = max([int(row.strip('\n').split(',')[3]) \ - int(row.strip('\n').split(',')[2]) \ for row in open(args.dbsnp,'rU') if not row.startswith('UCE')]) # get the SNP metadata all_snps = get_xml_data(args.xml) # find the middle overall_middle = int(round(mx / 2, 0)) # list to hold results l = numpy.zeros(mx + 1) positions = copy.deepcopy(l) # create a dict to hold the results by position in longest array #differences = dict((d,numpy.array([])) for d in range(-middle, middle + 1)) # iterate over intersections d = {} if args.output2: args.output2.write( 'UCE,chromo,uce-start,uce-end,snp-name,snp-start,snp-end,1000gvalidated,freq\n' ) for row in open(args.dbsnp, 'rU'): if not row.startswith('UCE'): uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split( ',') start, end, snps, snpe = map(int, [start, end, snps, snpe]) # get middle of this UCE middle = int(round((start + end) / 2, 0)) #pdb.set_trace() if snp not in used: if not snpe - snps > 1 \ and (uce not in dupes) \ and all_snps[snp.strip('rs')].val_1000G == 'true' \ and all_snps[snp.strip('rs')].freq_freq is not None: if not uce in d.keys(): d[uce] = numpy.zeros(mx + 1) rel_snp_pos = snps - middle d[uce][overall_middle + rel_snp_pos] = all_snps[snp.strip('rs')].freq_freq if args.output2 and not snpe - snps > 1 and ( snp not in used) and (uce not in dupes): args.output2.write("{},{},{},{},{},{},{},{},{}\n".format( uce, chromo, start, end, snp, snps, snpe, all_snps[snp.strip('rs')].val_1000G, all_snps[snp.strip('rs')].freq_freq)) used.add(snp) stack = numpy.array([d[uce] for uce in d.keys()]) #pdb.set_trace() # compute the running average win = 25 data = sum(stack > 0) weightings = numpy.repeat(1.0, win) / win running = numpy.convolve(data, weightings)[win - 1:-(win - 1)] args.output.write("pos,avg,ci,datatype\n") for base in range(len(running)): pos = base - overall_middle args.output.write("{},{},,running\n".format(pos, running[base])) # also output the average heterozygosity of 1000 Genome validated, hetero SNPs. for base in range(len(stack[0])): pos = base - overall_middle values = numpy.where(stack[:, base] != 0)[0] # reindex avg = numpy.mean(stack[:, base][values]) ci = 1.96 * (numpy.std(stack[:, base][values], ddof=1) / numpy.sqrt(len(stack[:, base][values]))) args.output.write("{},{},{},mean_hetero\n".format(pos, avg, ci)) win = 25 data = numpy.mean(stack, axis=1) weightings = numpy.repeat(1.0, win) / win running = numpy.convolve(data, weightings)[win - 1:-(win - 1)] for base in range(len(stack[0])): pos = base - overall_middle args.output.write("{},{},,running_hetero\n".format(pos, running[base]))
def main(): args = get_args() # compile some regular expressions we'll use later stripnum = re.compile("s_[0-9]+$") manyn = re.compile("[N,n]{20,}") # get names of loci and taxa uces = get_uce_names_from_probes(args.probes, regex=stripnum, repl='s', lower=True) taxa = get_taxa_names_from_fastas(args.fasta) print "\n" if not args.extend: if args.db is None: db = os.path.join(args.output, 'probe.matches.sqlite') else: db = args.db # create db to hold results conn, c = create_probe_database( db, taxa, uces, True ) else: conn, c = extend_probe_database( args.db, taxa ) # get duplicate probe sequences for filtering if args.dupefile: print "Determining duplicate probes..." dupes = get_dupes(args.dupefile, longfile=False) else: dupes = None # because of structure, strip probe designation from dupes # leaving only locus name. lowercase all. dupes = set([re.sub(stripnum, 's', d).lower() for d in dupes]) # iterate over LASTZ files for each taxon for lz in glob.glob(os.path.join(args.lastz, '*')): # get taxon name from lastz file taxon = get_taxon_from_filename(lz) print "\n{0}\n{1}\n{0}".format('=' * 30, taxon) # get fasta name from lastz file ff = get_fasta_name_from_lastz_pth(lz, args.fasta) # get lastz matches print "\tGetting LASTZ matches from GENOME alignments..." if not args.oldprobe: matches, probes = get_bgi_matches(lz, stripnum) else: matches, probes = get_old_probe_matches(lz) # remove bad loci (dupes) print "\tGetting bad (potentially duplicate) GENOME matches..." loci_to_skip = [] for k, v in matches.iteritems(): # check matches to makes sure all is well - keep names lc loci_to_skip.extend(quality_control_matches(matches, probes, dupes, k, v, False)) #pdb.set_trace() # convert to set, to keep only uniques loci_to_skip = set(loci_to_skip) print "\tSkipping {} bad (duplicate hit) loci...".format(len(loci_to_skip)) # get (and possibly assemble) non-skipped seqdict = defaultdict(list) # determine those contigs to skip and group those to assemble for contig in fasta.FastaReader(ff): # make sure all names are lowercase contig.identifier = contig.identifier.lower() if not args.oldprobe: name = contig.identifier.split('|')[-3] locus = re.sub(stripnum, 's', name) else: locus = contig.identifier.split('|')[-5] # skip what we identified as bad loci if locus not in loci_to_skip: seqdict[locus].append(contig) output_name = "{}.fasta".format(taxon.replace('_', '-')) fout_name = os.path.join(args.output, output_name) print "\tOutput filename is {}".format(output_name) fout = fasta.FastaWriter(fout_name) # this tracks "fake" contig number count = 0 # this tracks loci kept kept = 0 # when > 1 contig, assemble contigs across matches sys.stdout.write("\tWriting and Aligning/Assembling UCE loci with multiple probes (dot/1000 loci)") for k, v in seqdict.iteritems(): bad = False contig_names = [] if count % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() if len(v) == 1: # trim ambiguous bases on flanks record = v[0] orient = [matches[k][0][1]] if args.flank: record = trim_uce_reads(record, args.flank) contig_names.append(record.identifier) record.sequence = record.sequence.strip('N') # trim many ambiguous bases within contig result = manyn.search(record.sequence) if result: uce_start, uce_end = get_probe_positions(record) uce = record.sequence[uce_start:uce_end] record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False) # change header record.identifier = ">Node_{0}_length_{1}_cov_1000".format( count, len(record.sequence) ) fout.write(v[0]) else: orient = list(set([m[1] for m in matches[k]])) # skip any loci having matches of mixed orientation # ['+', '-'] if len(orient) == 1: # create tempfile for the reads fd, temp = tempfile.mkstemp(suffix='.fasta') os.close(fd) temp_out = fasta.FastaWriter(temp) # write all slices to outfile, trimming if we want #pdb.set_trace() for record in v: if args.flank: record = trim_uce_reads(record, args.flank) # keep names of contigs we assembled to store in db assoc # w/ resulting assembled contig name contig_names.append(record.identifier) record.sequence = record.sequence.strip('N') # trim many ambiguous bases within contig result = manyn.search(record.sequence) if result: uce_start, uce_end = get_probe_positions(record) uce = record.sequence[uce_start:uce_end] record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False) temp_out.write(record) # make sure to close the file temp_out.close() # assemble aln = Align(temp) aln.run_alignment() record = fasta.FastaSequence() record.sequence = aln.alignment_consensus.tostring() record.identifier = ">Node_{0}_length_{1}_cov_1000".format( count, len(record.sequence) ) fout.write(record) else: bad = True if not bad: # track contig assembly and renaming data in db q = "UPDATE matches SET {0} = 1 WHERE uce = '{1}'".format(taxon, k) c.execute(q) # generate db match and match map tables for data orient_key = "node_{0}({1})".format(count, orient[0]) q = "UPDATE match_map SET {0} = '{1}' WHERE uce = '{2}'".format(taxon, orient_key, k) c.execute(q) # keep track of new name :: old name mapping for old_name in contig_names: q = "INSERT INTO contig_map VALUES ('{0}', '{1}', '{2}', '{3}')".format(taxon, k, old_name, record.identifier) c.execute(q) kept += 1 # tracking "fake" contig number count += 1 conn.commit() print "\n\t{0} loci of {1} matched ({2:.0f}%), {3} dupes dropped ({4:.0f}%), {5} ({6:.0f}%) kept".format( count, len(uces), float(count) / len(uces) * 100, len(loci_to_skip), float(len(loci_to_skip)) / len(uces) * 100, kept, float(kept) / len(uces) * 100 ) #conn.commit() c.close() conn.close()
def main(): args = get_args() if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None used = set() mx = max( [ int(row.strip("\n").split(",")[3]) - int(row.strip("\n").split(",")[2]) for row in open(args.dbsnp, "rU") if not row.startswith("UCE") ] ) # get the SNP metadata all_snps = get_xml_data(args.xml) # find the middle overall_middle = int(round(mx / 2, 0)) # list to hold results l = numpy.zeros(mx + 1) positions = copy.deepcopy(l) # create a dict to hold the results by position in longest array # differences = dict((d,numpy.array([])) for d in range(-middle, middle + 1)) # iterate over intersections d = {} if args.output2: args.output2.write("UCE,chromo,uce-start,uce-end,snp-name,snp-start,snp-end,1000gvalidated,freq\n") for row in open(args.dbsnp, "rU"): if not row.startswith("UCE"): uce, chromo, start, end, snp, snps, snpe = row.strip("\n").split(",") start, end, snps, snpe = map(int, [start, end, snps, snpe]) # get middle of this UCE middle = int(round((start + end) / 2, 0)) # pdb.set_trace() if snp not in used: if ( not snpe - snps > 1 and (uce not in dupes) and all_snps[snp.strip("rs")].val_1000G == "true" and all_snps[snp.strip("rs")].freq_freq is not None ): if not uce in d.keys(): d[uce] = numpy.zeros(mx + 1) rel_snp_pos = snps - middle d[uce][overall_middle + rel_snp_pos] = all_snps[snp.strip("rs")].freq_freq if args.output2 and not snpe - snps > 1 and (snp not in used) and (uce not in dupes): args.output2.write( "{},{},{},{},{},{},{},{},{}\n".format( uce, chromo, start, end, snp, snps, snpe, all_snps[snp.strip("rs")].val_1000G, all_snps[snp.strip("rs")].freq_freq, ) ) used.add(snp) stack = numpy.array([d[uce] for uce in d.keys()]) # pdb.set_trace() # compute the running average win = 25 data = sum(stack > 0) weightings = numpy.repeat(1.0, win) / win running = numpy.convolve(data, weightings)[win - 1 : -(win - 1)] args.output.write("pos,avg,ci,datatype\n") for base in range(len(running)): pos = base - overall_middle args.output.write("{},{},,running\n".format(pos, running[base])) # also output the average heterozygosity of 1000 Genome validated, hetero SNPs. for base in range(len(stack[0])): pos = base - overall_middle values = numpy.where(stack[:, base] != 0)[0] # reindex avg = numpy.mean(stack[:, base][values]) ci = 1.96 * (numpy.std(stack[:, base][values], ddof=1) / numpy.sqrt(len(stack[:, base][values]))) args.output.write("{},{},{},mean_hetero\n".format(pos, avg, ci)) win = 25 data = numpy.mean(stack, axis=1) weightings = numpy.repeat(1.0, win) / win running = numpy.convolve(data, weightings)[win - 1 : -(win - 1)] for base in range(len(stack[0])): pos = base - overall_middle args.output.write("{},{},,running_hetero\n".format(pos, running[base]))