def get_dupes(lastz_file, regex, format): """Given a lastz_file of probes aligned to themselves, get duplicates""" matches = defaultdict(list) dupes = set() # get names and strip probe designation since loci are the same print "Parsing lastz file..." for lz in lastz.Reader(lastz_file, long_format=format): target_name = new_get_probe_name(lz.name1, regex) query_name = new_get_probe_name(lz.name2, regex) matches[target_name].append(query_name) # see if one probe matches any other probes # other than the children of the locus print "Screening results..." for k, v in matches.iteritems(): # if the probe doesn't match itself, we have # problems if len(v) > 1: for i in v: if i != k: dupes.add(k) dupes.add(i) elif k != v[0]: dupes.add(k) # make sure all names are lowercase return set([d.lower() for d in dupes])
def main(): args = get_args() log = setup_logger() for file in glob.glob(os.path.join(args.input, "*lastz*")): lz = lastz.Reader(file, long_format=True) probes = defaultdict(list) # get output file name #outname = os.path.basename(file).split('.')[1].split('_')[-1] search_result = re.search('_v_([A-Za-z0-9]+).lastz', os.path.basename(file)) outname = search_result.groups()[0] log.info("Working on {}".format(outname)) outf = open(os.path.join(args.output, "{}.probe.bed".format(outname)), 'w') outf.write('''track name="uce-v-{0}" description="UCE probe matches to {0}" visibility=2 itemRgb="On"\n'''.format(outname)) written = set([]) for match in lz: probe = match.name2.split('|')[0].strip() probes[probe].append([match.name1, match.zstart1, match.end1]) #pdb.set_trace() for probe in sorted(probes.keys()): for match in probes[probe]: chromo, start, end = match if probe in written: log.warn("{0} may have >1 hit".format(probe)) else: written.add(probe) write_bed_file(outf, chromo, start, end, probe) outf.close()
def get_dupe_matches(lastz_file, splitchar = "|", pos = 1, longfile = False): matches = defaultdict(list) for lz in lastz.Reader(lastz_file, longfile): target_name = get_name(lz.name1, splitchar, pos) query_name = get_name(lz.name2, splitchar, pos) matches[target_name].append(query_name) return matches
def get_dupe_matches(lastz_file, longfile=False): matches = defaultdict(list) for lz in lastz.Reader(lastz_file, longfile): target_name = get_uce_name(lz.name1) query_name = get_uce_name(lz.name2) matches[target_name].append(query_name) return matches
def main(): args = get_args() uce_loci = [] # get lengths of loci seq_lengths = {} for seq in fasta.FastaReader(args.fasta): name = seq.identifier.split('|')[1] uce_loci.append(name) seq_lengths[name] = len(seq.sequence) overlappers = defaultdict(dict) names = defaultdict(list) coords = {} for match in lastz.Reader(args.lastz, long_format=True): locus = match.name2.split('|')[1] chromo = match.name1 coords[locus] = (match.zstart1, match.end1) for pmatch, span in overlappers[chromo].iteritems(): if locus == 'chr5_10696_s' and pmatch == 'chr13_710_s': pdb.set_trace() overlap = span.find(match.zstart1, match.end1) if overlap: overlappers[chromo][pmatch].insert(match.zstart1, match.end1, locus) names[pmatch].append(locus) break else: overlappers = add_new_locus(match, overlappers, chromo) overlapping_loci = [] all_groups = [] for k, v in names.iteritems(): # group loci into overlapping clusters base = [k] base.extend(v) all_groups.append(base) # get list of "bad loci" so we can determine non-overlappers overlapping_loci.append(k) overlapping_loci.extend(v) pdb.set_trace() non_overlapping_loci = set(uce_loci).difference(set(overlapping_loci)) # generate output in config-file format: config = ConfigParser.RawConfigParser() config.add_section('Non-overlapping loci') for locus in list(non_overlapping_loci): config.set('Non-overlapping loci', locus, seq_lengths[locus]) longest_of_overlapping = get_longest_of_overlapping_loci( all_groups, seq_lengths) config.add_section('Longest loci of group') for locus in longest_of_overlapping: config.set('Longest loci of group', locus, seq_lengths[locus]) config.add_section('Superlocus groups') for c, group in enumerate(all_groups): # order loci by start position starts = [(name, coords[name][0], coords[name][1]) for name in group] starts = sorted(starts, key=itemgetter(1)) sorted_names = [n[0] for n in starts] print starts #pdb.set_trace() config.set('Superlocus groups', "Group{0}".format(c), ','.join(sorted_names)) config.write(args.output)
def get_dupes(lastz_file, format): """Given a lastz_file of probes aligned to themselves, get duplicates""" matches = defaultdict(list) dupes = set() # get names and strip probe designation since loci are the same print "Parsing lastz file..." for lz in lastz.Reader(lastz_file, long_format=format): target_name = "{}:{}-{}".format(lz.name1, lz.zstart1, lz.end1) query_name = lz.name2 matches[query_name].append(target_name) # see if one probe matches any other probes # other than the children of the locus print "Screening results..." for k, v in matches.iteritems(): name, pos = k.split('|') name = name.strip() # if the probe doesn't match itself, we have # problems if len(v) == 1: if not pos == v[0]: dupes.add(name) elif v > 1: dupes.add(name) # make sure all names are lowercase return set([d.lower() for d in dupes])
def main(lformat=True): args = get_args() if args.conf and args.sections: conf = ConfigParser.ConfigParser() conf.read(args.conf) if not args.sections: args.sections = conf.sections() items = [] for section in args.sections: items.extend([i[0] for i in conf.items(section)]) items = set(items) else: items = None for match in lastz.Reader(args.lastz, long_format=args.long_format): try: name = match.name2.split('|')[1] except: name = match.name2.split(' ')[0] if match.percent_identity >= args.identity and match.percent_continuity >= args.continuity: if args.conf and items and (name in items): write_to_outfile(args, match, name) elif args.conf is None: write_to_outfile(args, match, name) else: print name args.output.close()
def get_bgi_matches(lastz_file, stripnum): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format=True): uce_name = re.sub(stripnum, 's', lz.name2).lower() probe_number = int(lz.name2.split('_')[-1]) if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append( [get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1]) return matches, probes
def get_matches(lastz_file): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format=True): uce_name = get_uce_name(lz.name2) probe_number = get_uce_num(lz.name2) if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append( [ get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1 ] ) return matches, probes
def main(): args = get_args() conf = ConfigParser.ConfigParser() conf.read(args.conf) all_files = get_all_files_from_conf(conf) for genome in all_files: name, twobit_name = genome out_file = os.path.join(args.output, name) + ".fasta" out = fasta.FastaWriter(out_file) tb = twobit.TwoBitFile(file(twobit_name)) lz = os.path.join(args.lastz, name) + ".lastz" count = 0 for row in lastz.Reader(lz, long_format=True): sequence = slice_and_return_fasta(tb, row, args.flank) out.write(sequence) count += 1 print "\t{} sequences written to {}".format(count, out_file) out.close()
def get_matches(lastz_file, splitchar, components, fish = False): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format = True): # skip silly hg19 mhc haplotypes if "hap" in lz.name1: print "Skipping: ", lz.name1 else: if fish: uce_name = get_name(lz.name2, "_", 1) # add 1 because fish probe indexing starts @ 0 probe_number = int(lz.name2.split('|')[1].split('_')[1]) + 1 else: uce_name = get_name(lz.name2, "|", 1) probe_number = int(lz.name2.split(':')[-1]) #pdb.set_trace() if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append([get_name(lz.name1, splitchar = splitchar, items = components), lz.strand2, lz.zstart1, lz.end1]) return matches, probes
def main(): args = get_args() conf = ConfigParser.ConfigParser() conf.optionxform = str conf.read(args.conf) all_files = get_all_files_from_conf(conf, args.pattern) #pdb.set_trace() for genome in all_files: short_name, long_name, twobit_name = genome if not args.exclude or (short_name not in args.exclude): out_file = os.path.join(args.output, short_name) + ".fasta" out = fasta.FastaWriter(out_file) tb = twobit.TwoBitFile(file(twobit_name)) lz = os.path.join(args.lastz, long_name) count = 0 for row in lastz.Reader(lz, long_format=True): sequence = slice_and_return_fasta(tb, row, args.flank) out.write(sequence) count += 1 print "\t{} sequences written to {}".format(count, out_file) out.close()
def get_dupes(lastz_file, regex=None, repl=None): """Given a lastz_file of probes aligned to themselves, get duplicates""" matches = defaultdict(list) dupes = set() for lz in lastz.Reader(lastz_file): target_name = get_name(lz.name1, "|", 1) query_name = get_name(lz.name2, "|", 1) matches[target_name].append(query_name) # see if one probe matches any other probes # other than the children of the locus for k, v in matches.iteritems(): # if the probe doesn't match itself, we have # problems if len(v) > 1: for i in v: if i != k: dupes.add(k) dupes.add(i) elif k != v[0]: dupes.add(k) if not regex: return dupes else: return set([re.sub(regex, repl, d).lower() for d in dupes])
def main(): args = get_args() if args.regex and args.repl is not None: # "s_[0-9]+$" regex = re.compile(args.regex) uces = set([get_name(read.identifier, "|", 1, regex=regex, repl=args.repl) for read in fasta.FastaReader(args.query)]) else: uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) regex = None if args.dupefile: print "\t Getting dupes" dupes = get_dupes(args.dupefile, regex, args.repl) contig = args.contigs#glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = ["contigs"]#get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( uces ) print "Processing:" #for contig in fasta_files: critter = os.path.basename(contig).split('.')[0].replace('-', "_") #output = args.align # os.path.join( # args.align, \ # os.path.splitext(os.path.basename(contig))[0] + '.lastz' # ) contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align( contig, args.query, args.coverage, args.identity, args.align ) lzstdout, lztstderr = alignment.run() # parse the lastz results of the alignment matches, orientation, revmatches = \ defaultdict(set), defaultdict(set), defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(args.align ): # get strandedness of match contig_name = get_name(lz.name1) uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl) if args.dupefile and uce_name in dupes: probe_dupes.add(uce_name) else: matches[contig_name].add(uce_name) orientation[uce_name].add(lz.strand2) revmatches[uce_name].add(contig_name) else: print "Error in lastz:" print "STDerr:" print lztstderr print "STDout:" print lzstdout # we need to check nodes for dupe matches to the same probes contigs_matching_mult_uces = check_contigs_for_dupes(matches) uces_matching_mult_contigs = check_probes_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_uces nodes_to_drop_one_of = uces_matching_mult_contigs # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) already_observed = list() for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] elif k in nodes_to_drop_one_of: if matches[k] in already_observed: del matches[k] else: already_observed.append(matches[k]) store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_print_output( critter, matches, contigs, probe_dupes, contigs_matching_mult_uces, uces_matching_mult_contigs ) # get all the UCE records from the db query = "SELECT uce, {0} FROM match_map WHERE {0} IS NOT NULL".format("contigs") c.execute(query) data = {row[1].split("(")[0]:row[0] for row in c.fetchall()} nodenames = set(data.keys()) # make sure we don't lose any dupes assert len(data) == len(nodenames), "There were duplicate contigs." outp = open(args.output, 'w') print "Building UCE fasta:" #for contig in fasta_files: for record in SeqIO.parse(open(contig), 'fasta'): name = '_'.join(record.id.split('_')[:2]) if name.lower() in nodenames: record.id = "{0}|{1}".format(data[name.lower()], record.id) outp.write(record.format('fasta')) outp.close()
def main(args): #args = get_args() pre_regex = args.regex regex = re.compile("^(%s)(?:.*)" % pre_regex) if not os.path.isdir(args.output): os.makedirs(args.output) else: raise IOError( "The directory {} already exists. Please check and remove by hand." .format(args.output)) exons = set( new_get_probe_name(seq.id, regex) for seq in SeqIO.parse(open(args.reference, 'rU'), 'fasta')) #print exons if args.dupefile: dupes = get_dupes(log, args.dupefile, regex) else: dupes = set() fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) for f in fasta_files: replace_bad_fasta_chars = "sed -i -e '/>/! s=[K,Y,R,S,M,W,B,D,H,V,k,y,r,s,m,w,b,d,h,v]=N=g' %s" % f remove_os_sed_copies = "rm %s/*-e " % args.contigs fasta_name = f.split('/')[-1] if not fasta_name.startswith('sample'): rename_samples = "mv %s %s/sample_%s" % (f, args.contigs, fasta_name) os.system(rename_samples) os.system(replace_bad_fasta_chars) os.system(remove_os_sed_copies) fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( log, os.path.join(args.output, 'probe.matches.sqlite'), organisms, exons) log.info("Processing contig data") # open a file for duplicate writing, if we're interested if args.keep_duplicates is not None: dupefile = open(args.keep_duplicates, 'w') else: dupefile = None log.info("{}".format("-" * 65)) kmers = {} for contig in sorted(fasta_files): critter = os.path.basename(contig).split('.')[0].replace('-', "_") output = os.path.join( args.output, os.path.splitext(os.path.basename(contig))[0] + '.lastz') contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align(contig, args.reference, args.min_coverage, args.min_identity, output) lzstdout, lztstderr = alignment.run() if lztstderr: raise EnvironmentError("lastz: {}".format(lztstderr)) # parse the lastz results of the alignment matches = defaultdict(set) orientation = defaultdict(set) revmatches = defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(output): contig_name = get_contig_name(lz.name1, args) exon_name = new_get_probe_name(lz.name2, regex) if args.dupefile and exon_name in dupes: probe_dupes.add(exon_name) else: matches[contig_name].add(exon_name) orientation[exon_name].add(lz.strand2) revmatches[exon_name].add(contig_name) # we need to check nodes for dupe matches to the same probes contigs_matching_mult_exons = check_contigs_for_dupes(matches) exon_dupe_contigs, exon_dupe_exons = check_loci_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_exons.union(exon_dupe_contigs) # write out duplicates if requested if dupefile is not None: log.info("Writing duplicates file for {}".format(critter)) if len(exon_dupe_exons) != 0: dupefile.write( "[{} - probes hitting multiple contigs]\n".format(critter)) for exon in exon_dupe_exons: dupefile.write("{}:{}\n".format( exon, ', '.join(revmatches[exon]))) dupefile.write("\n") if len(contigs_matching_mult_exons) != 0: dupefile.write( "[{} - contigs hitting multiple probes]\n".format(critter)) for dupe in contigs_matching_mult_exons: dupefile.write("{}:{}\n".format(dupe, ', '.join(matches[dupe]))) dupefile.write("\n") dupefile.write("[{} - contig orientation]\n".format(critter)) for dupe in contigs_matching_mult_exons: matches_list = list(matches[dupe]) for exon in matches_list: dupefile.write("{}:{}\n".format( exon, list(orientation[exon])[0])) dupefile.write("\n") # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] #print matches #print lz.name1 #get contig id #contig_id = re.search("^(\d*)\s\d*\s\d*.*", lz.name1).groups()[0] #print matches #added function to return the kmer count (sum of all kmers of target contigs) for lz in lastz.Reader(output): for element in matches: #print element, "has to match", lz[1] if re.search("^(\d*)\s\d*\s\d*.*", lz[1]).groups()[0] == element: kmer_value = get_kmer_value(lz.name1) kmers.setdefault(contig, []) kmers[contig].append(kmer_value) store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_log_output(log, critter, matches, contigs, probe_dupes, contigs_matching_mult_exons, exon_dupe_exons) kmerfile = open(os.path.join(args.output, 'kmer_count.txt'), 'w') for key in kmers: count = 0 for element in kmers[key]: count += int(element) kmerfile.write("%s : %d\n" % (os.path.basename(key).split('.')[0], count)) if dupefile is not None: dupefile.close() log.info("{}".format("-" * 65)) log.info("The LASTZ alignments are in {}".format(args.output)) log.info("The exon match database is in {}".format( os.path.join(args.output, "probes.matches.sqlite"))) text = "Completed" log.info(text.center(65, "=")) # Access the SQL file and export tab-separated text-file sql_file = os.path.join(args.output, 'probe.matches.sqlite') tsf_out = os.path.join(args.output, 'match_table.txt') sql_cmd = "%s -header -nullvalue '.' -separator '\t' %s \"select * from matches;\" > %s" % ( args.sqlite3, sql_file, tsf_out) os.system(sql_cmd) # Create the config file for the extraction of the desired loci output_folder = args.output with open(os.path.join(output_folder, 'config'), 'w') as f: print('[Organisms]', file=f) for aln in glob.glob(os.path.join(output_folder, '*.lastz')): aln = os.path.basename(aln) #aln = aln.split('_')[0] aln = aln.replace('.lastz', '') print(aln, file=f) print('\n[Loci]', file=f) with open(os.path.join(output_folder, 'match_table.txt')) as match_table: lines = match_table.readlines() for line in lines[1:]: print(line.split('\t')[0], file=f)
def main(): args = get_args() if args.regex and args.repl is not None: # "s_[0-9]+$" regex = re.compile(args.regex) uces = set([ get_name(read.identifier, "|", 1, regex=regex, repl=args.repl) for read in fasta.FastaReader(args.query) ]) else: uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) regex = None if args.dupefile: print "\t Getting dupes" dupes = get_dupes(args.dupefile, regex, args.repl) fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( os.path.join(args.output, 'probe.matches.sqlite'), organisms, uces) print "Processing:" for contig in fasta_files: critter = os.path.basename(contig).split('.')[0].replace('-', "_") output = os.path.join( args.output, \ os.path.splitext(os.path.basename(contig))[0] + '.lastz' ) contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align(contig, args.query, args.coverage, args.identity, output) lzstdout, lztstderr = alignment.run() # parse the lastz results of the alignment matches, orientation, revmatches = \ defaultdict(set), defaultdict(set), defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(output): # get strandedness of match contig_name = get_name(lz.name1) uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl) if args.dupefile and uce_name in dupes: probe_dupes.add(uce_name) else: matches[contig_name].add(uce_name) orientation[uce_name].add(lz.strand2) revmatches[uce_name].add(contig_name) # we need to check nodes for dupe matches to the same probes contigs_matching_mult_uces = check_contigs_for_dupes(matches) uces_matching_mult_contigs = check_probes_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_uces.union( uces_matching_mult_contigs) # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_print_output(critter, matches, contigs, probe_dupes, contigs_matching_mult_uces, uces_matching_mult_contigs)