def main(): with smart_open(sys.argv[1], "r") as handle: cluster_to_read_cnt = {} for record in SeqIO.parse(handle, "fasta"): cluster, mult = parse_cluster_mult(record.id) cluster_to_read_cnt[cluster] = mult with smart_open(sys.argv[2], "r") as handle: cluster_to_umi_cnt = {} for record in SeqIO.parse(handle, "fasta"): cluster, mult = parse_cluster_mult(record.id) cluster_to_umi_cnt[cluster] = mult read_cnt = np.array([ cluster_to_read_cnt[cluster] for cluster in cluster_to_read_cnt.keys() ]) umi_cnt = np.array([ cluster_to_umi_cnt[cluster] for cluster in cluster_to_read_cnt.keys() ]) plot = sns.regplot(umi_cnt, read_cnt, fit_reg=False) plot.set_ylabel("Read count") plot.set_xlabel("Barcode count") margin_coef = 0.01 # ysize = max(read_cnt) - min(read_cnt) # ymargin = margin_coef * ysize xsize = max(umi_cnt) - min(umi_cnt) xmargin = margin_coef * xsize ymargin = 0.1 plt.ylim(1.0 / (1 + ymargin), max(read_cnt) * (1 + ymargin)) plt.xlim(min(umi_cnt) - xmargin, max(umi_cnt) + xmargin) plt.yscale("log", nonposy="clip") plt.savefig( os.path.join(os.path.dirname(sys.argv[1]), "read_cnt_to_umi_cnt.png")) plt.close()
def run_presto(input_file, output_dir, log=None, remove_tmp=True): if log is None: log = FakeLog() mkdir_p(output_dir) # gunzip input_file_new = "%s/input_reads.fasta" % output_dir fastx2fastx(input_file, input_file_new) args = {"input_file": input_file_new, "output_dir": output_dir} timer = Timer() support.sys_call( "CollapseSeq.py -s %(input_file)s --outdir %(output_dir)s --outname presto" % args, log=log) timer.stamp(output_dir + "/time.txt") presto_output = output_dir + "/presto_collapse-unique.fasta" repertoire_fa = output_dir + "/final_repertoire.fa" with smart_open(presto_output) as fin, smart_open(repertoire_fa, "w") as fout: for i, record in enumerate( SeqIO.parse(fin, idFormatByFileName(presto_output))): id = record.description size = parse_presto_id(id) record.id = record.description = "cluster___%d___size___%d" % ( i, size) SeqIO.write(record, fout, "fasta") if remove_tmp: os.remove(input_file_new) os.remove(presto_output)
def convert_mixcr_output_to_igrec(input_file, output_file): with smart_open(input_file) as fh, smart_open(output_file, "w") as fout: # Skip header fh.next() for i, line in enumerate(fh): seq, size = line.strip().split() size = int(size) fout.write(">cluster___%d___size___%d\n" % (i, size)) fout.write(seq + "\n")
def multiplex_repertoire(input_file, output_file): output_format = idFormatByFileName(output_file) input_format = idFormatByFileName(input_file) assert output_format == "fasta" or input_format == "fastq" with smart_open(input_file) as fh, smart_open(output_file, "w") as fout: for record in SeqIO.parse(fh, input_format): cluster, mult = parse_cluster_mult(str(record.description)) for i in xrange(1, mult + 1): record.id = record.description = "antibody_%s_multiplicity_%d_copy_%d" % ( cluster, mult, i) SeqIO.write(record, fout, output_format)
def convert_abvitro_to_repertoire(input_file, output_file): output_format = idFormatByFileName(output_file) input_format = idFormatByFileName(input_file) assert output_format == "fasta" or input_format == "fastq" with smart_open(input_file) as fh, smart_open(output_file, "w") as fout: for record in SeqIO.parse(fh, input_format): cluster, mult = parse_abvitro_assembled_header( str(record.description)) record.id = record.description = "cluster___%s___size___%d" % ( cluster, mult) SeqIO.write(record, fout, output_format)
def jit_fx_file(input_file, output_file, error_rate=2, random_errors=True, min_error=0, erroneous_site_len=10005000, seed=None): import numpy as np from Bio import Seq import random output_format = idFormatByFileName(output_file) input_format = idFormatByFileName(input_file) print seed random.seed(seed) np.random.seed(seed) print np.random.ranf(1) with smart_open(input_file) as fh, smart_open(output_file, "w") as fout: for record in SeqIO.parse(fh, input_format): n_errors = np.random.poisson(error_rate, 1)[0] if random_errors else error_rate if n_errors < min_error: n_errors = min_error positions = random.sample( range(min(len(record.seq), erroneous_site_len)), n_errors) s = list(str(record.seq)) for pos in positions: s[pos] = RC(s[pos]) if input_format == "fastq": phred_quality = record.letter_annotations["phred_quality"] record.letter_annotations = {} record.seq = Seq.Seq("".join(s)) if output_format == "fastq": if input_format == "fastq": record.letter_annotations["phred_quality"] = phred_quality else: record.letter_annotations["phred_quality"] = [ random.randint(30, 50) for _ in xrange(len(record)) ] # TODO Check it out SeqIO.write(record, fout, output_format)
def convert_mixcr2_output_to_igrec(input_file, output_file, initial_reads, output_rcm): with smart_open(initial_reads) as fh: record_ids = [ str(record.description) for record in SeqIO.parse(fh, idFormatByFileName(initial_reads)) ] targets = [None] * len(record_ids) with smart_open(input_file) as fh, smart_open(output_file, "w") as fout: # Skip header fh.next() for i, line in enumerate(fh): seq, size, ids = line.strip().split("\t") ids = ids.strip().split(",") ids = map(int, ids) for id in ids: targets[id] = i size = int(size) assert size <= len(ids) # WHY????????????? # if size != len(ids): # print size # print ids size = len(ids) fout.write(">cluster___%d___size___%d\n" % (i, size)) fout.write(seq + "\n") empty_num = max(target for target in targets if target is not None) + 1 # print empty_num with smart_open(initial_reads) as fh: for j, record in enumerate( SeqIO.parse(fh, idFormatByFileName(initial_reads))): if targets[j] is None: targets[j] = empty_num empty_num += 1 fout.write(">cluster___%d___size___%d\n" % (targets[j], 1)) fout.write(str(record.seq) + "\n") with smart_open(output_rcm, "w") as rcm: for id, target_cluster in zip(record_ids, targets): assert target_cluster is not None rcm.write("%s\t%d\n" % (id, target_cluster))
def generate_rcm(reads_file_name, compressed_file_name, cliques_ids_file_name, out_file): # Obtain read ids with smart_open(reads_file_name, "r") as fh: ids = [ str(record.id) for record in SeqIO.parse(fh, idFormatByFileName(reads_file_name)) ] # Obtain compread2clique with smart_open(cliques_ids_file_name, "r") as fh: compread2clique = [int(s) for s in fh] with smart_open(compressed_file_name, "r") as fh: idmap = [int(s) for s in fh] with smart_open(out_file, "w") as fh: for i in xrange(len(ids)): fh.write("%s\t%d\n" % (ids[i], compread2clique[idmap[i]]))
def simulated_repertoire_to_final_repertoire(input_file, output_file): import random output_format = idFormatByFileName(output_file) with smart_open(input_file) as fh, smart_open(output_file, "w") as fout: for record in SeqIO.parse(fh, "fasta"): id = record.description cluster, size, copy = parse_final_repertoire_id(id) if copy == 1: record.id = record.description = "cluster___%s___size___%d" % ( cluster, size) record.letter_annotations = {} if output_format == "fastq": record.letter_annotations["phred_quality"] = [ random.randint(30, 50) for _ in xrange(len(record)) ] # TODO Check it out # record.letter_annotations["phred_quality"] = [50] * len(record) SeqIO.write(record, fout, output_format)
def parse_vjf_output(filename, readfile): from collections import defaultdict with smart_open(readfile, "rU") as fh: parser = SeqIO.parse(fh, idFormatByFileName(readfile)) descr_to_ind = { str(record.description).replace(" ", "_"): i for i, record in enumerate(parser) } result = defaultdict(dict) with open(filename) as csv_file: reader = csv.reader(csv_file, delimiter="\t") headers = reader.next() # Read_name Chain_type V_hit V_start_pos V_end_pos V_score J_hit J_start_pos J_end_pos J_score id_col = linear_search(headers, "Read_name") Vstart_col = linear_search(headers, "V_start_pos") Vend_col = linear_search(headers, "V_end_pos") Vgene_col = linear_search(headers, "V_hit") Jgene_col = linear_search(headers, "J_hit") Jstart_col = linear_search(headers, "J_start_pos") Jend_col = linear_search(headers, "J_end_pos") for line in reader: desc = line[id_col] Vstart = int(line[Vstart_col]) Vend = int(line[Vend_col]) Jstart = int(line[Jstart_col]) Jend = int(line[Jend_col]) Vgene = line[Vgene_col] # Vgene = Vgene[:Vgene.find(" ")] Jgene = line[Jgene_col] # Jgene = Jgene[:Jgene.find(" ")] ind = descr_to_ind[desc] result[desc]["V"] = HitTableRowVJF("V", desc, Vgene, Vstart, Vend) result[desc]["J"] = HitTableRowVJF("J", desc, Jgene, Jstart, Jend) result[ind] = result[desc] return result
type=str, help="input FASTA/FASTQ file with abundances in ids") parser.add_argument("output", type=str, help="output FASTA/FASTQ file") parser.add_argument("--limit", "-l", type=int, default=5, help="size limit (default: %(default)s)") args = parser.parse_args() print "Supernode reporter started..." print "Command line: %s" % " ".join(sys.argv) input_size = output_size = 0 with smart_open(args.input, "r") as fin, smart_open(args.output, "w") as fout: for record in SeqIO.parse(fin, idFormatByFileName(args.input)): input_size += 1 id = str(record.description) size = parse_size(id) assert id is not None if size >= args.limit: SeqIO.write(record, fout, idFormatByFileName(args.output)) output_size += 1 print "%d antibody clusters have abundance >= %d" % (output_size, args.limit) print "%d lowly abundant antibody clusters will be discarded" % ( input_size - output_size, ) print "Highly abundant clusters were written to " + args.output
def stamp(self, filename): delta = self.delta() with smart_open(filename, "w") as f: f.write("%f\n" % delta) return delta
parser.add_argument("--output-repertoire", "-r", type=str, help="output file with repertoire sequences") parser.add_argument("--output-rcm", "-R", type=str, help="output file with repertoire RCM") args = parser.parse_args() print "Construct repertoire from TrieCompressor output..." print "Command line: %s" % " ".join(sys.argv) # Fix ids with smart_open(args.input_compressed) as fin, smart_open( args.output_repertoire, "w") as fout: for i, record in enumerate( SeqIO.parse(fin, idFormatByFileName(args.input_compressed))): id = record.description size = parse_size(id) record.id = record.description = "cluster___%d___size___%d" % ( i, size) SeqIO.write(record, fout, idFormatByFileName(args.output_repertoire)) with smart_open(args.input_reads) as fin_reads, smart_open( args.input_map) as fin_map, smart_open(args.output_rcm, "w") as fout_rcm: for record, cluster in izip( SeqIO.parse(fin_reads, idFormatByFileName(args.input_reads)),
type=int, default=10, help="distance threshold [default %(default)d]") parser.add_argument("--lengths", type=str, help="file for read length stats") # parser.add_argument("--subs-map", "-M", # type=str, # help="file for subs table") args = parser.parse_args() barcodes_count = defaultdict(int) print("Reading library...") with smart_open(args.input, "r") as fh: data = list(SeqIO.parse(fh, idFormatByFileName(args.input))) if not args.no_fix_spaces: for record in data: record.description = str(record.description).replace(" ", "_") record.id = record.name = record.description # Omit reads with Ns data = [record for record in data if record.seq.count("N") == 0] data = [ record for record in data if extract_barcode(record.id) is not None ] clusters = defaultdict(list)
if __name__ == "__main__": args = parse_command_line() log = CreateLogger("VJF benchmark") if args.log: AttachFileLogger(log, args.log) with open(args.germline_J_file, "rU") as fh: germline_J_parser = SeqIO.parse(fh, "fasta") germline_J_map = { str(record.id): str(record.seq) for record in germline_J_parser } igblast_hits = get_igblast_output(args) vjf_hits = get_vjf_output(args) with smart_open(args.input, "rU") as fh: parser = SeqIO.parse(fh, idFormatByFileName(args.input)) reads = list(parser) assert len(reads) == len(igblast_hits) ids = [str(record.description) for record in reads] assert len(set(ids)) == len(reads) stats = benchmark_stats(igblast_hits, vjf_hits, germline_J_map) if args.bad_reads: with smart_open(args.bad_reads, "w") as f: SeqIO.write([reads[_] for _ in stats.bad_reads], f, idFormatByFileName(args.bad_reads)) log.info("Bad reads were written to " + args.bad_reads) log.info("Overall reads %d" % stats.all)
parser.add_argument("--input", "-i", required=True, type=str, help="input FASTA/FASTQ file with abundances in ids") parser.add_argument("--output", "-o", required=True, type=str, help="output FASTA/FASTQ file") parser.add_argument("--id-map", "-m", type=str, default="", help="map file name; empty (default) for non-producing") args = parser.parse_args() print "Fake trie_compressor started" read_count = 0 with smart_open(args.input, "r") as fin, smart_open(args.output, "w") as fout: for record in SeqIO.parse(fin, idFormatByFileName(args.input)): id = str(record.description) record.id = record.description = id + "__size__1" SeqIO.write(record, fout, idFormatByFileName(args.output)) read_count += 1 if args.id_map: with smart_open(args.id_map, "w") as f_id_map: for i in xrange(read_count): f_id_map.write("%d\n" % i) print "Fake trie_compressor finished"