def readfiles(indir): ext = 'fa' files = iseqlib.getfiles(indir, ext) sample2freqs = {} for file in files: sample = file.rstrip(ext).rstrip('.') freqs = readfile( os.path.join(indir, file) ) sample2freqs[sample] = freqs return sample2freqs
def main(): parser = iseqlib.initOptions() addOptions(parser) options, args = parser.parse_args() if options.minExpFreq > 0 and not options.sample2total: parser.error("--sample2total is required as --minExpandedFreq > 0\n") if options.sample2total: options.sample2total = readSample2total(options.sample2total) ext = 'xml' infiles = iseqlib.getfiles(options.indir, ext) outbasename = os.path.join(options.outdir, options.basename) fh1 = open("%s-1.txt" %outbasename, 'w') fh2 = open("%s-2.txt" %outbasename, 'w') fh3 = open("%s-3.txt" %outbasename, 'w') fh4 = open("%s-4.txt" %outbasename, 'w') type2count = {} type2clones = {} for file in infiles: infile = os.path.join(options.indir, file) vjname = file.rstrip(ext).rstrip('.') clone2hits = readNcbiXml(infile, options) separateClusters(clone2hits, options.sample2total, options.minExpFreq, options.minExpSize, options.minExpClones, options.minMotifClones, vjname, fh1, fh2, fh3, fh4, type2count, type2clones) fh1.close() fh2.close() fh3.close() fh4.close() #summary stats: summaryFile = "%s-summary.txt" %outbasename fh = open(summaryFile, 'w') fh.write("#min similarity: %f\n#min length: %d\n#min size to be called expanded clones: %d\n#min number of expanded clones required for type 1: %d\n#min number of clones supported a single motif for type 2: %d\n" %(options.minPos, options.minLen, options.minExpSize, options.minExpClones, options.minMotifClones)) for type, count in type2count.iteritems(): fh.write("%s\t%d\n" %(type, count)) fh.close() #Print type2clones: v2seq = {} if options.vfile: v2seq = readVfile(options.vfile) type2index = {} for type, header2vj2seq in type2clones.iteritems(): outfile = getFaFilename(type, outbasename) f = open(outfile, 'w') f.write("#%s\n" %type) for header, vj2seq in header2vj2seq.iteritems(): for vj, seq in vj2seq.iteritems(): #f.write(">%s;%s\n" %(header, vj)) #reformat header: sample, id, size = getCloneInfo( ";".join(header.split(";")[:3]) ) header = "%s;%s;%d;%s" % (sample.lstrip('as'), seq, size, vj) f.write(">%s\n" %header) v = vj.split(".")[0] if v in v2seq: seq = v2seq[v] + seq f.write("%s\n" %seq) f.close()
def main(): parser = iseqlib.initOptions() addOptions(parser) options, args = parser.parse_args() if options.minExpFreq > 0 and not options.sample2total: parser.error("--sample2total is required as --minExpandedFreq > 0\n") if options.sample2total: options.sample2total = readSample2total(options.sample2total) ext = 'txt' infiles = iseqlib.getfiles(options.indir, ext) outbasename = os.path.join(options.outdir, options.basename) fh1 = open("%s-1.txt" %outbasename, 'w') fh2 = open("%s-2.txt" %outbasename, 'w') fh3 = open("%s-3.txt" %outbasename, 'w') fh4 = open("%s-4.txt" %outbasename, 'w') fh5 = open("%s-5.txt" %outbasename, 'w') type2count = {} type2clusters = {} totalseq = 0 expseq = 0 for file in infiles: infile = os.path.join(options.indir, file) vjname = file.rstrip(ext).rstrip('.') clusters = readBlastclustOutfiles(infile, vjname, options) total, exp = separateClusters(clusters, options.minExpFreq, options.minExpSize, options.minExpClones, fh1, fh2, fh3, fh4, fh5, type2count, type2clusters) totalseq += total expseq += exp fh1.close() fh2.close() fh3.close() fh4.close() fh5.close() #summary stats: summaryFile = "%s-summary.txt" %outbasename fh = open(summaryFile, 'w') fh.write("#min size to be called expanded clones: %d\n#min number of expanded clones required for type 1: %d\n" %(options.minExpSize, options.minExpClones)) id2info = clusterId2info() fh.write("Total number of clones: %d\n" %totalseq) fh.write("Total number of expanded clones: %d\n" %expseq) for type in sorted(type2count.keys()): fh.write("%d\t%s\t%d\n" %(type, id2info[type], type2count[type])) fh.close() #Print type2clones: #Read v files: v2seq = {} if options.vfile: v2seq = readFaFile(options.vfile) #Read fasta files: ext = 'fa' infiles = iseqlib.getfiles(options.fadir, ext) vj2id2seq = {} for file in infiles: infile = os.path.join(options.fadir, file) vjname = file.rstrip(ext).rstrip('.') vj2id2seq[vjname] = readFaFile(infile) for type, clusters in type2clusters.iteritems(): outfile = "%s-%d.fa" %(outbasename, type) f = open(outfile, 'w') f.write("#Type: %s\n" %type) clusters = sorted(clusters, key=lambda c: c.totalReads, reverse=True) for cluster in clusters: f.write("#Cluster %s\n" %cluster.getDesc()) id2seq = vj2id2seq[cluster.vj] v = cluster.vj.split(".")[0] vseq = "" if v in v2seq: vseq = v2seq[v] for clone in cluster.clones: seq = id2seq[ clone.desc ] cdr3seq = seq[ len(vseq): ] header = "%s;%s;%d;%s" %(clone.sample, cdr3seq, clone.size, cluster.vj) f.write(">%s\n" %header) f.write("%s\n" %seq) f.close()