def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf', logfile=sys.stdout, debugfile=sys.stderr): script_path=gl['script_path'] file_names = {} job_groups=job.groups resFiles={} for gid, group in job_groups.iteritems(): file_names[gid] = {} primersFilename = 'group_' + group['name'] + "_barcode_file.fa" primersFile = os.path.join(file_path,primersFilename) ex.add(primersFile,description=set_file_descr(primersFilename,groupId=gid,step="init",type="fa")) paramsFilename = 'group_' + group['name'] + "_param_file.txt" paramsFile = os.path.join(file_path,paramsFilename) ex.add(paramsFile,description=set_file_descr(paramsFilename,groupId=gid,step="init",type="txt")) params = load_paramsFile(paramsFile) infiles = [] tot_counts = 0 allSubFiles = [] for rid,run in group['runs'].iteritems(): infiles.append(run) n=count_lines(ex,run) tot_counts += n/4 if n>10000000: allSubFiles.extend(split_file(ex,run,n_lines=8000000)) else: allSubFiles.append(run) (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate( ex, allSubFiles, primersFile, (gid, group['name']), via=via, **params ) gzipfile(ex,run) ex.add(run+".gz",description=set_file_descr(group['name']+"_full_fastq.gz", groupId=gid,step='exonerate',view='debug',type="fastq")) logfile.write("Will get sequences to filter\n");logfile.flush() seqToFilter = getSeqToFilter(ex,primersFile) logfile.write("Will filter the sequences\n") filteredFastq = filterSeq(ex,resExonerate,seqToFilter,gid,group['name'],via=via) logfile.write("After filterSeq, filteredFastq=%s\n" %filteredFastq);logfile.flush() counts_primers = {} counts_primers_filtered = {} if len(filteredFastq): archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") for k,f in resExonerate.iteritems(): counts_primers[k] = count_lines(ex,f)/4 if k in filteredFastq: file_names[gid][k] = group['name']+"_"+k+"_filtered" ex.add(filteredFastq[k],description=set_file_descr(file_names[gid][k]+".fastq", groupId=gid,step="final", type="fastq")) counts_primers_filtered[k] = count_lines(ex,filteredFastq[k])/4 tgz.add( f, arcname=group['name']+"_"+k+".fastq" ) else: file_names[gid][k] = group['name']+"_"+k ex.add(f,description=set_file_descr(file_names[gid][k]+".fastq", groupId=gid,step="final", type="fastq")) counts_primers_filtered[k] = 0 if len(filteredFastq): tgz.close() ex.add(archive,description=set_file_descr(group['name']+"_unfiltered_fastq.tgz", groupId=gid,step="exonerate", type="tar")) # Prepare report per group of runs report_ok,reportFile = prepareReport(ex,group['name'], tot_counts, counts_primers,counts_primers_filtered, tot_ambiguous, tot_discarded) ex.add(reportFile,description = set_file_descr( group['name']+"_report_demultiplexing.txt", groupId=gid,step="final",type="txt",view="admin")) if report_ok: reportFile_pdf = unique_filename_in() createReport(ex,reportFile,reportFile_pdf,script_path) ex.add(reportFile_pdf,description=set_file_descr( group['name']+"_report_demultiplexing.pdf", groupId=gid,step="final",type="pdf")) else: logfile.write("*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n");logfile.flush() add_pickle( ex, file_names, set_file_descr('file_names',step="final",type='py',view='admin') ) return resFiles
def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf', logfile=sys.stdout, debugfile=sys.stderr): script_path = gl['script_path'] file_names = {} job_groups = job.groups resFiles = {} for gid, group in job_groups.iteritems(): file_names[gid] = {} primersFilename = 'group_' + group['name'] + "_barcode_file.fa" primersFile = group.get("primersfile", os.path.join(file_path, primersFilename)) ex.add(primersFile, description=set_file_descr(primersFilename, groupId=gid, step="init", type="fa")) paramsFilename = 'group_' + group['name'] + "_param_file.txt" paramsFile = group.get("paramsfile", os.path.join(file_path, paramsFilename)) ex.add(paramsFile, description=set_file_descr(paramsFilename, groupId=gid, step="init", type="txt")) params = load_paramsFile(paramsFile) infiles = [] tot_counts = 0 allSubFiles = [] for rid, run in group['runs'].iteritems(): infiles.append(run) n = count_lines(ex, run) tot_counts += n / 4 if n > 10000000: allSubFiles.extend(split_file(ex, run, n_lines=8000000)) else: allSubFiles.append(run) (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate(ex, allSubFiles, primersFile, (gid, group['name']), via=via, **params) gzipfile(ex, cat(infiles)) ex.add(run + ".gz", description=set_file_descr(group['name'] + "_full_fastq.gz", groupId=gid, step='exonerate', view='debug', type="fastq")) logfile.write("Will get sequences to filter\n") logfile.flush() seqToFilter = getSeqToFilter(ex, primersFile) logfile.write("Will filter the sequences\n") filteredFastq = filterSeq(ex, resExonerate, seqToFilter, gid, group['name'], via=via) logfile.write("After filterSeq, filteredFastq=%s\n" % filteredFastq) logfile.flush() counts_primers = {} counts_primers_filtered = {} global bcDelimiter if len(filteredFastq): archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") for k, f in resExonerate.iteritems(): counts_primers[k] = count_lines(ex, f) / 4 if k in filteredFastq: k2 = k.replace(bcDelimiter, "_") file_names[gid][k2] = group['name'] + "_" + k2 + "_filtered" ex.add(filteredFastq[k], description=set_file_descr(file_names[gid][k2] + ".fastq", groupId=gid, step="final", type="fastq")) counts_primers_filtered[k] = count_lines(ex, filteredFastq[k]) / 4 tgz.add(f, arcname=group['name'] + "_" + k.replace(bcDelimiter, "_") + ".fastq") else: k2 = k.replace(bcDelimiter, "_") file_names[gid][k2] = group['name'] + "_" + k2 ex.add(f, description=set_file_descr(file_names[gid][k2] + ".fastq", groupId=gid, step="final", type="fastq")) counts_primers_filtered[k] = 0 if len(filteredFastq): tgz.close() ex.add(archive, description=set_file_descr(group['name'] + "_unfiltered_fastq.tgz", groupId=gid, step="exonerate", type="tar")) # Prepare report per group of runs report_ok, reportFile = prepareReport(ex, group['name'], tot_counts, counts_primers, counts_primers_filtered, tot_ambiguous, tot_discarded) ex.add(reportFile, description=set_file_descr(group['name'] + "_report_demultiplexing.txt", groupId=gid, step="final", type="txt", view="admin")) if report_ok: reportFile_pdf = unique_filename_in() createReport(ex, reportFile, reportFile_pdf, script_path) ex.add(reportFile_pdf, description=set_file_descr(group['name'] + "_report_demultiplexing.pdf", groupId=gid, step="final", type="pdf")) else: logfile.write( "*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n" ) logfile.flush() add_pickle( ex, file_names, set_file_descr('file_names', step="final", type='py', view='admin')) return resFiles
def parallel_exonerate(ex, subfiles, dbFile, grp_descr, minScore=77, n=1, x=22, l=30, q=True, via="local"): futures = [fastqToFasta.nonblocking(ex,sf,n=n,x=x,via=via) for sf in subfiles] futures2 = [] res = [] resExonerate = [] faSubFiles = [] all_ambiguous = [] all_ambiguous_fastq = [] all_unaligned = [] all_discarded = [] gid, grp_name = grp_descr my_minscore = _get_minscore(dbFile) for sf in futures: subResFile = unique_filename_in() faSubFiles.append(sf.wait()) futures2.append(exonerate.nonblocking(ex, faSubFiles[-1], dbFile, minScore=my_minscore, via=via, stdout=subResFile, memory=6)) resExonerate.append(subResFile) for nf,f in enumerate(resExonerate): futures2[nf].wait() (resSplitExonerate,alignments) = split_exonerate(f,minScore,l=l,n=n) all_unaligned.append(alignments["unaligned"]) all_ambiguous.append(alignments["ambiguous"]) all_ambiguous_fastq.append(alignments["ambiguous_fastq"]) all_discarded.append(alignments["discarded"]) res.append(resSplitExonerate) gzipfile(ex,cat(all_unaligned[1:],out=all_unaligned[0])) ex.add(all_unaligned[0]+".gz", description=set_file_descr(grp_name+"_unaligned.txt.gz", groupId=gid,step="exonerate",type="txt", view="admin", comment="scores between %i and %i"%(my_minscore,minScore)) ) # add ambiguous file only if it is not empty n = count_lines(ex,all_ambiguous[0]) if n > 1: gzipfile(ex,cat(all_ambiguous[1:],out=all_ambiguous[0])) ex.add(all_ambiguous[0]+".gz", description=set_file_descr(grp_name+"_ambiguous.txt.gz", groupId=gid,step="exonerate",type="txt", view="admin", comment="multiple equally good classifications") ) # add ambiguous fastq file only if it is not empty tot_ambiguous = count_lines(ex,all_ambiguous_fastq[0])/4 if n > 1: gzipfile(ex,cat(all_ambiguous_fastq[1:],out=all_ambiguous_fastq[0])) ex.add(all_ambiguous_fastq[0]+".gz", description=set_file_descr(grp_name+"_ambiguous.fastq.gz", groupId=gid,step="exonerate",type="fastq", comment="multiple equally good classifications") ) # add discarded file only if it is not empty tot_discarded = count_lines(ex,all_discarded[0])/4 if n > 1: gzipfile(ex,cat(all_discarded[1:],out=all_discarded[0])) ex.add(all_discarded[0]+".gz", description=set_file_descr(grp_name+"_discarded.fastq.gz", groupId=gid, step="exonerate", type="fastq", view="admin", comment="< %i bps" %l ) ) gzipfile(ex,faSubFiles[0]) ex.add(faSubFiles[0]+".gz", description=set_file_descr(grp_name+"_input_part.fa.gz", groupId=gid, step="init", type="fa", view="admin", comment="part") ) gzipfile(ex,resExonerate[0]) ex.add(resExonerate[0]+".gz", description=set_file_descr(grp_name+"_exonerate_part.txt.gz", groupId=gid, step="exonerate", type="txt", view="admin", comment="part") ) resFiles = dict((k,'') for d in res for k in d.keys()) for k in resFiles.keys(): v = [d[k] for d in res if k in d] resFiles[k] = cat(v[1:],out=v[0]) return (resFiles, tot_ambiguous, tot_discarded)
def parallel_exonerate(ex, subfiles, dbFile, grp_descr, minScore=77, n=1, x=22, l=30, trim=True, via="local"): futures = [ fastqToFasta.nonblocking(ex, sf, n=n, x=x, via=via) for sf in subfiles ] futures2 = [] res = [] resExonerate = [] faSubFiles = [] all_ambiguous = [] all_ambiguous_fastq = [] all_unaligned = [] all_discarded = [] gid, grp_name = grp_descr my_minscore = _get_minscore(dbFile) primersDict = get_primersList(dbFile) for sf in futures: subResFile = unique_filename_in() faSubFiles.append(sf.wait()) futures2.append( exonerate.nonblocking(ex, faSubFiles[-1], dbFile, minScore=my_minscore, via=via, stdout=subResFile, memory=6)) resExonerate.append(subResFile) for nf, f in enumerate(resExonerate): futures2[nf].wait() (resSplitExonerate, alignments) = split_exonerate(f, primersDict, minScore, l=l, n=n, trim=trim) all_unaligned.append(alignments["unaligned"]) all_ambiguous.append(alignments["ambiguous"]) all_ambiguous_fastq.append(alignments["ambiguous_fastq"]) all_discarded.append(alignments["discarded"]) res.append(resSplitExonerate) # add unaligned file only if it is not empty n = count_lines(ex, all_unaligned[0]) if n > 1: catfile = cat(all_unaligned) gzipfile(ex, catfile) ex.add(catfile + ".gz", description=set_file_descr(grp_name + "_unaligned.txt.gz", groupId=gid, step="exonerate", type="txt", view="admin", comment="scores between %i and %i" % (my_minscore, minScore))) # add ambiguous file only if it is not empty n = count_lines(ex, all_ambiguous[0]) if n > 1: catfile = cat(all_ambiguous) gzipfile(ex, catfile) ex.add(catfile + ".gz", description=set_file_descr( grp_name + "_ambiguous.txt.gz", groupId=gid, step="exonerate", type="txt", view="admin", comment="multiple equally good classifications")) # add ambiguous fastq file only if it is not empty tot_ambiguous = count_lines(ex, all_ambiguous_fastq[0]) / 4 if tot_ambiguous > 1: catfile = cat(all_ambiguous_fastq) gzipfile(ex, catfile) ex.add(catfile + ".gz", description=set_file_descr( grp_name + "_ambiguous.fastq.gz", groupId=gid, step="exonerate", type="fastq", comment="multiple equally good classifications")) # add discarded file only if it is not empty tot_discarded = count_lines(ex, all_discarded[0]) / 4 if tot_discarded > 1: catfile = cat(all_discarded) gzipfile(ex, catfile) ex.add(catfile + ".gz", description=set_file_descr(grp_name + "_discarded.fastq.gz", groupId=gid, step="exonerate", type="fastq", view="admin", comment="remaining seq too short")) ## add part input fasta file only if it is not empty n = count_lines(ex, faSubFiles[0]) if n > 1: gzipfile(ex, faSubFiles[0]) ex.add(faSubFiles[0] + ".gz", description=set_file_descr(grp_name + "_input_part.fa.gz", groupId=gid, step="init", type="fa", view="admin", comment="part")) ## add part res exonerate only if it is not empty n = count_lines(ex, resExonerate[0]) if n > 1: gzipfile(ex, resExonerate[0]) ex.add(resExonerate[0] + ".gz", description=set_file_descr(grp_name + "_exonerate_part.txt.gz", groupId=gid, step="exonerate", type="txt", view="admin", comment="part")) resFiles = dict((k, '') for d in res for k in d.keys()) for k in resFiles.keys(): v = [d[k] for d in res if k in d] resFiles[k] = cat(v[1:], out=v[0]) return (resFiles, tot_ambiguous, tot_discarded)