示例#1
0
def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf',
                         logfile=sys.stdout, debugfile=sys.stderr):
    script_path=gl['script_path']
    file_names = {}
    job_groups=job.groups
    resFiles={}
    for gid, group in job_groups.iteritems():
        file_names[gid] = {}

        primersFilename = 'group_' + group['name'] + "_barcode_file.fa"
        primersFile = os.path.join(file_path,primersFilename)
        ex.add(primersFile,description=set_file_descr(primersFilename,groupId=gid,step="init",type="fa"))

        paramsFilename = 'group_' + group['name'] + "_param_file.txt"
        paramsFile = os.path.join(file_path,paramsFilename)
        ex.add(paramsFile,description=set_file_descr(paramsFilename,groupId=gid,step="init",type="txt"))
        params = load_paramsFile(paramsFile)

        infiles = []
        tot_counts = 0
        allSubFiles = []
        for rid,run in group['runs'].iteritems():
            infiles.append(run)
            n=count_lines(ex,run)
            tot_counts += n/4
            if n>10000000:
                allSubFiles.extend(split_file(ex,run,n_lines=8000000))
            else:
                allSubFiles.append(run)
        (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate( ex, allSubFiles, primersFile,
                                                                           (gid, group['name']), via=via, **params )
        gzipfile(ex,run)
        ex.add(run+".gz",description=set_file_descr(group['name']+"_full_fastq.gz",
                                                    groupId=gid,step='exonerate',view='debug',type="fastq"))
        logfile.write("Will get sequences to filter\n");logfile.flush()
        seqToFilter = getSeqToFilter(ex,primersFile)

        logfile.write("Will filter the sequences\n")
        filteredFastq = filterSeq(ex,resExonerate,seqToFilter,gid,group['name'],via=via)

        logfile.write("After filterSeq, filteredFastq=%s\n" %filteredFastq);logfile.flush()

        counts_primers = {}
        counts_primers_filtered = {}
        if len(filteredFastq):
            archive = unique_filename_in()
            tgz = tarfile.open(archive, "w:gz")
        for k,f in resExonerate.iteritems():
            counts_primers[k] = count_lines(ex,f)/4
            if k in filteredFastq:
                file_names[gid][k] = group['name']+"_"+k+"_filtered"
                ex.add(filteredFastq[k],description=set_file_descr(file_names[gid][k]+".fastq",
                                                                   groupId=gid,step="final",
                                                                   type="fastq"))
                counts_primers_filtered[k] = count_lines(ex,filteredFastq[k])/4
                tgz.add( f, arcname=group['name']+"_"+k+".fastq" )
            else:
                file_names[gid][k] = group['name']+"_"+k
                ex.add(f,description=set_file_descr(file_names[gid][k]+".fastq",
                                                    groupId=gid,step="final",
                                                    type="fastq"))
                counts_primers_filtered[k] = 0
        if len(filteredFastq):
            tgz.close()
            ex.add(archive,description=set_file_descr(group['name']+"_unfiltered_fastq.tgz",
                                                      groupId=gid,step="exonerate",
                                                      type="tar"))

        # Prepare report per group of runs
        report_ok,reportFile = prepareReport(ex,group['name'],
                                             tot_counts, counts_primers,counts_primers_filtered,
                                             tot_ambiguous, tot_discarded)
        ex.add(reportFile,description = set_file_descr(
                group['name']+"_report_demultiplexing.txt",
                groupId=gid,step="final",type="txt",view="admin"))
        if report_ok:
            reportFile_pdf = unique_filename_in()
            createReport(ex,reportFile,reportFile_pdf,script_path)
            ex.add(reportFile_pdf,description=set_file_descr(
                    group['name']+"_report_demultiplexing.pdf",
                    groupId=gid,step="final",type="pdf"))
        else:
            logfile.write("*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n");logfile.flush()
    add_pickle( ex, file_names,
                set_file_descr('file_names',step="final",type='py',view='admin') )
    return resFiles
示例#2
0
def demultiplex_workflow(ex,
                         job,
                         gl,
                         file_path="../",
                         via='lsf',
                         logfile=sys.stdout,
                         debugfile=sys.stderr):
    script_path = gl['script_path']
    file_names = {}
    job_groups = job.groups
    resFiles = {}
    for gid, group in job_groups.iteritems():
        file_names[gid] = {}

        primersFilename = 'group_' + group['name'] + "_barcode_file.fa"
        primersFile = group.get("primersfile",
                                os.path.join(file_path, primersFilename))
        ex.add(primersFile,
               description=set_file_descr(primersFilename,
                                          groupId=gid,
                                          step="init",
                                          type="fa"))

        paramsFilename = 'group_' + group['name'] + "_param_file.txt"
        paramsFile = group.get("paramsfile",
                               os.path.join(file_path, paramsFilename))
        ex.add(paramsFile,
               description=set_file_descr(paramsFilename,
                                          groupId=gid,
                                          step="init",
                                          type="txt"))
        params = load_paramsFile(paramsFile)

        infiles = []
        tot_counts = 0
        allSubFiles = []
        for rid, run in group['runs'].iteritems():
            infiles.append(run)
            n = count_lines(ex, run)
            tot_counts += n / 4
            if n > 10000000:
                allSubFiles.extend(split_file(ex, run, n_lines=8000000))
            else:
                allSubFiles.append(run)
        (resExonerate, tot_ambiguous,
         tot_discarded) = parallel_exonerate(ex,
                                             allSubFiles,
                                             primersFile, (gid, group['name']),
                                             via=via,
                                             **params)

        gzipfile(ex, cat(infiles))
        ex.add(run + ".gz",
               description=set_file_descr(group['name'] + "_full_fastq.gz",
                                          groupId=gid,
                                          step='exonerate',
                                          view='debug',
                                          type="fastq"))
        logfile.write("Will get sequences to filter\n")
        logfile.flush()
        seqToFilter = getSeqToFilter(ex, primersFile)

        logfile.write("Will filter the sequences\n")
        filteredFastq = filterSeq(ex,
                                  resExonerate,
                                  seqToFilter,
                                  gid,
                                  group['name'],
                                  via=via)

        logfile.write("After filterSeq, filteredFastq=%s\n" % filteredFastq)
        logfile.flush()

        counts_primers = {}
        counts_primers_filtered = {}
        global bcDelimiter
        if len(filteredFastq):
            archive = unique_filename_in()
            tgz = tarfile.open(archive, "w:gz")
        for k, f in resExonerate.iteritems():
            counts_primers[k] = count_lines(ex, f) / 4
            if k in filteredFastq:
                k2 = k.replace(bcDelimiter, "_")
                file_names[gid][k2] = group['name'] + "_" + k2 + "_filtered"
                ex.add(filteredFastq[k],
                       description=set_file_descr(file_names[gid][k2] +
                                                  ".fastq",
                                                  groupId=gid,
                                                  step="final",
                                                  type="fastq"))
                counts_primers_filtered[k] = count_lines(ex,
                                                         filteredFastq[k]) / 4
                tgz.add(f,
                        arcname=group['name'] + "_" +
                        k.replace(bcDelimiter, "_") + ".fastq")
            else:
                k2 = k.replace(bcDelimiter, "_")
                file_names[gid][k2] = group['name'] + "_" + k2
                ex.add(f,
                       description=set_file_descr(file_names[gid][k2] +
                                                  ".fastq",
                                                  groupId=gid,
                                                  step="final",
                                                  type="fastq"))
                counts_primers_filtered[k] = 0
        if len(filteredFastq):
            tgz.close()
            ex.add(archive,
                   description=set_file_descr(group['name'] +
                                              "_unfiltered_fastq.tgz",
                                              groupId=gid,
                                              step="exonerate",
                                              type="tar"))

        # Prepare report per group of runs
        report_ok, reportFile = prepareReport(ex, group['name'], tot_counts,
                                              counts_primers,
                                              counts_primers_filtered,
                                              tot_ambiguous, tot_discarded)
        ex.add(reportFile,
               description=set_file_descr(group['name'] +
                                          "_report_demultiplexing.txt",
                                          groupId=gid,
                                          step="final",
                                          type="txt",
                                          view="admin"))
        if report_ok:
            reportFile_pdf = unique_filename_in()
            createReport(ex, reportFile, reportFile_pdf, script_path)
            ex.add(reportFile_pdf,
                   description=set_file_descr(group['name'] +
                                              "_report_demultiplexing.pdf",
                                              groupId=gid,
                                              step="final",
                                              type="pdf"))
        else:
            logfile.write(
                "*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n"
            )
            logfile.flush()
    add_pickle(
        ex, file_names,
        set_file_descr('file_names', step="final", type='py', view='admin'))
    return resFiles
示例#3
0
def parallel_exonerate(ex, subfiles, dbFile, grp_descr,
                       minScore=77, n=1, x=22, l=30, q=True, via="local"):
    futures = [fastqToFasta.nonblocking(ex,sf,n=n,x=x,via=via) for sf in subfiles]

    futures2 = []
    res = []
    resExonerate = []
    faSubFiles = []
    all_ambiguous = []
    all_ambiguous_fastq = []
    all_unaligned = []
    all_discarded = []
    gid, grp_name = grp_descr
    my_minscore = _get_minscore(dbFile)
    for sf in futures:
        subResFile = unique_filename_in()
        faSubFiles.append(sf.wait())
        futures2.append(exonerate.nonblocking(ex, faSubFiles[-1], dbFile, minScore=my_minscore,
                                              via=via, stdout=subResFile, memory=6))
        resExonerate.append(subResFile)
    for nf,f in enumerate(resExonerate):
        futures2[nf].wait()
        (resSplitExonerate,alignments) = split_exonerate(f,minScore,l=l,n=n)
        all_unaligned.append(alignments["unaligned"])
        all_ambiguous.append(alignments["ambiguous"])
        all_ambiguous_fastq.append(alignments["ambiguous_fastq"])
        all_discarded.append(alignments["discarded"])
        res.append(resSplitExonerate)

    gzipfile(ex,cat(all_unaligned[1:],out=all_unaligned[0]))
    ex.add(all_unaligned[0]+".gz",
           description=set_file_descr(grp_name+"_unaligned.txt.gz",
                                      groupId=gid,step="exonerate",type="txt",
                                      view="admin", comment="scores between %i and %i"%(my_minscore,minScore)) )

    # add ambiguous file only if it is not empty
    n = count_lines(ex,all_ambiguous[0])
    if n > 1:
        gzipfile(ex,cat(all_ambiguous[1:],out=all_ambiguous[0]))
        ex.add(all_ambiguous[0]+".gz",
               description=set_file_descr(grp_name+"_ambiguous.txt.gz",
                                      groupId=gid,step="exonerate",type="txt",
                                      view="admin", comment="multiple equally good classifications") )

    # add ambiguous fastq file only if it is not empty
    tot_ambiguous = count_lines(ex,all_ambiguous_fastq[0])/4
    if n > 1:
        gzipfile(ex,cat(all_ambiguous_fastq[1:],out=all_ambiguous_fastq[0]))
        ex.add(all_ambiguous_fastq[0]+".gz",
               description=set_file_descr(grp_name+"_ambiguous.fastq.gz",
                                      groupId=gid,step="exonerate",type="fastq", comment="multiple equally good classifications") )

    # add discarded file only if it is not empty
    tot_discarded = count_lines(ex,all_discarded[0])/4
    if n > 1:
        gzipfile(ex,cat(all_discarded[1:],out=all_discarded[0]))
        ex.add(all_discarded[0]+".gz",
               description=set_file_descr(grp_name+"_discarded.fastq.gz",
                                          groupId=gid, step="exonerate", type="fastq",
                                          view="admin", comment="< %i bps" %l ) )
    gzipfile(ex,faSubFiles[0])
    ex.add(faSubFiles[0]+".gz",
           description=set_file_descr(grp_name+"_input_part.fa.gz",
                                      groupId=gid, step="init", type="fa",
                                      view="admin", comment="part") )
    gzipfile(ex,resExonerate[0])
    ex.add(resExonerate[0]+".gz",
           description=set_file_descr(grp_name+"_exonerate_part.txt.gz",
                                      groupId=gid, step="exonerate", type="txt",
                                      view="admin", comment="part") )

    resFiles = dict((k,'') for d in res for k in d.keys())
    for k in resFiles.keys():
        v = [d[k] for d in res if k in d]
        resFiles[k] = cat(v[1:],out=v[0])

    return (resFiles, tot_ambiguous, tot_discarded)
示例#4
0
def parallel_exonerate(ex,
                       subfiles,
                       dbFile,
                       grp_descr,
                       minScore=77,
                       n=1,
                       x=22,
                       l=30,
                       trim=True,
                       via="local"):
    futures = [
        fastqToFasta.nonblocking(ex, sf, n=n, x=x, via=via) for sf in subfiles
    ]

    futures2 = []
    res = []
    resExonerate = []
    faSubFiles = []
    all_ambiguous = []
    all_ambiguous_fastq = []
    all_unaligned = []
    all_discarded = []
    gid, grp_name = grp_descr
    my_minscore = _get_minscore(dbFile)
    primersDict = get_primersList(dbFile)

    for sf in futures:
        subResFile = unique_filename_in()
        faSubFiles.append(sf.wait())
        futures2.append(
            exonerate.nonblocking(ex,
                                  faSubFiles[-1],
                                  dbFile,
                                  minScore=my_minscore,
                                  via=via,
                                  stdout=subResFile,
                                  memory=6))
        resExonerate.append(subResFile)
    for nf, f in enumerate(resExonerate):
        futures2[nf].wait()
        (resSplitExonerate, alignments) = split_exonerate(f,
                                                          primersDict,
                                                          minScore,
                                                          l=l,
                                                          n=n,
                                                          trim=trim)
        all_unaligned.append(alignments["unaligned"])
        all_ambiguous.append(alignments["ambiguous"])
        all_ambiguous_fastq.append(alignments["ambiguous_fastq"])
        all_discarded.append(alignments["discarded"])
        res.append(resSplitExonerate)

    # add unaligned file only if it is not empty
    n = count_lines(ex, all_unaligned[0])
    if n > 1:
        catfile = cat(all_unaligned)
        gzipfile(ex, catfile)
        ex.add(catfile + ".gz",
               description=set_file_descr(grp_name + "_unaligned.txt.gz",
                                          groupId=gid,
                                          step="exonerate",
                                          type="txt",
                                          view="admin",
                                          comment="scores between %i and %i" %
                                          (my_minscore, minScore)))

    # add ambiguous file only if it is not empty
    n = count_lines(ex, all_ambiguous[0])
    if n > 1:
        catfile = cat(all_ambiguous)
        gzipfile(ex, catfile)
        ex.add(catfile + ".gz",
               description=set_file_descr(
                   grp_name + "_ambiguous.txt.gz",
                   groupId=gid,
                   step="exonerate",
                   type="txt",
                   view="admin",
                   comment="multiple equally good classifications"))

    # add ambiguous fastq file only if it is not empty
    tot_ambiguous = count_lines(ex, all_ambiguous_fastq[0]) / 4
    if tot_ambiguous > 1:
        catfile = cat(all_ambiguous_fastq)
        gzipfile(ex, catfile)
        ex.add(catfile + ".gz",
               description=set_file_descr(
                   grp_name + "_ambiguous.fastq.gz",
                   groupId=gid,
                   step="exonerate",
                   type="fastq",
                   comment="multiple equally good classifications"))

    # add discarded file only if it is not empty
    tot_discarded = count_lines(ex, all_discarded[0]) / 4
    if tot_discarded > 1:
        catfile = cat(all_discarded)
        gzipfile(ex, catfile)
        ex.add(catfile + ".gz",
               description=set_file_descr(grp_name + "_discarded.fastq.gz",
                                          groupId=gid,
                                          step="exonerate",
                                          type="fastq",
                                          view="admin",
                                          comment="remaining seq too short"))

    ## add part input fasta file only if it is not empty
    n = count_lines(ex, faSubFiles[0])
    if n > 1:
        gzipfile(ex, faSubFiles[0])
        ex.add(faSubFiles[0] + ".gz",
               description=set_file_descr(grp_name + "_input_part.fa.gz",
                                          groupId=gid,
                                          step="init",
                                          type="fa",
                                          view="admin",
                                          comment="part"))

    ## add part res exonerate only if it is not empty
    n = count_lines(ex, resExonerate[0])
    if n > 1:
        gzipfile(ex, resExonerate[0])
        ex.add(resExonerate[0] + ".gz",
               description=set_file_descr(grp_name + "_exonerate_part.txt.gz",
                                          groupId=gid,
                                          step="exonerate",
                                          type="txt",
                                          view="admin",
                                          comment="part"))

    resFiles = dict((k, '') for d in res for k in d.keys())
    for k in resFiles.keys():
        v = [d[k] for d in res if k in d]
        resFiles[k] = cat(v[1:], out=v[0])

    return (resFiles, tot_ambiguous, tot_discarded)