Exemplo n.º 1
0
def CountReads(infile, params):
    original = PipelineMetaAssemblyKit.SequencingData(infile)
    original.readCount()
    rrna = False
    genome = False
    rnadir = os.getcwd() + "/rrna_filter_out.dir/"
    gendir = os.getcwd() + "/genome_filter_out.dir/"
    if params["General_rrna_filter"] == "true":
        rrna = PipelineMetaAssemblyKit.SequencingData(rnadir +
                                                      original.cleanname +
                                                      "/other_" +
                                                      original.filename)
        rrna.readCount()
    if params["General_host_filter"] == "true":
        genome = PipelineMetaAssemblyKit.SequencingData(gendir +
                                                        original.cleanname +
                                                        "/hostfiltered_" +
                                                        original.filename)
        genome.readCount()
    ocount = original.readcount
    if rrna == False:
        rcount = "NA"
    else:
        rcount = rrna.readcount
    if genome == False:
        gcount = "NA"
    else:
        gcount = genome.readcount
    return ("{}\t{}\t{}\t{}\n".format(original.cleanname, ocount, rcount,
                                      gcount))
Exemplo n.º 2
0
def pooledName(infiles,PARAMS):
    ftype = PipelineMetaAssemblyKit.SequencingData(infiles[0])
    pooledname = "pooled.dir/"+PARAMS["General_output_prefix"]+"."+ftype.fileformat
    if ftype.paired == True and ftype.interleaved == False:
        pooledname += ".1"
    if ftype.compressed == True:
        pooledname += ".gz"
    return(PipelineMetaAssemblyKit.SequencingData(pooledname))
Exemplo n.º 3
0
def runIdbaud(infile,outfile):
    job_memory = str(PARAMS["IDBAUD_clus_memory"])+"G"
    job_threads = PARAMS["IDBAUD_clus_threads"]
    seqdat = PipelineMetaAssemblyKit.SequencingData(infile)
    assembler = PipelineMetaAssemblyKit.Idbaud(seqdat,"idbaud_out.dir",PARAMS)
    statement = assembler.build()
    P.run()
Exemplo n.º 4
0
def idbaudInterleave(infile,outfile):
    seqdat = PipelineMetaAssemblyKit.SequencingData(infile)
    if os.path.exists(os.getcwd()+"/idbaud_out.dir/{}".format(seqdat.cleanname)) == False:
        os.mkdir(os.getcwd()+"/idbaud_out.dir/{}".format(seqdat.cleanname))
    statement = PipelineMetaAssemblyKit.IdbaudInterleave(seqdat,os.getcwd(),outfile)
    if statement != None:
        P.run()
Exemplo n.º 5
0
def detectOrfs(infile, outfile):
    statementlist = []
    #set job memory and threads
    job_memory = str(PARAMS["Prodigal_memory"]) + "G"
    job_threads = PARAMS["Prodigal_threads"]
    #command to generate index files
    seqdat = PipelineMetaAssemblyKit.SequencingData(infile)
    #ensure input is FASTA
    if seqdat.paired == True:
        print(
            "Prodigal requires single/merged (i.e. not paired-end) reads for ORF detection."
        )
    else:
        if seqdat.fileformat == "fastq":
            statementlist.append("reformat.sh in={} out={}".format(
                infile, "orfs.dir/" + seqdat.cleanname + ".fa"))
            infile = "orfs.dir/" + seqdat.cleanname + ".fa"
        #generate the call to prodigal
        statementlist.append(
            PipelineMetaAnnotate.runProdigal(infile, outfile, PARAMS))
        #remove the temp FASTA if created
        if seqdat.fileformat == "fastq":
            statementlist.append("rm {}".format("orfs.dir/" +
                                                seqdat.cleanname + ".fa"))
        statement = " && ".join(statementlist)
        P.run()
Exemplo n.º 6
0
def runMegahit(infile, outfile):
    job_memory = str(PARAMS["Megahit_clus_memory"])+"G"
    job_threads = PARAMS["Megahit_clus_threads"]
    seqdat=PipelineMetaAssemblyKit.SequencingData(infile)
    assembler = PipelineMetaAssemblyKit.Megahit(seqdat,"megahit_out.dir",PARAMS)
    statement = assembler.build()
    to_cluster = True
    P.run()
Exemplo n.º 7
0
def checkFile(infile, outfile):
    seqdat=PipelineMetaAssemblyKit.SequencingData(infile)
    outf=open(outfile,'w')
    outf.write("name\t{}\nformat\t{}\ncompressed\t{}\npaired\t{}\ninterleaved\t{}\n".format(
        seqdat.filename,seqdat.fileformat,seqdat.compressed,seqdat.paired,seqdat.interleaved))
    seqdat.readCount()
    outf.write("read_count\t{}\n".format(seqdat.readcount))
    outf.close()
Exemplo n.º 8
0
def runMetaspades(infile,outfile):
    job_memory = str(int(math.ceil(int(PARAMS["Metaspades_memory"])/int(PARAMS["Metaspades_threads"]))))+"G"
    job_threads = PARAMS["Metaspades_threads"]
    seqdat = PipelineMetaAssemblyKit.SequencingData(infile)
    if seqdat.paired == True:
        assembler = PipelineMetaAssemblyKit.Metaspades(seqdat,"metaspades_out.dir",PARAMS)
        statement = assembler.build()
        P.run()
    else:
        print("cannot run metaspades on file {} as it requires paired-end data".format(seqdat.filename))
Exemplo n.º 9
0
def filterMapping(infile, outfile):
    #use the original sequencing file to pull pairedness, file format and compression
    seqdat = PipelineMetaAssemblyKit.SequencingData(
        os.path.basename(infile.strip(".mapped.bam")))
    filterer = PipelineMetaFilter.FilterFromBam(infile, outfile, seqdat,
                                                PARAMS)
    statementlist = []
    statementlist.append(filterer.build())
    statement = " && ".join(statementlist)
    P.run()
Exemplo n.º 10
0
def poolReads(infiles,outfile):
    statementlist = []
    #get file type from first file
    ftype = PipelineMetaAssemblyKit.SequencingData(infiles[0])
    #generate output filename
    outname = "pooled.dir/"+PARAMS["General_output_prefix"]+"."+ftype.fileformat
    #pool the reads
    statementlist.append(PipelinePoolReads.poolReads(ftype,infiles,outname))
    #create the log to ensure jpb isn't rerun
    statementlist.append('echo "Pooled {} files to {}" >> pooled.dir/pool.log'.format(len(infiles),outname))
    statement = " && ".join(statementlist)
    P.run()
Exemplo n.º 11
0
def mapBowtie2(infile, outfile):
    job_threads = PARAMS["Bowtie_threads"]
    job_memory = str(PARAMS["Bowtie_memory"]) + "G"
    seqdat = PipelineMetaAssemblyKit.SequencingData(infile)
    bowtie = PipelineMetaFilter.Bowtie2(seqdat, outfile, PARAMS)
    statementlist = []
    #remove all comments from read names in files (trimming can add comments making non-matching readnames in pairs)
    statementlist.append(bowtie.cleanNames())
    #directory for output
    statementlist.append("mkdir -p {}".format(os.path.dirname(outfile)))
    #call to bowtie
    statementlist.append(bowtie.build())
    #convert sam to bam
    statementlist.append("samtools view -bS {} > {}".format(
        outfile.replace(".bam", ".sam"), outfile))
    #remove the sam file
    statementlist.append("rm {}".format(outfile.replace(".bam", ".sam")))
    statement = " && ".join(statementlist)
    P.run()
Exemplo n.º 12
0
def poolReads(ftype,infiles,outname):
    statementlist = []
    if ftype.paired == True and ftype.interleaved == False:
        statementlist.append("touch {} && touch {}".format(outname+".1",outname+".2"))
    else:
        statementlist.append("touch {}".format(outname))
    #concatenate the reads as appropriate
    for i in infiles:
        curfile = PipelineMetaAssemblyKit.SequencingData(i)
        if ftype.paired == True and ftype.interleaved == False:
            statementlist.append("zcat -f {} >> {} && zcat -f {} >> {}".format(i,outname+".1",curfile.pairedname,outname+".2"))
        else:
            statementlist.append("zcat -f {} >> {}".format(i,outname))
    #if compressed in, compress the output
    if ftype.compressed == True:
        if ftype.paired == True and ftype.interleaved == False:
            statementlist.append("gzip {} && gzip {}".format(outname+".1",outname+".2"))
        else:
            statementlist.append("gzip {}".format(outname))
    return(" && ".join(statementlist))
Exemplo n.º 13
0
def runSortMeRNA(infile, outfile):
    seqdat = PipelineMetaAssemblyKit.SequencingData(infile)
    if PARAMS["General_rrna_filter"] == "true":
        sortmerna = PipelineMetaFilter.SortMeRNA(seqdat, outfile, PARAMS)
        if PARAMS["SortMeRNA_memory"] != "false":
            job_memory = str(PARAMS["SortMeRNA_memory"]) + "G"
        else:
            job_memory = "1G"
        job_threads = PARAMS["SortMeRNA_threads"]
        statement = sortmerna.build()
    else:
        #if skipping rRNA filtering symlink files and make appropriate directory
        statementlist = ["rm -r ref_index.dir"]
        statementlist.append('mkdir -p rrna_filter_out.dir/{}'.format(
            seqdat.cleanname))
        statementlist.append('ln -s {} {}'.format(os.getcwd() + "/" + infile,
                                                  outfile))
        if seqdat.paired == True and seqdat.interleaved == False:
            statementlist.append(
                'ln -s {} rrna_filter_out.dir/{}/other_{}'.format(
                    os.getcwd() + "/" + seqdat.pairedname, seqdat.cleanname,
                    seqdat.pairedname))
        statement = " && ".join(statementlist)
    P.run()
Exemplo n.º 14
0
def cleanUp(infile, outfile):
    seqdat = PipelineMetaAssemblyKit.SequencingData(infile)
    statement = PipelineMetaFilter.CleanUp(seqdat, outfile, PARAMS)
    P.run()