Exemplo n.º 1
0
def runMegahit(infile, outfile):
    job_memory = str(PARAMS["Megahit_clus_memory"]) + "G"
    job_threads = int(PARAMS["Megahit_clus_threads"])
    seqdat = PipelineAssembly.SequencingData(infile)
    assembler = PipelineAssembly.Megahit(seqdat, "megahit_out.dir", PARAMS)
    statement = assembler.build()
    P.run(statement)
Exemplo n.º 2
0
def runIdbaud(infile, outfile):
    job_memory = str(PARAMS["IDBAUD_clus_memory"]) + "G"
    job_threads = int(PARAMS["IDBAUD_clus_threads"])
    seqdat = PipelineAssembly.SequencingData(infile)
    assembler = PipelineAssembly.Idbaud(seqdat, "idbaud_out.dir", PARAMS)
    statement = assembler.build()
    P.run(statement)
Exemplo n.º 3
0
def idbaudInterleave(infile, outfile):
    seqdat = PipelineAssembly.SequencingData(infile)
    if os.path.exists(os.getcwd() +
                      "/idbaud_out.dir/{}".format(seqdat.cleanname)) == False:
        os.mkdir(os.getcwd() + "/idbaud_out.dir/{}".format(seqdat.cleanname))
    statement = PipelineAssembly.IdbaudInterleave(seqdat, os.getcwd(), outfile)
    if statement != None:
        P.run(statement)
Exemplo n.º 4
0
def runIdbaud(infile, outfile):
    job_memory = str(PARAMS["IDBAUD_clus_memory"]) + "G"
    job_threads = int(PARAMS["IDBAUD_clus_threads"])
    seqdat = PipelineAssembly.SequencingData(infile)
    assembler = PipelineAssembly.Idbaud(seqdat, "idbaud_out.dir", PARAMS)
    statement = assembler.build()
    statement += ' && echo "Made file {}." > {}'.format(
        outfile.replace("_complete.log", ""), outfile)
    downstream = True
    P.run(statement)
Exemplo n.º 5
0
def runMetaspades(infile, outfile):
    job_memory = str(PARAMS["Metaspades_memory"]) + "G"
    job_threads = int(PARAMS["Metaspades_threads"])
    seqdat = PipelineAssembly.SequencingData(infile)
    if seqdat.paired == True:
        assembler = PipelineAssembly.Metaspades(seqdat, "metaspades_out.dir",
                                                PARAMS)
        statement = assembler.build()
        P.run(statement)
    else:
        print(
            "cannot run metaspades on file {} as it requires paired-end data".
            format(seqdat.filename))
Exemplo n.º 6
0
def CountReads(infile, outfile, params):
    def counter(seqfile, outfile):
        sdat = PipelineAssembly.SequencingData(seqfile)
        div = 4
        if sdat.fileformat == "fasta":
            div = 2
        return (
            "zcat {} | wc -l | awk 'BEGIN {{ORS=\"\"}}; END {{x=$1/{}; print \"\\t\"x}}' >> {}"
            .format(seqfile, div, outfile))

    original = PipelineAssembly.SequencingData(infile)
    call = [
        'printf "File\\tInput\\tPost_rRNA_Filtering\\tPost_Genome_Filtering\\n{}" > {}'
        .format(original.cleanname, outfile)
    ]
    ocount = counter(infile, outfile)
    rcount = 'printf "\\tNA" >> {}'.format(outfile)
    gcount = 'printf "\\tNA" >> {}'.format(outfile)
    rnadir = os.getcwd() + "/rrna_filter_out.dir/"
    gendir = os.getcwd() + "/genome_filter_out.dir/"
    if params["General_rrna_filter"] == "true":
        rcount = counter(
            rnadir + original.cleanname + "/other_" + original.filename,
            outfile)
    if params["General_host_filter"] == "true":
        gcount = counter(
            gendir + original.cleanname + "/hostfiltered_" + original.filename,
            outfile)
    call.append(ocount)
    call.append(rcount)
    call.append(gcount)
    call.append('printf "\\n" >> {}'.format(outfile))
    return (" && ".join(call))
Exemplo n.º 7
0
def fileForm():
    matching = [x for x in glob.glob("./*") if re.search(seqpat, x)]
    seqdat = PipelineAssembly.SequencingData(matching[0])
    ext = seqdat.fileformat
    if seqdat.compressed == True:
        ext += ".gz"
    return (ext)
Exemplo n.º 8
0
def mapSamples(infile, outfile):
    filename = re.match(
        r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)",
        infile).group(1)
    filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS)
    #get the mapping DB
    bowtiedb = "contig_databases.dir/{}.contigs.bowtie".format(
        filemap.samplename)
    job_threads = int(PARAMS["Bowtie_threads"])
    job_memory = str(PARAMS["Bowtie_memory"]) + "G"
    seqdat = PipelineAssembly.SequencingData(infile)
    bowtie = PipelineFilter.Bowtie2(seqdat, outfile, PARAMS, bowtiedb)
    #need to reset the working directory in the bowtie function as it is running on files in one directory
    bowtie.indir = ""
    statementlist = []
    #remove all comments from read names in files (trimming can add comments making non-matching pairs)
    #only skip if there was a failure in a previous run at the bowtie step
    if PARAMS["Bowtie_skip_file_prep"] != "true":
        statementlist.append(bowtie.cleanNames())
    #directory for output
    statementlist.append("mkdir -p {}".format(os.path.dirname(outfile)))
    #call to bowtie
    statementlist.append(bowtie.build())
    #convert sam to bam
    statementlist.append("samtools view -bS {} > {}".format(
        outfile.replace(".bam", ".sam"), outfile))
    #remove the sam file
    statementlist.append("rm {}".format(outfile.replace(".bam", ".sam")))
    statement = " && ".join(statementlist)
    P.run(statement)
Exemplo n.º 9
0
def detectOrfs(infile, outfile):
    statementlist = []
    #set job memory and threads
    job_memory = str(PARAMS["Prodigal_memory"]) + "G"
    job_threads = int(PARAMS["Prodigal_threads"])
    #command to generate index files
    seqdat = PipelineAssembly.SequencingData(infile)
    #generate outfile without gz
    outfile = outfile.replace(".gz", "")
    #ensure input is FASTA
    if seqdat.paired == True:
        print("Cannot detect ORFs from paired-end reads.")
    else:
        if seqdat.fileformat == "fastq":
            statementlist.append("reformat.sh in={} out={}".format(
                infile, "orfs.dir/" + seqdat.cleanname + ".fa"))
            infile = "orfs.dir/" + seqdat.cleanname + ".fa"
        #generate the call to prodigal
        statementlist.append(
            PipelineAnnotate.runProdigal(infile, outfile, PARAMS))
        #remove the temp FASTA if created
        if seqdat.fileformat == "fastq":
            statementlist.append("rm {}".format("orfs.dir/" +
                                                seqdat.cleanname + ".fa"))
        #compress the outputs
        statementlist.append("gzip {}".format(outfile))
        statementlist.append("gzip {}".format(
            outfile.replace("peptides", "positions")))
        statement = " && ".join(statementlist)
        P.run(statement)
Exemplo n.º 10
0
def runMetaspades(infile, outfile):
    job_memory = str(PARAMS["Metaspades_memory"]) + "G"
    job_threads = int(PARAMS["Metaspades_threads"])
    seqdat = PipelineAssembly.SequencingData(infile)
    if seqdat.paired == True:
        assembler = PipelineAssembly.Metaspades(seqdat, "metaspades_out.dir",
                                                PARAMS)
        statement = assembler.build()
        statement += ' && echo "Made file {}." > {}'.format(
            outfile.replace("_complete.log", ""), outfile)
        downstream = True
        P.run(statement)
    else:
        print(
            "cannot run metaspades on file {} as it requires paired-end data".
            format(seqdat.filename))
Exemplo n.º 11
0
def detectOrfs(infile, outfile):
    statementlist = []
    #set job memory and threads
    job_memory = str(PARAMS["Prodigal_memory"]) + "G"
    job_threads = int(PARAMS["Prodigal_threads"])
    #command to generate index files
    seqdat = PipelineAssembly.SequencingData(infile)
    #ensure input is FASTA
    if seqdat.paired == True:
        print(
            "Prodigal requires single/merged (i.e. not paired-end) reads for ORF detection."
        )
    else:
        if seqdat.fileformat == "fastq":
            statementlist.append("reformat.sh in={} out={}".format(
                infile, "orfs.dir/" + seqdat.cleanname + ".fa"))
            infile = "orfs.dir/" + seqdat.cleanname + ".fa"
        #generate the call to prodigal
        statementlist.append(
            PipelineAnnotate.runProdigal(infile, outfile, PARAMS))
        #remove the temp FASTA if created
        if seqdat.fileformat == "fastq":
            statementlist.append("rm {}".format("orfs.dir/" +
                                                seqdat.cleanname + ".fa"))
        statement = " && ".join(statementlist)
        P.run(statement)
Exemplo n.º 12
0
def checkFile(infile, outfile):
    seqdat = PipelineAssembly.SequencingData(infile)
    outf = open(outfile, 'w')
    outf.write(
        "name\t{}\nformat\t{}\ncompressed\t{}\npaired\t{}\ninterleaved\t{}\n".
        format(seqdat.filename, seqdat.fileformat, seqdat.compressed,
               seqdat.paired, seqdat.interleaved))
    outf.close()
Exemplo n.º 13
0
 def counter(seqfile, outfile):
     sdat = PipelineAssembly.SequencingData(seqfile)
     div = 4
     if sdat.fileformat == "fasta":
         div = 2
     return (
         "zcat {} | wc -l | awk 'BEGIN {{ORS=\"\"}}; END {{x=$1/{}; print \"\\t\"x}}' >> {}"
         .format(seqfile, div, outfile))
Exemplo n.º 14
0
def collateContigfiles(infile, outfile):
    print(outfile)
    dirs = os.getcwd() + "/"
    outdir = "/".join(outfile.split("/")[0:-1])
    infile = PipelineAssembly.contigLoc(infile)
    outfile = outfile.replace(".gz", "")
    statement = "mkdir -p {} && mv {} {} && gzip {}".format(
        dirs + outdir, dirs + infile, dirs + outfile, dirs + outfile)
    P.run(statement)
Exemplo n.º 15
0
def filterMapping(infile, outfile):
    #use the original sequencing file to pull pairedness, file format and compression
    seqdat = PipelineAssembly.SequencingData(
        os.path.basename(infile.strip(".mapped.bam")))
    filterer = PipelineFilter.FilterFromBam(infile, outfile, seqdat, PARAMS)
    statementlist = []
    statementlist.append(filterer.build())
    statement = " && ".join(statementlist)
    P.run(statement)
Exemplo n.º 16
0
def mapBowtie2(infile, outfile):
    job_threads = int(PARAMS["Bowtie_threads"])
    job_memory = str(PARAMS["Bowtie_memory"]) + "G"
    seqdat = PipelineAssembly.SequencingData(infile)
    bowtie = PipelineFilter.Bowtie2(seqdat, outfile, PARAMS,
                                    PARAMS["Bowtie_genome_db"])
    statementlist = []
    #remove all comments from read names in files (trimming can add comments making non-matching readnames in pairs)
    #can skip this if crashed in previous run using the skip_file_prep parameter
    if PARAMS["Bowtie_skip_file_prep"] != "true":
        statementlist.append(bowtie.cleanNames())
    #directory for output
    statementlist.append("mkdir -p {}".format(os.path.dirname(outfile)))
    #call to bowtie
    statementlist.append(bowtie.build())
    #convert sam to bam
    statementlist.append("samtools view -bS {} > {}".format(
        outfile.replace(".bam", ".sam"), outfile))
    #remove the sam file
    statementlist.append("rm {}".format(outfile.replace(".bam", ".sam")))
    statement = " && ".join(statementlist)
    P.run(statement)
Exemplo n.º 17
0
def runSortMeRNA(infile, outfile):
    seqdat = PipelineAssembly.SequencingData(infile)
    if PARAMS["General_rrna_filter"] == "true":
        sortmerna = PipelineFilter.SortMeRNA(seqdat, outfile, PARAMS)
        if PARAMS["SortMeRNA_memory"] != "false":
            job_memory = str(PARAMS["SortMeRNA_memory"]) + "G"
        else:
            job_memory = "1G"
        job_threads = int(PARAMS["SortMeRNA_threads"])
        statement = sortmerna.build()
    else:
        #if skipping rRNA filtering symlink files and make appropriate directory
        statementlist = []
        statementlist.append('mkdir -p rrna_filter_out.dir/{}'.format(
            seqdat.cleanname))
        statementlist.append('ln -s {} {}'.format(os.getcwd() + "/" + infile,
                                                  outfile))
        if seqdat.paired == True and seqdat.interleaved == False:
            statementlist.append(
                'ln -s {} rrna_filter_out.dir/{}/other_{}'.format(
                    os.getcwd() + "/" + seqdat.pairedname, seqdat.cleanname,
                    seqdat.pairedname))
        statement = " && ".join(statementlist)
    P.run(statement)
Exemplo n.º 18
0
def callMetaphlan2(infile, outfile, params):
    #check format and pairedness of input
    indat = PipelineAssembly.SequencingData(infile)
    inputf = infile
    inputt = indat.fileformat
    if indat.paired == True:
        inputf += ",{}".format(indat.pairedname)
        inputt = "multi" + inputt
    mcall = [checkVenv(params)]
    mcall.append(
        "metaphlan2.py {} --input_type {} -t {} --tax_lev {} -o {}".format(
            inputf, inputt, params["Metaphlan2_t"],
            params["Metaphlan2_tax_lev"], outfile))
    if params["Metaphlan2_bowtie2db"] != "false":
        mcall.append("--bowtie2db {}".format(params["Metaphlan2_bowtie2db"]))
    if params["Metaphlan2_index"] != "false":
        mcall.append("--index {}".format(params["Metaphlan2_index"]))
    if params["Metaphlan2_bt2_ps"] != "false":
        mcall.append("--bt2_ps {}".format(params["Metaphlan2_bt2_ps"]))
    if params["Metaphlan2_bowtie2_exe"] != "false":
        mcall.append("--bowtie2_exe {}".format(
            params["Metaphlan2_bowtie2_exe"]))
    if params["Metaphlan2_bowtie2_build"] != "false":
        mcall.append("--bowtie2_build {}".format(
            params["Metaphlan2_bowtie2_build"]))
    if params["Metaphlan2_save_bowtie"] == "false":
        mcall.append("--no_map")
    else:
        mcall.append("--bowtie2out {}".format(outfile + ".bowtie.out"))
    if params["Metaphlan2_min_cu_len"] != "false":
        mcall.append("--min_cu_len {}".format(params["Metaphlan2_min_cu_len"]))
    if params["Metaphlan2_min_alignment_len"] != "false":
        mcall.append("--min_alignment_len {}".format(
            params["Metaphlan2_min_alignment_len"]))
    if params["Metaphlan2_ignore_viruses"] != "false":
        mcall.append("--ignore_viruses")
    if params["Metaphlan2_ignore_eukaryotes"] != "false":
        mcall.append("--ignore_eukaryotes")
    if params["Metaphlan2_ignore_bacteria"] != "false":
        mcall.append("--ignore_bacteria")
    if params["Metaphlan2_ignore_archaea"] != "false":
        mcall.append("--ignore_archaea")
    if params["Metaphlan2_ignore_viruses"] != "false":
        mcall.append("--ignore_viruses")
    if params["Metaphlan2_stat_q"] != "false":
        mcall.append("--stat_q {}".format(params["Metaphlan2_stat_q"]))
    if params["Metaphlan2_ignore_markers"] != "false":
        mcall.append("--ignore_markers {}".format(
            params["Metaphlan2_ignore_markers"]))
    if params["Metaphlan2_avoid_disqm"] != "false":
        mcall.append("--avoid_disqm")
    if params["Metaphlan2_stat"] != "false":
        mcall.append("--stat {}".format(params["Metaphlan2_stat"]))
    mcall.append(params["Metaphlan2_other"])
    if params["Metaphlan2_tmp_dir"] != "false":
        mcall.append("--tmp_dir {}".format(params["Metaphlan2_tmp_dir"]))
    mcall.append("--sample_id {}".format(indat.cleanname))
    if params["Metaphlan2_threads"] != "false":
        mcall.append("--nproc {}".format(params["Metaphlan2_threads"]))
    mstat = " ".join(mcall)
    return (mstat)
Exemplo n.º 19
0
def summariseContigs(infile, outfile):
    #summarise each contigs file
    statement = PipelineAssembly.SummariseContigs(infile, outfile)
    P.run(statement)
Exemplo n.º 20
0
def cleanUp(infile, outfile):
    seqdat = PipelineAssembly.SequencingData(infile)
    statement = PipelineFilter.CleanUp(seqdat, outfile, PARAMS)
    P.run(statement)