Пример #1
0
    def __init__(self, args, confs, funcs, output_dir):
        self.nSteps = len(funcs)
        self.outputdir = os.path.abspath(output_dir)

        self.args = args
        self.confs = confs

        self.stepFuncs = {}
        self.order = list()
        self.stepDir = {}
        self.tempDir = {}
        i = 0
        for pair in funcs:
            self.stepFuncs[pair[0]] = pair[1]
            self.order.append(pair[0])
            dir_name = self.outputdir + "/step_" + str(i + 1) + "-" + pair[0]
            print(dir_name)
            self.stepDir[pair[0]] = dir_name
            self.tempDir[pair[0]] = dir_name + "-tmp"
            i += 1
        if not os.path.exists(self.outputdir):
            print("Creating " + self.outputdir)
            code = runCommand("mkdir " + self.outputdir)
            if code != 0:
                print("No permission to create " + self.outputdir +
                      ", cannot procede.")
                self.ready = False
            else:
                self.ready = True
        else:
            self.ready = True
Пример #2
0
def get_infernal_output(output_dir, output_file):
    paths = getFilesWith(output_dir, ".tsv")
    paths2 = getFilesWith(output_dir, ".out")
    if len(paths) > 0:
        if len(paths2) > 0:
            results2 = " ".join(paths2)
            runCommand("cat " + results2 + " > " + output_file.rstrip(".tsv") +
                       ".out")
            erase_comments(output_file.rstrip(".tsv") + ".out")
        results = " ".join(paths)
        runCommand("cat " + results + " > " + output_file + ".tsv")
        erase_comments(output_file)
        return True
    else:
        print("No results ready yet")
        return False
Пример #3
0
def run_trnascan(args, confs, tmpDir, stepDir):
    genome_path = args["genome_link"]
    tRNAscan = confs["tRNAscan-SE"]
    output_file = tmpDir + "/trna_raw.txt"
    stats_file = tmpDir + "/trna_stats.txt"
    cmd = " ".join(
        [tRNAscan, "-o", output_file, "-m", stats_file, genome_path])
    code = runCommand(cmd)
    return code == 0
Пример #4
0
def split_genome(args, confs, tmpDir, stepDir):
    output_dir = args["data_dir"] + "/genome_parts"
    if not "genome" in args:
        print("Cannot run annotation without a path to a genome.")
        return False
    fasta_path = os.path.abspath(args["genome"])
    n = 100
    #creating shortcut to genome fasta
    runCommand("ln -s " + fasta_path + " " + args["genome_link"])
    fasta_path = args["genome_link"]
    print("Reading input fasta")
    seqs = readSeqsFromFasta(fasta_path)
    total_length = sum([len(entry[1]) for entry in seqs])
    print("Total length:" + str(total_length))
    max_length = int(total_length / n)

    current_length = 0
    part = []
    parts = []
    print("Spliting parts of fasta")
    cumulative = 0
    parts_done = 0
    for seq in seqs:
        if n > parts_done:
            max_length = int((total_length - cumulative) / (n - parts_done))
        if ((current_length >= max_length)):
            parts.append(part)
            parts_done += 1
            part = []
            current_length = 0
        part.append(shortFastaHeader(seq))
        cumulative += len(seq[1])
        current_length += len(seq[1])
    if len(part) > 0:
        parts.append(part)

    file_names = [
        output_dir + "/" + str(i) + ".fasta" for i in range(len(parts))
    ]
    runCommand("mkdir " + output_dir)
    print("Writing fasta files")
    for i in range(len(parts)):
        writeFastaSeqs(parts[i], file_names[i])
    return True
Пример #5
0
def blast(query, db, max_evalue = 0.001, threads=8, 
    blast_type="blastn", output = "results.tsv"):
    if threads > 8:
        threads = 8
    print("Blasting query to DB")
    cmd = " ".join([blast_type, "-db", db, "-query", query, "-evalue", str(max_evalue), "-num_threads", 
                str(threads), "-outfmt", "'6 qaccver saccver pident length mismatch"+
                " gapopen qstart qend sstart send evalue bitscore qcovs'", "-out", output])
    code = runCommand(cmd)
    return code == 0
Пример #6
0
    ap.add_argument(
        "-edb",
        "--extra-db",
        required=False,
        default=None,
        help=
        ("Add extra ncRNA databases for this run. Sintax: -edb db_name:db_path;db_name2:db_path2"
         ))
    return vars(ap.parse_args())


#parsing arguments
cmdArgs = getArgs()
outputdir = os.path.abspath(cmdArgs["output"])
if not os.path.exists(outputdir):
    runCommand("mkdir " + outputdir)
argsfile = outputdir + "/args.json"
args = {}
if os.path.exists(argsfile):
    with open(argsfile, "r") as input_stream:
        content = "\n".join(input_stream.readlines())
        args = eval(content)

for arg in cmdArgs:
    if cmdArgs[arg] is not None:
        args[arg] = cmdArgs[arg]

if not "best_hits" in args:
    args["best_hits"] = "False"

#if not os.path.isfile(inputFasta):
Пример #7
0
def erase_comments(path):
    tmp = path + ".tmp"
    runCommand("grep -v '^#' " + path + " > " + tmp)
    runCommand("mv " + tmp + " " + path)
Пример #8
0
def infernal(fasta, cmscan, rfam, threads):
    output_name = fasta.rstrip(".fasta") + ".tsv"
    output_name2 = fasta.rstrip(".fasta") + ".out"
    new_fasta = fasta + "_running"
    runCommand("mv " + fasta + " " + new_fasta)
    cmd = (cmscan + " -o " + output_name2 + " --tblout " + output_name +
           " -E 0.01 --acc --cpu " + str(threads) + " " + rfam + " " +
           new_fasta)
    runCommand("rm " + output_name)
    runCommand("rm " + fasta.rstrip(".fasta") + ".out")
    code = runCommand(cmd)
    if (code != 0):
        runCommand("mv " + new_fasta + " " + fasta)
    else:
        runCommand("mv " + new_fasta + " " + fasta + "_done")
Пример #9
0
    def run(self, start_from="-1", stop_at="-1"):
        print("Running pipeline")
        try:
            int(start_from)
            start_from = int(start_from)
        except ValueError:
            start_from = self.get_step_order(start_from) + 1

        try:
            int(stop_at)
            stop_at = int(stop_at)
        except ValueError:
            stop_at = self.get_step_order(stop_at)

        if not self.ready:
            print("Not ready to start pipeline.")
            return

        startingStep = 1
        if start_from > 0:
            startingStep = start_from
            if startingStep > self.nSteps:
                sys.exit("This step does not exist")
            elif startingStep > 1:
                if not os.path.exists(self.get_dir(startingStep - 2)):
                    sys.exit("The previous step to Step " + str(startingStep) +
                             " has not been done yet.")
                    print("Starting from " + str(startingStep))
        else:
            for name, path in self.stepDir.items():
                if os.path.exists(path):
                    print("Skipping step " + str(startingStep))
                    startingStep += 1

        #running necessary steps
        limit = len(self.stepFuncs)
        if stop_at > 0:
            stopAt = stop_at
            if stopAt > 0:
                limit = stopAt
                print("Stoping at " + str(limit))
        #print("entering steps loop from " + str(startingStep-1) + " " + str(limit))
        #print(str(range(startingStep-1, limit)))
        for i in range(startingStep - 1, limit):
            print(str(i))
            step = self.get_step_name(i)
            #print("entered")
            print("--- STEP " + str(i + 1) + ": " + step + " ---")

            #create temporary dir to store files from next step
            if os.path.exists(self.tempDir[step]):
                runCommand("rm -Rf " + self.tempDir[step])
            runCommand("mkdir " + self.tempDir[step])
            print(self.tempDir[step])
            #run step
            success = self.stepFuncs[step](self.args, self.confs,
                                           self.tempDir[step], self.stepDir)

            if success:
                #move results from temporary dir to permanent one
                if os.path.exists(self.stepDir[step]):
                    runCommand("rm -Rf " + self.stepDir[step])
                runCommand("mv " + self.tempDir[step] + " " +
                           self.stepDir[step])
            else:
                print("Step " + str(i + 1) + " was not successful.")
                break
Пример #10
0
def blast_annotate(query, db, output_dir, max_evalue = 0.0000000001, threads=8, blast_type="blastn", 
        db_id_sep_char=" ", source="db_name", remaining_fasta="auto_name", run_blast=True, alternative_outputdir=None):
    import os
    import pandas as pd

    print("Blasting query to DB")

    create_db = True
    if os.path.exists(db+".nhr"):
        create_db = False
    
    if create_db:
        cmd = " ".join(["makeblastdb -in " + db + " -dbtype nucl"])
        code = runCommand(cmd)
        if code != 0:
            print("Could not create database for given genome.")
            return False, ""

    db_name = os.path.basename(db).split(".")[0]
    if source == "db_name":
        source = db_name
    query_name = os.path.basename(query).split(".")[0]
    search_name = query_name+".to."+db_name
    output = output_dir + "/"+search_name+"_results.tsv"
    cmd = " ".join([blast_type, "-db", db, "-query", query,
        "-out", output, "-outfmt", 
        "'6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs'", 
        "-num_threads", str(threads), "-evalue", str(max_evalue)])
    #cmd = " ".join([blast_type, "-d", db, "-i", query, "-e", str(max_evalue), "-a", 
    #            str(threads), "-outfmt 1", "-o", output])
    if run_blast:
        code = runCommand(cmd)
        if code != 0:
            return False, ""
    else:
        if alternative_outputdir != None:
            output = alternative_outputdir + "/"+search_name+"_results.tsv"

    print("Reading full seq names from DB")
    rnas = getFastaHeaders(db)
    full_names = dict()
    #x = 0
    for entry in rnas:
        parts = entry.split(db_id_sep_char)
        full_names[parts[0]] = " ".join(parts[1:])

    print("Parsing blast output")
    seqs = readSeqsFromFasta(query)
    seq_lens = {}
    for seq in seqs:
        seq_lens[seq[0].rstrip("\n").lstrip(">")] = len(seq[1])
    example_sequence_names = list(seq_lens.keys())[:5]
    print("Some sequence keys: " + str(example_sequence_names))
    blast_df = pd.read_csv(output, sep='\t', header=None, index_col=False,
            names=["qseqid", "sseqid", "pident", "length", "mismatch",
                "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"])
    blast_df = blast_df.astype({"pident": 'float32', "length": 'int32', "mismatch": 'int32',
                "gapopen": 'int32', "qstart": 'int32', "qend": 'int32',
                "sstart": 'int32', "send": 'int32', "evalue": 'float64', "bitscore": 'float64'})
    print(str(blast_df.head()))
    print("Calculating coverage")
    blast_df["coverage"] = blast_df.apply(lambda row: row["length"] / seq_lens[row['qseqid']], axis=1)
    best_hits = dict()
    print(str(len(blast_df)) + " alignments")
    min_coverage = 0.99
    min_pid = 99
    print("Filtering blast results")
    blast_df = blast_df[blast_df["pident"] > min_pid]
    print(str(len(blast_df)) + " alignments filtered by pident")
    blast_df = blast_df[blast_df["coverage"] > min_coverage]
    print(str(len(blast_df)) + " alignments filtered by coverage")

    print("Choosing best hits")
    unique = 0
    for name, hits in blast_df.groupby(["qseqid"]):
        hit = get_best_mapping(hits)
        if hit != None:
            for i in range(len(hit)):
                best_hits[name+"."+str(i)] = hit[i]
            unique += 1
    print(str(unique) + " transcripts with genome mapping.")
    print(str(len(best_hits.keys())) + " total mappings.")

    print("Writing gff file about seqs identified")
    rows = []
    for name in best_hits:
        hit = best_hits[name]
        '''print(str(hit))
        print(type(str(hit)))
        print(hi)
        print(type(hit["sstart"]))
        print(type(hit["sstart"].item()))'''
        int_sstart = int(hit["sstart"])
        int_send = int(hit["send"])
        start = min(int_sstart, int_send)
        end = max(int_sstart, int_send)
        #full_name = "."
        #if hit["sseqid"] in full_names:
        #    full_name = full_names[hit["sseqid"]]
        row = {"seqname": hit["sseqid"], "source": source,
            "feature": "transcript", "start": str(start),
            "end":str(end), "score": ".",
            "strand": get_strand(int(hit["qstart"]),int(hit["qend"]),int(hit["sstart"]),int(hit["send"])),
            "frame": ".",
            "attribute":"ID="+name+";evalue="+str(hit["evalue"])
            +";coverage="+str(hit["coverage"])+";pident="+str(hit["pident"])}
        rows.append(row)
    gff = pd.DataFrame(rows, columns = ["seqname", "source",
        "feature", "start", "end", "score", "strand",
        "frame", "attribute"])
    gff_name = output_dir+"/"+search_name+"_found.gff"
    gff.to_csv(gff_name, sep="\t", index=False, header = False)
    print(str(len(seqs)) + " transcripts analyzed.")
    print(str(len(gff)) + " mappings detected and annotated on " + gff_name)

    '''print("Writing fasta files.")
    known = set([raw_name.split(".")[0] for raw_name in gff["seqname"].unique().tolist()])
    print("Some known sequences: " + str(list(known)[:5]))
    print("Some sequences: " + str([x[0] for x in seqs[:5]]))
    knownSeqs, unknownSeqs = filterSeqs(seqs, known)

    fasta_name = output_dir + "/" + remaining_fasta
    if remaining_fasta=="auto_name":
        fasta_name = output_dir+"/"+search_name+"_missing.fasta"
    writeFastaSeqs(unknownSeqs, fasta_name)
    writeFastaSeqs(knownSeqs, gff_name.rstrip("gff")+"fasta")
    print(str(len(gff)) + " detected and annotated on " + gff_name)
    print(str(len(unknownSeqs)) + " unknown seqs remaining on " + fasta_name)'''
    return True, gff_name
Пример #11
0
def writeFastaWithUniqueHeaders(input_fasta, base_name="Contig"):
    seqs = readSeqsFromFasta(input_fasta)
    runCommand("cp " + input_fasta + " " + input_fasta + ".old_names")
    writeSeqsWithUniqueHeaders(input_fasta, seqs, base_name=base_name)