Exemplo n.º 1
0
def main():

    bam = snakemake.input.bam
    twobit = snakemake.input.twobit
    consensus = snakemake.input.consensus
    ref_te_bed = snakemake.input.ref_te_bed
    taxonomy = snakemake.input.taxonomy
    median_insert_size_file = snakemake.input.median_insert_size
    log = snakemake.params.log
    with open(log, "a") as l:
        l.write("BAM: " + bam + "\n")
        l.write("2bit: " + twobit + "\n")
        l.write("consensus fasta: " + consensus + "\n")
        l.write("reference TE BED: " + ref_te_bed + "\n")
        l.write("Taxonomy TSV: " + taxonomy + "\n")

    threads = snakemake.threads
    out_dir = snakemake.params.out_dir
    scripts_dir = snakemake.params.scripts_dir
    sample_name = snakemake.params.sample_name

    mccutils.log("temp", "running TEMP Module")
    median_insert_size = get_median_insert_size(median_insert_size_file)

    run_temp_insertion(bam, scripts_dir, consensus, ref_te_bed, taxonomy,
                       median_insert_size, threads, out_dir, log)

    run_temp_absence(bam, scripts_dir, consensus, ref_te_bed, twobit, taxonomy,
                     median_insert_size, threads, out_dir, log)

    for f in os.listdir(out_dir):
        if ".sorted.bam" in f or ".fastq" in f:
            mccutils.remove(f)
Exemplo n.º 2
0
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log):
    insertions = []
    tmp_gff = out+"/tmp.ref_nonabs.gff"
    command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed]
    mccutils.run_command_stdout(command, tmp_gff, log=log)

    with open(tmp_gff,"r") as gff:
        for line in gff:
            if "#" not in line:
                line = line.replace(";","\t")
                split_line = line.split("\t")
                insert = mccutils.Insertion()
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.temp.support = "!"
                insert.name = split_line[9].split("=")[1]+"_reference_"+sample+"_temp_nonab_"
                insert.strand = split_line[6]
                insert.temp.classification = "!"
                insert.temp.junction1Support = "!"
                insert.temp.junction2Support = "!"
                insert.temp.junction1 = '!'
                insert.temp.junction2 = "!"
                insert.temp.frequency = "!"
                insert.type = "reference"
                
                insertions.append(insert)
    
    mccutils.remove(tmp_gff)

    return insertions
Exemplo n.º 3
0
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log):
    insertions = []
    tmp_gff = out + "/tmp.ref_nonabs.gff"
    command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed]
    mccutils.run_command_stdout(command, tmp_gff, log=log)

    with open(tmp_gff, "r") as gff:
        for line in gff:
            if "#" not in line:
                line = line.replace(";", "\t")
                split_line = line.split("\t")
                insert = output.Insertion(output.Temp())
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.name = split_line[9].split(
                    "=")[1] + "|reference|NA|" + sample + "|temp|nonab|"
                insert.strand = split_line[6]
                insert.type = "reference"

                insertions.append(insert)

    mccutils.remove(tmp_gff)

    return insertions
Exemplo n.º 4
0
def split_consensus_fasta(fasta, ref_name, out):
    elements = []
    out_dir = out+"/split_fasta/"
    mccutils.mkdir(out_dir)
    fasta_records = SeqIO.parse(fasta,"fasta")
    for record in fasta_records:
        fasta_name = str(record.id)
        elements.append(fasta_name)
        special_chars = [";","&","(",")","|","*","?","[","]","~","{","}","<","!","^",'"',"'","\\","$","/"]
        for char in special_chars:
            fasta_name = fasta_name.replace(char,"_")

        tmp_fasta = out_dir+ref_name+"_"+fasta_name+".fasta.tmp"
        with open(tmp_fasta,"w") as outfa:
            outfa.write(">"+str(record.id)+"\n")
            outfa.write(str(record.seq)+"\n")
        
        fasta_lines = fix_fasta.fix_fasta_lines(tmp_fasta, 80)
        out_fasta = out_dir+ref_name+"_"+fasta_name+".fasta"
        with open(out_fasta,"w") as outfa:
            for line in fasta_lines:
                outfa.write(line+"\n")
        
        mccutils.remove(tmp_fasta)
    
    return elements
Exemplo n.º 5
0
def main():
    full_command = " ".join(["python3"] + sys.argv)
    current_directory = os.getcwd()
    args = parse_args()
    mccutils.mkdir(args.out + "/logs")
    mccutils.mkdir(args.out + "/tmp")
    check_input_files(args.reference,
                      args.consensus,
                      args.first,
                      fq2=args.second,
                      locations=args.locations,
                      taxonomy=args.taxonomy,
                      coverage_fasta=args.coverage_fasta,
                      augment_fasta=args.augment,
                      annotations_only=args.make_annotations)
    ref_name = mccutils.get_base_name(args.reference)
    run_id = make_run_config(args,
                             args.sample_name,
                             ref_name,
                             full_command,
                             current_directory,
                             debug=args.debug)
    run_workflow(args,
                 args.sample_name,
                 ref_name,
                 run_id,
                 debug=args.debug,
                 annotations_only=args.make_annotations)
    mccutils.remove(args.out + "/tmp")
Exemplo n.º 6
0
def make_nonte_bed(reference, masked_gff, run_id, out, log):
    mccutils.log("coverage", "creating BED file of non-TE regions", log=log)
    masked_bed = out + "/input/" + run_id + "_ref_tes.bed"
    repeatmasker_gff_to_bed(masked_gff, masked_bed)

    sorted_bed = out + "/input/" + run_id + "_ref_tes_sorted.bed"
    mccutils.run_command_stdout(["bedtools", "sort", "-i", masked_bed],
                                sorted_bed,
                                log=log)

    chromosome_names = []
    with open(reference, "r") as fa:
        for line in fa:
            if ">" in line:
                chromosome_names.append(
                    line.replace(">", "").replace("\n", ""))

    chrom_idx = out + "/input/" + run_id + "_ref.genome"
    with open(reference + ".fai", "r") as faidx:
        with open(chrom_idx, "w") as genome:
            for line in faidx:
                split_line = line.split("\t")
                out_line = "\t".join([split_line[0], split_line[1]])
                genome.write(out_line + "\n")

    non_te_bed = out + "/input/" + run_id + "_ref_nonte.bed"
    command = ["bedtools", "complement", "-i", sorted_bed, "-g", chrom_idx]
    mccutils.run_command_stdout(command, non_te_bed, log=log)

    for f in [masked_bed, sorted_bed, chrom_idx]:
        mccutils.remove(f)

    return non_te_bed
Exemplo n.º 7
0
def make_nonredundant_bed(insertions, sample_name, out_dir):
    uniq_inserts = {}

    for insert in insertions:
        key = "_".join([insert.chromosome, str(insert.end)])
        if key not in uniq_inserts.keys():
            uniq_inserts[key] = insert
        else:
            if uniq_inserts[key].read_pair_support >  insert.read_pair_support:
                uniq_inserts[key] = insert
    
    tmp_bed = out_dir+"/tmp_telocate_nonredundant.bed"
    with open(tmp_bed, "w") as outbed:
        for key in uniq_inserts.keys():
            insert = uniq_inserts[key]
            out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand])
            outbed.write(out_line+"\n")
    
    sorted_bed = out_dir+"/sorted.bed"
    command = ["bedtools", "sort", "-i", tmp_bed]
    mccutils.run_command_stdout(command, sorted_bed)

    nonredundant_bed = out_dir+"/"+sample_name+"_telocate_nonredundant.bed"
    with open(sorted_bed, "r") as inbed:
        with open(nonredundant_bed, "w") as outbed:
            header = 'track name="'+sample_name+'_TE-locate" description="'+sample_name+'_TE-locate"\n'
            outbed.write(header)
            for line in inbed:
                outbed.write(line)
    

    mccutils.remove(tmp_bed)
    mccutils.remove(sorted_bed)
Exemplo n.º 8
0
def process_bed(bed, chromosomes, sample_name, log, out_dir, min_read_cutoff=0):
    unsorted_bed = out_dir+"/unsorted.bed"
    with open(unsorted_bed, "w") as outbed:
        with open(bed,"r") as inbed:
            insertion_count = 0
            for x,line in enumerate(inbed):
                line = line.replace(";","\t")
                split_line = line.split("\t")
                if int(split_line[7]) > min_read_cutoff and split_line[0] in chromosomes:
                    insertion_count += 1
                    outline = "\t".join([split_line[0], split_line[1], split_line[2], split_line[5]+"_"+split_line[8].replace("\n","")+"_"+sample_name+"_ngs_te_mapper_sr_"+str(x+1),"0", split_line[4]])
                    outbed.write(outline+"\n")
    
    if insertion_count >= 1:
        sorted_bed = out_dir+"/sorted.bed"
        command = ["bedtools", "sort", "-i", unsorted_bed]
        mccutils.run_command_stdout(command, sorted_bed, log=log)

        final_bed = out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"
        with open(final_bed,"w") as outbed:
            header = 'track name="'+sample_name+'_ngs_te_mapper" description="'+sample_name+'_ngs_te_mapper"\n'
            outbed.write(header)
            with open(sorted_bed, "r") as inbed:
                for line in inbed:
                    # line = line.replace("NA",".")
                    outbed.write(line)
        mccutils.remove(sorted_bed)
    else:
        mccutils.run_command(["touch",out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"])
        
    mccutils.remove(unsorted_bed)
Exemplo n.º 9
0
def make_data_copies(methods, results_dir, out_dir):
    mccutils.mkdir(out_dir + "/data/")
    if "trimgalore" in methods:
        if os.path.exists(out_dir + "/data/trimgalore/"):
            mccutils.remove(out_dir + "/data/trimgalore/")

        mccutils.mkdir(out_dir + "/data/trimgalore/")
        for f in os.listdir(results_dir + "/trimgalore"):
            if ".zip" not in f:
                mccutils.run_command([
                    "cp", "-r", results_dir + "/trimgalore/" + f,
                    out_dir + "/data/trimgalore/"
                ])

    if "coverage" in methods:
        if os.path.exists(out_dir + "/data/coverage/"):
            mccutils.remove(out_dir + "/data/coverage/")

        mccutils.mkdir(out_dir + "/data/coverage/")
        for f in os.listdir(results_dir + "/coverage/"):
            if not os.path.isdir(results_dir + "/coverage/" + f):
                mccutils.run_command([
                    "cp", results_dir + "/coverage/" + f,
                    out_dir + "/data/coverage/"
                ])
        for f in os.listdir(results_dir + "/coverage/te-depth-files/"):
            mccutils.run_command([
                "cp", results_dir + "/coverage/te-depth-files/" + f,
                out_dir + "/data/coverage/"
            ])
        for f in os.listdir(out_dir + "/data/coverage/"):
            tmp = out_dir + "/data/coverage/" + f
            o = f.replace(".csv", ".txt")
            o = o.replace(".cov", ".txt")
            mccutils.run_command(["mv", tmp, out_dir + "/data/coverage/" + o])
Exemplo n.º 10
0
def map_reads(out_dir, fq1, fq2, threads=1, log=None):
    reference_genome = out_dir+"/teflon.prep_MP/teflon.mappingRef.fa"
    command = ["bwa", "index", reference_genome]
    mccutils.run_command(command, log=log)

    out_sam = out_dir+"teflon.sam"

    command = [
        "bwa", "mem",
        "-t", str(threads),
        "-Y", reference_genome,
        fq1, 
        fq2
    ]

    mccutils.run_command_stdout(command, out_sam, log=log)

    out_bam = out_dir+"teflon.bam"
    command = ["samtools", "view", "-Sb", out_sam]
    mccutils.run_command_stdout(command, out_bam, log=log)

    sorted_bam = out_dir+"teflon.sorted.bam"
    command = ["samtools", "sort", "-@", str(threads), "-o", sorted_bam, out_bam]
    mccutils.run_command(command, log=log)

    command = ["samtools", "index", sorted_bam ]
    mccutils.run_command(command, log=log)

    mccutils.remove(out_sam)
    mccutils.remove(out_bam)

    return sorted_bam
Exemplo n.º 11
0
def run_repeatmasker(reference, ref_name, te_seqs, threads, log, outfile,
                     outdir):
    tmp_dir = outdir + "/tmp/repeatmasker"
    mccutils.remove(tmp_dir)
    mccutils.mkdir(tmp_dir)
    os.chdir(tmp_dir)

    command = [
        "RepeatMasker", "-pa",
        str(threads), "-lib", te_seqs, "-dir", tmp_dir, "-s", "-nolow",
        "-no_is", reference
    ]
    mccutils.run_command(command, log=log)

    os.chdir(outdir)

    rm_out = ""
    for f in os.listdir(tmp_dir):
        if "fasta.out" in f and f[-9:] == "fasta.out":
            rm_out = tmp_dir + "/" + f

    if rm_out == "":
        sys.exit("can't find Repeatmasker output in:" + tmp_dir + "\n")

    mccutils.run_command(["mv", rm_out, outfile])
Exemplo n.º 12
0
def remove_intermediate_files(options, run_config_file, methods, ref_name, sample_name, outdir):
    if "all" in options:
        return

    with open(run_config_file) as f:
        run_config_data = json.load(f)

    keep_paths = []
    if "methods" not in options:
        for method in methods:
            method_out = "/".join(run_config_data['out'][method].split("/")[:-1])+"/"
            if method not in options:
                essential_paths = run_config_data['essential'][method]
                if os.path.exists(method_out):
                    # delete all files not marked as essential
                    for root, subdirs, files in os.walk(method_out, topdown=False):
                        for f in files:
                            file_path = os.path.join(root, f)
                            is_essential = False
                            for essential_path in essential_paths:
                                if (os.path.isdir(essential_path) and essential_path in file_path) or (essential_path == file_path):
                                    is_essential = True
                            
                            if not is_essential:
                                mccutils.remove(file_path)
                    
                    # remove empty directories
                    for root, subdirs, files in os.walk(method_out, topdown=False):
                        for d in subdirs:
                            dir_path = os.path.join(root, d)
                            if len(os.listdir(dir_path)) < 1:
                                mccutils.remove(dir_path)
            
            else:
                keep_paths.append(method_out)
    
    if "general" not in options:
        intermediate_dir = outdir+"/"+sample_name+"/intermediate/"
        for root, subdirs, files in os.walk(intermediate_dir, topdown=False):
            for f in files:
                file_path = os.path.join(root, f)
                keep = False
                for keep_path in keep_paths:
                    if keep_path in file_path:
                        keep = True
                
                if not keep:
                    mccutils.remove(file_path)
        
        # remove empty directories
        for root, subdirs, files in os.walk(intermediate_dir, topdown=False):
            for d in subdirs:
                dir_path = os.path.join(root, d)
                if len(os.listdir(dir_path)) < 1:
                    mccutils.remove(dir_path)

        if len(os.listdir(intermediate_dir)) < 1:
            mccutils.remove(intermediate_dir)
Exemplo n.º 13
0
def main():
    mccutils.log("popoolationte", "running PopoolationTE preprocessing steps")
    ref_fasta = snakemake.input.ref_fasta
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    script_dir = snakemake.params.script_dir
    status_log = snakemake.params.status_log
    threads = snakemake.threads

    try:
        # ensures intermediate files from previous runs are removed
        for f in os.listdir(out_dir):
            mccutils.remove(out_dir + "/" + f)

        mccutils.log("popoolationte", "formatting read names")
        fq1, fq2 = format_read_names(fq1, fq2, sample_name, out_dir)
        mccutils.log("popoolationte",
                     "indexing popoolationTE reference fasta",
                     log=log)
        index_fasta(ref_fasta, log=log)
        mccutils.log("popoolationte", "mapping fastq1 reads", log=log)
        sam1 = map_reads(fq1, ref_fasta, threads=threads, log=log)
        mccutils.log("popoolationte", "mapping fastq2 reads", log=log)
        sam2 = map_reads(fq2, ref_fasta, threads=threads, log=log)
        mccutils.log("popoolationte", "combining alignments", log=log)
        combined_sam = combine_alignments(sam1,
                                          sam2,
                                          fq1,
                                          fq2,
                                          script_dir,
                                          out_dir,
                                          log=log)
        mccutils.log("popoolationte", "sorting sam file", log=log)
        bam = sam_to_bam(combined_sam, threads=threads, log=log)
        sorted_bam = sort_bam(bam, threads=threads, log=log)
        sorted_sam = bam_to_sam(sorted_bam, threads=threads, log=log)

        with open(status_log, "w") as l:
            l.write("COMPLETED\n")

        mccutils.log("popoolationte", "PopoolationTE preprocessing complete")

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log, "a") as l:
            print(track, file=l)
        with open(status_log, "w") as l:
            l.write("FAILED\n")

        mccutils.run_command(["touch", snakemake.output[0]])
        mccutils.run_command(["touch", snakemake.output[1]])
        mccutils.run_command(["touch", snakemake.output[2]])
Exemplo n.º 14
0
def main():
    mccutils.log("popoolationte2", "setting up for PopoolationTE2")
    ref_fasta = snakemake.input.ref_fasta
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2
    jar = snakemake.params.jar
    log = snakemake.params.log
    out_dir = snakemake.params.out_dir
    threads = snakemake.threads
    status_log = snakemake.params.status_log

    try:
        # ensures intermediate files from previous runs are removed
        for f in os.listdir(out_dir):
            mccutils.remove(out_dir + "/" + f)

        mccutils.mkdir(out_dir + "/tmp")
        index_fasta(ref_fasta, log=log)
        fq1 = format_fastq(fq1, out_dir + "/reads_1.fastq", log=log)
        fq2 = format_fastq(fq2, out_dir + "/reads_2.fastq", log=log)
        sam1 = map_reads(ref_fasta,
                         fq1,
                         out_dir + "/mapped_1.sam",
                         threads=threads,
                         log=log)
        sam2 = map_reads(ref_fasta,
                         fq2,
                         out_dir + "/mapped_2.sam",
                         threads=threads,
                         log=log)
        bam = sam_to_bam(jar,
                         fq1,
                         fq2,
                         sam1,
                         sam2,
                         snakemake.output.bam,
                         out_dir,
                         threads=threads,
                         log=log)
        mccutils.remove(out_dir + "/tmp")

        mccutils.check_file_exists(snakemake.output.bam)
        with open(status_log, "w") as l:
            l.write("COMPLETED\n")

        mccutils.log("popoolationte2", "PopoolationTE2 preprocessing complete")

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log, "a") as l:
            print(track, file=l)
        mccutils.log("popoolationte2", "popoolationte2 preprocessing failed")
        with open(status_log, "w") as l:
            l.write("FAILED\n")

        mccutils.run_command(["touch", snakemake.output.bam])
Exemplo n.º 15
0
def main():
    install_path = snakemake.config['paths']['install'] + "/tools/"

    mccutils.remove(snakemake.params.zipfile)
    download_success = mccutils.download(snakemake.params.url,
                                         snakemake.params.zipfile,
                                         md5=snakemake.params.md5,
                                         max_attempts=3)

    if not download_success:
        print("popoolationte download failed... exiting...")
        print("try running --install with --clean for clean installation")
        sys.exit(1)

    mccutils.remove(snakemake.config['paths']['install'] + "popoolationte")
    command = ["unzip", snakemake.params.zipfile]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path + "popoolationte")
    mccutils.mkdir(install_path + "popoolationte")
    for f in os.listdir(snakemake.config['paths']['install'] +
                        "popoolationte"):
        command = [
            "mv", snakemake.config['paths']['install'] + "popoolationte/" + f,
            install_path + "popoolationte"
        ]
        mccutils.run_command(command, log=snakemake.params.log)

    command = [
        "patch", "-i", snakemake.params.patch1,
        install_path + "popoolationte/Modules/TEInsertUtility.pm"
    ]
    mccutils.run_command(command, log=snakemake.params.log)
    command = [
        "patch", "-i", snakemake.params.patch2,
        install_path + "popoolationte/Modules/TEInsert.pm"
    ]
    mccutils.run_command(command, log=snakemake.params.log)
    command = [
        "patch", "-i", snakemake.params.patch3,
        install_path + "popoolationte/samro.pl"
    ]
    mccutils.run_command(command, log=snakemake.params.log)
    command = [
        "patch", "-i", snakemake.params.patch4,
        install_path + "popoolationte/identify-te-insertsites.pl"
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(snakemake.params.zipfile)
    mccutils.remove(snakemake.config['paths']['install'] + "popoolationte")

    # write version to file
    with open(
            snakemake.config['paths']['install'] +
            "/tools/popoolationte/version.log", "w") as version:
        version.write(snakemake.params.md5)
Exemplo n.º 16
0
def run_workflow(args, sample_name, run_id, debug=False):
    log = args.out + "/mcclintock." + str(run_id) + ".log"

    results_dir = args.out + "/results/"
    input_dir = args.out + "/method_input/"
    out_files = config.OUT_PATHS
    for key in out_files.keys():
        out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir)
        out_files[key] = out_files[key].replace(config.RESULTS_DIR,
                                                results_dir)
        out_files[key] = out_files[key].replace(config.SAMPLE_NAME,
                                                sample_name)

    path = os.path.dirname(os.path.abspath(__file__))
    mccutils.mkdir(args.out + "/snakemake")
    snakemake_path = args.out + "/snakemake/" + str(run_id)
    mccutils.mkdir(snakemake_path)
    mccutils.run_command(["cp", path + "/Snakefile", snakemake_path])
    os.chdir(snakemake_path)
    command = [
        "snakemake", "--use-conda", "--conda-prefix",
        path + "/install/envs/conda"
    ]
    if not debug:
        command.append("--quiet")
    else:
        command.append("--reason")

    command += [
        "--configfile",
        args.out + "/snakemake/config/config_" + str(run_id) + ".json"
    ]
    command += ["--cores", str(args.proc)]

    if args.clean:
        clean_command = command + ["--delete-all-output"]
        mccutils.run_command(clean_command)
        mccutils.remove(args.out + "/input")

    for method in args.methods:
        command.append(out_files[method])

    command.append(args.out + "/results/summary/summary_report.txt")

    # print(" ".join(command))
    try:
        mccutils.run_command(command)
        mccutils.check_file_exists(args.out +
                                   "/results/summary/summary_report.txt")
    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print(
            "McClintock Pipeline Failed... please open an issue at https://github.com/bergmanlab/mcclintock/issues if you are having trouble using McClintock",
            file=sys.stderr)
        sys.exit(1)
    mccutils.remove(args.out + "/tmp")
Exemplo n.º 17
0
def main():
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2
    bam = snakemake.input.bam
    reference = snakemake.input.reference
    twobit = snakemake.input.twobit
    consensus = snakemake.input.consensus
    ref_te_bed = snakemake.input.ref_te_bed
    taxonomy = snakemake.input.taxonomy
    median_insert_size_file = snakemake.input.median_insert_size
    log = snakemake.params.log

    with open(log,"a") as l:
        l.write("BAM: "+bam+"\n")
        l.write("2bit: "+twobit+"\n")
        l.write("consensus fasta: "+consensus+"\n")
        l.write("reference TE BED: "+ref_te_bed+"\n")
        l.write("Taxonomy TSV: "+taxonomy+"\n")

    threads = snakemake.threads
    out_dir = snakemake.params.out_dir
    script_dir = snakemake.params.script_dir
    sample_name = snakemake.params.sample_name
    status_log = snakemake.params.status_log

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(out_dir):
        mccutils.remove(out_dir+"/"+f)
    
    mccutils.log("temp2","running TEMP2 Module")

    try:
        median_insert_size = get_median_insert_size(median_insert_size_file)
        run_temp2_insertion(fq1, fq2, bam, median_insert_size, reference, script_dir, consensus, ref_te_bed, threads, out_dir, config, log)
        run_temp2_absence(script_dir, bam, twobit, ref_te_bed, median_insert_size, threads, out_dir+"/absence", config, log)
        mccutils.run_command(["cp", out_dir+'/absence/'+sample_name+".absence.refined.bp.summary", out_dir], log=log)

        mccutils.check_file_exists(snakemake.output[0])
        mccutils.check_file_exists(snakemake.output[1])
        with open(status_log,"w") as l:
            l.write("COMPLETED\n")
        mccutils.log("temp2","TEMP2 run complete")
    
    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log,"a") as l:
            print(track, file=l)
        mccutils.log("temp2","TEMP2 run failed")
        with open(status_log,"w") as l:
            l.write("FAILED\n")

        mccutils.run_command(["touch", snakemake.output[0]])
        mccutils.run_command(["touch", snakemake.output[1]])
Exemplo n.º 18
0
def get_genome_depth(non_te_bed, bam, run_id, out, log):
    mccutils.log("coverage","determining the coverage depth of the genome", log=log)
    depth_file = out+"/input/"+run_id+"genome.depth"
    command = ["samtools", "depth", "-aa", "-b", non_te_bed, bam, "-d", "0"]
    mccutils.run_command_stdout(command, depth_file, log=log)

    genome_depth = get_avg_depth(depth_file)

    mccutils.remove(depth_file)

    return genome_depth
Exemplo n.º 19
0
def main():
    consensus_fasta = snakemake.input.consensus_fasta
    reference_fasta = snakemake.input.reference_fasta
    fastq1 = snakemake.input.fastq1
    fastq2 = snakemake.input.fastq2

    log = snakemake.params.log
    with open(log, "a") as l:
        l.write("consensus fasta: " + consensus_fasta + "\n")
        l.write("reference fasta: " + reference_fasta + "\n")
        l.write("fastq1: " + fastq1 + "\n")
        l.write("fastq2: " + fastq2 + "\n")

    threads = snakemake.threads
    sample_name = snakemake.params.sample_name
    script_dir = snakemake.params.script_dir
    out_dir = snakemake.params.out_dir
    out_bed = snakemake.output[0]

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(out_dir):
        mccutils.remove(out_dir + "/" + f)

    is_paired = True
    if snakemake.params.raw_fq2 == "None":
        is_paired = False

    command = [
        'Rscript', "--vanilla", script_dir + "/ngs_te_mapper.R",
        "genome=" + reference_fasta, "teFile=" + consensus_fasta,
        "tsd=" + str(config.MAX_TSD), "thread=" + str(threads),
        "output=" + out_dir, "sourceCodeFolder=" + script_dir
    ]

    if is_paired:
        command.append("sample=" + fastq1 + ";" + fastq2)
    else:
        command.append("sample=" + fastq1)

    mccutils.log("ngs_te_mapper", "running ngs_te_mapper", log=log)
    mccutils.run_command(command, log=log)
    mccutils.log("ngs_te_mapper", "ngs_te_mapper run complete", log=log)

    raw_bed = ""
    for f in os.listdir(out_dir + "/bed_tsd/"):
        if "insertions.bed" in f:
            raw_bed = out_dir + "/bed_tsd/" + f

    mccutils.run_command(["cp", raw_bed, out_bed])

    mccutils.remove(out_dir + "/aligned_te/")

    mccutils.log("ngs_te_mapper", "ngs_te_mapper run complete")
Exemplo n.º 20
0
def main():
    te_gff = snakemake.input.te_gff
    sam = snakemake.input.sam
    ref_fasta = snakemake.input.ref
    median_insert_size_file = snakemake.input.median_insert_size
    log = snakemake.params.log

    mccutils.log("te-locate", "running TE-Locate", log=log)
    with open(log, "a") as l:
        l.write("TE GFF: " + te_gff + "\n")
        l.write("SAM: " + sam + "\n")
        l.write("reference fasta: " + ref_fasta + "\n")

    telocate = snakemake.params.run_script
    out_dir = snakemake.params.out_dir

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(out_dir):
        mccutils.remove(out_dir + "/" + f)

    sam_dir = out_dir + "/sam/"
    mccutils.mkdir(sam_dir)
    te_locate_sam = sam_dir + "te-locate.sam"
    if os.path.exists(te_locate_sam):
        os.remove(te_locate_sam)
    os.symlink(sam, te_locate_sam)

    os.chdir(os.path.dirname(telocate))

    median_insert_size = mccutils.get_median_insert_size(
        median_insert_size_file)

    distance = (median_insert_size * config.MIN_DISTANCE)

    command = [
        "perl", telocate,
        str(config.MAX_MEM), sam_dir, te_gff, ref_fasta, out_dir,
        str(distance),
        str(config.MIN_SUPPORT_READS),
        str(config.MIN_SUPPORT_INDIVIDUALS)
    ]

    mccutils.run_command(command, log=log)

    mccutils.run_command([
        "cp", out_dir + "_" + str(distance) + "_reads3_acc1.info",
        out_dir + "te-locate-raw.info"
    ])
    mccutils.log("te-locate", "TE-Locate complete")
Exemplo n.º 21
0
def main():
    consensus_fasta = snakemake.input.consensus_fasta
    bam = snakemake.input.bam
    ref_fasta = snakemake.input.ref_fasta
    ref_te_bed = snakemake.input.ref_te_bed
    taxonomy = snakemake.input.taxonomy
    log = snakemake.params.log
    status_log = snakemake.params.status_log

    try:
        with open(log,"a") as l:
            l.write("consensus fasta: "+consensus_fasta+"\n")
            l.write("BAM: "+bam+"\n")
            l.write("reference fasta: "+ref_fasta+"\n")
            l.write("taxonomy TSV: "+ taxonomy+"\n")
            

        script_dir = snakemake.params.script_dir
        out_dir = snakemake.params.out_dir
        ref_name = snakemake.params.ref_name
        sample_name = snakemake.params.sample_name

        # ensures intermediate files from previous runs are removed
        for f in os.listdir(out_dir):
            mccutils.remove(out_dir+"/"+f)

        mccutils.log("retroseq","running RetroSeq", log=log)    

        elements = split_consensus_fasta(consensus_fasta, ref_name, out_dir)

        bed_location_file = make_consensus_beds(elements, ref_name, ref_te_bed, taxonomy, out_dir)

        run_retroseq(bam, bed_location_file, ref_fasta, script_dir, sample_name, out_dir, config.PARAMS, log=log)

        with open(status_log,"w") as l:
            l.write("COMPLETED\n")

        mccutils.log("retroseq","RetroSeq complete")
    
    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log,"a") as l:
            print(track, file=l)
        mccutils.log("retroseq","RetroSeq run failed")
        with open(status_log,"w") as l:
            l.write("FAILED\n")
        
        mccutils.run_command(["touch", snakemake.output[0]])
Exemplo n.º 22
0
def make_redundant_bed(insertions, sample_name, out_dir):
    tmp_bed = out_dir + "/tmp.bed"

    insertion_dict = {}
    out_inserts = []
    for insert in insertions:
        insertion_dict["_".join([
            insert.chromosome,
            str(insert.start - 1),
            str(insert.end), insert.name, "0", insert.strand
        ])] = insert

    with open(tmp_bed, "w") as out:
        for insert in insertions:
            out_line = "\t".join([
                insert.chromosome,
                str(insert.start - 1),
                str(insert.end), insert.name, "0", insert.strand
            ])
            out.write(out_line + "\n")

    sorted_bed = out_dir + "/sorted.bed"
    command = ["bedtools", "sort", "-i", tmp_bed]
    mccutils.run_command_stdout(command, sorted_bed)

    redundant_bed = out_dir + "/" + sample_name + "_relocate2_redundant.bed"
    with open(redundant_bed, "w") as outbed:
        header = 'track name="' + sample_name + '_RelocaTE2" description="' + sample_name + '_RelocaTE2"\n'
        outbed.write(header)
        with open(sorted_bed, "r") as inbed:
            for x, line in enumerate(inbed):

                # outputs inserts in sorted order with unique number added to name
                key = line.replace("\t", "_")
                key = key.replace("\n", "")
                insert = insertion_dict[key]
                insert.name += str(x + 1)
                out_inserts.append(insert)

                # write to bed with unique number added to name
                split_line = line.split("\t")
                split_line[3] += str(x + 1)
                line = "\t".join(split_line)
                outbed.write(line)

    mccutils.remove(tmp_bed)
    mccutils.remove(sorted_bed)

    return out_inserts
Exemplo n.º 23
0
def main():
    mccutils.log("popoolationte2", "running PopoolationTE2")
    ref_fasta = snakemake.input.ref_fasta
    bam = snakemake.input.bam
    taxonomy = snakemake.input.taxonomy
    jar = snakemake.params.jar
    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    log = snakemake.params.log

    taxonomy = format_taxonomy(taxonomy, out_dir)
    ppileup = popoolationte2_ppileup(jar,
                                     config.ppileup,
                                     bam,
                                     taxonomy,
                                     out_dir,
                                     log=log)
    ppileup = popoolationte2_subsample(jar,
                                       config.subsampleppileup,
                                       ppileup,
                                       out_dir,
                                       log=log)
    signatures = popoolationte2_signatures(jar,
                                           config.identifySignatures,
                                           ppileup,
                                           out_dir,
                                           log=log)
    signatures = popoolationte2_strand(jar,
                                       config.updateStrand,
                                       signatures,
                                       bam,
                                       taxonomy,
                                       out_dir,
                                       log=log)
    signatures = popoolationte2_frequency(jar,
                                          ppileup,
                                          signatures,
                                          out_dir,
                                          log=log)
    te_insertions = popoolationte2_pairup(jar,
                                          config.pairupSignatures,
                                          signatures,
                                          ref_fasta,
                                          taxonomy,
                                          out_dir,
                                          log=log)

    mccutils.remove(bam)
Exemplo n.º 24
0
def main():
    mcc_out = snakemake.config["args"]['out']
    mccutils.mkdir(mcc_out+"/results/")
    coverage_out = mcc_out+"/results/coverage/"
    mccutils.mkdir(coverage_out)

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(coverage_out):
        mccutils.remove(coverage_out+"/"+f)

    run_id = snakemake.config['args']['run_id']
    te_seqs = snakemake.input.consensus
    log = snakemake.params.log
    
    # always use consensus fasta for masking the genome
    mccutils.mkdir(coverage_out+"/input")
    mccutils.mkdir(coverage_out+"/te-depth-files")
    masked_reference, masked_gff = repeatmask_genome(snakemake.input.ref, te_seqs, snakemake.threads, run_id, coverage_out, log)

    # uses coverage fasta (if exists) for augmenting and coverage analysis
    if snakemake.config['in']['coverage_fasta'] != "None":
        te_seqs = snakemake.input.coverage_fa


    augmented_reference = augment_genome(masked_reference, te_seqs, coverage_out)
    index_genome(snakemake.input.ref, log)
    index_genome(augmented_reference, log)
    
    if snakemake.config['in']['fq2'] == "None":
        sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log)
    else:
        sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log, fq2=snakemake.input.fq2)

    bam = sam_to_bam(sam, augmented_reference, snakemake.params.sample, snakemake.threads, run_id, coverage_out, log)
    nonte_bed = make_nonte_bed(snakemake.input.ref, masked_gff, run_id, coverage_out, log)
    genome_depth = get_genome_depth(nonte_bed, bam, run_id, coverage_out, log)

    edge_trim = 0
    if config.OMIT_EDGES:
        if config.OMIT_EDGES_READ_LENGTH:
            edge_trim = mccutils.estimate_read_length(snakemake.input.fq1)
        else:
            edge_trim = config.OMIT_EDGES_LENGTH
            
    te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths = make_depth_table(te_seqs, bam, genome_depth, run_id, coverage_out, snakemake.output[0], log, trim_edges=edge_trim)
    make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, snakemake.params.sample, coverage_out, trim_edges=edge_trim)

    mccutils.remove(sam)
Exemplo n.º 25
0
def sam_to_bam(sam, reference, sample_name, threads, run_id, out, log):
    mccutils.log("coverage","converting SAM to BAM, and indexing", log=log)
    threads = str(threads)
    tmp_bam = out+"/input/"+run_id+"_tmp.bam"
    command = ["samtools", "view", "-Sb", "-@", threads, "-t", reference+".fai", sam]
    mccutils.run_command_stdout(command, tmp_bam, log=log)

    sorted_bam = out+"/input/"+run_id+"_"+sample_name+".bam"
    command = ["samtools", "sort", "-@", threads, tmp_bam]
    mccutils.run_command_stdout(command, sorted_bam, log=log)

    mccutils.run_command(["samtools", "index", sorted_bam], log=log)

    mccutils.remove(tmp_bam)

    return sorted_bam
Exemplo n.º 26
0
def main():
    full_command = " ".join(["python3"] + sys.argv)
    current_directory = os.getcwd()

    expected_configs = sysconfig.CONFIGS
    args = parse_args(expected_configs)
    sys.path = [args.config] + sys.path

    mccutils.mkdir(args.out+"/logs")
    mccutils.mkdir(args.out+"/tmp")
    check_installed_modules(args.methods, sysconfig.NO_INSTALL_METHODS, config_install.MD5, os.path.dirname(os.path.abspath(__file__))+"/install/")
    check_input_files(args.reference, args.consensus, args.first, fq2=args.second, locations=args.locations, taxonomy=args.taxonomy, coverage_fasta=args.coverage_fasta, augment_fasta=args.augment, annotations_only=args.make_annotations)
    ref_name = mccutils.get_base_name(args.reference)
    run_id, out_files = make_run_config(args, args.sample_name, ref_name, full_command, current_directory, debug=args.debug)
    run_workflow(args, args.sample_name, ref_name, run_id, out_files, debug=args.debug, annotations_only=args.make_annotations)
    mccutils.remove(args.out+"/tmp")
Exemplo n.º 27
0
def main():
    mccutils.remove(snakemake.params.tar)
    download_success = mccutils.download(snakemake.params.url, snakemake.params.tar, md5=snakemake.params.md5, max_attempts=3)

    if not download_success:
        print("TE-Locate download failed... exiting...")
        print("try running --install with --clean for clean installation")
        sys.exit(1)


    command = ["tar", "-xvf", snakemake.params.tar, "-C", snakemake.config['paths']['install']+"/tools/te-locate/"]
    mccutils.run_command(command, log=snakemake.params.log)

    # write version to file
    with open(snakemake.config['paths']['install']+"/tools/te-locate/version.log","w") as version:
        version.write(snakemake.params.md5)
Exemplo n.º 28
0
def main():
    mccutils.log("popoolationte2", "setting up for PopoolationTE2")
    ref_fasta = snakemake.input.ref_fasta
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2
    log = snakemake.params.log
    out_dir = snakemake.params.out_dir
    threads = snakemake.threads

    index_fasta(ref_fasta, log=log)
    sam = map_reads(ref_fasta, fq1, fq2, out_dir, threads=threads, log=log)
    bam = sam_to_bam(sam, out_dir + "/tmp.bam", threads=threads, log=log)
    sorted_bam = sort_bam(bam, snakemake.output.bam, threads=threads, log=log)

    mccutils.remove(sam)
    mccutils.remove(bam)
Exemplo n.º 29
0
def main():
    mccutils.log("popoolationte", "running PopoolationTE preprocessing steps")
    ref_fasta = snakemake.input.ref_fasta
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    script_dir = snakemake.params.script_dir

    threads = snakemake.threads

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(out_dir):
        mccutils.remove(out_dir + "/" + f)

    mccutils.log("popoolationte", "formatting read names")
    fq1, fq2 = format_read_names(fq1, fq2, sample_name, out_dir)
    mccutils.log("popoolationte",
                 "indexing popoolationTE reference fasta",
                 log=log)
    index_fasta(ref_fasta, log=log)
    mccutils.log("popoolationte", "mapping fastq1 reads", log=log)
    sam1 = map_reads(fq1, ref_fasta, threads=threads, log=log)
    mccutils.log("popoolationte", "mapping fastq2 reads", log=log)
    sam2 = map_reads(fq2, ref_fasta, threads=threads, log=log)
    mccutils.log("popoolationte", "combining alignments", log=log)
    combined_sam = combine_alignments(sam1,
                                      sam2,
                                      fq1,
                                      fq2,
                                      script_dir,
                                      out_dir,
                                      log=log)
    mccutils.log("popoolationte", "sorting sam file", log=log)
    bam = sam_to_bam(combined_sam, threads=threads, log=log)
    sorted_bam = sort_bam(bam, threads=threads, log=log)
    sorted_sam = bam_to_sam(sorted_bam, threads=threads, log=log)

    files_to_remove = [sam1, sam2, combined_sam, bam, sorted_bam]

    for f in files_to_remove:
        mccutils.remove(f)

    mccutils.log("popoolationte", "PopoolationTE preprocessing complete")
Exemplo n.º 30
0
def main():
    fq1 = snakemake.input.fq1
    fq2 = snakemake.params.fq2
    methods = snakemake.params.methods.split(",")
    processors = snakemake.threads
    mcc_out = snakemake.params.out
    run_id = snakemake.params.run_id
    log = snakemake.params.log

    # now = datetime.now()
    # start = now.strftime("%Y-%m-%d %H:%M:%S")
    mccutils.log("processing", "prepping reads for McClintock")
    # trims adaptors of input fastq(s)
    trimmedfq = fq1
    trimmedfq2 = fq2

    try:
        check_fastqs(fq1, fq2, mcc_out, min_length=30, log=log)

        if "trimgalore" in methods:
            mccutils.log("processing", "running trim_galore", log=log)
            if fq2 == "None":
                flags = trimgalore.SINGLE_END_FLAGS
                trimmedfq = run_trim_galore(fq1, run_id, log, mcc_out, cores=processors, flags=flags)
            else:
                flags = trimgalore.PAIRED_END_FLAGS
                trimmedfq, trimmedfq2 = run_trim_galore(fq1, run_id, log, mcc_out, fq2=fq2, cores=processors, flags=flags)
            
            run_multiqc(mcc_out+"/results/trimgalore/")
            
        
        # make unzipped copies in mcc input dir        
        make_copies(trimmedfq, trimmedfq2, snakemake.output[0], snakemake.output[1])
    
        # removes trimmed read files from trimgalore directory
        if trimmedfq != fq1:
            mccutils.remove(trimmedfq)
        if trimmedfq2 != fq2:
            mccutils.remove(trimmedfq2)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("ERROR processing of FastQ files failed...check that your FastQ files are formatted correctly...Exiting...", file=sys.stderr)
        mccutils.remove(snakemake.output[0])
        mccutils.remove(snakemake.output[1])
        sys.exit(1)


    # now = datetime.now()
    # end = now.strftime("%Y-%m-%d %H:%M:%S")
    # mccutils.log("setup_reads", "start: "+start)
    # mccutils.log("setup_reads", "end: "+end)

    mccutils.log("processing", "read setup complete")