Пример #1
0
def make_local_css_js_copies(css_dir, js_dir, out_dir):
    mccutils.mkdir(out_dir + "/html/")
    mccutils.mkdir(out_dir + "/css/")
    for css in os.listdir(css_dir):
        mccutils.run_command(["cp", css_dir + "/" + css, out_dir + "/css/"])

    mccutils.mkdir(out_dir + "/js/")
    for js in os.listdir(js_dir):
        mccutils.run_command(["cp", js_dir + "/" + js, out_dir + "/js/"])
Пример #2
0
def make_consensus_beds(elements, ref_name, te_bed, taxon, out):
    out_dir = out + "/split_bed/"
    mccutils.mkdir(out_dir)
    taxon_map = {}
    location_file = out_dir + ref_name + ".locationlist"
    with open(taxon, "r") as t:
        for line in t:
            split_line = line.split("\t")
            element = split_line[0]
            element_fam = split_line[1].replace("\n", "")
            if element_fam in taxon_map.keys():
                taxon_map[element_fam].append(element)
            else:
                taxon_map[element_fam] = [element]

    with open(location_file, "w") as locations:
        for fam in taxon_map.keys():
            if fam in elements:
                bed_name = fam
                special_chars = [
                    ";", "&", "(", ")", "|", "*", "?", "[", "]", "~", "{", "}",
                    "<", "!", "^", '"', "'", "\\", "$", "/"
                ]
                for char in special_chars:
                    bed_name = bed_name.replace(char, "_")

                bed_name = out_dir + ref_name + "_" + bed_name + ".bed"
                locations.write(fam + "\t" + bed_name + "\n")
                with open(bed_name, "w") as outbed:
                    with open(te_bed, "r") as inbed:
                        for line in inbed:
                            split_line = line.split("\t")
                            element_name = split_line[3]
                            if element_name in taxon_map[fam]:
                                outbed.write(line)

    return location_file
Пример #3
0
def main():
    install_path = snakemake.config['paths']['install'] + "/tools/"

    raw_name = "ngs_te_mapper-f9f48996ac346ac86d57edbd00534aa1227b753e"

    mccutils.remove(snakemake.params.zipfile)
    download_success = mccutils.download(snakemake.params.url,
                                         snakemake.params.zipfile,
                                         md5=snakemake.params.md5,
                                         max_attempts=3)

    if not download_success:
        print("ngs_te_mapper download failed... exiting...")
        print("try running --install with --clean for clean installation")
        sys.exit(1)

    mccutils.remove(snakemake.config['paths']['install'] + raw_name)
    command = ["unzip", snakemake.params.zipfile]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path + raw_name)
    command = [
        "mv", snakemake.config['paths']['install'] + raw_name, install_path
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path + "ngs_te_mapper")
    mccutils.mkdir(install_path + "ngs_te_mapper")
    for f in os.listdir(install_path + raw_name):
        command = [
            "mv", install_path + raw_name + "/" + f,
            install_path + "ngs_te_mapper"
        ]
        mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path + raw_name)
    mccutils.remove(snakemake.params.zipfile)
Пример #4
0
def main():
    mcc_out = snakemake.config["args"]['out']
    mccutils.mkdir(mcc_out+"/results/")
    coverage_out = mcc_out+"/results/coverage/"
    mccutils.mkdir(coverage_out)

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(coverage_out):
        mccutils.remove(coverage_out+"/"+f)

    run_id = snakemake.config['args']['run_id']
    te_seqs = snakemake.input.consensus
    log = snakemake.params.log
    
    # always use consensus fasta for masking the genome
    mccutils.mkdir(coverage_out+"/input")
    masked_reference, masked_gff = repeatmask_genome(snakemake.input.ref, te_seqs, snakemake.threads, run_id, coverage_out, log)

    # uses coverage fasta (if exists) for augmenting and coverage analysis
    if snakemake.config['in']['coverage_fasta'] != "None":
        te_seqs = snakemake.input.coverage_fa

    augmented_reference = augment_genome(masked_reference, te_seqs, run_id, coverage_out)
    index_genome(snakemake.input.ref, log)
    index_genome(augmented_reference, log)
    
    if snakemake.config['in']['fq2'] == "None":
        sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log)
    else:
        sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log, fq2=snakemake.input.fq2)

    bam = sam_to_bam(sam, augmented_reference, snakemake.params.sample, snakemake.threads, run_id, coverage_out, log)
    nonte_bed = make_nonte_bed(snakemake.input.ref, masked_gff, run_id, coverage_out, log)
    genome_depth = get_genome_depth(nonte_bed, bam, run_id, coverage_out, log)

    edge_trim = 0
    if config.OMIT_EDGES:
        if config.OMIT_EDGES_READ_LENGTH:
            edge_trim = mccutils.estimate_read_length(snakemake.input.fq1)
        else:
            edge_trim = config.OMIT_EDGES_LENGTH
            
    te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths = make_depth_table(te_seqs, bam, genome_depth, run_id, coverage_out, snakemake.output[0], log, trim_edges=edge_trim)
    make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, snakemake.params.sample, coverage_out, trim_edges=edge_trim)

    mccutils.remove(sam)
    mccutils.remove(bam)
Пример #5
0
def parse_args():
    parser = argparse.ArgumentParser(
        prog='McClintock',
        description=
        "Meta-pipeline to identify transposable element insertions using next generation sequencing data"
    )

    ## required ##
    parser.add_argument("-r",
                        "--reference",
                        type=str,
                        help="A reference genome sequence in fasta format",
                        required='--install' not in sys.argv)
    parser.add_argument(
        "-c",
        "--consensus",
        type=str,
        help=
        "The consensus sequences of the TEs for the species in fasta format",
        required='--install' not in sys.argv)
    parser.add_argument(
        "-1",
        "--first",
        type=str,
        help=
        "The path of the first fastq file from paired end read sequencing or the fastq file from single read sequencing",
        required='--install' not in sys.argv)

    ## optional ##
    parser.add_argument(
        "-2",
        "--second",
        type=str,
        help=
        "The path of the second fastq file from a paired end read sequencing",
        required=False)
    parser.add_argument(
        "-p",
        "--proc",
        type=int,
        help=
        "The number of processors to use for parallel stages of the pipeline [default = 1]",
        required=False)
    parser.add_argument("-o",
                        "--out",
                        type=str,
                        help="An output folder for the run. [default = '.']",
                        required=False)
    parser.add_argument(
        "-m",
        "--methods",
        type=str,
        help=
        "A comma-delimited list containing the software you want the pipeline to use for analysis. e.g. '-m relocate,TEMP,ngs_te_mapper' will launch only those three methods",
        required=False)
    parser.add_argument(
        "-g",
        "--locations",
        type=str,
        help=
        "The locations of known TEs in the reference genome in GFF 3 format. This must include a unique ID attribute for every entry",
        required=False)
    parser.add_argument(
        "-t",
        "--taxonomy",
        type=str,
        help=
        "A tab delimited file with one entry per ID in the GFF file and two columns: the first containing the ID and the second containing the TE family it belongs to. The family should correspond to the names of the sequences in the consensus fasta file",
        required=False)
    parser.add_argument(
        "-s",
        "--coverage_fasta",
        type=str,
        help=
        "A fasta file that will be used for TE-based coverage analysis, if not supplied then the consensus sequences of the TEs will be used for the analysis",
        required=False)
    # parser.add_argument("-d", "--coverage", action="store_true", help="If this option is specified then McClintock will perform depth of coverage analysis for every TE. Note: Doing TE-based coverage analysis will result in longer running time. A fasta file can be provided here for coverage analysis. If no file is provided here, the consensus sequences of the TEs will be used for the analysis", required=False)
    # parser.add_argument("-D", "--coverage_only", action="store_true", help="If this option is specified then only depth of coverage analysis for TEs will be performed", required=False)
    parser.add_argument(
        "-T",
        "--comments",
        action="store_true",
        help=
        "If this option is specified then fastq comments (e.g. barcode) will be incorporated to SAM output. Warning: do not use this option if the input fastq files do not have comments",
        required=False)
    # parser.add_argument("-b", "--keep_bam", action="store_true", help="Retain the sorted and indexed BAM file of the paired end data aligned to the reference genome", required=False)
    # parser.add_argument("-i", "--remove_intermediate", action="store_true", help="If this option is specified then all sample specific intermediate files will be removed, leaving only the overall results. The default is to leave sample specific intermediate files", required=False)
    parser.add_argument(
        "-a",
        "--augment",
        type=str,
        help=
        "A fasta file of TE sequences that will be included as extra chromosomes in the reference file (useful if the organism is known to have TEs that are not present in the reference strain)",
        required=False)
    parser.add_argument(
        "--clean",
        action="store_true",
        help=
        "This option will make sure mcclintock runs from scratch and doesn't reuse files already created",
        required=False)
    parser.add_argument(
        "--install",
        action="store_true",
        help="This option will install the dependencies of mcclintock",
        required=False)
    parser.add_argument(
        "--debug",
        action="store_true",
        help="This option will allow snakemake to print progress to stdout",
        required=False)
    parser.add_argument(
        "--slow",
        action="store_true",
        help=
        "This option runs without attempting to optimize thread usage to run rules concurrently. Each multithread rule will use the max processors designated by -p/--proc",
        required=False)

    args = parser.parse_args()

    if args.debug is None:
        args.debug = False

    if args.install:
        mccutils.log("installation", "installing dependencies")
        mccutils.log("installation", "WARNING: this could take awhile")
        install(clean=args.clean, debug=args.debug)
        sys.exit(0)

    #check -r
    args.reference = mccutils.get_abs_path(args.reference)
    #check -c
    args.consensus = mccutils.get_abs_path(args.consensus)
    #check -1
    args.first = mccutils.get_abs_path(args.first)
    #check -2
    if args.second is not None:
        args.second = mccutils.get_abs_path(args.second)

    #check -p
    if args.proc is None:
        args.proc = 1

    #check -o
    if args.out is None:
        args.out = os.path.abspath(".")
    else:
        args.out = os.path.abspath(args.out)
        try:
            mccutils.mkdir(args.out)
        except Exception as e:
            track = traceback.format_exc()
            print(track, file=sys.stderr)
            print("cannot create output directory: ",
                  args.out,
                  "exiting...",
                  file=sys.stderr)
            sys.exit(1)

    #check -m
    # If only one fastq has been supplied assume this is single ended data and launch only ngs_te_mapper and RelocaTE
    if args.second is None:
        valid_methods = config.SINGLE_END_METHODS  #from config.py
    else:
        valid_methods = config.ALL_METHODS  #from config.py

    if args.methods is None:
        args.methods = valid_methods

    else:
        args.methods = args.methods.split(",")
        for x, method in enumerate(args.methods):
            args.methods[x] = method.lower()
            if args.methods[x] not in valid_methods:
                sys.stderr.write(" ".join([
                    "Method:", method, "not a valid method...",
                    "Valid methods:", " ".join(valid_methods), "\n"
                ]))
                sys.exit(1)

    # check -g
    if args.locations is not None:
        args.locations = mccutils.get_abs_path(args.locations)

        if args.taxonomy is None:
            sys.stderr.write(
                "If a GFF file is supplied (-g/--locations) then a TE taxonomy file that links it to the fasta consensus is also needed (-t/--taxonomy)...exiting...\n"
            )
            sys.exit(1)

    # check -t
    if args.taxonomy is not None:
        args.taxonomy = mccutils.get_abs_path(args.taxonomy)

    # check -s
    if args.coverage_fasta is not None:
        args.coverage_fasta = mccutils.get_abs_path(args.coverage_fasta)

    # check -T
    if args.comments is None:
        args.comments = False

    # check -a
    if args.augment is not None:
        args.augment = mccutils.get_abs_path(args.augment)

    return args
Пример #6
0
def make_run_config(args, sample_name, ref_name, full_command,
                    current_directory):
    run_id = random.randint(1000000, 9999999)
    mccutils.mkdir(args.out + "/snakemake")
    mccutils.mkdir(args.out + "/snakemake/config")
    run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json"
    input_dir = args.out + "/method_input/"
    results_dir = args.out + "/results/"

    out_files_to_make = []
    out_files = config.OUT_PATHS
    for key in out_files.keys():
        out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir)
        out_files[key] = out_files[key].replace(config.RESULTS_DIR,
                                                results_dir)
        out_files[key] = out_files[key].replace(config.SAMPLE_NAME,
                                                sample_name)

    for method in args.methods:
        out_files_to_make.append(out_files[method])

    now = datetime.now()
    now_str = now.strftime("%Y%m%d.%H%M%S")
    log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/"
    mccutils.mkdir(log_dir)

    chromosomes = []
    for record in SeqIO.parse(args.reference, "fasta"):
        chrom = str(record.id)
        chrom = mccutils.replace_special_chars(chrom)
        chromosomes.append(chrom)

    data = {}
    data['args'] = {
        'proc':
        str(args.proc),
        'out':
        str(args.out),
        'log_dir':
        log_dir,
        'augment_fasta':
        str(args.augment),
        'mcc_path':
        os.path.dirname(os.path.abspath(__file__)),
        'sample_name':
        sample_name,
        'ref_name':
        ref_name,
        'run_id':
        str(run_id),
        'methods':
        ",".join(args.methods),
        'out_files':
        ",".join(out_files_to_make),
        'save_comments':
        str(args.comments),
        'max_threads_per_rule':
        max(
            1,
            calculate_max_threads(args.proc,
                                  args.methods,
                                  config.MULTI_THREAD_METHODS,
                                  slow=args.slow)),
        'full_command':
        full_command,
        'call_directory':
        current_directory,
        'time':
        now.strftime("%Y-%m-%d %H:%M:%S"),
        "chromosomes":
        ",".join(chromosomes)
    }

    # input paths for files
    data["in"] = {
        'reference': str(args.reference),
        'consensus': str(args.consensus),
        'fq1': str(args.first),
        'fq2': str(args.second),
        'locations': str(args.locations),
        'taxonomy': str(args.taxonomy),
        'coverage_fasta': str(args.coverage_fasta),
    }

    # where mcc copies will be stored

    data["mcc"] = config.INTERMEDIATE_PATHS
    for key in data["mcc"].keys():
        data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR,
                                                    input_dir)
        data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name)
        data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME,
                                                    sample_name)

    env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/"
    data["envs"] = config_install.ENV
    for key in data["envs"].keys():
        data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH,
                                                      env_path)

    with open(run_config, "w") as conf:
        json.dump(data, conf, indent=4)

    return run_id
Пример #7
0
def main():
    mccutils.log("popoolationte2", "running PopoolationTE2")
    ref_fasta = snakemake.input.ref_fasta
    bam = snakemake.input.bam
    taxonomy = snakemake.input.taxonomy
    jar = snakemake.params.jar
    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    status_log = snakemake.params.status_log

    prev_step_succeeded = mccutils.check_status_file(status_log)

    if prev_step_succeeded:
        try:
            mccutils.mkdir(out_dir + "/tmp")
            taxonomy = format_taxonomy(taxonomy, out_dir)
            ppileup = popoolationte2_ppileup(jar,
                                             config.PARAMS["ppileup"],
                                             bam,
                                             taxonomy,
                                             out_dir,
                                             log=log)
            ppileup = popoolationte2_subsample(
                jar,
                config.PARAMS["subsampleppileup"],
                ppileup,
                out_dir,
                log=log)
            signatures = popoolationte2_signatures(
                jar,
                config.PARAMS["identifySignatures"],
                ppileup,
                out_dir,
                log=log)
            signatures = popoolationte2_strand(jar,
                                               config.PARAMS["updateStrand"],
                                               signatures,
                                               bam,
                                               taxonomy,
                                               out_dir,
                                               log=log)
            signatures = popoolationte2_frequency(jar,
                                                  ppileup,
                                                  signatures,
                                                  out_dir,
                                                  log=log)
            te_insertions = popoolationte2_pairup(
                jar,
                config.PARAMS["pairupSignatures"],
                signatures,
                ref_fasta,
                taxonomy,
                out_dir,
                log=log)
            mccutils.remove(out_dir + "/tmp")
            mccutils.check_file_exists(snakemake.output[0])

            with open(status_log, "w") as l:
                l.write("COMPLETED\n")
            mccutils.log("popoolationte2", "popoolationte2 run complete")

        except Exception as e:
            track = traceback.format_exc()
            print(track, file=sys.stderr)
            with open(log, "a") as l:
                print(track, file=l)
            mccutils.log("popoolationte2", "popoolationte2 run failed")
            with open(status_log, "w") as l:
                l.write("FAILED\n")

            mccutils.run_command(["touch", snakemake.output[0]])

    else:
        mccutils.run_command(["touch", snakemake.output[0]])
Пример #8
0
def install(clean=False, debug=False):

    mcc_path = os.path.dirname(os.path.abspath(__file__))
    install_path = mcc_path + "/install/"
    install_config = install_path + "/config.json"
    log_dir = install_path + "/log/"
    conda_env_dir = install_path + "/envs/conda"
    data = {}
    data['paths'] = {
        'mcc_path': mcc_path,
        'install': install_path,
        'log_dir': log_dir
    }

    data['URLs'] = config_install.URL
    data['MD5s'] = config_install.MD5
    data['ENVs'] = config_install.ENV
    data['output'] = config_install.OUTPUT

    for method in data['ENVs'].keys():
        data['ENVs'][method] = data['ENVs'][method].replace(
            config_install.ENV_PATH, install_path + "envs/")

    for method in data['output'].keys():
        data['output'][method] = data['output'][method].replace(
            config_install.INSTALL_PATH, install_path)

    with open(install_config, "w") as c:
        json.dump(data, c, indent=4)

    if os.path.exists(install_path + "install.log"):
        os.remove(install_path + "install.log")

    # removes installed tools and conda environments
    if clean:
        mccutils.log("install", "Removing conda envs from: " + conda_env_dir)
        mccutils.log(
            "install",
            "Removing installed tools from: " + install_path + "tools")
        mccutils.remove(conda_env_dir)
        mccutils.remove(install_path + "/tools")

    mccutils.mkdir(conda_env_dir)
    os.chdir(install_path)
    mccutils.mkdir(log_dir)

    for env in config.ALL_METHODS:
        if env not in config.NO_INSTALL_METHODS:
            mccutils.log("install", "Installing conda environment for: " + env)
            command = [
                "snakemake", "--use-conda", "--conda-prefix", conda_env_dir,
                "--configfile", install_config, "--cores", "1", "--nolock",
                "--create-envs-only", data['output'][env]
            ]
            if not debug:
                command.append("--quiet")
            mccutils.run_command(command)

            mccutils.log("install", "Installing scripts for:" + env)
            command = [
                "snakemake", "--use-conda", "--conda-prefix", conda_env_dir,
                "--configfile", install_config, "--cores", "1", "--nolock",
                data['output'][env]
            ]
            if not debug:
                command.append("--quiet")
            mccutils.run_command(command)

    mccutils.log("install",
                 "Installing conda environment for processing steps")
    command = [
        "snakemake", "--use-conda", "--conda-prefix", conda_env_dir,
        "--configfile", install_config, "--cores", "1", "--nolock",
        "--create-envs-only", data['output']['processing']
    ]
    if not debug:
        command.append("--quiet")
    mccutils.run_command(command)
Пример #9
0
def main():
    install_path = snakemake.config['paths']['install'] + "/tools/"
    mccutils.remove(snakemake.params.zipfile)
    download_success = mccutils.download(snakemake.params.url,
                                         snakemake.params.zipfile,
                                         md5=snakemake.params.md5,
                                         max_attempts=3)

    if not download_success:
        print("relocaTE download failed... exiting...")
        print("try running --install with --clean for clean installation")
        sys.exit(1)

    mccutils.remove(snakemake.config['paths']['install'] +
                    "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b")
    command = ["unzip", snakemake.params.zipfile]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path +
                    "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b")
    command = [
        "mv", snakemake.config['paths']['install'] +
        "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b", install_path
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path + "relocate")
    mccutils.mkdir(install_path + "relocate/")
    for f in os.listdir(install_path +
                        "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b"):
        command = [
            "mv", install_path +
            "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b/" + f,
            install_path + "relocate/"
        ]
        mccutils.run_command(command, log=snakemake.params.log)

    command = [
        "patch", "-i", snakemake.params.patch,
        install_path + "relocate/scripts/relocaTE_insertionFinder.pl"
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path +
                    "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b")
    mccutils.remove(snakemake.params.zipfile)

    output = subprocess.Popen(["which", "perl"],
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE)
    perl_path = output.stdout.read()
    perl_path = perl_path.decode()

    for f in os.listdir(install_path + "relocate/scripts/"):
        if "pl" == f.split(".")[-1]:
            with open(install_path + "tmp", "w") as tmp:
                with open(install_path + "relocate/scripts/" + f,
                          "r") as script:
                    for line in script:
                        if "#!/usr/bin/perl" in line:
                            # line = "#!"+perl_path
                            line = "#!/usr/bin/env perl\n"
                        elif "defined @" in line:
                            line = line.replace("defined @", "@")

                        elif "$scripts/" in line and "perl" not in line and "relocaTE.pl" in f:
                            line = line.replace("$scripts/", "perl $scripts/")

                        tmp.write(line)

            mccutils.run_command([
                "mv", install_path + "tmp",
                install_path + "relocate/scripts/" + f
            ])

    # write version to file
    with open(
            snakemake.config['paths']['install'] +
            "/tools/relocate/version.log", "w") as version:
        version.write(snakemake.params.md5)
Пример #10
0
def main():
    te_gff = snakemake.input.te_gff
    sam = snakemake.input.sam
    ref_fasta = snakemake.input.ref
    median_insert_size_file = snakemake.input.median_insert_size
    log = snakemake.params.log
    status_log = snakemake.params.status_log

    mccutils.log("te-locate", "running TE-Locate", log=log)
    with open(log, "a") as l:
        l.write("TE GFF: " + te_gff + "\n")
        l.write("SAM: " + sam + "\n")
        l.write("reference fasta: " + ref_fasta + "\n")

    telocate = snakemake.params.run_script
    out_dir = snakemake.params.out_dir

    try:
        # ensures intermediate files from previous runs are removed
        for f in os.listdir(out_dir):
            mccutils.remove(out_dir + "/" + f)

        sam_dir = out_dir + "/sam/"
        mccutils.mkdir(sam_dir)
        te_locate_sam = sam_dir + "te-locate.sam"
        if os.path.exists(te_locate_sam):
            os.remove(te_locate_sam)
        os.symlink(sam, te_locate_sam)

        os.chdir(os.path.dirname(telocate))

        median_insert_size = mccutils.get_median_insert_size(
            median_insert_size_file)

        distance = (median_insert_size * config.PARAMS["min_distance"])

        command = [
            "perl", telocate,
            str(config.PARAMS["max_mem"]), sam_dir, te_gff, ref_fasta, out_dir,
            str(distance),
            str(config.PARAMS["min_support_reads"]),
            str(config.PARAMS["min_support_individuals"])
        ]

        mccutils.run_command(command, log=log)

        mccutils.check_file_exists(out_dir + "_" + str(distance) +
                                   "_reads3_acc1.info")
        mccutils.run_command([
            "cp", out_dir + "_" + str(distance) + "_reads3_acc1.info",
            out_dir + "te-locate-raw.info"
        ])

        mccutils.log("te-locate", "TE-Locate complete")
        with open(status_log, "w") as l:
            l.write("COMPLETED\n")

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log, "a") as l:
            print(track, file=l)
        mccutils.log("telocate", "TE-locate run failed")
        with open(status_log, "w") as l:
            l.write("FAILED\n")

        mccutils.run_command(["touch", snakemake.output[0]])
Пример #11
0
def main():

    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    raw_fq2 = snakemake.params.raw_fq2
    is_paired = True
    if raw_fq2 == "None":
        is_paired = False

    script_dir = snakemake.params.script_dir
    out_dir = snakemake.params.out_dir
    out_gff = snakemake.output[0]

    mccutils.log("relocate", "running RelocaTE", log=log)

    input_dir = snakemake.params.out_dir + "/input/"
    mccutils.remove(input_dir)
    mccutils.mkdir(input_dir)
    fq_dir = input_dir + "fastq/"
    mccutils.mkdir(fq_dir)

    consensus_fasta = input_dir + "consensus.fasta"
    te_gff = input_dir + "te.gff"
    reference_fasta = input_dir + "reference.fasta"

    os.symlink(snakemake.input.consensus_fasta, consensus_fasta)
    os.symlink(snakemake.input.te_gff, te_gff)
    os.symlink(snakemake.input.reference_fasta, reference_fasta)
    if is_paired:
        os.symlink(snakemake.input.fq1, fq_dir + sample_name + "_1.fq")
        os.symlink(snakemake.input.fq2, fq_dir + sample_name + "_2.fq")
    else:
        os.symlink(snakemake.input.fq1, fq_dir + sample_name + ".unPaired.fq")

    annotation = make_annotation_file(te_gff, out_dir)
    os.chdir(out_dir)

    command = [
        "perl", script_dir + "/relocaTE.pl", "-t", consensus_fasta, "-d",
        fq_dir, "-g", reference_fasta, "-o", ".", "-r", annotation, "-l",
        str(config.RELOCATE['l']), "-m",
        str(config.RELOCATE['m']), "-bm",
        str(config.RELOCATE['bm']), "-bt",
        str(config.RELOCATE['bt']), "-f",
        str(config.RELOCATE['f'])
    ]

    if is_paired:
        command += ["-1", "_1", "-2", "_2"]
    else:
        command += ["-u", "unPaired"]

    mccutils.run_command(command, log=log)
    combine_gffs(out_dir, out_gff)

    mccutils.remove(out_dir + "/input/fastq")
    for d in os.listdir(out_dir):
        if os.path.exists(d + "/te_containing_fq/"):
            mccutils.remove(d + "/te_containing_fq/")

    mccutils.log("relocate", "RelocaTE run complete")
Пример #12
0
def make_depth_table(te_fasta,
                     bam,
                     genome_depth,
                     run_id,
                     out,
                     depth_csv,
                     log,
                     trim_edges=0):
    mccutils.log("coverage", "creating TE depth coverage table", log=log)
    with open(depth_csv, "w") as table:
        table.write("TE-Family,Normalized-Depth,Normalized-Unique-Depth" +
                    "\n")

    te_names = []
    uniq_coverage_files = []
    all_coverage_files = []
    avg_norm_depths = []
    avg_uniq_norm_depths = []

    with open(te_fasta, "r") as fa:
        for line in fa:
            if ">" in line:
                te_name = line.replace("\n", "")
                te_name = te_name.replace(">", "")

                mccutils.mkdir(out + "/te-depth-files")
                highQ = out + "/te-depth-files/" + te_name + ".highQ.cov"
                command = [
                    "samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0",
                    "-Q", "1"
                ]
                mccutils.run_command_stdout(command, highQ, log=log)

                allQ = out + "/te-depth-files/" + te_name + ".allQ.cov"
                command = [
                    "samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0",
                    "-Q", "0"
                ]
                mccutils.run_command_stdout(command, allQ, log=log)

                # make normalized coverage files
                allQ_chrom, allQ_pos, allQ_cov = read_samtools_depth_file(allQ)
                with open(
                        out + "/te-depth-files/" + te_name +
                        ".allQ.normalized.cov", "w") as covfile:
                    for i, pos in enumerate(allQ_pos):
                        cov = str(round(allQ_cov[i] / genome_depth, 2))
                        line = "\t".join([allQ_chrom, str(pos), cov])
                        covfile.write(line + "\n")

                highQ_chrom, highQ_pos, highQ_cov = read_samtools_depth_file(
                    highQ)
                with open(
                        out + "/te-depth-files/" + te_name +
                        ".highQ.normalized.cov", "w") as covfile:
                    for i, pos in enumerate(highQ_pos):
                        cov = str(round(highQ_cov[i] / genome_depth, 2))
                        line = "\t".join([highQ_chrom, str(pos), cov])
                        covfile.write(line + "\n")

                avg_depth = get_avg_depth(allQ, trim_edges=trim_edges)
                avg_norm_depth = avg_depth / genome_depth

                avg_uniq_depth = get_avg_depth(highQ, trim_edges=trim_edges)
                avg_uniq_norm_depth = avg_uniq_depth / genome_depth

                with open(depth_csv, "a") as table:
                    table.write(te_name + "," + str(round(avg_norm_depth, 2)) +
                                "," + str(round(avg_uniq_norm_depth, 2)) +
                                "\n")

                te_names.append(te_name)
                uniq_coverage_files.append(highQ)
                all_coverage_files.append(allQ)
                avg_norm_depths.append(avg_norm_depth)

    return te_names, all_coverage_files, uniq_coverage_files, avg_norm_depths
Пример #13
0
def install(methods, resume=False, debug=False):

    mcc_path = os.path.dirname(os.path.abspath(__file__))
    install_path = mcc_path+"/install/"
    install_config = install_path+"/config.json"
    log_dir = install_path+"/log/"
    conda_env_dir = install_path+"/envs/conda"
    data = {}
    data['paths'] = {
        'mcc_path': mcc_path,
        'install' : install_path,
        'log_dir': log_dir
    }
    
    data['URLs'] = config_install.URL
    data['MD5s'] = config_install.MD5
    data['ENVs'] = config_install.ENV
    data['output'] = config_install.OUTPUT

    for method in data['ENVs'].keys():
        data['ENVs'][method] = data['ENVs'][method].replace(config_install.ENV_PATH, install_path+"envs/")
    
    for method in data['output'].keys():
        data['output'][method] = data['output'][method].replace(config_install.INSTALL_PATH, install_path)

    with open(install_config,"w") as c:
        json.dump(data, c, indent=4)

    if os.path.exists(install_path+"install.log"):
        os.remove(install_path+"install.log")


    # finding existing conda yamls
    existing_envs = get_conda_envs(conda_env_dir)


    mccutils.mkdir(conda_env_dir)
    os.chdir(install_path)
    mccutils.mkdir(log_dir)

    # temp requires te-locate scripts to make taxonomy file
    if "temp" in methods and "te-locate" not in methods:
        methods.append("te-locate")

    for env in methods:
        if not resume:
            # remove existing envs
            if env in existing_envs.keys():
                mccutils.log("install","Removing existing conda env for: "+env)
                mccutils.remove(existing_envs[env])
                mccutils.remove(existing_envs[env].replace(".yaml",""))

            # remove existing src code
            if os.path.exists(install_path+"/tools/"+env):
                mccutils.log("install","Removing existing installation of: "+env)
                print(install_path+"/tools/"+env)
                mccutils.remove(install_path+"/tools/"+env)
        
        # reinstall src code
        mccutils.log("install","Installing scripts for:"+env)
        command = ["snakemake","--use-conda", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", data['output'][env]]
        if not debug:
            command.append("--quiet")
        mccutils.run_command(command)
Пример #14
0
def main():

    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    raw_fq2 = snakemake.params.raw_fq2
    is_paired = True
    if raw_fq2 == "None":
        is_paired = False

    script_dir = snakemake.params.script_dir
    out_dir = snakemake.params.out_dir
    status_log = snakemake.params.status_log
    out_gff = snakemake.output[0]

    try:
        # ensures intermediate files from previous runs are removed
        for f in os.listdir(out_dir):
            mccutils.remove(out_dir + "/" + f)

        mccutils.log("relocate", "running RelocaTE", log=log)
        input_dir = snakemake.params.out_dir + "/input/"
        mccutils.remove(input_dir)
        mccutils.mkdir(input_dir)
        fq_dir = input_dir + "fastq/"
        mccutils.mkdir(fq_dir)

        consensus_fasta = input_dir + "consensus.fasta"
        te_gff = input_dir + "te.gff"
        reference_fasta = input_dir + "reference.fasta"

        uniq_id = str(random.randint(10000, 99999))
        while uniq_id in fq_dir:
            mccutils.log("relocate",
                         "unique id: " + uniq_id +
                         " occurs in file path... selecting a new one...",
                         log=log)
            uniq_id = str(random.randint(10000, 99999))

        fq1_uniq_id = uniq_id + "_mcc_relocate_1"
        fq2_uniq_id = uniq_id + "_mcc_relocate_2"
        unpaired_id = uniq_id + "_unPaired"

        os.symlink(snakemake.input.consensus_fasta, consensus_fasta)
        os.symlink(snakemake.input.te_gff, te_gff)
        os.symlink(snakemake.input.reference_fasta, reference_fasta)
        if is_paired:
            os.symlink(snakemake.input.fq1,
                       fq_dir + sample_name + "." + fq1_uniq_id + ".fq")
            os.symlink(snakemake.input.fq2,
                       fq_dir + sample_name + "." + fq2_uniq_id + ".fq")
        else:
            os.symlink(snakemake.input.fq1,
                       fq_dir + sample_name + "." + unpaired_id + ".fq")

        annotation = make_annotation_file(te_gff, out_dir)
        os.chdir(out_dir)

        command = [
            "perl", script_dir + "/relocaTE.pl", "-t", consensus_fasta, "-d",
            fq_dir, "-g", reference_fasta, "-o", ".", "-r", annotation
        ]

        for param in config.PARAMS.keys():
            command.append(param)
            command.append(str(config.PARAMS[param]))

        if is_paired:
            command += ["-1", fq1_uniq_id, "-2", fq2_uniq_id]
        else:
            command += ["-u", unpaired_id]

        mccutils.run_command(command, log=log)
        combine_gffs(out_dir, out_gff)

        mccutils.check_file_exists(out_gff)
        mccutils.log("relocate", "RelocaTE run complete")
        with open(status_log, "w") as l:
            l.write("COMPLETED\n")

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log, "a") as l:
            print(track, file=l)
        mccutils.log("relocate", "RelocaTE run failed")
        with open(status_log, "w") as l:
            l.write("FAILED\n")

        mccutils.run_command(["touch", snakemake.output[0]])
Пример #15
0
def main():
    sample_name = snakemake.params.sample_name
    threads = snakemake.threads
    out_dir = snakemake.params.out_dir
    median_insert_size_file = snakemake.input.median_insert_size
    log = snakemake.params.log

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(out_dir):
        mccutils.remove(out_dir + "/" + f)

    is_paired = True
    if snakemake.params.raw_fq2 == "None":
        is_paired = False

    input_dir = snakemake.params.out_dir + "/input/"
    mccutils.remove(input_dir)
    mccutils.mkdir(input_dir)
    fq_dir = snakemake.params.out_dir + "/input/fastq/"
    mccutils.mkdir(fq_dir)

    reference = input_dir + "reference.fasta"
    te_seqs = input_dir + "consensus.fasta"
    rm_out = input_dir + "repeatmasker.out"

    os.symlink(snakemake.input.reference, reference)
    os.symlink(snakemake.input.te_seqs, te_seqs)
    os.symlink(snakemake.input.rm_out, rm_out)

    if is_paired:
        fq1 = fq_dir + sample_name + "_1.fq"
        fq2 = fq_dir + sample_name + "_2.fq"
        os.symlink(snakemake.input.fq1, fq1)
        os.symlink(snakemake.input.fq2, fq2)
    else:
        fq1 = fq_dir + sample_name + ".unPaired.fq"
        os.symlink(snakemake.input.fq1, fq1)

    median_insert_size = get_median_insert_size(median_insert_size_file)
    output = subprocess.Popen(["which", "relocaTE2.py"],
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE)
    script = output.stdout.read()
    script = script.decode()
    script = script.replace("\n", "")

    mccutils.log("relocate2", "running RelocaTE2", log=log)
    command = [
        "python2", script, "-t", te_seqs, "-g", reference, "-r", rm_out, "-o",
        out_dir, "-s",
        str(median_insert_size), "--run", "-v", "4", "-c",
        str(threads), "--aligner", config.RELOCATE2["aligner"],
        "--len_cut_match",
        str(config.RELOCATE2["len_cut_match"]), "--len_cut_trim",
        str(config.RELOCATE2["len_cut_trim"]), "--mismatch",
        str(config.RELOCATE2["mismatch"]), "--mismatch_junction",
        str(config.RELOCATE2["mismatch_junction"]), "-d", fq_dir
    ]

    if is_paired:
        command += ["-1", "_1", "-2", "_2"]

    else:
        command += ["-u", ".unPaired"]

    mccutils.run_command(command, log=log)

    mccutils.log("relocate2", "RelocaTE2 run complete")
Пример #16
0
def run_workflow(args,
                 sample_name,
                 ref_name,
                 run_id,
                 debug=False,
                 annotations_only=False):
    log = args.out + "/mcclintock." + str(run_id) + ".log"

    input_dir = args.out
    reference_dir = args.out + "/" + ref_name + "/"
    sample_dir = args.out + "/" + sample_name + "/"
    results_dir = args.out + "/" + sample_name + "/results/"

    out_files = config.OUT_PATHS
    for key in out_files.keys():
        out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir)
        out_files[key] = out_files[key].replace(config.REF_DIR, reference_dir)
        out_files[key] = out_files[key].replace(config.SAM_DIR, sample_dir)
        out_files[key] = out_files[key].replace(config.RESULTS_DIR,
                                                results_dir)
        out_files[key] = out_files[key].replace(config.SAMPLE_NAME,
                                                sample_name)

    path = os.path.dirname(os.path.abspath(__file__))
    mccutils.mkdir(args.out + "/snakemake")
    snakemake_path = args.out + "/snakemake/" + str(run_id)
    mccutils.mkdir(snakemake_path)
    mccutils.run_command(["cp", path + "/Snakefile", snakemake_path])
    os.chdir(snakemake_path)
    command = [
        "snakemake", "--use-conda", "--conda-prefix",
        path + "/install/envs/conda"
    ]
    if not debug:
        command.append("--quiet")
    else:
        command.append("--reason")

    command += [
        "--configfile",
        args.out + "/snakemake/config/config_" + str(run_id) + ".json"
    ]
    command += ["--cores", str(args.proc)]

    if not args.resume:
        if os.path.exists(reference_dir) and len(
                os.listdir(reference_dir)) > 0:
            sys.exit(
                "ERROR: output directory:" + reference_dir +
                " is not empty. If wanting to resume a previous run, use --resume, otherwise please delete this directory or change your -o/--output\n"
            )
        if os.path.exists(sample_dir) and len(os.listdir(sample_dir)) > 0:
            sys.exit(
                "ERROR: output directory:" + sample_dir +
                " is not empty. If wanting to resume a previous run, use --resume, otherwise please delete this directory or change your -o/--output or --sample_name\n"
            )

    # check that previous runs are compatible
    else:
        mccutils.log(
            "setup",
            "Checking config files to ensure previous intermediate files are compatible with this run"
        )
        config_found = False
        for prev_config in os.listdir(input_dir + "/snakemake/config/"):
            if prev_config != "config_" + str(run_id) + ".json":
                config_found = True
                config_compatible = config_compatibility(
                    input_dir + "/snakemake/config/config_" + str(run_id) +
                    ".json", args.out + "/snakemake/config/" + prev_config)
                if not config_compatible:
                    sys.exit(1)

        if not config_found:
            sys.exit(
                "ERROR: Unable to resume run. No config files from previous runs found in:"
                + input_dir +
                "/snakemake/config/ Remove --resume for clean run\n")

    if not annotations_only:
        for method in args.methods:
            command.append(out_files[method])

        command.append(sample_dir +
                       "results/summary/data/run/summary_report.txt")
    else:
        command.append(reference_dir + "reference_te_locations/inrefTEs.gff")
        command.append(reference_dir + "te_taxonomy/taxonomy.tsv")

    # print(" ".join(command))
    try:
        sys.stdout.flush()
        mccutils.mkdir(sample_dir)
        mccutils.mkdir(sample_dir + "tmp")
        mccutils.run_command(command)
    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print(
            "McClintock Pipeline Failed... please open an issue at https://github.com/bergmanlab/mcclintock/issues if you are having trouble using McClintock",
            file=sys.stderr)
        sys.exit(1)
    mccutils.remove(sample_dir + "tmp")
    remove_intermediate_files(
        args.keep_intermediate,
        args.out + "/snakemake/config/config_" + str(run_id) + ".json",
        args.methods, ref_name, sample_name, args.out)
Пример #17
0
def install(methods, resume=False, debug=False):

    mcc_path = os.path.dirname(os.path.abspath(__file__))
    install_path = mcc_path + "/install/"
    install_config = install_path + "/config.json"
    log_dir = install_path + "/log/"
    conda_env_dir = install_path + "/envs/conda"
    data = {}
    data['paths'] = {
        'mcc_path': mcc_path,
        'install': install_path,
        'log_dir': log_dir
    }

    data['URLs'] = config_install.URL
    data['MD5s'] = config_install.MD5
    data['ENVs'] = config_install.ENV
    data['output'] = config_install.OUTPUT

    for method in data['ENVs'].keys():
        data['ENVs'][method] = data['ENVs'][method].replace(
            config_install.ENV_PATH, install_path + "envs/")

    for method in data['output'].keys():
        data['output'][method] = data['output'][method].replace(
            config_install.INSTALL_PATH, install_path)

    with open(install_config, "w") as c:
        json.dump(data, c, indent=4)

    if os.path.exists(install_path + "install.log"):
        os.remove(install_path + "install.log")

    # removes installed tools and conda environments
    if not resume:
        mccutils.log(
            "install",
            "Removing all previously installed McClintock conda envs and tools"
        )
        mccutils.log(
            "install",
            "Use the --resume option if you don't want to perform a clean installation"
        )
        mccutils.log("install", "Removing conda envs from: " + conda_env_dir)
        mccutils.log(
            "install",
            "Removing installed tools from: " + install_path + "tools")
        mccutils.remove(conda_env_dir)
        mccutils.remove(install_path + "/tools")

    mccutils.mkdir(conda_env_dir)
    os.chdir(install_path)
    mccutils.mkdir(log_dir)

    # temp requires te-locate scripts to make taxonomy file
    if "temp" in methods and "te-locate" not in methods:
        methods.append("te-locate")

    for env in methods:
        if env not in config.NO_INSTALL_METHODS:
            mccutils.log("install", "Installing conda environment for: " + env)
            command = [
                "snakemake", "--use-conda", "--conda-frontend", "mamba",
                "--conda-prefix", conda_env_dir, "--configfile",
                install_config, "--cores", "1", "--nolock",
                "--conda-create-envs-only", data['output'][env]
            ]

            if not debug:
                command.append("--quiet")
            mccutils.run_command(command)

            mccutils.log("install", "Installing scripts for:" + env)
            command = [
                "snakemake", "--use-conda", "--conda-prefix", conda_env_dir,
                "--configfile", install_config, "--cores", "1", "--nolock",
                data['output'][env]
            ]
            if not debug:
                command.append("--quiet")
            mccutils.run_command(command)

    mccutils.log("install",
                 "Installing conda environment for setup_reads steps")
    command = [
        "snakemake", "--use-conda", "--conda-frontend", "mamba",
        "--conda-prefix", conda_env_dir, "--configfile", install_config,
        "--cores", "1", "--nolock", "--conda-create-envs-only",
        data['output']['setup_reads']
    ]
    if not debug:
        command.append("--quiet")
    mccutils.run_command(command)

    mccutils.log("install",
                 "Installing conda environment for processing steps")
    command = [
        "snakemake", "--use-conda", "--conda-frontend", "mamba",
        "--conda-prefix", conda_env_dir, "--configfile", install_config,
        "--cores", "1", "--nolock", "--conda-create-envs-only",
        data['output']['processing']
    ]

    if not debug:
        command.append("--quiet")
    mccutils.run_command(command)
Пример #18
0
def main():
    sample_name = snakemake.params.sample_name
    threads = snakemake.threads
    out_dir = snakemake.params.out_dir
    median_insert_size_file = snakemake.input.median_insert_size
    log = snakemake.params.log
    status_log = snakemake.params.status_log

    try:
        # ensures intermediate files from previous runs are removed
        for f in os.listdir(out_dir):
            mccutils.remove(out_dir + "/" + f)

        is_paired = True
        if snakemake.params.raw_fq2 == "None":
            is_paired = False

        input_dir = snakemake.params.out_dir + "/input/"
        mccutils.remove(input_dir)
        mccutils.mkdir(input_dir)
        fq_dir = snakemake.params.out_dir + "/input/fastq/"
        mccutils.mkdir(fq_dir)

        reference = input_dir + "reference.fasta"
        te_seqs = input_dir + "consensus.fasta"
        rm_out = input_dir + "repeatmasker.out"

        os.symlink(snakemake.input.reference, reference)
        os.symlink(snakemake.input.te_seqs, te_seqs)
        os.symlink(snakemake.input.rm_out, rm_out)

        if is_paired:
            fq1 = fq_dir + sample_name + "_1.fq"
            fq2 = fq_dir + sample_name + "_2.fq"
            os.symlink(snakemake.input.fq1, fq1)
            os.symlink(snakemake.input.fq2, fq2)
        else:
            fq1 = fq_dir + sample_name + ".unPaired.fq"
            os.symlink(snakemake.input.fq1, fq1)

        median_insert_size = get_median_insert_size(median_insert_size_file)
        output = subprocess.Popen(["which", "relocaTE2.py"],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
        script = output.stdout.read()
        script = script.decode()
        script = script.replace("\n", "")

        mccutils.log("relocate2", "running RelocaTE2", log=log)
        command = [
            "python2", script, "-t", te_seqs, "-g", reference, "-r", rm_out,
            "-o", out_dir, "-s",
            str(median_insert_size), "--run", "-v", "4", "-c",
            str(threads), "-d", fq_dir
        ]

        for param in config.PARAMS.keys():
            command.append(param)
            command.append(str(config.PARAMS[param]))

        if is_paired:
            command += ["-1", "_1", "-2", "_2"]

        else:
            command += ["-u", ".unPaired"]

        mccutils.run_command(command, log=log)

        mccutils.check_file_exists(snakemake.output[0])
        mccutils.check_file_exists(snakemake.output[1])
        with open(status_log, "w") as l:
            l.write("COMPLETED\n")
        mccutils.log("relocate2", "RelocaTE2 run complete")

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log, "a") as l:
            print(track, file=l)
        mccutils.log("relocate2", "RelocaTE2 run failed")
        with open(status_log, "w") as l:
            l.write("FAILED\n")

        mccutils.run_command(["touch", snakemake.output[0]])
        mccutils.run_command(["touch", snakemake.output[1]])
Пример #19
0
def parse_args(expected_configs):
    parser = argparse.ArgumentParser(prog='McClintock', description="Meta-pipeline to identify transposable element insertions using next generation sequencing data")

    ## required ##
    parser.add_argument("-r", "--reference", type=str, help="A reference genome sequence in fasta format", required=('--install' not in sys.argv))
    parser.add_argument("-c", "--consensus", type=str, help="The consensus sequences of the TEs for the species in fasta format", required='--install' not in sys.argv)
    parser.add_argument("-1", "--first", type=str, help="The path of the first fastq file from paired end read sequencing or the fastq file from single read sequencing", required=(('--install' not in sys.argv) and ('--make_annotations' not in sys.argv)))
    

    ## optional ##
    parser.add_argument("-2", "--second", type=str, help="The path of the second fastq file from a paired end read sequencing", required=False)
    parser.add_argument("-p", "--proc", type=int, help="The number of processors to use for parallel stages of the pipeline [default = 1]", required=False)
    parser.add_argument("-o", "--out", type=str, help="An output folder for the run. [default = '.']", required=False)
    parser.add_argument("-m", "--methods", type=str, help="A comma-delimited list containing the software you want the pipeline to use for analysis. e.g. '-m relocate,TEMP,ngs_te_mapper' will launch only those three methods", required=False)
    parser.add_argument("-g", "--locations", type=str, help="The locations of known TEs in the reference genome in GFF 3 format. This must include a unique ID attribute for every entry", required=False)
    parser.add_argument("-t", "--taxonomy", type=str, help="A tab delimited file with one entry per ID in the GFF file and two columns: the first containing the ID and the second containing the TE family it belongs to. The family should correspond to the names of the sequences in the consensus fasta file", required=False)
    parser.add_argument("-s", "--coverage_fasta", type=str, help="A fasta file that will be used for TE-based coverage analysis, if not supplied then the consensus sequences of the TEs will be used for the analysis", required=False)
    parser.add_argument("-T", "--comments", action="store_true", help="If this option is specified then fastq comments (e.g. barcode) will be incorporated to SAM output. Warning: do not use this option if the input fastq files do not have comments", required=False)
    # parser.add_argument("-b", "--keep_bam", action="store_true", help="Retain the sorted and indexed BAM file of the paired end data aligned to the reference genome", required=False)
    # parser.add_argument("-i", "--remove_intermediate", action="store_true", help="If this option is specified then all sample specific intermediate files will be removed, leaving only the overall results. The default is to leave sample specific intermediate files", required=False)
    parser.add_argument("-a", "--augment", type=str, help="A fasta file of TE sequences that will be included as extra chromosomes in the reference file (useful if the organism is known to have TEs that are not present in the reference strain)", required=False)
    parser.add_argument("--sample_name", type=str, help="The sample name to use for output files [default: fastq1 name]", required=False)
    parser.add_argument("--resume", action="store_true", help="This option will attempt to use existing intermediate files from a previous McClintock run", required=False)
    parser.add_argument("--install", action="store_true", help="This option will install the dependencies of mcclintock", required=False)
    parser.add_argument("--debug", action="store_true", help="This option will allow snakemake to print progress to stdout", required=False)
    parser.add_argument("--slow", action="store_true", help="This option runs without attempting to optimize thread usage to run rules concurrently. Each multithread rule will use the max processors designated by -p/--proc", required=False)
    parser.add_argument("--make_annotations", action="store_true", help="This option will only run the pipeline up to the creation of the repeat annotations", required=False)
    parser.add_argument("-k","--keep_intermediate", type=str, help="This option determines which intermediate files are preserved after McClintock completes [default: general][options: minimal, general, methods, <list,of,methods>, all]", required=False)
    parser.add_argument("--config", type=str, help="This option determines which config files to use for your mcclintock run [default: config in McClintock Repository]", required=False)

    args = parser.parse_args()

    if args.config is None:
        args.config = os.path.dirname(os.path.abspath(__file__)) + "/config/"
    else:
        args.config = os.path.abspath(args.config)+"/"

    for key in expected_configs.keys():
        for config_file in expected_configs[key]:
            if not os.path.exists(args.config+"/"+config_file):
                sys.exit("Error: can't find config file: "+args.config+"/"+config_file+"\n Check that --config is set correctly...exiting...\n")

    if args.debug is None:
        args.debug = False

    #check -m
    # If only one fastq has been supplied assume this is single ended data and launch only ngs_te_mapper and RelocaTE
    if args.second is None and not args.install:
        valid_methods = sysconfig.SINGLE_END_METHODS #from config.py
    else:
        valid_methods = sysconfig.ALL_METHODS #from config.py
    
    # used to preserve trimgalore and mapped reads output if they are explicitly called by the user
    trimgalore_called = False
    map_reads_called = False

    if args.methods is None:
        args.methods = valid_methods
    
    else:
        args.methods = args.methods.split(",")
        if "trimgalore" in args.methods:
            trimgalore_called = True
        
        if "map_reads" in args.methods:
            map_reads_called = True
            
        for x,method in enumerate(args.methods):
            args.methods[x] = method.lower()
            if args.methods[x] not in valid_methods:
                sys.stderr.write(" ".join(["Method:",method, "not a valid method...", "Valid methods:"," ".join(valid_methods),"\n"]))
                sys.exit(1)

    if args.install:
        mccutils.log("install","installing dependencies")
        mccutils.log("install","WARNING: this could take awhile")
        install(args.methods, resume=args.resume, debug=args.debug)
        sys.exit(0)

    #check -r
    args.reference = mccutils.get_abs_path(args.reference)
    #check -c
    args.consensus = mccutils.get_abs_path(args.consensus)

    if args.make_annotations != True:
        #check -1
        args.first = mccutils.get_abs_path(args.first)
        #check -2
        if args.second is not None:
            args.second = mccutils.get_abs_path(args.second)

    #check -p
    if args.proc is None:
        args.proc = 1

    #check -o
    if args.out is None:
        args.out = os.path.abspath(".")
    else:
        args.out = os.path.abspath(args.out)
        try:
            mccutils.mkdir(args.out)
        except Exception as e:
            track = traceback.format_exc()
            print(track, file=sys.stderr)
            print("cannot create output directory: ",args.out,"exiting...", file=sys.stderr)
            sys.exit(1)


    # check -g
    if args.locations is not None:
        args.locations = mccutils.get_abs_path(args.locations)

        if args.taxonomy is None:
            sys.stderr.write("If a GFF file is supplied (-g/--locations) then a TE taxonomy file that links it to the fasta consensus is also needed (-t/--taxonomy)...exiting...\n")
            sys.exit(1)
    
    # check -t
    if args.taxonomy is not None:
        args.taxonomy = mccutils.get_abs_path(args.taxonomy)
    

    # check -s
    if args.coverage_fasta is not None:
        args.coverage_fasta = mccutils.get_abs_path(args.coverage_fasta)

    # check -T
    if args.comments is None:
        args.comments = False

    # check -a
    if args.augment is not None:
        args.augment = mccutils.get_abs_path(args.augment)
    
    # check sample name
    if args.sample_name is not None:
        if "/" in args.sample_name or args.sample_name == "tmp":
            sys.exit(args.sample_name+" is not a valid sample name...\n")
    else:
        if not args.make_annotations:
            args.sample_name = mccutils.get_base_name(args.first)
        else:
            args.sample_name = "tmp"

    keep_intermediate_options = ["minimal","general", "methods", "all"] + args.methods
    if args.keep_intermediate is None:
        args.keep_intermediate = ["general"]
    else:
        args.keep_intermediate = args.keep_intermediate.split(",")
        for option in args.keep_intermediate:
            if option not in keep_intermediate_options:
                sys.stderr.write("keep_intermediate option: "+option+" is not valid. Valid options: "+" ".join(keep_intermediate_options)+"\nExample:(--keep_intermediate general,methods)\n")
                sys.exit(1)

    if trimgalore_called:
        args.keep_intermediate.append("trimgalore")
    
    if map_reads_called:
        args.keep_intermediate.append("map_reads")

    return args
Пример #20
0
def make_method_pages(jinja_env, methods, consensus, out_file_map, chromosomes,
                      out_dir):
    prediction_methods = []
    for method in methods:
        if method not in NO_PRED_METHODS:
            prediction_methods.append(method)

    if len(prediction_methods) > 0:
        families = []
        with open(consensus, "r") as fa:
            for line in fa:
                if line[0] == ">":
                    family = line.replace(">", "")
                    family = family.replace("\n", "")
                    families.append(family)

        mccutils.mkdir(out_dir + "/data/methods/")
        for method in prediction_methods:
            template = jinja_env.get_template('method.html')
            mccutils.mkdir(out_dir + "/data/methods/" + method)

            predictions_file = out_file_map[method]
            reference_family_counts = []
            nonreference_family_counts = []
            for family in families:
                reference_count = 0
                nonreference_count = 0
                predictions = get_predictions(predictions_file, family=family)
                for prediction in predictions:
                    if prediction.type == "Reference":
                        reference_count += 1
                    else:
                        nonreference_count += 1

                reference_family_counts.append(reference_count)
                nonreference_family_counts.append(nonreference_count)

            with open(
                    out_dir + "/data/methods/" + method +
                    "/family_predictions.txt", "w") as raw_file:
                header = ",".join(["Family", "Reference", "Non-Reference"])
                raw_file.write(header + "\n")
                for i, fam in enumerate(families):
                    line = ",".join([
                        fam,
                        str(reference_family_counts[i]),
                        str(nonreference_family_counts[i])
                    ])
                    raw_file.write(line + "\n")

            # determine height of family counts plot, makes sure there is enough room for each bar
            height_per_entry = 20
            min_height = 500
            family_plot_height = len(families) * height_per_entry
            if family_plot_height < min_height:
                family_plot_height = min_height

            reference_chromosome_counts = []
            nonreference_chromosome_counts = []
            for chromosome in chromosomes:
                reference_count = 0
                nonreference_count = 0
                predictions = get_predictions(predictions_file,
                                              chromosome=chromosome)
                for prediction in predictions:
                    if prediction.type == "Reference":
                        reference_count += 1
                    else:
                        nonreference_count += 1

                reference_chromosome_counts.append(reference_count)
                nonreference_chromosome_counts.append(nonreference_count)

            with open(
                    out_dir + "/data/methods/" + method +
                    "/contig_predictions.txt", "w") as raw_file:
                header = ",".join(["Contig", "Reference", "Non-Reference"])
                raw_file.write(header + "\n")
                for i, chrom in enumerate(chromosomes):
                    line = ",".join([
                        chrom,
                        str(reference_chromosome_counts[i]),
                        str(nonreference_chromosome_counts[i])
                    ])
                    raw_file.write(line + "\n")

            # determine height of plot of predictions per contig
            chrom_plot_height = len(chromosomes) * height_per_entry
            if chrom_plot_height < min_height:
                chrom_plot_height = min_height

            predictions = get_predictions(predictions_file)

            with open(
                    out_dir + "/data/methods/" + method +
                    "/all_predictions.txt", "w") as raw_file:
                header = ",".join(
                    ["Contig", "Family", "Type", "Start", "End", "Strand"])
                raw_file.write(header + "\n")
                for prediction in predictions:
                    line = ",".join([
                        prediction.chrom, prediction.family, prediction.type,
                        str(prediction.start),
                        str(prediction.end), prediction.strand
                    ])
                    raw_file.write(line + "\n")

            rendered_lines = template.render(
                methods=prediction_methods,
                method=method,
                families=families,
                family_plot_height=family_plot_height,
                reference_family_counts=reference_family_counts,
                nonreference_family_counts=nonreference_family_counts,
                chromosomes=chromosomes,
                chrom_plot_height=chrom_plot_height,
                reference_chromosome_counts=reference_chromosome_counts,
                nonreference_chromosome_counts=nonreference_chromosome_counts,
                predictions=predictions)

            out_file = out_dir + "/html/" + method + ".html"
            with open(out_file, "w") as out:
                for line in rendered_lines:
                    out.write(line)
Пример #21
0
def make_run_config(args, sample_name, ref_name, full_command,
                    current_directory):
    run_id = random.randint(1000000, 9999999)
    mccutils.mkdir(args.out + "/snakemake")
    mccutils.mkdir(args.out + "/snakemake/config")
    run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json"
    input_dir = args.out + "/method_input/"
    results_dir = args.out + "/results/"

    mcc_path = os.path.dirname(os.path.abspath(__file__))

    # get git commit hash to provide in summary report
    git_commit = "?"
    try:
        os.chdir(mcc_path)
        git_commit_file = args.out + "/git-commit.txt"
        mccutils.run_command_stdout(["git", "rev-parse", "HEAD"],
                                    git_commit_file)
        with open(git_commit_file, "r") as inf:
            for line in inf:
                git_commit = line.replace("\n", "")

        mccutils.remove(git_commit_file)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("Could not locate git commit hash...using '?' ", file=sys.stderr)
        git_commit = "?"

    mccutils.log("SETUP", "McClintock Version: " + git_commit)

    out_files_to_make = []
    out_files = config.OUT_PATHS
    for key in out_files.keys():
        out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir)
        out_files[key] = out_files[key].replace(config.RESULTS_DIR,
                                                results_dir)
        out_files[key] = out_files[key].replace(config.SAMPLE_NAME,
                                                sample_name)

    for method in args.methods:
        out_files_to_make.append(out_files[method])

    now = datetime.now()
    now_str = now.strftime("%Y%m%d.%H%M%S")
    log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/"
    mccutils.mkdir(log_dir)

    chromosomes = []
    for record in SeqIO.parse(args.reference, "fasta"):
        chrom = str(record.id)
        chrom = mccutils.replace_special_chars(chrom)
        chromosomes.append(chrom)

    data = {}
    data['args'] = {
        'proc':
        str(args.proc),
        'out':
        str(args.out),
        'log_dir':
        log_dir,
        'augment_fasta':
        str(args.augment),
        'mcc_path':
        mcc_path,
        'commit':
        git_commit,
        'sample_name':
        sample_name,
        'ref_name':
        ref_name,
        'run_id':
        str(run_id),
        'methods':
        ",".join(args.methods),
        'out_files':
        ",".join(out_files_to_make),
        'save_comments':
        str(args.comments),
        'max_threads_per_rule':
        max(
            1,
            calculate_max_threads(args.proc,
                                  args.methods,
                                  config.MULTI_THREAD_METHODS,
                                  slow=args.slow)),
        'full_command':
        full_command,
        'call_directory':
        current_directory,
        'time':
        now.strftime("%Y-%m-%d %H:%M:%S"),
        "chromosomes":
        ",".join(chromosomes)
    }

    # input paths for files
    data["in"] = {
        'reference': str(args.reference),
        'consensus': str(args.consensus),
        'fq1': str(args.first),
        'fq2': str(args.second),
        'locations': str(args.locations),
        'taxonomy': str(args.taxonomy),
        'coverage_fasta': str(args.coverage_fasta),
    }

    # where mcc copies will be stored

    data["mcc"] = config.INTERMEDIATE_PATHS
    for key in data["mcc"].keys():
        data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR,
                                                    input_dir)
        data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name)
        data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME,
                                                    sample_name)

    env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/"
    data["envs"] = config_install.ENV
    for key in data["envs"].keys():
        data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH,
                                                      env_path)

    with open(run_config, "w") as conf:
        json.dump(data, conf, indent=4)

    return run_id
Пример #22
0
def main():
    

    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    raw_fq2 = snakemake.params.raw_fq2
    is_paired = True
    if raw_fq2 == "None":
        is_paired = False

    script_dir = snakemake.params.script_dir
    out_dir = snakemake.params.out_dir
    out_gff = snakemake.output[0]

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(out_dir):
        mccutils.remove(out_dir+"/"+f)

    mccutils.log("relocate","running RelocaTE", log=log)

    input_dir = snakemake.params.out_dir+"/input/"
    mccutils.remove(input_dir)
    mccutils.mkdir(input_dir)
    fq_dir = input_dir+"fastq/"
    mccutils.mkdir(fq_dir)

    consensus_fasta = input_dir+"consensus.fasta"
    te_gff = input_dir+"te.gff"
    reference_fasta = input_dir+"reference.fasta"

    uniq_id = str(random.randint(10000,99999))
    while uniq_id in fq_dir:
        mccutils.log("relocate","unique id: "+uniq_id+" occurs in file path... selecting a new one...", log=log)
        uniq_id = str(random.randint(10000,99999))

    fq1_uniq_id = uniq_id+"_mcc_relocate_1"
    fq2_uniq_id = uniq_id+"_mcc_relocate_2"
    unpaired_id = uniq_id+"_unPaired"

    os.symlink(snakemake.input.consensus_fasta, consensus_fasta)
    os.symlink(snakemake.input.te_gff, te_gff)
    os.symlink(snakemake.input.reference_fasta, reference_fasta)
    if is_paired:
        os.symlink(snakemake.input.fq1, fq_dir+sample_name+"."+fq1_uniq_id+".fq")
        os.symlink(snakemake.input.fq2, fq_dir+sample_name+"."+fq2_uniq_id+".fq")
    else:
        os.symlink(snakemake.input.fq1, fq_dir+sample_name+"."+unpaired_id+".fq")




    annotation = make_annotation_file(te_gff, out_dir)
    os.chdir(out_dir)

    command = ["perl", script_dir+"/relocaTE.pl", 
                    "-t", consensus_fasta, 
                    "-d", fq_dir, 
                    "-g", reference_fasta, 
                    "-o", ".", 
                    "-r", annotation,
                    "-l", str(config.RELOCATE['l']),
                    "-m", str(config.RELOCATE['m']),
                    "-bm", str(config.RELOCATE['bm']),
                    "-bt", str(config.RELOCATE['bt']),
                    "-f", str(config.RELOCATE['f'])]


    if is_paired:
        command += ["-1", fq1_uniq_id, "-2", fq2_uniq_id]
    else:
        command += ["-u", unpaired_id]
    
    
    mccutils.run_command(command, log=log)
    combine_gffs(out_dir, out_gff)

    mccutils.remove(out_dir+"/input/fastq")
    for d in os.listdir(out_dir):
        if os.path.exists(d+"/te_containing_fq/"):
            mccutils.remove(d+"/te_containing_fq/")
            
    mccutils.log("relocate","RelocaTE run complete")
Пример #23
0
def main():
    consensus_fasta = snakemake.input.consensus_fasta
    reference_fasta = snakemake.input.reference_fasta
    fastq1 = snakemake.input.fastq1
    fastq2 = snakemake.input.fastq2
    status_log = snakemake.params.status_log

    log = snakemake.params.log

    try:
        with open(log,"a") as l:
            l.write("consensus fasta: "+consensus_fasta+"\n")
            l.write("reference fasta: "+reference_fasta+"\n")
            l.write("fastq1: "+fastq1+"\n")
            l.write("fastq2: "+fastq2+"\n")


        threads = snakemake.threads
        sample_name = snakemake.params.sample_name
        script_dir = snakemake.params.script_dir
        out_dir = snakemake.params.out_dir
        out_bed = snakemake.output[0]

        # ensures intermediate files from previous runs are removed
        for f in os.listdir(out_dir):
            mccutils.remove(out_dir+"/"+f)

        is_paired = True
        if snakemake.params.raw_fq2 == "None":
            is_paired = False
        
        command = ['Rscript', "--vanilla", script_dir+"/ngs_te_mapper.R", "genome="+reference_fasta, "teFile="+consensus_fasta, "tsd="+str(config.PARAMS["tsd="]), "thread="+str(threads), "output="+out_dir, "sourceCodeFolder="+script_dir]

        if is_paired:
            command.append("sample="+fastq1+";"+fastq2)
        else:
            command.append("sample="+fastq1)
        
        mccutils.log("ngs_te_mapper","running ngs_te_mapper", log=log)
        mccutils.run_command(command, log=log)
        mccutils.log("ngs_te_mapper","ngs_te_mapper run complete", log=log)

        raw_bed = ""
        for f in os.listdir(out_dir+"/bed_tsd/"):
            if "insertions.bed" in f:
                raw_bed = out_dir+"/bed_tsd/"+f

        mccutils.check_file_exists(raw_bed)
        mccutils.run_command(["cp", raw_bed, out_bed])

        mccutils.log("ngs_te_mapper","ngs_te_mapper run complete")
        with open(status_log,"w") as l:
            l.write("COMPLETED\n")
    
    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log,"a") as l:
            print(track, file=l)
        mccutils.log("ngs_te_mapper","ngs_te_mapper run failed")
        with open(status_log,"w") as l:
            l.write("FAILED\n")
        
        mccutils.mkdir(out_dir+"/bed_tsd/")
        mccutils.run_command(["touch", out_bed])