예제 #1
0
def format_taxonomy(in_taxonomy, gff_ids, consensus_ids, consensus_fasta, locations_gff):
    mccutils.log("setup","checking taxonomy TSV: "+in_taxonomy)
    with open(in_taxonomy, "r") as tsv:
        for line in tsv:
            split_line = line.split("\t")
            if len(split_line) != 2:
                sys.exit(in_taxonomy+" does not have two columns. Should be tab-separated file with feature ID and TE family as columns\n")
            else:
                te_id = split_line[0]
                masked_te_id = mccutils.replace_special_chars(te_id)
                if masked_te_id != te_id:
                    mccutils.log("setup", in_taxonomy+": ERROR problematic symbol in feature name: "+te_id+" ... reformat this feature name for compatibility with McClintock")
                    print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS))
                    sys.exit(1)

                te_family = split_line[1].replace("\n","")
                if "#" in te_family:
                    org_te_family = te_family
                    te_family = te_family[:(te_family.find("#"))]
                    mccutils.log("setup", in_taxonomy+": replacing "+org_te_family+" with "+te_family+" for compatibility with RepeatMasker")

                masked_te_family = mccutils.replace_special_chars(te_family)
                if masked_te_family != te_family:
                    mccutils.log("setup", in_taxonomy+": ERROR problematic symbol in feature name: "+te_family+" ... reformat this feature name for compatibility with McClintock")
                    print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS))
                    sys.exit(1)

                if masked_te_id not in gff_ids:
                    sys.exit("TE ID: "+masked_te_id+" not found in IDs from GFF: "+locations_gff+"\nplease make sure each ID in: "+in_taxonomy+" is found in:"+locations_gff+"\n")
                
                if masked_te_family not in consensus_ids:
                    sys.exit("TE Family: "+masked_te_family+" not found in sequence names from: "+consensus_fasta+"\nplease make sure each family in: "+in_taxonomy+" is found in: "+consensus_fasta+"\n")
예제 #2
0
def format_gff(ingff):
    mccutils.log("setup","checking locations gff: "+ingff)
    gff_ids = []
    with open(ingff,"r") as gff:
        for line in gff:
            if "#" not in line[0]:
                split_line = line.split("\t")
                if len(split_line) < 9:
                    sys.exit(ingff+" appears to be a malformed GFF file..exiting...\n")
                else:
                    feats = split_line[8]
                    split_feats = feats.split(";")
                    gff_id = ""
                    for feat in split_feats:
                        if feat[:3] == "ID=":
                            gff_id = feat.split("=")[1].replace("\n","")
                            masked_gff_id = mccutils.replace_special_chars(gff_id)
                            if gff_id != masked_gff_id:
                                mccutils.log("setup", ingff+": ERROR problematic symbol in feature name: "+gff_id+" ... reformat this feature name for compatibility with McClintock")
                                print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS))
                                sys.exit(1)

                            if masked_gff_id not in gff_ids:
                                gff_ids.append(masked_gff_id)
                            else:
                                sys.exit("ID: "+masked_gff_id+" is not unique. please ensure each feature has a unique ID\n")
                    if masked_gff_id == "":
                        sys.exit("GFF line: "+line+" is missing an ID attribute (ex. ID=chr1_TY1s1)\n")
    
    return gff_ids
예제 #3
0
def format_fasta(in_fasta):
    mccutils.log("setup","checking fasta: "+in_fasta)
    seq_names = []
    try:
        with open(in_fasta,"r") as infa:
            for record in SeqIO.parse(infa, "fasta"):
                seq_name = str(record.id)
                if "#" in seq_name:
                    org_seq_name = seq_name
                    seq_name = seq_name[:(seq_name.find("#"))]
                    mccutils.log("setup", in_fasta+": replacing "+org_seq_name+" with "+seq_name+" for compatibility with RepeatMasker")

                masked_seq_name = mccutils.replace_special_chars(seq_name)
                if seq_name != masked_seq_name:
                    mccutils.log("setup", in_fasta+": ERROR problematic symbol in feature name: "+seq_name+" ... reformat this feature name for compatibility with McClintock")
                    print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS))
                    sys.exit(1)
                
                if masked_seq_name not in seq_names:
                    seq_names.append(masked_seq_name)
                else:
                    sys.exit(in_fasta+": Duplicate sequence name:"+masked_seq_name+"...exiting...\n")

    except Exception as e:
        print(e)
        sys.exit(in_fasta+" appears to be a malformed FastA file..exiting...\n")
    
    if len(seq_names) < 1:
        sys.exit(in_fasta+" contains no sequences... exiting...\n")

    return seq_names
예제 #4
0
def format_ref_te_gff(ref_tes, run_id, out):
    format_ref_tes = out + "/tmp/" + run_id + "tmpreferenceTEs1.gff"
    with open(ref_tes, "r") as ingff:
        with open(format_ref_tes, "w") as outgff:
            for line in ingff:
                if "#" not in line:
                    split_line = line.split("\t")
                    features = split_line[8].replace("\n", "")
                    split_feats = features.split(";")
                    te_id = "MISSING"
                    for feat in split_feats:
                        if "ID=" in feat:
                            te_id = feat.split("=")[1]

                    te_id = mccutils.replace_special_chars(te_id)
                    split_line[2] = te_id
                    features = ";".join(
                        ["ID=" + te_id, "Name=" + te_id, "Alias=" + te_id])
                    line = "\t".join(split_line[0:8])
                    line = line + "\t" + features + "\n"
                    outgff.write(line)

    return format_ref_tes
예제 #5
0
def make_run_config(args, sample_name, ref_name, full_command,
                    current_directory):
    run_id = random.randint(1000000, 9999999)
    mccutils.mkdir(args.out + "/snakemake")
    mccutils.mkdir(args.out + "/snakemake/config")
    run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json"
    input_dir = args.out + "/method_input/"
    results_dir = args.out + "/results/"

    out_files_to_make = []
    out_files = config.OUT_PATHS
    for key in out_files.keys():
        out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir)
        out_files[key] = out_files[key].replace(config.RESULTS_DIR,
                                                results_dir)
        out_files[key] = out_files[key].replace(config.SAMPLE_NAME,
                                                sample_name)

    for method in args.methods:
        out_files_to_make.append(out_files[method])

    now = datetime.now()
    now_str = now.strftime("%Y%m%d.%H%M%S")
    log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/"
    mccutils.mkdir(log_dir)

    chromosomes = []
    for record in SeqIO.parse(args.reference, "fasta"):
        chrom = str(record.id)
        chrom = mccutils.replace_special_chars(chrom)
        chromosomes.append(chrom)

    data = {}
    data['args'] = {
        'proc':
        str(args.proc),
        'out':
        str(args.out),
        'log_dir':
        log_dir,
        'augment_fasta':
        str(args.augment),
        'mcc_path':
        os.path.dirname(os.path.abspath(__file__)),
        'sample_name':
        sample_name,
        'ref_name':
        ref_name,
        'run_id':
        str(run_id),
        'methods':
        ",".join(args.methods),
        'out_files':
        ",".join(out_files_to_make),
        'save_comments':
        str(args.comments),
        'max_threads_per_rule':
        max(
            1,
            calculate_max_threads(args.proc,
                                  args.methods,
                                  config.MULTI_THREAD_METHODS,
                                  slow=args.slow)),
        'full_command':
        full_command,
        'call_directory':
        current_directory,
        'time':
        now.strftime("%Y-%m-%d %H:%M:%S"),
        "chromosomes":
        ",".join(chromosomes)
    }

    # input paths for files
    data["in"] = {
        'reference': str(args.reference),
        'consensus': str(args.consensus),
        'fq1': str(args.first),
        'fq2': str(args.second),
        'locations': str(args.locations),
        'taxonomy': str(args.taxonomy),
        'coverage_fasta': str(args.coverage_fasta),
    }

    # where mcc copies will be stored

    data["mcc"] = config.INTERMEDIATE_PATHS
    for key in data["mcc"].keys():
        data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR,
                                                    input_dir)
        data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name)
        data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME,
                                                    sample_name)

    env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/"
    data["envs"] = config_install.ENV
    for key in data["envs"].keys():
        data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH,
                                                      env_path)

    with open(run_config, "w") as conf:
        json.dump(data, conf, indent=4)

    return run_id
예제 #6
0
def make_run_config(args, sample_name, ref_name, full_command,
                    current_directory):
    run_id = random.randint(1000000, 9999999)
    mccutils.mkdir(args.out + "/snakemake")
    mccutils.mkdir(args.out + "/snakemake/config")
    run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json"
    input_dir = args.out + "/method_input/"
    results_dir = args.out + "/results/"

    mcc_path = os.path.dirname(os.path.abspath(__file__))

    # get git commit hash to provide in summary report
    git_commit = "?"
    try:
        os.chdir(mcc_path)
        git_commit_file = args.out + "/git-commit.txt"
        mccutils.run_command_stdout(["git", "rev-parse", "HEAD"],
                                    git_commit_file)
        with open(git_commit_file, "r") as inf:
            for line in inf:
                git_commit = line.replace("\n", "")

        mccutils.remove(git_commit_file)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("Could not locate git commit hash...using '?' ", file=sys.stderr)
        git_commit = "?"

    mccutils.log("SETUP", "McClintock Version: " + git_commit)

    out_files_to_make = []
    out_files = config.OUT_PATHS
    for key in out_files.keys():
        out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir)
        out_files[key] = out_files[key].replace(config.RESULTS_DIR,
                                                results_dir)
        out_files[key] = out_files[key].replace(config.SAMPLE_NAME,
                                                sample_name)

    for method in args.methods:
        out_files_to_make.append(out_files[method])

    now = datetime.now()
    now_str = now.strftime("%Y%m%d.%H%M%S")
    log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/"
    mccutils.mkdir(log_dir)

    chromosomes = []
    for record in SeqIO.parse(args.reference, "fasta"):
        chrom = str(record.id)
        chrom = mccutils.replace_special_chars(chrom)
        chromosomes.append(chrom)

    data = {}
    data['args'] = {
        'proc':
        str(args.proc),
        'out':
        str(args.out),
        'log_dir':
        log_dir,
        'augment_fasta':
        str(args.augment),
        'mcc_path':
        mcc_path,
        'commit':
        git_commit,
        'sample_name':
        sample_name,
        'ref_name':
        ref_name,
        'run_id':
        str(run_id),
        'methods':
        ",".join(args.methods),
        'out_files':
        ",".join(out_files_to_make),
        'save_comments':
        str(args.comments),
        'max_threads_per_rule':
        max(
            1,
            calculate_max_threads(args.proc,
                                  args.methods,
                                  config.MULTI_THREAD_METHODS,
                                  slow=args.slow)),
        'full_command':
        full_command,
        'call_directory':
        current_directory,
        'time':
        now.strftime("%Y-%m-%d %H:%M:%S"),
        "chromosomes":
        ",".join(chromosomes)
    }

    # input paths for files
    data["in"] = {
        'reference': str(args.reference),
        'consensus': str(args.consensus),
        'fq1': str(args.first),
        'fq2': str(args.second),
        'locations': str(args.locations),
        'taxonomy': str(args.taxonomy),
        'coverage_fasta': str(args.coverage_fasta),
    }

    # where mcc copies will be stored

    data["mcc"] = config.INTERMEDIATE_PATHS
    for key in data["mcc"].keys():
        data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR,
                                                    input_dir)
        data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name)
        data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME,
                                                    sample_name)

    env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/"
    data["envs"] = config_install.ENV
    for key in data["envs"].keys():
        data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH,
                                                      env_path)

    with open(run_config, "w") as conf:
        json.dump(data, conf, indent=4)

    return run_id