예제 #1
0
def main():

    description = "Check whether stop codons are depleted in motif sets by simulating the motif set."
    args = gen.parse_arguments(description, ["motif_file", "output_dir", "results_dir", "required_simulations", "motif_simulation", "exon_simulation"], flags = [4,5], ints = [3])
    motif_file, output_dir, results_dir, required_simulations, motif_simulation, exon_simulation = args.motif_file,  args.output_dir, args.results_dir, args.required_simulations, args.motif_simulation, args.exon_simulation

    if not required_simulations:
        print('You must specify the number of simulations you require.')
        raise Exception

    gen.create_output_directories(output_dir)

    if motif_simulation:
        simulation_sets = []

        #create the output directory for the particular motif set
        motif_output_dir = "{0}/{1}".format(output_dir, ".".join(motif_file.split('.')[:-1]).split('/')[-1])
        gen.create_output_directories(motif_output_dir)


        simulated_motifs_output = "{0}/simulations_{1}.txt".format(motif_output_dir, required_simulations)
        output_file = "{0}/stop_counts_{1}.txt".format(motif_output_dir, required_simulations)

        # add the files to the required list
        simulation_sets.append([motif_file, simulated_motifs_output, output_file])

        # run the simulations
        run_simulations(simulation_sets, required_simulations)

    exon_hexamer_simulation = "{0}/region_hexamer_sim.csv".format(output_dir)
    if exon_simulation:
        exon_fasta = "{0}_CDS_intervals.fasta".format(results_dir)
        run_exon_simulation(motif_file, exon_fasta, output_dir, required_simulations, exon_hexamer_simulation)
예제 #2
0
def motif_codon_density(motif_file, output_directory):

    stops = ["TAA", "TAG", "TGA"]
    gc_matchd_motifs_file = "{0}/gc_matched_combinations.bed".format(
        output_directory)
    if not os.path.isfile(gc_matchd_motifs_file):
        seqo.get_gc_matched_motifs(stops, gc_matchd_motifs_file)

    temp_dir = "temp_motif_density"
    gen.create_output_directories(temp_dir)

    motif_sets = gen.read_many_fields(gc_matchd_motifs_file, "\t")
    motif_sets.append(["TAA", "TAG", "TGA"])

    args = [motif_file, temp_dir]
    outputs = simoc.run_simulation_function(motif_sets,
                                            args,
                                            ops.calc_codon_density_in_motifs,
                                            sim_run=False)

    new_output_dir = "{0}/motif_densities".format(output_directory)
    gen.create_output_directories(new_output_dir)

    output_file = "{0}/{1}.csv".format(new_output_dir,
                                       motif_file.split("/")[-1].split(".")[0])
    with open(output_file, "w") as outfile:
        outfile.write("id,motifs,density\n")
        for i, file in enumerate(sorted(outputs)):
            data = gen.read_many_fields(file, ",")[0]
            outfile.write("{0},{1},{2}\n".format(i + 1, data[0], data[1]))

    gen.remove_directory(temp_dir)
예제 #3
0
def get_conservation(transcript_list,
                     output_file,
                     max_dS_threshold=None,
                     max_omega_threshold=None):
    """
    Get the conversation for a list of sequences and only keep those that pass

    Args:
        transcript_list (dict): dict containing transcript id, the cds and the ortholog seqs
        output_file (str): path to output file
        max_dS_threshold (float): if set, pass in the dS threshold you wish alignments to be below
        max_omega_threshold (float): if set, pass in the omega threshold you wish alignments to be below
    """

    print("Getting the most conserved ortholog for each transcript...")

    temp_dir = "temp_conservation_files"
    gen.create_output_directories(temp_dir)
    # get a list of the transcript ids
    transcript_ids = list(transcript_list.keys())
    # transcript_ids = transcript_ids[:200]
    # run this linearly because it doesnt like being parallelised
    # outputs = run_conservation_check(transcript_ids, transcript_list, max_dS_threshold, max_omega_threshold, temp_dir)
    outputs = gen.run_parallel_function(
        transcript_ids,
        [transcript_list, max_dS_threshold, max_omega_threshold, temp_dir],
        run_conservation_check,
        parallel=False)
    # remove the old output file if there is one
    gen.remove_file(output_file)
    # now concat the output files
    args = ["cat"]
    [args.append(i) for i in outputs]
    gen.run_process(args, file_for_output=output_file)
    gen.remove_directory(temp_dir)
예제 #4
0
def calc_ds(aligned_sequences):
    aligned_sequences_iupac = [
        Seq("".join(i), IUPAC.unambiguous_dna) for i in aligned_sequences
    ]
    alignment = MultipleSeqAlignment([
        SeqRecord(aligned_sequences_iupac[0], id="seq"),
        SeqRecord(aligned_sequences_iupac[1], id="orth_seq")
    ])
    gen.create_output_directories("temp_files")
    random_instance = random.random()
    temp_phylip_file = "temp_files/{0}.phy".format(random_instance)
    temp_output_file = "temp_files/{0}.out".format(random_instance)
    fo.write_to_phylip(alignment, temp_phylip_file)
    # # run paml on sequences
    working_dir = "temp_dir.{0}".format(random.random())
    paml = sequo.PAML_Functions(input_file=temp_phylip_file,
                                output_file=temp_output_file,
                                working_dir=working_dir)
    # run codeml
    codeml_output = paml.run_codeml()
    ds = codeml_output["NSsites"][0]["parameters"]["dS"]
    # clean up files
    gen.remove_file(temp_phylip_file)
    gen.remove_file(temp_output_file)
    paml.cleanup()

    return ds
예제 #5
0
def extract_second_seqs(input_bed, input_file, genome_fasta, output_dir):
    """
    Extract the second set of sequences
    """
    # get a set of ids that correspond only to lincrna entries
    id_file = "{0}/lncrna_ids.txt".format(output_dir)
    extract_lncrna_only(input_file, id_file)

    # now keep only the bed entries that are in the id list
    filtered_bed = "{0}.filtered".format(input_bed)
    ids = gen.read_many_fields(id_file, "\t")
    bed_entries = gen.read_many_fields(input_bed, "\t")
    with open(filtered_bed, "w") as outfile:
        for entry in bed_entries:
            if entry[3] in ids:
                outfile.write("{0}\n".format("\t".join(entry)))

    # now write the bed to an exon bed
    exons_bed = "{0}.exons.bed".format(input_bed)
    fo.entries_to_bed(filtered_bed, exons_bed, hg38=True)
    # now get the exon sequences
    exons_fasta = "{0}.exons.fasta".format(input_bed)
    fo.fasta_from_intervals(exons_bed,
                            exons_fasta,
                            genome_fasta,
                            force_strand=True,
                            names=True)

    # now generate the full transcript for multi exon transcripts
    transcripts_fasta = "{0}.multi_exon_transcripts.fasta".format(input_bed)
    names, seqs = gen.read_fasta(exons_fasta)
    seq_list = collections.defaultdict(lambda: collections.defaultdict())
    for i, name in enumerate(names):
        id = ".".join(name.split("(")[0].split(".")[:-1])
        exon = int(name.split("(")[0].split(".")[-1])
        seq_list[id][exon] = seqs[i]
    with open(transcripts_fasta, "w") as outfile:
        for id in sorted(seq_list):
            if len(seq_list[id]) > 1:
                exon_list = []
                for exon in sorted(seq_list[id]):
                    exon_list.append(seq_list[id][exon])
                seq = "".join(exon_list)
                if "N" not in seq and len(seq) >= 200:
                    # convert names to : here as otherwise it will run sorting later
                    id = ":".join(id.split("."))
                    outfile.write(">{0}\n{1}\n".format(id, seq))

    # blast to get paralogous families
    blast_db_path = "{0}/bast_db".format(output_directory)
    output_blast_file = "{0}/blast_output.csv".format(output_directory)
    families_file = "{0/families.txt".format(output_directory)
    gen.create_output_directories(blast_db_path)
    cons.filter_families(transcripts_fasta,
                         output_blast_file,
                         families_file,
                         database_path=blast_db_path,
                         clean_run=True)
예제 #6
0
def intron_hexamer_test(input_fasta, motif_file, output_directory, output_file, required_simulations = None, families_file = None):
    """
    Generate random hexamers from introns and calculate purine content

    Args:
        input_fasta (str): path to intron fasta
        motif_file (str): path to file containing real motifs
        output_directory (str): path to output directory
        output_file (str): path to output file
        required_simulations (int): if set, the number of simulations to run
        families_file (str): if set, path to families file
    """

    hexamers_dir = "{0}/random_hexamers".format(output_directory)
    gen.create_output_directories(hexamers_dir)
    # get the motifs
    motifs = sequo.read_motifs(motif_file)
    # if there are not enough simulations, generate them
    if len(os.listdir(hexamers_dir)) < required_simulations:
        gen.create_output_directories(hexamers_dir)
        required = list(range(required_simulations - len(os.listdir(hexamers_dir))))
        names, seqs = gen.read_fasta(sequences_file)
        seqs_list = collections.defaultdict(lambda: [])
        for i, name in enumerate(names):
            seqs_list[name.split(".")[0]].append(seqs[i])

        if families_file:
            seqs_list = sequo.pick_random_family_member(families_file, seqs_list)

        all_seqs = []
        [all_seqs.extend(seqs_list[i]) for i in seqs_list]
        full_seq = "X".join(all_seqs)
        simopc.run_simulation_function(required, [full_seq, motifs, hexamers_dir], sequo.locate_random_motifs, sim_run = False)

    # calculate the purine contents
    real_purine_content = sequo.calc_purine_content(motifs)
    real_nt_content = sequo.calc_nucleotide_content(motifs)

    test_purine_content = []
    test_nt_content = []
    for file in os.listdir(hexamers_dir):
        filepath = "{0}/{1}".format(hexamers_dir, file)
        test_motifs = sequo.read_motifs(filepath)
        test_purine_content.append(sequo.calc_purine_content(test_motifs))
        test_nt_content.append(sequo.calc_nucleotide_content(test_motifs))

    with open(output_file, "w") as outfile:
        outfile.write("id,purine_content,a_content,c_content,g_content,t_content\n")
        outfile.write("real,{0},{1}\n".format(real_purine_content, ",".join(gen.stringify([real_nt_content[i] for i in sorted(real_nt_content)]))))
        for i in range(len(test_purine_content)):
            outfile.write("{0},{1},{2}\n".format(i+1, test_purine_content[i], ",".join(gen.stringify([test_nt_content[i][j] for j in sorted(test_nt_content[i])]))))

    # remove the output directory
    gen.remove_directory(hexamers_dir)
예제 #7
0
def align_sequences(muscle_exe,
                    seq1,
                    seq2,
                    seq1_id=None,
                    seq2_id=None,
                    temp_input_file=None,
                    temp_output_file=None):
    """
    Align two protein sequences using Muscle.

    Args:
        muscle_exe (str): path to muscle tool
        seq1 (str): protein sequence 1
        seq2 (str): protein sequence 2
        seq_id1 (str): if set, the id for sequence 1
        seq_id2 (str): if set, the id for sequence 2
        temp_input_file (str): if set, the path to the alignment input file
        temp_output_file (str): if set, the path to the alignment output file
    """

    # create temp files for running alignment
    temp_dir = "temp_alignment"
    gen.create_output_directories(temp_dir)
    # create the random alignment files
    random_alignment = random.random()
    if not temp_input_file:
        temp_input_file = "{0}/protein_alignment_input_{1}.fasta".format(
            temp_dir, random_alignment)
    if not temp_output_file:
        temp_output_file = "{0}/protein_alignment_output_{1}.fasta".format(
            temp_dir, random_alignment)
    # in case the sequence ids are not set
    if not seq1_id:
        seq1_id = "seq_id_{0}_1".format(random.random())
    if not seq2_id:
        seq2_id = "{0}_2".format(seq1_id[:-2])
    # write the temporary alignment file
    with open(temp_input_file, "w") as temp_file:
        temp_file.write(">{0}\n{1}\n>{2}\n{3}\n".format(
            seq1_id, seq1, seq2_id, seq2))
    # run muscle alignment
    muscle_output = MuscleCommandline(muscle_exe,
                                      input=temp_input_file,
                                      out=temp_output_file)
    # get object
    muscle_output()

    return temp_input_file, temp_output_file
예제 #8
0
def sort_bed(input_file, output_file):
    """
    Sort a bed file.

    Args:
        input_file (str): path to the input file
        output_file (str): path to the output file
    """

    # Do like this so we can sort a file and keep the same name
    gen.create_output_directories("temp_data")
    temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random())
    gen.run_process(["sortBed", "-i", input_file],
                    file_for_output=temp_file_name)
    gen.run_process(["mv", temp_file_name, output_file])
    gen.remove_file(temp_file_name)
예제 #9
0
def motif_stop_codon_densities(motif_file, motif_controls_directory, required_simulations, output_file):

    filelist = {"real": motif_file}
    for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]):
        filelist[i] = "{0}/{1}".format(motif_controls_directory, file)
    file_ids = [i for i in filelist]

    temp_dir = "temp_motif_dir"
    gen.create_output_directories(temp_dir)

    args = [filelist, temp_dir]
    outputs = simopc.run_simulation_function(file_ids, args, calculate_stop_codon_densities, sim_run = False)

    with open(output_file, "w") as outfile:
        outfile.write("sim_id,stop_density\n")
        for file in outputs:
            outfile.write("{0}\n".format(",".join(gen.read_many_fields(file, "\t")[0])))
예제 #10
0
def cds_motif_test(cds_fasta, output_file):

    nts = ["A", "C", "G", "T"]
    stops = ["TAA", "TAG", "TGA"]

    codon_list = sorted(
        ["".join(codon) for codon in it.product(nts, nts, nts)])
    combinations = [sorted(i) for i in it.combinations(codon_list, 3)]
    # combination_not_all_stops = [i for i in combinations if len(list(set(i) & set(stops))) < 3]

    # combinations = combinations[:30]

    temp_dir = "temp_motif_densities"
    gen.create_output_directories(temp_dir)

    args = [cds_fasta, temp_dir]
    outputs = simoc.run_simulation_function(combinations,
                                            args,
                                            ops.calc_motif_densities,
                                            sim_run=False)

    temp_filelist = []
    for output in outputs:
        temp_filelist.append(output)

    densities = collections.defaultdict(lambda: collections.defaultdict())

    for file in temp_filelist:
        motif = file.split("/")[-1].split(".")[0]
        data = gen.read_many_fields(file, ",")[0]
        gc = data[0]
        density = data[1]
        densities[gc][motif] = density

    iterator = 0
    with open(output_file, "w") as outfile:
        outfile.write("id,motif,gc,density\n")
        for gc in sorted(densities):
            for motif in sorted(densities[gc]):
                iterator += 1
                outfile.write("{0},{1},{2},{3}\n".format(
                    iterator, motif, gc, densities[gc][motif]))
예제 #11
0
def blast_all_against_all(fasta_file,
                          output_file,
                          database_path=None,
                          remove_database=None,
                          clean_run=None):
    """
    Blast all sequences against all other sequences

    Args:
        fasta_file (str): path to fasta file containing sequences
        output_file (str): path to output file
        database_path (str): if not set, use temp dir
        remove_database (bool): if set, remove the database once blast has run
        clean_run (bool): if set, run new blast
    """

    print("BLASTing sequences against each other...")

    # remove the old database if we want a clean run
    if clean_run:
        gen.remove_directory(database_path)

    # create the blast database
    if not database_path:
        database_path = "temp_blast_db/{0}".format(random.random())
        print("Temp blast db: {0}".format(database_path))
    gen.create_output_directories(database_path)

    # get the list of files
    filelist = ["blast.nhr", "blast.nin", "blast.nsq"]
    files_present = [i for i in filelist if i in os.listdir(database_path)]

    database_path = "{0}/blast".format(database_path)
    if len(filelist) != len(files_present) or clean_run:
        make_blast_database(fasta_file, database_path)
    # now blast each sequence against each other
    blast_sequences(fasta_file, database_path, output_file)
    # remove the database
    if remove_database:
        gen.remove_directory(database_path)
예제 #12
0
def run_conservation_check(input_list, transcript_list, max_dS_threshold,
                           max_omega_threshold, temp_dir):
    """
    Wrapper to run the conservation check in parallel

    Args:
        input_list (list): list of transcript ids to iterate over
        transcript_list (dict): dict containing transcript id, the cds and the ortholog seqs
        output_file (str): path to output file
        max_dS_threshold (float): if set, pass in the dS threshold you wish alignments to be below
        max_omega_threshold (float): if set, pass in the omega threshold you wish alignments to be below
    """
    # create a list to keep temporary outputs
    temp_filelist = []
    temp_instance_dir = "temp_codeml_dir.{0}".format(random.random())
    gen.create_output_directories(temp_instance_dir)

    if input_list:
        temp_file = "{0}/best_ortholog_match.{1}.bed".format(
            temp_dir, random.random())
        temp_filelist.append(temp_file)
        with open(temp_file, "w") as outfile:
            # get best ortholog for each transcript
            for i, transcript_id in enumerate(input_list):
                print("{0}/{1}".format(i + 1, len(input_list)))
                ortholog_id = check_conservation(
                    transcript_id,
                    transcript_list[transcript_id][0],
                    transcript_list[transcript_id][1],
                    temp_instance_dir,
                    max_dS_threshold=max_dS_threshold,
                    max_omega_threshold=max_omega_threshold)
                if ortholog_id:
                    outfile.write("{0}\t{1}\n".format(transcript_id,
                                                      ortholog_id))
    gen.remove_directory(temp_instance_dir)
    return temp_filelist
예제 #13
0
def motif_codon_densities(motif_file, codon_combinations_file, motif_controls_directory, required_simulations, output_file):

    filelist = {"real": motif_file}
    for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]):
        filelist[i] = "{0}/{1}".format(motif_controls_directory, file)
    file_ids = [i for i in filelist]


    codon_sets = gen.read_many_fields(codon_combinations_file, "\t")


    temp_dir = "temp_motif_dir"
    gen.create_output_directories(temp_dir)
    args = [filelist, codon_sets, temp_dir]
    outputs = simopc.run_simulation_function(file_ids, args, calculate_motif_densities, sim_run = False)

    real_density_list = {}
    sim_density_list = collections.defaultdict(lambda: [])

    for file in outputs:
        results = gen.read_many_fields(file, "\t")
        if "real" in file:
            for i in results:
                real_density_list[i[0]] = float(i[1])
        else:
            for i in results:
                sim_density_list[i[0]].append(float(i[1]))

    with open(output_file, "w") as outfile:
        outfile.write("codons,gc_content,purine_content,density,nd\n")
        for codon_set in sorted(real_density_list):
            nd = np.divide(real_density_list[codon_set] - np.mean(sim_density_list[codon_set]), np.mean(sim_density_list[codon_set]))
            outputs = [codon_set, seqo.calc_gc_seqs_combined(codon_set.split("_")), sequo.calc_purine_content(codon_set.split("_")), real_density_list[codon_set], nd]
            outfile.write("{0}\n".format(",".join(gen.stringify(outputs))))

    gen.remove_directory(temp_dir)
예제 #14
0
def main():

    arguments = ["output_directory", "motif_file", "simulations", "controls_directory", "exons_fasta", "motifs_stop_density", "motif_stop_codon_densities_sim", "motif_codon_densities", "motif_densities_exon_dinucleotides", "generate_motif_controls", "match_density", "match_subs"]

    description = ""
    args = gen.parse_arguments(description, arguments, opt_flags=[2,3,4], flags = [5,6,7,8,9,10,11])
    output_directory, motif_file, simulations, controls_directory, exons_fasta, motifs_stop_density, motif_stop_codon_densities_sim, motif_codon_densities, motif_densities_exon_dinucleotides, generate_motif_controls, match_density, match_subs = args.output_directory, args.motif_file, args.simulations, args.controls_directory, args.exons_fasta, args.motifs_stop_density, args.motif_stop_codon_densities_sim, args.motif_codon_densities, args.motif_densities_exon_dinucleotides, args.generate_motif_controls, args.match_density, args.match_subs

    # interger the simulations
    if simulations:
        simulations = int(simulations)

    # create the global output directory
    global_output_directory = "{0}/motif_tests".format(output_directory)
    gen.create_output_directories(global_output_directory)

    # if we want to generate the controls
    if generate_motif_controls:
        simopc.generate_motif_dinucleotide_controls(motif_file, simulations, output_directory, match_density = match_density, match_subs = match_subs)

    # get the stop density if motifs and non motifs of same length
    if motifs_stop_density:
        mtop.calc_stop_densities(motif_file)

    # calculate stop codon densities in the motif sets
    if motif_stop_codon_densities_sim:
        # create a local output directory
        local_output_directory = "{0}/motif_stop_density_simulations".format(global_output_directory)
        gen.create_output_directories(local_output_directory)
        # output filepath
        output_file = "{0}/{1}_stop_codon_densities.csv".format(local_output_directory, motif_file.split("/")[-1].split(".")[0])
        # run if we need some more controls
        if simulations > len(os.listdir(controls_directory)):
            print("Please create more simulants...")
            raise Exception
        # # calculate densities
        mtop.motif_stop_codon_densities(motif_file, controls_directory, simulations, output_file)

    # calculate other codon densities in motif sets
    if motif_codon_densities:
        local_output_directory = "{0}/codon_combination_densities".format(global_output_directory)
        gen.create_output_directories(local_output_directory)
        # get all the possible sets of 3 unique codon combinations
        codon_combinations_file = "{0}/codon_combinations.txt".format(local_output_directory)
        if not os.path.isfile(codon_combinations_file):
            seqo.generate_all_motif_combinations(stops, codon_combinations_file)

        output_file = "{0}/{1}_codon_combination_densities.csv".format(local_output_directory, motif_file.split("/")[-1].split(".")[0])
        if simulations > len(os.listdir(controls_directory)):
            gen.remove_directory(controls_directory)
            simopc.generate_motif_controls(motif_file, simulations, controls_directory, match_density = False)
        mtop.motif_codon_densities(motif_file, codon_combinations_file, controls_directory, simulations, output_file)
names, seqs = gen.read_fasta(seqs_file)
seq_list = {
    name: seqs[i]
    for i, name in enumerate(names) if "N" not in seqs[i]
}
seq_list = sequo.pick_random_family_member(families_file, seq_list)


def randomise_seq(seq):
    nts = list(seq)
    np.random.shuffle(nts)
    return "".join(nts)


output_directory = "temp_shuffle_linc"
gen.create_output_directories(output_directory)


def run_simulations(iterations, seq_list, codon_sets, output_directory):
    outputs = []
    if len(iterations) > 0:
        np.random.seed()

        for i, iteration in enumerate(iterations):
            print("{0}/{1}".format(i + 1, len(iterations)))

            if iteration != "real":
                new_seqs = []
                for id in seq_list:
                    new_seqs.append(randomise_seq(seq_list[id]))
            else:
예제 #16
0
def main():

    arguments = ["working_directory", "output_directory", "genome_path", "input_bed", "input_fasta", "clean_run", "extract_exon_intron_bed", "extract_exons", "extract_introns", "sort_by_exon_number", "build_transcripts", "extract_families", "orf_length_sim"]
    description = "Wrapper for miscellaneous operations on lincRNA"
    args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13], opt_flags = [2,3,4])
    working_directory, output_directory, genome_path, input_bed, input_fasta, clean_run, extract_exon_intron_bed, extract_exons, extract_introns, sort_by_exon_number, build_transcripts, extract_families, orf_length_sim = args.working_directory, args.output_directory, args.genome_path, args.input_bed, args.input_fasta, args.clean_run, args.extract_exon_intron_bed, args.extract_exons, args.extract_introns, args.sort_by_exon_number, args.build_transcripts, args.extract_families, args.orf_length_sim

    # create the directories
    gen.create_output_directories(working_directory)
    gen.create_output_directories(output_directory)

    # file paths
    exons_bed = "{0}/exons.bed".format(working_directory)
    single_exons_bed = "{0}/single_exons.bed".format(working_directory)
    multi_exons_bed = "{0}/multi_exons.bed".format(working_directory)
    exons_fasta = "{0}/exons.fasta".format(working_directory)
    single_exons_fasta = "{0}/single_exons.fasta".format(working_directory)
    multi_exons_fasta = "{0}/multi_exons.fasta".format(working_directory)
    introns_bed = "{0}/introns.bed".format(working_directory)
    introns_fasta = "{0}/introns.fasta".format(working_directory)
    transcript_sequences_fasta = "{0}/transcript_sequences.fasta".format(working_directory)
    multi_exon_transcript_sequences_fasta = "{0}/multi_exon_transcript_sequences.fasta".format(working_directory)
    multi_exon_blast_file = "{0}/multi_exons_blast_all_against_all.csv".format(working_directory)
    multi_exon_blast_database = "{0}/multi_exon_blast_all_against_all".format(working_directory)
    multi_exon_families_file = "{0}/multi_exon_families.txt".format(working_directory)

    # create the exons and introns files from bed
    if extract_exon_intron_bed:
        # copy the main file to the folder
        gen.copy_file(input_bed, "{0}/{1}".format(working_directory, input_bed.split("/")[-1]))
        # extract the features
        lmo.extract_bed_coordinates_block_format(input_bed, exons_bed, introns_bed)
    # get files for each
    if sort_by_exon_number:
        gen.check_files_exists([exons_bed])
        lmo.sort_by_exon_number(exons_bed, single_exons_bed, multi_exons_bed)

    # get exons
    if extract_exons:
        gen.check_files_exists([exons_bed])
        fo.fasta_from_intervals(exons_bed, exons_fasta, genome_path, names=True)
        # if the single exons bed file exists, get just the single exon sequences
        if os.path.isfile(single_exons_bed):
            lmo.sort_fasta_by_bed(single_exons_bed, exons_fasta, single_exons_fasta)
        # if the multi exons bed file exists, get just the multi exon sequences
        if os.path.isfile(multi_exons_bed):
            lmo.sort_fasta_by_bed(multi_exons_bed, exons_fasta, multi_exons_fasta)

    # get introns
    if extract_introns:
        gen.check_files_exists([introns_bed])
        fo.fasta_from_intervals(introns_bed, introns_fasta, genome_path, names=True)

    # build transcripts
    if build_transcripts:
        gen.check_files_exists([exons_fasta])
        lmo.build_transcripts(exons_fasta, transcript_sequences_fasta)
        # if the multi exons bed file exists, get just the multi exon sequences
        if os.path.isfile(multi_exons_bed):
            lmo.sort_fasta_by_bed(multi_exons_bed, transcript_sequences_fasta, multi_exon_transcript_sequences_fasta)

    # now group into paralagous families
    if extract_families:
        gen.check_files_exists([multi_exon_transcript_sequences_fasta])
        cons.filter_families(multi_exon_transcript_sequences_fasta, multi_exon_blast_file, multi_exon_families_file, database_path = multi_exon_blast_database, clean_run = clean_run)
예제 #17
0
def main():

    description = "Look at disease snps."
    arguments = ["disease_snps_file", "output_directory", "results_prefix", "simulations", "ese_file", "intersect_snps", "get_relative_positions", "get_snp_status", "get_info", "simulate_ptc_location", "get_possible_ptc_locations", "required_simulations", "get_overlaps", "intersect_ptcs", "compare_ptcs" ,"get_introns", "compare_distances", "clinvar_ptc_locations", "location_simulation", "exclude_cpg", "ese_hit_simulation", "only_disease", "only_kgenomes", "only_ese", "get_unique_ptcs", "get_unique_rel_pos", "excess_test", "disease_locations_chisquare"]
    args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 21, 22, 23,24,25,26,27], ints=[3])
    disease_snps_file, output_directory, results_prefix, simulations, ese_file, intersect_snps, get_relative_positions, get_snp_status, get_info, simulate_ptc_location, get_possible_ptc_locations, required_simulations, get_overlaps, intersect_ptcs, compare_ptcs, get_introns, compare_distances, clinvar_ptc_locations, location_simulation, exclude_cpg, ese_hit_simulation, only_disease, only_kgenomes, only_ese, get_unique_ptcs, get_unique_rel_pos, excess_test, disease_locations_chisquare = args.disease_snps_file, args.output_directory, args.results_prefix, args.simulations, args.ese_file, args.intersect_snps, args.get_relative_positions, args.get_snp_status, args.get_info, args.simulate_ptc_location, args.get_possible_ptc_locations, args.required_simulations, args.get_overlaps, args.intersect_ptcs, args.compare_ptcs, args.get_introns, args.compare_distances, args.clinvar_ptc_locations, args.location_simulation, args.exclude_cpg, args.ese_hit_simulation, args.only_disease, args.only_kgenomes, args.only_ese, args.get_unique_ptcs, args.get_unique_rel_pos, args.excess_test, args.disease_locations_chisquare

    if simulations and not isinstance(simulations, int):
        print("\nERROR: Please provide the correct number for simulations.\n")
        raise Exception

    # create the output directory if it doesnt already exist
    gen.create_output_directories(output_directory)

    # disease_snps_file = "./source_data/clinvar_20180429.vcf.gz"
    disease_snps_index_file = "{0}.tbi".format(disease_snps_file)

    if not os.path.isfile(disease_snps_file) or not os.path.isfile(disease_snps_index_file):
        print("\nERROR: Please provide the required disease SNPs file(s).\n")
        raise Exception

    # intersect the coding exons with the disease snps
    exon_bed = "{0}_coding_exons.bed".format(results_prefix)
    disease_snp_intersect_file_vcf = "{0}/disease_snp_intersect.vcf".format(output_directory)
    disease_snp_intersect_file_bed = "{0}/disease_snp_intersect.bed".format(output_directory)
    if intersect_snps:
        print("Intersecting snps with exons")
        so.intersect_snps_parallel(exon_bed, disease_snps_file, disease_snp_intersect_file_vcf)
        so.intersect_vcf_to_bed(exon_bed, disease_snp_intersect_file_vcf, disease_snp_intersect_file_bed, change_names = True)

    # get relative positions of the snps in cds and exons
    full_bed = "{0}_CDS.bed".format(results_prefix)
    disease_snps_relative_exon_positions = "{0}/disease_snp_relative_exon_positions.bed".format(output_directory)
    disease_snps_relative_cds_positions = "{0}/disease_snp_relative_cds_positions.bed".format(output_directory)
    if get_relative_positions:
        print("Getting snp relative positions...")
        so.get_snp_relative_exon_position(disease_snp_intersect_file_bed, disease_snps_relative_exon_positions)
        # output to var because this is how the function was made
        relative_positions = gen.read_many_fields(disease_snps_relative_exon_positions, "\t")
        so.get_snp_relative_cds_position(relative_positions, disease_snps_relative_cds_positions, full_bed)

    # get the change status of the snps to check them
    cds_fasta = "{0}_CDS.fasta".format(results_prefix)
    disease_ptcs_file = "{0}/disease_ptcs.txt".format(output_directory)
    disease_other_file = "{0}/disease_other_snps.txt".format(output_directory)
    if get_snp_status:
        print("Getting snp status...")
        so.get_snp_change_status(disease_snps_relative_cds_positions, cds_fasta, disease_ptcs_file, disease_other_file)

    # get intersect between the clinvar ptcs and 1000 genomes ptcs
    ptc_file = "{0}_ptc_file.txt".format(results_prefix)
    ptc_intersect_file = "{0}/ptc_intersect.bed".format(output_directory)
    if intersect_ptcs:
        temp_disease_ptc_file = "temp_data/{0}".format(random.random())
        dso.refactor_ptc_file(disease_ptcs_file, temp_disease_ptc_file)
        temp_k_genomes_ptc_file = "temp_data/{0}".format(random.random())
        dso.refactor_ptc_file(ptc_file, temp_k_genomes_ptc_file, header=True)
        bao.intersect_bed(temp_k_genomes_ptc_file, temp_disease_ptc_file, write_both = True, no_dups=False, output_file = ptc_intersect_file)
        gen.remove_file(temp_disease_ptc_file)
        gen.remove_file(temp_k_genomes_ptc_file)

    # get a list of ptcs unique to each dataset
    unique_ptcs = "{0}/disease_ptcs_no_intersect.bed".format(output_directory)
    unique_ptcs_kgenomes = "{0}/kgenomes_ptcs_no_intersect.bed".format(output_directory)
    if get_unique_ptcs:
        dso.get_unique_ptcs(disease_ptcs_file, ptc_file, ptc_intersect_file, unique_ptcs, unique_ptcs_kgenomes)

    # get the relative positions of the ptcs unique to each dataset
    unique_ptcs_rel_pos_file = "{0}/disease_ptcs_no_intersect_rel_pos.bed".format(output_directory)
    kgenomes_relative_positions = "{0}_PTC_relative_exon_positions.bed".format(results_prefix)
    kgenomes_unique_ptcs_rel_pos_file = "{0}/kgenomes_ptcs_no_intersect_rel_pos.bed".format(output_directory)
    if get_unique_rel_pos:
        dso.get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, unique_ptcs_kgenomes, kgenomes_relative_positions, unique_ptcs_rel_pos_file, kgenomes_unique_ptcs_rel_pos_file)


    # get the ese file name
    ese_file_name = ese_file.split('/')[-1].split('.')[0]
    # get the coding exons fasta file path
    coding_exons_fasta = "{0}_coding_exons.fasta".format(results_prefix)

    # snp_relative_positions_file = "{0}_SNP_relative_exon_position.bed".format(results_prefix)

    # simulation picking random reference allele matched simulants
    clinvar_location_simulation_file = "{0}/clinvar_ptc_location_simulation.csv".format(output_directory)
    clinvar_location_simulation_ese_overlap_file = "{0}/clinvar_ptc_location_simulation_{1}_ese_overlaps.csv".format(output_directory, ese_file_name)
    kgenomes_location_simulation_file = "{0}/1000_genomes_simulations.csv".format(output_directory)
    kgenomes_location_simulation_ese_overlap_file = "{0}/1000_genomes_simulations_ese_overlaps.csv".format(output_directory)

    if location_simulation:
        if not only_kgenomes:
            print('Running ptc location simulation on disease PTCs...')
            dso.ptc_location_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_location_simulation_file, clinvar_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg)
        if not only_disease:
            print('Running ptc location simulation on 1000 genomes PTCs...')
            dso.ptc_location_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_location_simulation_file, kgenomes_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg)


    window_start = 3
    window_end = 69
    clinvar_ese_hit_simulation_file = "{0}/clinvar_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name)
    kgenomes_ese_hit_simulation_file = "{0}/1000_genomes_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name)

    # do a simulation picking only sites from within the region
    if ese_hit_simulation:
        if not only_kgenomes:
            print("Simulating ESE hits on the {0}-{1} region for disease PTCs...".format(window_start, window_end))
            dso.ese_hit_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg)
        if not only_disease:
            print("Simulating ESE hits on the {0}-{1} region for 1000 genomes PTCs...".format(window_start, window_end))
            dso.ese_hit_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg)


    excess_test_file = "{0}/clinvar_ptc_{1}_{2}_excesses.csv".format(output_directory, window_start, window_end)
    if excess_test:
        dso.excess_test(unique_ptcs_rel_pos_file, coding_exons_fasta, excess_test_file)

    location_test_file = "{0}/clinvar_locations_chisquare.csv".format(output_directory)
    if disease_locations_chisquare:
        dso.disease_ptc_location_test(unique_ptcs_rel_pos_file, coding_exons_fasta, location_test_file)
예제 #18
0
def get_coding_exons(exons_file,
                     cds_file,
                     output_file,
                     remove_overlapping=False):
    """
    Given a bed file of exon coordinates and a bed file of CDS coordinates,
    write a new bed file that only contains those exon coordinates form the former file that are
    1) fully coding 2) internal
    NB! Assumes that all the coordinates are from non-overlapping transcripts. If this is not the case,
    set remove_overlaps to True and it'll remove overlapping intervals.
    Modified from LA and RS.

    Args:
        exons_file (str): path to the bed file containing exon coordinates
        cds_file (str): path to bed file containing the cds coordinates
        output_file (str): path to output file
        remove_overlapping (bool): if true, remove overlapping intervals
    """

    if remove_overlapping:
        sort_bed(exons_file, exons_file)
        remove_bed_overlaps(exons_file, exons_file)
    #filter out anything that isn't fully coding
    #you have to write_both because you want to make sure that they
    #haven't been kept because of an overlap to a transcript that doesn't appear in the exons file
    gen.create_output_directories("temp_data")
    temp_file = "temp_data/temp{0}.txt".format(random.random())
    intersect_bed(exons_file,
                  cds_file,
                  overlap=1,
                  overlap_rec=True,
                  output_file=temp_file,
                  force_strand=True,
                  write_both=True,
                  no_dups=False,
                  no_name_check=False)
    #filter out terminal exons
    #in theory, there shouldn't be any left after the previous step
    #in practice, there may be unannotated UTRs, so it looks like we have a fully coding terminal exon,
    #whereas in reality, the exon is only partially coding
    temp_file2 = "temp_data/temp{0}.txt".format(random.random())
    with open(temp_file2, "w") as outfile:
        #figure out the rank of the last exon for each transcript
        filt_exons = gen.read_many_fields(exons_file, "\t")
        filt_exons = [i for i in filt_exons if len(i) > 3]
        names = [i[3].split(".") for i in filt_exons]
        names = gen.list_to_dict(names, 0, 1, as_list=True)
        names = {i: max([int(j) for j in names[i]]) for i in names}
        coding_exons = gen.read_many_fields(temp_file, "\t")
        for exon in coding_exons:
            overlap_name = exon[10].split(".")
            if overlap_name[0] in names:
                name = exon[3].split(".")
                if name[-1] != "1":
                    last_exon = names[name[0]]
                    if int(name[-1]) != last_exon:
                        exon = [str(i) for i in exon[:7]]
                        outfile.write("{0}\n".format("\t".join(exon)))
    sort_bed(temp_file2, temp_file2)
    gen.run_process([
        "mergeBed", "-i", temp_file2, "-c", "4,5,6,7", "-o",
        "distinct,distinct,distinct,distinct"
    ],
                    file_for_output=output_file)
    gen.remove_file(temp_file)
    gen.remove_file(temp_file2)
예제 #19
0
def main():

    arguments = [
        "input_bed", "input_fasta", "output_directory", "input_fasta2",
        "input_file", "required_simulations", "motif_file", "families_file",
        "output_prefix", "controls_dir", "extract_sequences", "calc_gc",
        "density_sim", "get_exon_dint_controls", "get_intron_dint_controls",
        "exon_region_density", "compare_stop_density", "sim_orf_lengths",
        "sim_orf_lengths_masked", "sim_stop_density",
        "sim_stop_density_introns", "sim_stop_density_within_genes",
        "sim_stop_density_removed_motifs",
        "sim_stop_density_removed_motifs_sim_seqs", "sim_stop_density_diff",
        "exon_intron_density", "motif_nd", "excess_test", "single_exon",
        "motif_overlap", "motif_overlap_density", "clean_alignments",
        "seq_hits_linc", "upstream_atg", "excess_length_thresholds",
        "density_regions", "extract_second", "seq_no"
    ]
    description = "Container for analysis on lincRNAs"
    args = gen.parse_arguments(description,
                               arguments,
                               flags=[
                                   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                                   21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
                                   32, 33, 34, 35, 36
                               ],
                               opt_flags=[3, 4, 5, 6, 7, 8, 9, 37])

    input_bed, \
    input_fasta, \
    output_directory, \
    input_fasta2, \
    input_file, \
    required_simulations, \
    motif_file, \
    families_file, \
    output_prefix, \
    controls_dir, \
    extract_sequences, \
    calc_gc, \
    density_sim,  \
    get_exon_dint_controls, \
    get_intron_dint_controls, \
    exon_region_density, \
    compare_stop_density, \
    sim_orf_lengths, \
    sim_orf_lengths_masked, \
    sim_stop_density, \
    sim_stop_density_introns, \
    sim_stop_density_within_genes, \
    sim_stop_density_removed_motifs, \
    sim_stop_density_removed_motifs_sim_seqs, \
    sim_stop_density_diff, \
    exon_intron_density, \
    motif_nd, \
    excess_test, \
    single_exon,\
    motif_overlap, \
    motif_overlap_density, \
    clean_alignments, \
    seq_hits_linc, \
    upstream_atg, \
    excess_length_thresholds, \
    density_regions, \
    extract_second, \
    seq_no = \
    args.input_bed, \
    args.input_fasta, \
    args.output_directory, \
    args.input_fasta2, \
    args.input_file, \
    args.required_simulations, \
    args.motif_file, \
    args.families_file, \
    args.output_prefix, \
    args.controls_dir, \
    args.extract_sequences, \
    args.calc_gc, \
    args.density_sim, \
    args.get_exon_dint_controls, \
    args.get_intron_dint_controls, \
    args.exon_region_density, \
    args.compare_stop_density, \
    args.sim_orf_lengths, \
    args.sim_orf_lengths_masked, \
    args.sim_stop_density, \
    args.sim_stop_density_introns, \
    args.sim_stop_density_within_genes, \
    args.sim_stop_density_removed_motifs, \
    args.sim_stop_density_removed_motifs_sim_seqs, \
    args.sim_stop_density_diff, \
    args.exon_intron_density, \
    args.motif_nd, \
    args.excess_test, \
    args.single_exon, \
    args.motif_overlap, \
    args.motif_overlap_density, \
    args.clean_alignments, \
    args.seq_hits_linc, \
    args.upstream_atg, \
    args.excess_length_thresholds, \
    args.density_regions, \
    args.extract_second, \
    args.seq_no

    # make required simultions an int
    required_simulations = int(
        required_simulations) if required_simulations else None
    # prcoess output prefix
    output_prefix = output_prefix + "_" if output_prefix else ""
    seq_no = int(seq_no) if seq_no else None

    # create output directories
    global_output_directory = "{0}/tests/lincrna".format(output_directory)
    gen.create_output_directories(global_output_directory)

    # set a start time
    start = time.time()

    # create the output_directory if it doenst already exist
    gen.create_output_directories(global_output_directory)

    # get the sequences
    if extract_sequences:
        lincRNA_single_exon_bed = "{0}/lincrna/lincRNA.single_exon.bed".format(
            output_directory)
        lincRNA_single_exon_fasta = "{0}/lincrna/lincRNA.single_exon.fasta".format(
            output_directory)
        lincRNA_single_exon_families = "{0}/lincrna/lincRNA.single_exon_families.bed".format(
            output_directory)
        lincRNA_multi_exon_bed = "{0}/lincrna/lincRNA.multi_exon.bed".format(
            output_directory)
        lincRNA_multi_exon_intron_bed = "{0}/lincrna/lincRNA.multi_exon.introns.bed".format(
            output_directory)
        lincRNA_multi_exon_fasta = "{0}/lincrna/lincRNA.multi_exon.fasta".format(
            output_directory)
        lincRNA_multi_exon_exons_fasta = "{0}/lincrna/lincRNA.multi_exon.exons.fasta".format(
            output_directory)
        lincRNA_multi_exon_intron_fasta = "{0}/lincrna/lincRNA.multi_exon.introns.fasta".format(
            output_directory)
        lincRNA_multi_exon_families = "{0}/lincrna/lincRNA.multi_exon_families.bed".format(
            output_directory)
        cont.extract_lincRNA_sequences(input_bed,
                                       input_fasta,
                                       lincRNA_single_exon_bed,
                                       lincRNA_multi_exon_bed,
                                       lincRNA_single_exon_fasta,
                                       lincRNA_multi_exon_fasta,
                                       lincRNA_multi_exon_intron_bed,
                                       lincRNA_multi_exon_intron_fasta,
                                       lincRNA_single_exon_families,
                                       lincRNA_multi_exon_families,
                                       clean_run=None)

    # clean the alignments to get in usable form
    # might need this
    if clean_alignments:
        output_exon_file = "{0}/clean_exon_alignments.fasta"
        output_intron_file = "{0}/clean_intron_alignments.fasta"
        ltests.clean_alignments(input_bed, input_fasta, output_exon_file,
                                output_intron_file)

    if calc_gc:
        output_file = "{0}/{1}_gc.csv".format(global_output_directory,
                                              output_prefix)
        ltests.calc_gc(input_fasta, output_file, families_file=families_file)

    # orf length test
    if sim_orf_lengths:
        sim_orf_length_output_file = "{0}/{1}sim_orf_lengths.csv".format(
            global_output_directory, output_prefix)
        if families_file:
            sim_orf_length_z_file = "{0}/{1}sim_orf_lengths_zs_grouped.csv".format(
                global_output_directory, output_prefix)
        else:
            sim_orf_length_z_file = "{0}/{1}sim_orf_lengths_zs.csv".format(
                global_output_directory, output_prefix)
        # run the test
        simopc.sim_orf_length(input_fasta, required_simulations,
                              sim_orf_length_output_file)
        ltests.process_length_sim(sim_orf_length_output_file,
                                  sim_orf_length_z_file,
                                  families_file=families_file)

    if sim_orf_lengths_masked:
        masked_output_file = "{0}_{1}_masked.csv".format(
            input_file.split(".")[0],
            motif_file.split("/")[-1].split(".")[0])
        # run the test
        simopc.sim_orf_length_masked(input_fasta,
                                     required_simulations,
                                     motif_file,
                                     input_file,
                                     controls_dir,
                                     masked_output_file,
                                     families_file=families_file)

    # stop density test
    if sim_stop_density:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_stop_density_output_dir = "{0}/{1}_stop_density_simulation_all_genes_grouped_families".format(
                local_output_directory, output_prefix)
            sim_stop_density_output_file = "{0}/{1}_stop_density_simulation_all_genes_grouped_families.csv".format(
                local_output_directory, output_prefix)
            runs = 10
        else:
            sim_stop_density_output_dir = "{0}/{1}_stop_density_simulation_all_genes".format(
                local_output_directory, output_prefix)
            sim_stop_density_output_file = "{0}/{1}_stop_density_simulation_all_genes.csv".format(
                local_output_directory, output_prefix)
            runs = 1

        gen.create_output_directories(sim_stop_density_output_dir)

        for run in list(range(runs)):
            output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_stop_density_output_dir, run + 1)
            ltests.sim_stop_density(input_fasta,
                                    output_file,
                                    simulations=int(required_simulations),
                                    families_file=families_file)
        # process the outputs
        ltests.process_sim_stop_density_outputs(sim_stop_density_output_dir,
                                                sim_stop_density_output_file)

    # within genes
    if sim_stop_density_within_genes:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_stop_density_within_gene_output_dir = "{0}/{1}_stop_density_simulation_within_genes_grouped_families".format(
                local_output_directory, output_prefix)
            sim_stop_density_within_gene_output_file = "{0}/{1}_stop_density_simulation_within_genes_grouped_families.csv".format(
                local_output_directory, output_prefix)
            runs = 10
        else:
            sim_stop_density_within_gene_output_dir = "{0}/{1}_stop_density_simulation_within_genes".format(
                local_output_directory, output_prefix)
            sim_stop_density_within_gene_output_file = "{0}/{1}_stop_density_simulation_within_genes.csv".format(
                local_output_directory, output_prefix)
            runs = 1
        gen.create_output_directories(sim_stop_density_within_gene_output_dir)
        for run in list(range(runs)):
            output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_stop_density_within_gene_output_dir, run + 1)
            ltests.sim_stop_density_within_genes(
                input_fasta,
                output_file,
                simulations=int(required_simulations),
                families_file=families_file)

        # process the outputs
        ltests.process_sim_stop_density_within_gene_outputs(
            sim_stop_density_within_gene_output_dir,
            sim_stop_density_within_gene_output_file)

    # stop density test in the introns
    if sim_stop_density_introns:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_stop_density_output_dir = "{0}/{1}_stop_density_introns_simulation_all_genes_grouped_families".format(
                local_output_directory, output_prefix)
            sim_stop_density_output_file = "{0}/{1}_stop_density_introns_simulation_all_genes_grouped_families.csv".format(
                local_output_directory, output_prefix)
            runs = 1
        else:
            sim_stop_density_output_dir = "{0}/{1}_stop_density_introns_simulation_all_genes".format(
                local_output_directory, output_prefix)
            sim_stop_density_output_file = "{0}/{1}_stop_density_introns_simulation_all_genes.csv".format(
                local_output_directory, output_prefix)
            runs = 1
        gen.create_output_directories(sim_stop_density_output_dir)

        for run in list(range(runs)):
            output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_stop_density_output_dir, run + 1)
            ltests.sim_stop_density(input_fasta,
                                    output_file,
                                    simulations=int(required_simulations),
                                    families_file=families_file,
                                    introns=True,
                                    input_fasta2=input_fasta2)
        # process the outputs
        ltests.process_sim_stop_density_outputs(sim_stop_density_output_dir,
                                                sim_stop_density_output_file)

    # remove motifs and test
    if sim_stop_density_removed_motifs:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes_grouped_families_removed_motifs".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes_grouped_families_removed_motifs.csv".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 10
        else:
            sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes.csv".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 1
        # remove any previous runs
        gen.remove_directory(sim_output_dir)
        gen.create_output_directories(sim_output_dir)

        for run in list(range(runs)):
            run_output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_output_dir, run + 1)
            ltests.sim_stop_density_removed_motifs(
                input_fasta,
                run_output_file,
                motif_file,
                simulations=int(required_simulations),
                families_file=families_file)
        # process the outputs
        ltests.process_sim_stop_density_outputs(sim_output_dir,
                                                sim_output_file,
                                                reverse=True)

    # remove motifs and test within seqs
    if sim_stop_density_removed_motifs_sim_seqs:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_grouped_families_removed_motifs_seq_sim".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_simulation_grouped_families_removed_motifs_seq_sim.csv".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 10
        else:
            sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes_seq_sim".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes_seq_sim.csv".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 1
        # remove any previous runs
        gen.remove_directory(sim_output_dir)
        gen.create_output_directories(sim_output_dir)

        for run in list(range(runs)):
            run_output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_output_dir, run + 1)
            ltests.sim_stop_density_removed_motifs_seq_sim(
                input_fasta,
                run_output_file,
                motif_file,
                controls_dir,
                simulations=int(required_simulations),
                families_file=families_file)
        # process the outputs
        ltests.process_sim_stop_density_outputs(sim_output_dir,
                                                sim_output_file,
                                                reverse=True)

    if sim_stop_density_diff:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_output_dir = "{0}/{1}_{2}_stop_density_diff_grouped_families".format(
                global_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_stop_density_diff_grouped_families.csv".format(
                global_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 10
        else:
            sim_output_dir = "{0}/{1}_{2}_stop_density_stop_density_diff_all_genes".format(
                global_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_stop_density_diff_all_genes.csv".format(
                global_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 1
        # remove any previous runs
        gen.remove_directory(sim_output_dir)
        gen.create_output_directories(sim_output_dir)

        for run in list(range(runs)):
            run_output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_output_dir, run + 1)
            ltests.sim_stop_density_diff(input_fasta,
                                         run_output_file,
                                         motif_file,
                                         controls_dir,
                                         simulations=int(required_simulations),
                                         families_file=families_file)
        # process the outputs
        ltests.process_sim_stop_density_diffs(sim_output_dir,
                                              sim_output_file,
                                              greater_than=False)

    # get density in exons and introns
    if exon_intron_density:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        output_file = "{0}/exon_intron_stop_density.csv".format(
            local_output_directory)
        ltests.exon_intron_stop_density(input_fasta,
                                        input_fasta2,
                                        output_file,
                                        families_file=families_file)

    # test whether there is an excess in flanks
    if excess_test:
        gen.check_files_exists([input_fasta, motif_file])
        # local output directory
        local_output_directory = "{0}/stop_excesses".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        # if the families file exists, group by family
        if families_file:
            excess_test_output_file = "{0}/{1}_stop_codon_excesses_grouped.csv".format(
                local_output_directory,
                motif_file.split("/")[-1].split(".")[0])
        else:
            excess_test_output_file = "{0}/{1}_stop_codon_excesses.csv".format(
                local_output_directory,
                motif_file.split("/")[-1].split(".")[0])
        # run the test
        ltests.excess_test(input_fasta,
                           motif_file,
                           excess_test_output_file,
                           simulations=required_simulations,
                           families_file=families_file)

    # upstream from the atg
    if upstream_atg:
        output_file = "{0}/stop_density/upstream_atg_stop_density.csv".format(
            global_output_directory)
        ltests.upstream_atg(input_fasta,
                            output_file,
                            simulations=int(required_simulations),
                            families_file=families_file)

    # calculate the density in the different regions
    if density_regions:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        output_file = "{0}/stop_density_regions_chisq.csv".format(
            local_output_directory)
        output_file1 = "{0}/stop_density_regions1.csv".format(
            local_output_directory)
        output_file2 = "{0}/stop_density_regions_per_seq.csv".format(
            local_output_directory)
        ltests.density_regions(input_fasta,
                               motif_file,
                               output_file,
                               output_file1,
                               output_file2,
                               required_simulations=required_simulations,
                               families_file=families_file)

    # test hits to seqs
    if seq_hits_linc:
        local_output_dir = "{0}/ese_hits".format(global_output_directory)
        if output_prefix:
            tests_output_dir = "{0}/{1}_{2}".format(
                local_output_dir, output_prefix[:-1],
                motif_file.split("/")[-1].split(".")[0])
            final_output_file = "{0}/{1}_{2}_processed2.csv".format(
                local_output_dir, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
        else:
            tests_output_dir = "{0}/{1}_{2}".format(
                local_output_dir, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            final_output_file = "{0}/{1}_{2}_processed2.csv".format(
                local_output_dir, output_prefix[:-1],
                motif_file.split("/")[-1].split(".")[0])
        gen.create_output_directories(tests_output_dir)

        runs = 10
        for run in range(runs):
            if output_prefix:
                output_file = "{0}/{1}_{2}_hits_{3}.csv".format(
                    tests_output_dir, output_prefix,
                    motif_file.split("/")[-1].split(".")[0], run + 1)
            else:
                output_file = "{0}/{1}_hits_{2}.csv".format(
                    tests_output_dir,
                    motif_file.split("/")[-1].split(".")[0], run + 1)
            mto.calc_seq_hits_linc(input_fasta,
                                   output_file,
                                   motif_file,
                                   controls_dir,
                                   required_simulations=required_simulations,
                                   families_file=families_file)
        mto.process_seq_hits_linc(tests_output_dir, final_output_file)

    if excess_length_thresholds:
        local_output_dir = "{0}/orf_length_thresholds".format(
            global_output_directory)
        gen.create_output_directories(local_output_dir)
        ltests.orf_exceed_length_threshold(
            input_fasta,
            local_output_directory,
            required_simulations=required_simulations,
            families_file=families_file)

    # extract second set
    if extract_second:
        local_output_dir = "{0}/genome_sequences/lincrna/{1}".format(
            output_directory, output_prefix)
        lmisco.extract_second_seqs(input_bed, input_file, input_fasta,
                                   local_output_dir)
예제 #20
0
def process_bam_per_individual(bam_files, global_exon_junctions_file,
                               PTC_exon_junctions_file, out_folder, PTC_file,
                               syn_nonsyn_file, out_prefix,
                               exon_junctions_bam_output_folder, kw_dict):
    '''
    Do all of the processing on an individual bam, from filtering out low quality data to mapping reads to
    exon-exon junctions.
    For each exon, return information on how many reads fall at different exon-exon junctions.
    '''

    #parse keyword_dict
    #it's done like this to make it easier to parallelize this process
    if "ptc_snp_simulation" in kw_dict:
        ptc_snp_simulation = kw_dict["ptc_snp_simulation"]
    else:
        ptc_snp_simulation = False
    if "simulation_instance_folder" in kw_dict:
        simulation_instance_folder = kw_dict["simulation_instance_folder"]
    else:
        simulation_instance_folder = None
    if "simulation_number" in kw_dict:
        simulation_number = kw_dict["simulation_number"]
    else:
        simulation_number = None
    if "overwrite_intersect" in kw_dict:
        overwrite_intersect = kw_dict["overwrite_intersect"]
    else:
        overwrite_intersect = False
    if "phase" in kw_dict:
        phase = kw_dict["phase"]
    else:
        phase = False

    bam_file_number = len(bam_files)
    for pos, bam_file in enumerate(bam_files):

        #Process:
        # 1. get the number of reads in bam
        # 2. Filter out reads that don't overlap exon-exon junctions
        # 3. Filter out reads that don't overlap exon-exon junctions flanking PTC-containing exons
        # 4. Filter bams by quality
        # This gives us a set of "good" quality reads.
        # 5. scale down total read number proportionally to how many reads were lost in the quality filtering
        # 6. Count reads either skipping or including each exon

        print("{0}/{1}: {2}".format(pos, bam_file_number, bam_file))
        sample_name = (bam_file.split("/")[-1]).split(".")[0]
        if ptc_snp_simulation:
            output_file = "{0}/{1}_simulation_{2}.txt".format(
                out_folder, sample_name, simulation_number)
        else:
            output_file = "{0}/{1}.txt".format(out_folder, sample_name)

        #folder that will contain all of the intermediate steps in the processing of the bam file
        if ptc_snp_simulation:
            proc_folder = "{0}/bam_proc_files".format(
                simulation_instance_folder)

        else:
            proc_folder = "{0}__analysis_bam_proc_files".format(out_prefix)

        gen.create_output_directories(proc_folder)

        bam_file_parts = os.path.split(bam_file)
        mapq_filtered_bam = "{0}/{1}_filtered_mapq.bam".format(
            proc_folder, bam_file_parts[1])
        mapq_flag_filtered_bam = "{0}_flag.bam".format(mapq_filtered_bam[:-4])
        mapq_flag_xt_filtered_bam = "{0}_xt.bam".format(
            mapq_flag_filtered_bam[:-4])
        mapq_flag_xt_nm_filtered_bam = "{0}_nm.bam".format(
            mapq_flag_xt_filtered_bam[:-4])

        if not os.path.isfile(output_file):

            #1: We get a count of the total reads in the sample which can be used for normalisation
            #I'm initializing it to None for safety. That way, if the process fails,
            #it won't just silently go with whatever the value was at the end of the previous loop.
            #also, writing it down cause this bit takes forever, don't want to do it again every time.
            read_count_file_name = "{0}/read_count_sample_name.txt".format(
                exon_junctions_bam_output_folder)
            read_count = None
            if os.path.isfile(read_count_file_name):
                with open(read_count_file_name) as file:
                    read_count = int("".join(file))
            else:
                read_count = int(
                    gen.run_process(["samtools", "view", "-c", bam_file]))
                with open(read_count_file_name, "w") as file:
                    file.write(str(read_count))

            #2: intersect the bam with all exon-exon junctions
            #only has to be done once for each bam
            #also removing "_out_of_frame" from out_prefix if it is present
            global_out_prefix = out_prefix
            if "out_of_frame" in global_out_prefix:
                global_out_prefix = global_out_prefix[:6]
            global_intersect_bam = "{0}/{1}_exon_junctions.bam".format(
                exon_junctions_bam_output_folder, bam_file_parts[1][:-4])
            if not os.path.isfile(global_intersect_bam) or overwrite_intersect:
                #intersect the filtered bam and the global exon junctions file
                # print(global_intersect_bam)
                bmo.intersect_bed(bam_file,
                                  global_exon_junctions_file,
                                  output_file=global_intersect_bam,
                                  intersect_bam=True)

            #3: filter to relevant exon-exon junctions
            ##Intersect junctions and .bam, and write down the overlapping .bam alignments, without counting.
            #this uses intersect bed, with the intersect bam parameter
            intersect_bam = "{0}/{1}_exon_junction_bam_intersect.bam".format(
                proc_folder, bam_file_parts[1][:-4])

            #intersect the filtered bam and the ptc exon junctions file
            bmo.intersect_bed(global_intersect_bam,
                              PTC_exon_junctions_file,
                              output_file=intersect_bam,
                              intersect_bam=True)

            #count how many reads there are in the sample after filtering to relevant exon-exon junctions but before quality filtering
            read_count_junctions_no_filter = int(
                gen.run_process(["samtools", "view", "-c", intersect_bam]))
            #4. filter .bam alignments by quality.
            #takes both upper and lower bam thresholds
            #outputs bam file with "_quality_filter_{lower_lim}_{upper_lim}" appended
            # need to do this twice and merge, so we use both intervals used by Geuvadis
            #set the mapq filter parameters here
            mapq_intervals = [[251, 255], [175, 181]]
            mapq_filter_filelist = []

            for mapq_interval in mapq_intervals:
                lower_threshold, upper_threshold = mapq_interval[
                    0], mapq_interval[1]
                mapq_filter_file = "{0}/{1}_mapq_filter_{2}_{3}.bam".format(
                    proc_folder, bam_file_parts[1][:-4], lower_threshold,
                    upper_threshold)
                mapq_filter_filelist.append(mapq_filter_file)
                ##run the mapq filter
                bmo.bam_quality_filter(
                    intersect_bam,
                    mapq_filter_file,
                    quality_greater_than_equal_to=lower_threshold,
                    quality_less_than_equal_to=upper_threshold)

            ##merge files in filelist
            bmo.merge_bams(mapq_filter_filelist, mapq_filtered_bam)

            ##filter by flags: get all mapped reads
            #Leaves: mapped unpaired and paired reads
            bmo.bam_flag_filter(mapq_filtered_bam,
                                mapq_flag_filtered_bam,
                                get_mapped_reads=True)

            ##filter bam by xt tag XT=U
            bmo.bam_xt_filter(mapq_flag_filtered_bam,
                              mapq_flag_xt_filtered_bam,
                              xt_filter="U")

            ##filter bam by nm tag NM<=6
            bmo.bam_nm_filter(mapq_flag_xt_filtered_bam,
                              mapq_flag_xt_nm_filtered_bam,
                              nm_less_equal_to=6)

            #5. scale down the initial count of reads in the sample by the proportion lost during quality filtering
            read_count_junctions_filter = int(
                gen.run_process(
                    ["samtools", "view", "-c", mapq_flag_xt_nm_filtered_bam]))
            prop_kept = np.divide(read_count_junctions_filter,
                                  read_count_junctions_no_filter)
            read_count = prop_kept * read_count

            #convert to sam format and phase reads
            intersect_sam = "{0}_phased.sam".format(
                mapq_flag_xt_nm_filtered_bam[:-4])
            if phase:
                temp_snp_file = "temp_data/snps{0}.txt".format(random.random())
                so.merge_and_header(PTC_file, syn_nonsyn_file, temp_snp_file)
                bmo.phase_bams(temp_snp_file, mapq_flag_xt_nm_filtered_bam,
                               sample_name, intersect_sam)
                gen.remove_file(temp_snp_file)
            else:
                gen.run_process(
                    ["samtools", "view", mapq_flag_xt_nm_filtered_bam],
                    file_for_output=intersect_sam)

            #6. count the number of reads supporting either the skipping or the inclusion of each exon
            junctions = bmo.read_exon_junctions(PTC_exon_junctions_file)
            bmo.count_junction_reads(intersect_sam, junctions, output_file,
                                     read_count)
예제 #21
0
def main():

    description = "Check whether PTCs are associated with greater rates of exon skipping."
    args = gen.parse_arguments(
        description, [
            "gtf", "genome_fasta", "bams_folder", "vcf_folder", "panel_file",
            "out_prefix", "bam_analysis_folder", "number_of_simulations",
            "simulation_output_folder", "motif_file", "filter_genome_data",
            "get_SNPs", "process_bams", "simulate_ptc_snps",
            "motif_complement", "overwrite_intersect", "use_old_sims",
            "out_of_frame", "simulate_ptcs_with_monomorphic",
            "generate_monomorphic_indices", "ignore_determine_snp_type",
            "ignore_psi_calculation", "ptc_location_analysis"
        ],
        flags=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
        ints=[7])
    gtf, genome_fasta, bams_folder, vcf_folder, panel_file, out_prefix, bam_analysis_folder, number_of_simulations, simulation_output_folder, motif_file, filter_genome_data, get_SNPs, process_bams, simulate_ptc_snps, motif_complement, overwrite_intersect, use_old_sims, out_of_frame, simulate_ptcs_with_monomorphic, generate_monomorphic_indices, ignore_determine_snp_type, ignore_psi_calculation, ptc_location_analysis = args.gtf, args.genome_fasta, args.bams_folder, args.vcf_folder, args.panel_file, args.out_prefix, args.bam_analysis_folder, args.number_of_simulations, args.simulation_output_folder, args.motif_file, args.filter_genome_data, args.get_SNPs, args.process_bams, args.simulate_ptc_snps, args.motif_complement, args.overwrite_intersect, args.use_old_sims, args.out_of_frame, args.simulate_ptcs_with_monomorphic, args.generate_monomorphic_indices, args.ignore_determine_snp_type, args.ignore_psi_calculation, args.ptc_location_analysis

    start = time.time()

    # create any necessary output diretories
    directory_splits = out_prefix.split('/')
    directory_paths = "/".join(directory_splits[:-1])
    gen.create_output_directories(directory_paths)
    gen.create_directory('temp_data/')

    CDS_fasta = "{0}_CDS.fasta".format(out_prefix)
    CDS_bed = "{0}_CDS.bed".format(out_prefix)
    exon_bed = "{0}_exons.bed".format(out_prefix)
    filtered_exon_bed = "{0}_filtered_exons.bed".format(out_prefix)
    exon_junctions_file = "{0}_exon_junctions.bed".format(out_prefix)
    coding_exon_bed = "{0}_coding_exons.bed".format(out_prefix)

    if filter_genome_data:
        #extract and filter CDS coordinates and sequences
        print("Extracting and filtering CDSs...")
        bo.extract_cds(gtf,
                       CDS_bed,
                       CDS_fasta,
                       genome_fasta,
                       all_checks=True,
                       uniquify=True,
                       clean_chrom_only=True,
                       full_chr_name=True)
        gen.get_time(start)

        #group the CDS sequences into families based on sequence similarity
        print("Grouping sequences into families...")
        names = gen.read_fasta(CDS_fasta)[0]
        gen.find_families_ensembl(
            "../source_data/GRCh37_ensembl_protein_families.txt", names,
            "{0}_families.txt".format(out_prefix))
        gen.get_time(start)

        print("Extracting and filtering exons...")
        #extract exon coordinates
        bo.extract_exons(gtf, exon_bed)
        #only leave exons from transcripts that passed quality control in the extract_cds step above.
        #also only leave a single gene per family
        bo.filter_bed_from_fasta(
            exon_bed,
            CDS_fasta,
            filtered_exon_bed,
            families_file="{0}_families.txt".format(out_prefix))
        gen.get_time(start)

        #extract exon-exon junction coordinates
        print("Extracting exon-exon junctions...")
        bo.extract_exon_junctions(exon_bed,
                                  exon_junctions_file,
                                  window_of_interest=2)
        gen.get_time(start)

        #make another exons bed that only contains fully coding exons.
        #This is because in the final analysis, we should only consider fully protein-coding exons.
        #However, for getting the exon junctions we need the full exons file because fully protein-coding exons might
        #be flanked by exons that are not. This is why we couldn't do this filtering step earlier.
        print(
            "Filtering out overlapping, non-coding and partially coding, as well as terminal exons..."
        )
        bo.check_coding(filtered_exon_bed,
                        CDS_bed,
                        coding_exon_bed,
                        remove_overlapping=True)
        gen.get_time(start)

    SNP_file = "{0}_SNP_file.txt".format(out_prefix)
    if out_of_frame:
        out_prefix = out_prefix + "_out_of_frame"
    PTC_file = "{0}_ptc_file.txt".format(out_prefix)
    syn_nonsyn_file = "{0}_syn_nonsyn_file.txt".format(out_prefix)
    CDS_interval_file = "{0}_intervals{1}".format(
        os.path.splitext(CDS_fasta)[0],
        os.path.splitext(CDS_fasta)[1])
    #check which individuals were included in Geuvadis
    full_sample_names = os.listdir(bams_folder)
    full_sample_names = [
        i for i in full_sample_names if i[-4:] == ".bam" and "proc" not in i
    ]
    sample_names = [(i.split("."))[0] for i in full_sample_names]
    sample_names = [i for i in sample_names if len(i) > 0]
    print('{0} samples included in Geuvadis...'.format(len(sample_names)))
    #for some reason, 17 of the samples from Geuvadis don't appear in the 1000genomes vcf
    #I'm gonna have to get to the bottom of this at some point
    #but at the moment I'm just gonna filter them out

    with open("../source_data/samples_in_vcf.txt") as file:
        samples_in_vcf = file.readlines()
    samples_in_vcf = [i.rstrip("\n") for i in samples_in_vcf]
    sample_names = [i for i in sample_names if i in samples_in_vcf]
    print('{0} samples also in vcf...'.format(len(sample_names)))
    sample_file = "{0}_sample_file.txt".format(out_prefix)

    # create a fasta containing all sequences for exons with snp
    coding_exons_fasta = "{0}_coding_exons.fasta".format(out_prefix)
    bo.fasta_from_intervals(coding_exon_bed,
                            coding_exons_fasta,
                            genome_fasta,
                            names=True)

    if get_SNPs:
        #get SNPs for the sample
        intersect_file = "{0}_SNP_CDS_intersect.bed".format(out_prefix)
        print("Getting SNP data...")
        so.get_snps_in_cds(coding_exon_bed, CDS_bed, vcf_folder, panel_file,
                           sample_names, sample_file, intersect_file,
                           out_prefix)
        print("Calculating SNP positions...")
        so.get_snp_positions(sample_file, SNP_file, CDS_bed, intersect_file,
                             out_prefix)
        gen.get_time(start)

    if ignore_determine_snp_type:
        pass
    else:
        print("Determining SNP type...")
        so.get_snp_change_status(SNP_file,
                                 CDS_fasta,
                                 PTC_file,
                                 syn_nonsyn_file,
                                 out_of_frame=out_of_frame,
                                 ref_check=True,
                                 headers=True)
        gen.get_time(start)

    #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step.
    print(
        "Filtering exon-exon junctions to only leave those that flank exons with a PTC variant..."
    )
    PTC_exon_junctions_file = "{0}_filtered_exon_junctions.bed".format(
        out_prefix)
    bo.filter_exon_junctions(exon_junctions_file, PTC_file,
                             PTC_exon_junctions_file)

    #make a list of all the .bam files and modify them to have the full path rather than just the file name
    bam_files = [
        "{0}/{1}".format(bams_folder, i) for i in full_sample_names
        if (i.split("."))[0] in sample_names
    ]

    #in parallel, do the processing on individual .bam files
    exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format(
        out_prefix)
    if bam_analysis_folder == "None":
        bam_analysis_folder = "{0}__analysis_bam_analysis".format(out_prefix)
    gen.create_directory(bam_analysis_folder)
    if process_bams:
        print("Processing RNA-seq data...")
        if out_of_frame:
            splits = exon_junctions_bam_output_folder.split('/')
            splits[-1] = splits[-1].replace('_out_of_frame', '')
            exon_junctions_bam_output_folder = "/".join(splits)
        gen.create_directory(exon_junctions_bam_output_folder)
        #we have to do it like this because you can't pass flags into run_in_parallel
        keyword_dict = {"overwrite_intersect": overwrite_intersect}
        processes = gen.run_in_parallel(bam_files, [
            "foo", exon_junctions_file, PTC_exon_junctions_file,
            bam_analysis_folder, PTC_file, syn_nonsyn_file, out_prefix,
            exon_junctions_bam_output_folder, keyword_dict
        ],
                                        nao.process_bam_per_individual,
                                        workers=36)
        for process in processes:
            process.get()
        gen.get_time(start)

    #if required, filter PTCs to only leave ones that overlap motifs from a specified set
    motif_filtering = False
    if motif_file != "None":
        print(
            "Filtering SNPs based on whether or not they overlap a motif from the specified set..."
        )
        motif_suffix = ((motif_file.split("/"))[-1]).split(".")[0]
        if motif_complement:
            out_prefix = "{0}_{1}_complement".format(out_prefix, motif_suffix)
        else:
            out_prefix = "{0}_{1}".format(out_prefix, motif_suffix)
        filtered_ptc = "{0}_ptc_file.txt".format(out_prefix)
        so.filter_motif_SNPs(CDS_fasta,
                             PTC_file,
                             motif_file,
                             filtered_ptc,
                             complement=motif_complement)
        PTC_file = filtered_ptc

    final_file = "{0}__analysis_final_output.txt".format(out_prefix)
    if ignore_psi_calculation:
        pass
    else:
        print("Calculating PSI...")
        bmo.compare_PSI(PTC_file, bam_analysis_folder, final_file)

    #run the simulation that swaps ptcs for nonsynonymous snps
    if simulate_ptc_snps:
        if simulate_ptc_snps and not number_of_simulations:
            print("Please specify the number of simulations")
            raise Exception
        nao.ptc_snp_simulation(out_prefix,
                               simulation_output_folder,
                               PTC_file,
                               syn_nonsyn_file,
                               exon_junctions_file,
                               bam_files,
                               number_of_simulations,
                               exon_junctions_bam_output_folder,
                               use_old_sims=use_old_sims)

    # run the simulation that picks monomorphic sites
    if simulate_ptcs_with_monomorphic:
        if simulate_ptcs_with_monomorphic and not number_of_simulations:
            print("Please specify the number of simulations")
            raise Exception

        coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix)
        if not os.path.exists(coding_exon_fasta):
            print('Coding exon fasta is required...')
            raise Exception
        nao.ptc_monomorphic_simulation(
            out_prefix,
            simulation_output_folder,
            sample_file,
            genome_fasta,
            PTC_file,
            syn_nonsyn_file,
            coding_exon_bed,
            coding_exon_fasta,
            exon_junctions_file,
            bam_files,
            number_of_simulations,
            generate_indices=generate_monomorphic_indices,
            use_old_sims=use_old_sims)

    # get the locations of the ptcs
    if ptc_location_analysis:
        print("PTC locations analysis...")
        snp_relative_exon_position_file = "{0}_SNP_relative_exon_position.bed".format(
            out_prefix)
        ptc_location_analysis_output_file = "{0}_ptc_location_analysis.csv".format(
            out_prefix)
        coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix)
        if not os.path.exists(coding_exon_fasta) or not os.path.exists(
                snp_relative_exon_position_file) or not os.path.exists(
                    PTC_file):
            print("Please run --filter_genome_data and --get_SNPs first...")
            raise Exception
        # need to work out where and what the analysis outputs need to do
        so.ptc_locations(PTC_file, snp_relative_exon_position_file,
                         ptc_location_analysis_output_file)
예제 #22
0
def main():

    arguments = [
        "output_directory", "genome_gtf", "genome_fasta", "ortholog_gtf",
        "ortholog_fasta", "input_file", "genome_fasta", "mapping_file",
        "codes_file", "ensembl_links", "extract_protein_coding",
        "extract_exons", "extract_introns", "extract_coding_exons",
        "extract_non_coding_exons", "extract_non_transcribed_regions",
        "extract_lincrna_seqs", "clean_run"
    ]

    description = ""
    args = gen.parse_arguments(description,
                               arguments,
                               opt_flags=[1, 2, 3, 4, 5, 6, 7, 8, 9],
                               flags=[10, 11, 12, 13, 14, 15, 16, 17])
    output_directory, genome_gtf, genome_fasta, ortholog_gtf, ortholog_fasta, input_file, genome_fasta, mapping_file, codes_file, ensembl_links, extract_protein_coding, extract_exons, extract_introns, extract_coding_exons, extract_non_coding_exons, extract_non_transcribed_regions, extract_lincrna_seqs, clean_run = args.output_directory, args.genome_gtf, args.genome_fasta, args.ortholog_gtf, args.ortholog_fasta, args.input_file, args.genome_fasta, args.mapping_file, args.codes_file, args.ensembl_links, args.extract_protein_coding, args.extract_exons, args.extract_introns, args.extract_coding_exons, args.extract_non_coding_exons, args.extract_non_transcribed_regions, args.extract_lincrna_seqs, args.clean_run

    # set a start time
    start = time.time()

    # create the output_directory if it doenst already exist
    gen.create_output_directories(output_directory)

    # get the sequences
    if extract_protein_coding:
        # input_file1 = gtf genome 1, genome_fasta = genome fasta 1, ortholog_gtf = gtf genome 2, ortholog_fasta = genome fasta 2, ensembl_links = orthlogs file
        cont.extract_clean_sequences(genome_gtf,
                                     genome_fasta,
                                     ortholog_gtf,
                                     ortholog_fasta,
                                     ensembl_links,
                                     output_directory,
                                     clean_run=clean_run)

    full_exon_file = "{0}/genome_sequences/human/human.exons.bed".format(
        output_directory)
    if extract_exons:
        cont.extract_exons(genome_gtf,
                           genome_fasta,
                           output_directory,
                           full_exon_file,
                           clean_run=clean_run)
        sequo.clean_feature_file(full_exon_file)

    exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_filtered_exons.bed".format(
        output_directory, "human")
    coding_exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_coding_exons.bed".format(
        output_directory, "human")
    coding_exons_fasta = "{0}/genome_sequences/{1}/{1}.cds.clean_coding_exons.fasta".format(
        output_directory, "human")
    if extract_coding_exons:
        sequo.get_coding_exon_coordinates(full_exon_file, exons_bed,
                                          coding_exons_bed)
        fo.fasta_from_intervals(coding_exons_bed,
                                coding_exons_fasta,
                                genome_fasta,
                                names=True)

    if extract_non_coding_exons:
        non_coding_exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_non_coding_exons.bed".format(
            output_directory, "human")
        non_coding_exons_fasta = "{0}/genome_sequences/{1}/{1}.cds.clean_non_coding_exons.fasta".format(
            output_directory, "human")
        sequo.get_non_coding_exon_coordinates(full_exon_file, exons_bed,
                                              non_coding_exons_bed)
        fo.fasta_from_intervals(non_coding_exons_bed,
                                non_coding_exons_fasta,
                                genome_fasta,
                                names=True)

    if extract_introns:
        intron_bed = "{0}/genome_sequences/human/human.clean_introns.bed".format(
            output_directory)
        intron_fasta = "{0}/genome_sequences/human/human.clean_introns.fasta".format(
            output_directory)
        sequo.get_intron_coordinates(coding_exons_bed, intron_bed)
        fo.fasta_from_intervals(intron_bed,
                                intron_fasta,
                                genome_fasta,
                                names=True)

    if extract_non_transcribed_regions:
        all_features_bed = "{0}/genome_sequences/human/human.all_features.bed".format(
            output_directory)
        non_transcribed_bed = "{0}/genome_sequences/human/human.non_transcribed.bed".format(
            output_directory)
        non_transcribed_fasta = "{0}/genome_sequences/human/human.non_transcribed.fasta".format(
            output_directory)
        seqo.get_non_transcribed_regions(genome_gtf, genome_fasta,
                                         all_features_bed, non_transcribed_bed,
                                         non_transcribed_fasta,
                                         output_directory)

    # extract sequences from source file
    if extract_lincrna_seqs:
        # set up the output fasta to contain the exon seqs
        lincrna_exons_bed = "{0}/lincRNA_exons.bed".format(output_directory)
        lincrna_exons_fasta = "{0}/lincRNA_exons.fasta".format(
            output_directory)
        lincrna_seqs_fasta = "{0}/lincRNA_seqs.fasta".format(output_directory)
        print("Extracting lincRNA seqs...")
        fo.extract_seqs(input_file,
                        genome_fasta,
                        lincrna_exons_bed,
                        lincrna_exons_fasta,
                        lincrna_seqs_fasta,
                        mapping_file,
                        codes_file,
                        exclude_XY=True,
                        hg38=hg38,
                        NONCODE=NONCODE)
        print("Use lincRNA_misc.py to do further filtering...")