예제 #1
0
def get_conservation(transcript_list,
                     output_file,
                     max_dS_threshold=None,
                     max_omega_threshold=None):
    """
    Get the conversation for a list of sequences and only keep those that pass

    Args:
        transcript_list (dict): dict containing transcript id, the cds and the ortholog seqs
        output_file (str): path to output file
        max_dS_threshold (float): if set, pass in the dS threshold you wish alignments to be below
        max_omega_threshold (float): if set, pass in the omega threshold you wish alignments to be below
    """

    print("Getting the most conserved ortholog for each transcript...")

    temp_dir = "temp_conservation_files"
    gen.create_output_directories(temp_dir)
    # get a list of the transcript ids
    transcript_ids = list(transcript_list.keys())
    # transcript_ids = transcript_ids[:200]
    # run this linearly because it doesnt like being parallelised
    # outputs = run_conservation_check(transcript_ids, transcript_list, max_dS_threshold, max_omega_threshold, temp_dir)
    outputs = gen.run_parallel_function(
        transcript_ids,
        [transcript_list, max_dS_threshold, max_omega_threshold, temp_dir],
        run_conservation_check,
        parallel=False)
    # remove the old output file if there is one
    gen.remove_file(output_file)
    # now concat the output files
    args = ["cat"]
    [args.append(i) for i in outputs]
    gen.run_process(args, file_for_output=output_file)
    gen.remove_directory(temp_dir)
예제 #2
0
def convert_bed(input_bed, output_bed = None, to_hg38 = True):
    """
    Convert bed file from hg37 to hg38 and vice versa

    Args:
        input_bed (str): path to bed file
        output_bed (str): if set, path to output_file
        to_hg38 (bool): if set, convert to hg38, else convert to hg37
    """

    # create temp file if no output file is given
    if not output_bed:
        file_to_write = "temp_files/{0}.bed".format(random.random())
    else:
        file_to_write = output_bed

    entries = gen.read_many_fields(input_bed, "\t")
    with open(file_to_write, "w") as outfile:
        for entry in entries:
            if to_hg38:
                entry[0] = entry[0].strip("chr")
            else:
                entry[0] = "chr{0}".format(entry[0])
            outfile.write("{0}\n".format("\t".join(entry)))

    # remove the temp file if created
    if not output_bed:
        gen.run_process(["mv", file_to_write, input_bed])
        gen.remove_file(file_to_write)
예제 #3
0
def fasta_from_intervals(bed_file, fasta_file, genome_fasta, force_strand = True, names = False):
    """
    Takes a bed file and creates a fasta file with the corresponding sequences.
    Credit: Rosina Savisaar

    Args:
        bed_file (str): the bed file path to create fasta from
        fasta_file (str): the output fasta file path
        genome_fasta (str): the file path to the genome fasta file
        names (bool): if False, the fasta record names will be generated from the sequence coordinates.
        names (bool): if True, the fasta name will correspond to whatever is in the 'name' field of the bed file
    """

    #if the index file exists, check whether the expected features are present
    genome_fasta_index = genome_fasta + '.fai'
    if(os.path.exists(genome_fasta_index)):
        bed_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(bed_file, "\t")])))
        index_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(genome_fasta_index, "\t")])))
        if(not set(bed_chrs).issubset(set(index_chrs))):
            gen.remove_file(genome_fasta_index)

    bedtools_args = ["bedtools", "getfasta", "-s", "-fi", genome_fasta, "-bed", bed_file, "-fo", fasta_file]
    if not force_strand:
        del bedtools_args[2]
    if names:
        bedtools_args.append("-name")
    gen.run_process(bedtools_args)
    names, seqs = gen.read_fasta(fasta_file)
    seqs = [i.upper() for i in seqs]
    gen.write_to_fasta(names, seqs, fasta_file)
예제 #4
0
def bam_xt_filter(input_bam, output, xt_filter=None):
    '''
    Filter a bam/sam file by XT tag.
    '''
    if not xt_filter:
        print("Please specify XT filter.")
        raise Exception
    #create output file
    if output[-4:] == ".bam":
        output_file = "{0}.sam".format(output[:-4])
    else:
        output_file = output

    sam_output = gen.run_process(["samtools", "view", "-h", input_bam])
    grep_args = []
    #get header lines
    grep_args.append("^@")
    #get XT values with xt_filter
    grep_args.append("\|\tXT:A:{0}\t".format(xt_filter))
    grep_args = "".join(grep_args)
    gen.run_process(["grep", grep_args],
                    input_to_pipe=sam_output,
                    file_for_output=output_file)

    #if wanting to create bam, create bam and delete sam
    if output != output_file:
        samtools_args = ["samtools", "view", "-bh", output_file]
        gen.run_process(samtools_args, file_for_output=output)
        gen.remove_file(output_file)
예제 #5
0
def run_simulations(simulation_sets, required_simulations):
    '''
    Run the simulations
    '''

    for motif_set in simulation_sets:

        motif_file = motif_set[0]
        simulation_output_file = motif_set[1]
        stops_count_output_file = motif_set[2]

        # clean up and previous simulations
        gen.remove_file(simulation_output_file)
        gen.remove_file(stops_count_output_file)

        motif_list = gen.read_many_fields(motif_file, ",")
        # get motifs, avoid header if there is one
        motifs = [i[0] for i in motif_list if i[0][0] != "#"]

        # get the number of stop codons found in the real set
        real_count = se.get_stop_codon_count(motifs)

        # generate simulated motifs using motif set
        print('Simulating {0}...'.format(motif_file))
        se.generate_motifs_sets(motifs, required_simulations, output_file = simulation_output_file)
        simulated_motif_sets = gen.read_many_fields(simulation_output_file, "|")
        with open(stops_count_output_file, "w") as output:
        	output.write('id,stop_count\n')
        	output.write('real,{0}\n'.format(real_count))
        	for i, simulated_set in enumerate(simulated_motif_sets):
        		stop_count = se.get_stop_codon_count(simulated_set)
        		output.write('{0},{1}\n'.format(i+1, stop_count))
예제 #6
0
def retrieve_bams_core(all_files, local_directory, host, user, password,
                       ftp_directory, expect_string):
    '''
    Core function parallelized in retrieve_bams above.
    '''
    #connect to FTP server
    ftp = gen.ftp_connect(host, user, password, directory=ftp_directory)
    #loop over .bam files
    for pos, bam_file in enumerate(all_files):
        expect_file = "temp_data/expect_file{0}.txt".format(random.random())
        start_time = time.time()
        print("{0}/{1}".format(pos, len(all_files)))
        local_bam_file = "{0}/{1}".format(local_directory, bam_file)
        #retrieve current file
        if not os.path.isfile(local_bam_file):
            ftp = gen.ftp_retrieve(ftp,
                                   host,
                                   user,
                                   password,
                                   ftp_directory,
                                   bam_file,
                                   destination=local_directory)
        #transfer file to Watson
        current_expect_string = str.replace(expect_string, "foo", bam_file)
        with open(expect_file, "w") as e_file:
            e_file.write(current_expect_string)
        gen.run_process(["expect", expect_file])
        print("Transferred to Watson.")
        gen.remove_file(expect_file)
        gen.remove_file(local_bam_file)
        print("Time spent: {0} minutes.\n".format(
            round((time.time() - start_time) / 60), 3))
    ftp = gen.ftp_check(ftp, host, user, password, ftp_directory)
    ftp.quit()
예제 #7
0
def bam_nm_filter(input_bam, output, nm_less_equal_to=None):
    '''
    Filters bam reads by NM value.
    nm_less_equal_to: the NM value you wish to filter by.
    '''
    if not nm_less_equal_to:
        print("Please provide NM filter value.")
        raise Exception

    #create output file
    if output[-4:] == ".bam":
        output_file = "{0}.sam".format(output[:-4])
    else:
        output_file = output
    sam_output = gen.run_process(["samtools", "view", "-h", input_bam])
    #create grep args and include header fields if they exist
    grep_args = ["^@"]
    #for each nm less than equal to threshold, create grep arg
    for i in range(nm_less_equal_to + 1):
        grep_args.append("\|\tNM:i:{0}\t".format(i))
    grep_args = "".join(grep_args)
    gen.run_process(["grep", grep_args],
                    input_to_pipe=sam_output,
                    file_for_output=output_file)

    #if wanting to create bam, create bam and delete sam
    if output != output_file:
        samtools_args = ["samtools", "view", "-bh", output_file]
        gen.run_process(samtools_args, file_for_output=output)
        gen.remove_file(output_file)
예제 #8
0
def calc_ds(aligned_sequences):
    aligned_sequences_iupac = [
        Seq("".join(i), IUPAC.unambiguous_dna) for i in aligned_sequences
    ]
    alignment = MultipleSeqAlignment([
        SeqRecord(aligned_sequences_iupac[0], id="seq"),
        SeqRecord(aligned_sequences_iupac[1], id="orth_seq")
    ])
    gen.create_output_directories("temp_files")
    random_instance = random.random()
    temp_phylip_file = "temp_files/{0}.phy".format(random_instance)
    temp_output_file = "temp_files/{0}.out".format(random_instance)
    fo.write_to_phylip(alignment, temp_phylip_file)
    # # run paml on sequences
    working_dir = "temp_dir.{0}".format(random.random())
    paml = sequo.PAML_Functions(input_file=temp_phylip_file,
                                output_file=temp_output_file,
                                working_dir=working_dir)
    # run codeml
    codeml_output = paml.run_codeml()
    ds = codeml_output["NSsites"][0]["parameters"]["dS"]
    # clean up files
    gen.remove_file(temp_phylip_file)
    gen.remove_file(temp_output_file)
    paml.cleanup()

    return ds
예제 #9
0
 def test_sort_bed(self):
     infile = "test_data/bam_ops/test_sort_bed/test_intersect_bed_A_file_unsorted.bed"
     expected_file = "test_data/bam_ops/test_sort_bed/expected_test_intersect_bed_A_file.bed"
     observed_file = "test_data/bam_ops/test_sort_bed/observed_test_sort_bed.bed"
     gen.remove_file(observed_file)
     sort_bed(infile, observed_file)
     expected = gen.read_many_fields(expected_file, "\t")
     observed = gen.read_many_fields(observed_file, "\t")
     self.assertEqual(expected, observed)
예제 #10
0
 def test_remove_overlaps2(self):
     in_bed = "test_data/bed_ops/test_remove_overlaps2/in.bed"
     expected = "test_data/bed_ops/test_remove_overlaps2/expected.bed"
     observed = "test_data/bed_ops/test_remove_overlaps2/observed.bed"
     gen.remove_file(observed)
     remove_overlaps(in_bed, observed)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #11
0
 def test_compare_PSI_haplotypes(self):
     SNPs = "test_data/bam_ops/test_compare_PSI_haplotypes/SNPs.bed"
     bam_folder = "test_data/bam_ops/test_compare_PSI_haplotypes/bam_folder"
     expected = gen.read_many_fields("test_data/bam_ops/test_compare_PSI_haplotypes/expected.txt", "\t")
     observed = "test_data/bam_ops/test_compare_PSI_haplotypes/observed.txt"
     gen.remove_file(observed)
     compare_PSI_haplotypes(SNPs, bam_folder, observed, 3)
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #12
0
 def test_filter_by_snp_type(self):
     input_snps = "test_data/snp_ops/test_filter_by_snp_type/input_snps.bed"
     expected = "test_data/snp_ops/test_filter_by_snp_type/expected_snps.bed"
     observed = "test_data/snp_ops/test_filter_by_snp_type/observed_snps.bed"
     gen.remove_file(observed)
     filter_by_snp_type(input_snps, observed, "non")
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(observed, expected)
예제 #13
0
 def test_get_snp_relative_cds_position_plus_strand_split(self):
     relative_exon_position_file = gen.read_many_fields("test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/test_snp_relative_exon_position.bed", "\t")
     bed_file = "test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/full_bed.bed"
     expected = gen.read_many_fields("test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/expected_test_snp_relative_cds_position.bed", "\t")
     observed = "test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/observed_test_snp_relative_cds_position.bed"
     gen.remove_file(observed)
     get_snp_relative_cds_position(relative_exon_position_file, observed, bed_file)
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(observed, expected)
예제 #14
0
 def test_group_flags(self):
     input_bed = "test_data/bam_ops/test_group_flags/test_tabix.bed"
     observed = "test_data/bam_ops/test_group_flags/observed_test_group_flags.bed"
     gen.remove_file(observed)
     flag_start = 3
     group_flags(input_bed, observed, flag_start)
     expected = gen.read_many_fields("test_data/bam_ops/test_group_flags/expected_test_group_flags.bed", "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #15
0
 def test_tabix(self):
     bed_file = "test_data/snp_ops/test_tabix/test_tabix_bed.txt"
     expected = gen.read_many_fields("test_data/snp_ops/test_tabix/expected_test_tabix.txt", "\t")
     observed = "test_data/snp_ops/observed_test_tabix.bed"
     gen.remove_file(observed)
     vcf = "../source_data/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.gz"
     tabix(bed_file, observed, vcf)
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(sorted(observed), sorted(expected))
예제 #16
0
 def test_intersect_bed_overlap(self):
     A_file = "test_data/bam_ops/test_intersect_bed_overlap/test_intersect_bed_A_file.bed"
     B_file = "test_data/bam_ops/test_intersect_bed_overlap/test_intersect_bed_B_file.bed"
     expected_file = "test_data/bam_ops/test_intersect_bed_overlap/expected_test_intersect_bed_overlap.bed"
     observed_file = "test_data/bam_ops/test_intersect_bed_overlap/observed_test_intersect_bed_overlap.bed"
     gen.remove_file(observed_file)
     intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, overlap = 0.5)
     expected = gen.read_many_fields(expected_file, "\t")
     observed = gen.read_many_fields(observed_file, "\t")
     self.assertEqual(expected, observed)
예제 #17
0
 def test_intersect_bed_force_strand_hit_count(self):
     A_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/test_intersect_bed_A_file.bed"
     B_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/test_intersect_bed_B_file.bed"
     expected_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/expected_test_intersect_bed_force_strand_hit_count.bed"
     observed_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/observed_test_intersect_bed_force_strand_hit_count.bed"
     gen.remove_file(observed_file)
     intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, force_strand = True, hit_count = True)
     expected = gen.read_many_fields(expected_file, "\t")
     observed = gen.read_many_fields(observed_file, "\t")
     self.assertEqual(expected, observed)
예제 #18
0
 def test_intersect_bed_intersect_bedops(self):
     A_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/test_intersect_bed_A_file.bed"
     B_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/test_intersect_bed_B_file.bed"
     expected_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/expected_test_intersect_bed_intersect_bedops.bed"
     observed_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/observed_test_intersect_bed_intersect_bedops.bed"
     gen.remove_file(observed_file)
     intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, use_bedops = True, intersect = True)
     expected = gen.read_many_fields(expected_file, "\t")
     observed = gen.read_many_fields(observed_file, "\t")
     self.assertEqual(expected, observed)
예제 #19
0
 def test_filter_exon_junctions(self):
     exon_junctions_file = "test_data/bed_ops/test_filter_exon_junctions/exon_junctions.bed"
     exons_file = "test_data/bed_ops/test_filter_exon_junctions/exons.bed"
     expected = "test_data/bed_ops/test_filter_exon_junctions/expected_filter_exon_junctions.bed"
     observed = "test_data/bed_ops/test_filter_exon_junctions/observed_filter_exon_junctions.bed"
     gen.remove_file(observed)
     filter_exon_junctions(exon_junctions_file, exons_file, observed)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #20
0
 def test_extract_exon_junctions_window(self):
     exons = "test_data/bed_ops/test_extract_exon_junctions_window/test_extract_exon_junctions.bed"
     observed = "test_data/bed_ops/test_extract_exon_junctions_window/observed_test_extract_exon_window_junctions.bed"
     gen.remove_file(observed)
     extract_exon_junctions(exons, observed, 30)
     expected = gen.read_many_fields(
         "test_data/bed_ops/test_extract_exon_junctions_window/expected_test_extract_exon_window_junctions.bed",
         "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #21
0
 def test_extract_features_cdss_stops(self):
     gtf_file = "test_data/bed_ops/test_extract_features_cdss_stops/test_extract_features.gtf"
     observed = "test_data/bed_ops/test_extract_features_cdss_stops/observed_test_extract_features_cdss_stops.bed"
     gen.remove_file(observed)
     extract_features(gtf_file, observed, ['CDS', 'stop_codon'])
     expected = gen.read_many_fields(
         "test_data/bed_ops/test_extract_features_cdss_stops/expected_test_extract_features_cdss_stops.bed",
         "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(observed, expected)
예제 #22
0
 def test_filter_bed_from_fasta(self):
     bed = "test_data/bed_ops/test_filter_bed_from_fasta/test_filter_bed_from_fasta.bed"
     fasta = "test_data/bed_ops/test_filter_bed_from_fasta/test_filter_bed_from_fasta.fasta"
     observed = "test_data/bed_ops/test_filter_bed_from_fasta/observed_test_filter_bed_from_fasta.bed"
     gen.remove_file(observed)
     expected = "test_data/bed_ops/test_filter_bed_from_fasta/expected_test_filter_bed_from_fasta.bed"
     filter_bed_from_fasta(bed, fasta, observed)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #23
0
 def test_get_descriptions(self):
     gtf = "test_data/bed_ops/test_get_descriptions/descriptions.gtf"
     names = ["ENST100", "ENST7", "ENST0003", "ENST5"]
     expected = "test_data/bed_ops/test_get_descriptions/expected_get_descriptions.txt"
     observed = "test_data/bed_ops/test_get_descriptions/observed_get_descriptions.txt"
     gen.remove_file(observed)
     get_descriptions(names, gtf, observed)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #24
0
 def test_check_coding(self):
     exon_file = "test_data/bed_ops/test_check_coding/exons.bed"
     CDS_file = "test_data/bed_ops/test_check_coding/CDSs.bed"
     expected = "test_data/bed_ops/test_check_coding/expected_check_coding.bed"
     observed = "test_data/bed_ops/test_check_coding/observed_check_coding.bed"
     gen.remove_file(observed)
     check_coding(exon_file, CDS_file, observed)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #25
0
 def test_merge_and_header(self):
     file1 = "test_data/snp_ops/test_merge_and_header/file1.txt"
     file2 = "test_data/snp_ops/test_merge_and_header/file2.txt"
     expected = "test_data/snp_ops/test_merge_and_header/expected.txt"
     observed = "test_data/snp_ops/test_merge_and_header/observed.txt"
     gen.remove_file(observed)
     merge_and_header(file1, file2, observed)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #26
0
 def test_extract_exons(self):
     gtf = "test_data/bed_ops/test_extract_exons/test_extract_exons.gtf"
     observed = "test_data/bed_ops/test_extract_exons/observed_test_extract_exons.bed"
     gen.remove_file(observed)
     extract_exons(gtf, observed)
     expected = gen.read_many_fields(
         "test_data/bed_ops/test_extract_exons/expected_test_extract_exons.bed",
         "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #27
0
 def test_filter_fasta_intervals_from_fasta(self):
     fasta = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/test_filter_fasta_intervals_from_fasta_fasta.fasta"
     fasta_intervals = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/test_filter_fasta_intervals_from_fasta_intervals.fasta"
     observed = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/observed_filtered_intervals.fasta"
     gen.remove_file(observed)
     expected = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/expected_filtered_intervals.fasta"
     filter_fasta_intervals_from_fasta(fasta_intervals, fasta, observed)
     expected = gen.read_fasta(expected)
     observed = gen.read_fasta(observed)
     self.assertEqual(expected, observed)
예제 #28
0
 def test_ptc_locations(self):
     ptc_file = "test_data/snp_ops/test_ptc_locations/test_PTC_file.txt"
     snp_file = "test_data/snp_ops/test_ptc_locations/test_SNP_relative_exon_position.bed"
     bam_output_file = "test_data/snp_ops/test_ptc_locations/bam_analysis_output.txt"
     observed = "test_data/snp_ops/test_ptc_locations/observed_ptc_location.txt"
     expected = "test_data/snp_ops/test_ptc_locations/expected_ptc_location.txt"
     gen.remove_file(observed)
     ptc_locations(ptc_file, snp_file, bam_output_file, observed)
     observed = gen.read_many_fields(observed, "\t")
     expected = gen.read_many_fields(expected, "\t")
     self.assertEqual(observed, expected)
예제 #29
0
 def test_filter_motif_SNPs_complement(self):
     motifs = "test_data/snp_ops/test_filter_motif_snps_complement/ESEs.txt"
     fasta = "test_data/snp_ops/test_filter_motif_snps_complement/CDS.fasta"
     SNPs = "test_data/snp_ops/test_filter_motif_snps_complement/snps.bed"
     expected = "test_data/snp_ops/test_filter_motif_snps_complement/expected.txt"
     observed = "test_data/snp_ops/test_filter_motif_snps_complement/observed.txt"
     gen.remove_file(observed)
     filter_motif_SNPs(fasta, SNPs, motifs, observed, complement = True)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
예제 #30
0
def sort_bed(input_file_name, output_file_name):
    '''
    Sort a bed file.
    '''
    #This is done via a temp file because that way you can specify the same file as input and output file and thus
    #overwrite the unsorted file with the sorted one.
    temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random())
    gen.run_process(["sort-bed", input_file_name],
                    file_for_output=temp_file_name)
    gen.run_process(["mv", temp_file_name, output_file_name])
    gen.remove_file(temp_file_name)