def test_get_ptcs_in_window(self): ptc_file = "test_data/disease_snps_ops/test_get_ptcs_in_window/ptc_list.txt" relative_positions_file = "test_data/disease_snps_ops/test_get_ptcs_in_window/relative_positions.txt" ptcs = gen.read_many_fields(ptc_file, "\t") relative_positions = gen.read_many_fields(relative_positions_file, "\t") ptc_list = {} relative_positions_list = {} for i, ptc in enumerate(ptcs): ptc[1], ptc[2], ptc[3] = int(ptc[1]), int(ptc[2]), int(ptc[3]) ptc_list[i] = ptc rel_pos = relative_positions[i] rel_pos[1] = int(rel_pos[1]) relative_positions_list[i] = rel_pos expected_file = "test_data/disease_snps_ops/test_get_ptcs_in_window/expected.txt" expected_list = gen.read_many_fields(expected_file, "\t") expected = {} ends = [5,3] for i in ends: expected[i] = {} for entry in expected_list: if entry[9] != '.' and int(entry[9]) in ends: required_entry = entry[:9] required_entry[1], required_entry[2], required_entry[3], required_entry[8] = int(required_entry[1]), int(required_entry[2]), int(required_entry[3]), int(required_entry[8]) expected[int(entry[9])][int(entry[10])] = required_entry observed = get_ptcs_in_window(ptc_list, relative_positions_list, 4, 69) self.assertEqual(observed, expected)
def get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, kgenomes_ptcs_file, kgenomes_ptcs_exon_positions, unique_ptcs_rel_pos_file, kgenomes_ptcs_rel_pos_file): ''' Get the relative positions of the unique ptcs ''' snps = gen.read_many_fields(disease_snps_relative_exon_positions, "\t") snp_list = collections.defaultdict(lambda: collections.defaultdict()) for snp in snps: snp_pos = int(snp[7]) rel_pos = int(snp[11]) snp_list[snp_pos] = rel_pos ptcs = gen.read_many_fields(unique_ptcs, "\t") with open(unique_ptcs_rel_pos_file, "w") as outfile: for ptc in ptcs: ptc_pos = int(ptc[7]) ptc[11] = snp_list[ptc_pos] outfile.write("{0}\n".format("\t".join(gen.stringify(ptc)))) kgenomes_ptc_positions = gen.read_many_fields(kgenomes_ptcs_exon_positions, "\t") kgenomes_ptc_list = collections.defaultdict(lambda: collections.defaultdict()) for ptc in kgenomes_ptc_positions[1:]: snp_pos = int(ptc[7]) rel_pos = int(ptc[11]) kgenomes_ptc_list[snp_pos] = rel_pos kgenomes_ptcs = gen.read_many_fields(kgenomes_ptcs_file, "\t") with open(kgenomes_ptcs_rel_pos_file, "w") as outfile: for ptc in kgenomes_ptcs: ptc_pos = int(ptc[7]) ptc[11] = kgenomes_ptc_list[ptc_pos] outfile.write("{0}\n".format("\t".join(gen.stringify(ptc))))
def motif_codon_density(motif_file, output_directory): stops = ["TAA", "TAG", "TGA"] gc_matchd_motifs_file = "{0}/gc_matched_combinations.bed".format( output_directory) if not os.path.isfile(gc_matchd_motifs_file): seqo.get_gc_matched_motifs(stops, gc_matchd_motifs_file) temp_dir = "temp_motif_density" gen.create_output_directories(temp_dir) motif_sets = gen.read_many_fields(gc_matchd_motifs_file, "\t") motif_sets.append(["TAA", "TAG", "TGA"]) args = [motif_file, temp_dir] outputs = simoc.run_simulation_function(motif_sets, args, ops.calc_codon_density_in_motifs, sim_run=False) new_output_dir = "{0}/motif_densities".format(output_directory) gen.create_output_directories(new_output_dir) output_file = "{0}/{1}.csv".format(new_output_dir, motif_file.split("/")[-1].split(".")[0]) with open(output_file, "w") as outfile: outfile.write("id,motifs,density\n") for i, file in enumerate(sorted(outputs)): data = gen.read_many_fields(file, ",")[0] outfile.write("{0},{1},{2}\n".format(i + 1, data[0], data[1])) gen.remove_directory(temp_dir)
def get_passed_NONCODE_codes(input_fasta, codes_file, mapping_file, output_fasta, code): """ Only keep sequences that have particular NONCODE code Args: input_fasta (str): path to input fasta file codes_file (str): path to file containing code mapping_file (str): path to transcript-gene mapping file output_fasta (str): path to output fasta code (str): code to look for. As string because cant pass 0001 through """ codes = { code[0]: code[1] for code in gen.read_many_fields(codes_file, "\t") } mappings = { name[0].split(".")[0]: name[1] for name in gen.read_many_fields(mapping_file, "\t") } names, seqs = gen.read_fasta(input_fasta) with open(output_fasta, "w") as outfile: for i, name in enumerate(names): gene = mappings[name] seq_code = codes[gene] if seq_code == code: outfile.write(">{0}\n{1}\n".format(name, seqs[i]))
def run_simulations(simulation_sets, required_simulations): ''' Run the simulations ''' for motif_set in simulation_sets: motif_file = motif_set[0] simulation_output_file = motif_set[1] stops_count_output_file = motif_set[2] # clean up and previous simulations gen.remove_file(simulation_output_file) gen.remove_file(stops_count_output_file) motif_list = gen.read_many_fields(motif_file, ",") # get motifs, avoid header if there is one motifs = [i[0] for i in motif_list if i[0][0] != "#"] # get the number of stop codons found in the real set real_count = se.get_stop_codon_count(motifs) # generate simulated motifs using motif set print('Simulating {0}...'.format(motif_file)) se.generate_motifs_sets(motifs, required_simulations, output_file = simulation_output_file) simulated_motif_sets = gen.read_many_fields(simulation_output_file, "|") with open(stops_count_output_file, "w") as output: output.write('id,stop_count\n') output.write('real,{0}\n'.format(real_count)) for i, simulated_set in enumerate(simulated_motif_sets): stop_count = se.get_stop_codon_count(simulated_set) output.write('{0},{1}\n'.format(i+1, stop_count))
def check_exon_files(input_bed1, input_bed2): """ Do a sanity check to make sure there are no coding exons in the non coding exons file and vice versa. Args: input_bed1 (str): path to the first bed file input_bed2 (str): path to the second bed file Returns: """ bed_lines1 = gen.read_many_fields(input_bed1, "\t") bed_lines2 = gen.read_many_fields(input_bed2, "\t") transcripts1 = [line[3] for line in bed_lines1] transcripts2 = [line[3] for line in bed_lines2] # get any overlap overlap = list(set(transcripts1) & set(transcripts2)) if len(overlap): print( "Something's gone wrong. Coding exons and non coding exons are present in both files..." ) raise Exception return True
def fasta_from_intervals(bed_file, fasta_file, genome_fasta, force_strand = True, names = False): """ Takes a bed file and creates a fasta file with the corresponding sequences. Credit: Rosina Savisaar Args: bed_file (str): the bed file path to create fasta from fasta_file (str): the output fasta file path genome_fasta (str): the file path to the genome fasta file names (bool): if False, the fasta record names will be generated from the sequence coordinates. names (bool): if True, the fasta name will correspond to whatever is in the 'name' field of the bed file """ #if the index file exists, check whether the expected features are present genome_fasta_index = genome_fasta + '.fai' if(os.path.exists(genome_fasta_index)): bed_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(bed_file, "\t")]))) index_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(genome_fasta_index, "\t")]))) if(not set(bed_chrs).issubset(set(index_chrs))): gen.remove_file(genome_fasta_index) bedtools_args = ["bedtools", "getfasta", "-s", "-fi", genome_fasta, "-bed", bed_file, "-fo", fasta_file] if not force_strand: del bedtools_args[2] if names: bedtools_args.append("-name") gen.run_process(bedtools_args) names, seqs = gen.read_fasta(fasta_file) seqs = [i.upper() for i in seqs] gen.write_to_fasta(names, seqs, fasta_file)
def test_clean_alleles(self): input = "test_data/snp_ops/test_clean_alleles/input.vcf" expected = "test_data/snp_ops/test_clean_alleles/expected.vcf" observed = "test_data/snp_ops/test_clean_alleles/observed.vcf" clean_alleles(input, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(observed, expected)
def extract_second_seqs(input_bed, input_file, genome_fasta, output_dir): """ Extract the second set of sequences """ # get a set of ids that correspond only to lincrna entries id_file = "{0}/lncrna_ids.txt".format(output_dir) extract_lncrna_only(input_file, id_file) # now keep only the bed entries that are in the id list filtered_bed = "{0}.filtered".format(input_bed) ids = gen.read_many_fields(id_file, "\t") bed_entries = gen.read_many_fields(input_bed, "\t") with open(filtered_bed, "w") as outfile: for entry in bed_entries: if entry[3] in ids: outfile.write("{0}\n".format("\t".join(entry))) # now write the bed to an exon bed exons_bed = "{0}.exons.bed".format(input_bed) fo.entries_to_bed(filtered_bed, exons_bed, hg38=True) # now get the exon sequences exons_fasta = "{0}.exons.fasta".format(input_bed) fo.fasta_from_intervals(exons_bed, exons_fasta, genome_fasta, force_strand=True, names=True) # now generate the full transcript for multi exon transcripts transcripts_fasta = "{0}.multi_exon_transcripts.fasta".format(input_bed) names, seqs = gen.read_fasta(exons_fasta) seq_list = collections.defaultdict(lambda: collections.defaultdict()) for i, name in enumerate(names): id = ".".join(name.split("(")[0].split(".")[:-1]) exon = int(name.split("(")[0].split(".")[-1]) seq_list[id][exon] = seqs[i] with open(transcripts_fasta, "w") as outfile: for id in sorted(seq_list): if len(seq_list[id]) > 1: exon_list = [] for exon in sorted(seq_list[id]): exon_list.append(seq_list[id][exon]) seq = "".join(exon_list) if "N" not in seq and len(seq) >= 200: # convert names to : here as otherwise it will run sorting later id = ":".join(id.split(".")) outfile.write(">{0}\n{1}\n".format(id, seq)) # blast to get paralogous families blast_db_path = "{0}/bast_db".format(output_directory) output_blast_file = "{0}/blast_output.csv".format(output_directory) families_file = "{0/families.txt".format(output_directory) gen.create_output_directories(blast_db_path) cons.filter_families(transcripts_fasta, output_blast_file, families_file, database_path=blast_db_path, clean_run=True)
def test_sort_bed(self): infile = "test_data/bam_ops/test_sort_bed/test_intersect_bed_A_file_unsorted.bed" expected_file = "test_data/bam_ops/test_sort_bed/expected_test_intersect_bed_A_file.bed" observed_file = "test_data/bam_ops/test_sort_bed/observed_test_sort_bed.bed" gen.remove_file(observed_file) sort_bed(infile, observed_file) expected = gen.read_many_fields(expected_file, "\t") observed = gen.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_compare_PSI_haplotypes(self): SNPs = "test_data/bam_ops/test_compare_PSI_haplotypes/SNPs.bed" bam_folder = "test_data/bam_ops/test_compare_PSI_haplotypes/bam_folder" expected = gen.read_many_fields("test_data/bam_ops/test_compare_PSI_haplotypes/expected.txt", "\t") observed = "test_data/bam_ops/test_compare_PSI_haplotypes/observed.txt" gen.remove_file(observed) compare_PSI_haplotypes(SNPs, bam_folder, observed, 3) observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_get_snp_relative_cds_position_plus_strand_split(self): relative_exon_position_file = gen.read_many_fields("test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/test_snp_relative_exon_position.bed", "\t") bed_file = "test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/full_bed.bed" expected = gen.read_many_fields("test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/expected_test_snp_relative_cds_position.bed", "\t") observed = "test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/observed_test_snp_relative_cds_position.bed" gen.remove_file(observed) get_snp_relative_cds_position(relative_exon_position_file, observed, bed_file) observed = gen.read_many_fields(observed, "\t") self.assertEqual(observed, expected)
def test_tabix(self): bed_file = "test_data/snp_ops/test_tabix/test_tabix_bed.txt" expected = gen.read_many_fields("test_data/snp_ops/test_tabix/expected_test_tabix.txt", "\t") observed = "test_data/snp_ops/observed_test_tabix.bed" gen.remove_file(observed) vcf = "../source_data/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.gz" tabix(bed_file, observed, vcf) observed = gen.read_many_fields(observed, "\t") self.assertEqual(sorted(observed), sorted(expected))
def test_get_snp_type(self): cds_list = gen.read_many_fields("test_data/snp_ops/test_get_snp_type/test_cdss.bed", "\t") snp_info = gen.read_many_fields("test_data/snp_ops/test_get_snp_type/test_snp_cds_info.bed", "\t") expected = gen.read_many_fields("test_data/snp_ops/test_get_snp_type/expected_snp_types.bed", "\t") observed = [] for i, snp in enumerate(snp_info): cds_codon, snp_codon, mutation_type = get_snp_type(cds_list[i][0], snp) observed.append([cds_codon, snp_codon, mutation_type]) self.assertEqual(observed, expected)
def test_group_flags(self): input_bed = "test_data/bam_ops/test_group_flags/test_tabix.bed" observed = "test_data/bam_ops/test_group_flags/observed_test_group_flags.bed" gen.remove_file(observed) flag_start = 3 group_flags(input_bed, observed, flag_start) expected = gen.read_many_fields("test_data/bam_ops/test_group_flags/expected_test_group_flags.bed", "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_filter_by_snp_type(self): input_snps = "test_data/snp_ops/test_filter_by_snp_type/input_snps.bed" expected = "test_data/snp_ops/test_filter_by_snp_type/expected_snps.bed" observed = "test_data/snp_ops/test_filter_by_snp_type/observed_snps.bed" gen.remove_file(observed) filter_by_snp_type(input_snps, observed, "non") expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(observed, expected)
def test_remove_overlaps2(self): in_bed = "test_data/bed_ops/test_remove_overlaps2/in.bed" expected = "test_data/bed_ops/test_remove_overlaps2/expected.bed" observed = "test_data/bed_ops/test_remove_overlaps2/observed.bed" gen.remove_file(observed) remove_overlaps(in_bed, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_merge_and_header(self): file1 = "test_data/snp_ops/test_merge_and_header/file1.txt" file2 = "test_data/snp_ops/test_merge_and_header/file2.txt" expected = "test_data/snp_ops/test_merge_and_header/expected.txt" observed = "test_data/snp_ops/test_merge_and_header/observed.txt" gen.remove_file(observed) merge_and_header(file1, file2, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_check_coding(self): exon_file = "test_data/bed_ops/test_check_coding/exons.bed" CDS_file = "test_data/bed_ops/test_check_coding/CDSs.bed" expected = "test_data/bed_ops/test_check_coding/expected_check_coding.bed" observed = "test_data/bed_ops/test_check_coding/observed_check_coding.bed" gen.remove_file(observed) check_coding(exon_file, CDS_file, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_intersect_bed_intersect_bedops(self): A_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/test_intersect_bed_A_file.bed" B_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/test_intersect_bed_B_file.bed" expected_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/expected_test_intersect_bed_intersect_bedops.bed" observed_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/observed_test_intersect_bed_intersect_bedops.bed" gen.remove_file(observed_file) intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, use_bedops = True, intersect = True) expected = gen.read_many_fields(expected_file, "\t") observed = gen.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_get_dinucleotides_contact(self): motif_set = gen.read_many_fields( "test_data/test_get_dinucleotides_concat/motif_set.txt", ",") motif_set = [i[0] for i in motif_set] expected = gen.read_many_fields( "test_data/test_get_dinucleotides_concat/expected_dinucleotides.txt", ",") expected = [i[0] for i in expected] observed = se.get_dinucleotides(motif_set, concat_motifs=True) self.assertEqual(expected, observed)
def test_get_dinucleotides_reg(self): motif_set = gen.read_many_fields( "test_data/test_get_dinucleotides_reg/motif_set.txt", ",") motif_set = [i[0] for i in motif_set] expected = gen.read_many_fields( "test_data/test_get_dinucleotides_reg/expected_dinucleotides.txt", ",") expected = [i[0] for i in expected] observed = se.get_dinucleotides(motif_set) self.assertEqual(expected, observed)
def test_intersect_bed_force_strand_hit_count(self): A_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/test_intersect_bed_A_file.bed" B_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/test_intersect_bed_B_file.bed" expected_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/expected_test_intersect_bed_force_strand_hit_count.bed" observed_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/observed_test_intersect_bed_force_strand_hit_count.bed" gen.remove_file(observed_file) intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, force_strand = True, hit_count = True) expected = gen.read_many_fields(expected_file, "\t") observed = gen.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_get_descriptions(self): gtf = "test_data/bed_ops/test_get_descriptions/descriptions.gtf" names = ["ENST100", "ENST7", "ENST0003", "ENST5"] expected = "test_data/bed_ops/test_get_descriptions/expected_get_descriptions.txt" observed = "test_data/bed_ops/test_get_descriptions/observed_get_descriptions.txt" gen.remove_file(observed) get_descriptions(names, gtf, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_extract_exon_junctions_window(self): exons = "test_data/bed_ops/test_extract_exon_junctions_window/test_extract_exon_junctions.bed" observed = "test_data/bed_ops/test_extract_exon_junctions_window/observed_test_extract_exon_window_junctions.bed" gen.remove_file(observed) extract_exon_junctions(exons, observed, 30) expected = gen.read_many_fields( "test_data/bed_ops/test_extract_exon_junctions_window/expected_test_extract_exon_window_junctions.bed", "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_filter_bed_from_fasta(self): bed = "test_data/bed_ops/test_filter_bed_from_fasta/test_filter_bed_from_fasta.bed" fasta = "test_data/bed_ops/test_filter_bed_from_fasta/test_filter_bed_from_fasta.fasta" observed = "test_data/bed_ops/test_filter_bed_from_fasta/observed_test_filter_bed_from_fasta.bed" gen.remove_file(observed) expected = "test_data/bed_ops/test_filter_bed_from_fasta/expected_test_filter_bed_from_fasta.bed" filter_bed_from_fasta(bed, fasta, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_extract_exons(self): gtf = "test_data/bed_ops/test_extract_exons/test_extract_exons.gtf" observed = "test_data/bed_ops/test_extract_exons/observed_test_extract_exons.bed" gen.remove_file(observed) extract_exons(gtf, observed) expected = gen.read_many_fields( "test_data/bed_ops/test_extract_exons/expected_test_extract_exons.bed", "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_extract_features_cdss_stops(self): gtf_file = "test_data/bed_ops/test_extract_features_cdss_stops/test_extract_features.gtf" observed = "test_data/bed_ops/test_extract_features_cdss_stops/observed_test_extract_features_cdss_stops.bed" gen.remove_file(observed) extract_features(gtf_file, observed, ['CDS', 'stop_codon']) expected = gen.read_many_fields( "test_data/bed_ops/test_extract_features_cdss_stops/expected_test_extract_features_cdss_stops.bed", "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(observed, expected)
def test_intersect_bed_overlap(self): A_file = "test_data/bam_ops/test_intersect_bed_overlap/test_intersect_bed_A_file.bed" B_file = "test_data/bam_ops/test_intersect_bed_overlap/test_intersect_bed_B_file.bed" expected_file = "test_data/bam_ops/test_intersect_bed_overlap/expected_test_intersect_bed_overlap.bed" observed_file = "test_data/bam_ops/test_intersect_bed_overlap/observed_test_intersect_bed_overlap.bed" gen.remove_file(observed_file) intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, overlap = 0.5) expected = gen.read_many_fields(expected_file, "\t") observed = gen.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_filter_exon_junctions(self): exon_junctions_file = "test_data/bed_ops/test_filter_exon_junctions/exon_junctions.bed" exons_file = "test_data/bed_ops/test_filter_exon_junctions/exons.bed" expected = "test_data/bed_ops/test_filter_exon_junctions/expected_filter_exon_junctions.bed" observed = "test_data/bed_ops/test_filter_exon_junctions/observed_filter_exon_junctions.bed" gen.remove_file(observed) filter_exon_junctions(exon_junctions_file, exons_file, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)