def test_filtered_profiles(args): original_profiles_array = IO.unpack_profiles_file(args.profiles_full_file, do_print=True) with open(args.profiles_filtered_file, 'rb') as rf: bitstring = rf.read() filtered_profiles_array = IO.decompress_profiles_indices(bitstring) print(original_profiles_array.shape) print(original_profiles_array) print(original_profiles_array[6:16, ].sum()) print(filtered_profiles_array.shape) print(filtered_profiles_array) print(filtered_profiles_array.sum())
def run_test_bins_fasta(args): with open(args.rna_bin_file, 'rb') as rb: bitstring = rb.read() seq_objects_dict, seq_objects_order = IO.decompress_named_sequences( bitstring) full_string = IO.write_named_seq_to_fasta(seq_objects_dict, seq_objects_order) with open(args.rna_fastafile, 'r') as rf: full_fasta_string = rf.read() full_fasta_string_Us = full_fasta_string.replace('T', 'U').replace( 'ENSU', 'ENST') assert (len(full_string) == len(full_fasta_string_Us)) assert (full_string == full_fasta_string_Us)
def time_calculate_MI_profiles(calculate_with_numba): test_batch_folder = '/Users/student/Documents/hani/programs/pyteiser/data/test_1_batch_snrnpa1' seeds_filename = os.path.join(test_batch_folder, 'seeds_4-7_4-9_4-6_14-20_30k_1.bin') profiles_filename = os.path.join( test_batch_folder, 'snrnpa_profiles_4-7_4-9_4-6_14-20_30k_1.bin') exp_mask_filename = "/Users/student/Documents/hani/programs/pyteiser/data/mask_files/SNRNPA1_PSI_mask.bin" nbins = 15 min_occurences = 5 decompressed_profiles_array, index_array, values_array = IO.unpack_profiles_and_mask( profiles_filename, exp_mask_filename, do_print=True) discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins) value, counts = np.unique(discr_exp_profile, return_counts=True, axis=0) print(counts) MI_values_array = calculate_MI_profiles.calculate_MI_for_seeds( decompressed_profiles_array, index_array, discr_exp_profile, min_occurences, calculate_with_numba, do_print=True)
def profiles_wrapper(args): profiles_filename = args.profiles_full_file decompressed_profiles_array = IO.unpack_profiles_file(profiles_filename, args.indices_mode, do_print=True) run_test_compressing_decompressing_indices(args, decompressed_profiles_array)
def main(): args = handler() w_motifs_list = IO.read_motif_file(args.seeds_bin_file) test_representation(w_motifs_list[2], args.temp_folder) test_RNAstructure_plotting(w_motifs_list[0:10])
def main(): args = handler() n_seqs_list = read_sequences(args.rna_bin_file) index_array, values_array = IO.unpack_mask_file(args.exp_mask_file) discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins = args.nbins) seqs_of_interest = [n_seqs_list[x] for x in range(index_array.shape[0]) if index_array[x]] test_elongated_seed(seqs_of_interest, discr_exp_profile, args.nbins, args.number_example_matches_to_print)
def test_bins_fasta(args): with open(args.rna_bin_file, 'rb') as rb: bitstring = rb.read() seq_objects_dict, seq_objects_order = IO.decompress_named_sequences( bitstring) full_string = IO.write_named_seq_to_fasta(seq_objects_dict, seq_objects_order) with open(args.rna_fastafile, 'r') as rf: full_fasta_string = rf.read() with open("/Users/student/Documents/hani/temp/temp_fasta/1.txt", 'w') as wf: wf.write(full_string) full_fasta_string_Us = full_fasta_string.replace('T', 'U').replace( 'ENSU', 'ENST') assert (len(full_string) == len(full_fasta_string_Us)) # # print(full_string[0:200]) # print(full_fasta_string_Us[0:200]) assert (full_string == full_fasta_string_Us)
def run_test_profiles_compression_decompression(args, do_shorten_test = True): args.seedfile = args.seeds_bin_file n_motifs_list, n_seqs_list = calculate_seed_profiles.prepare_lists_for_calculations(args) if do_shorten_test: n_motifs_list = n_motifs_list[0:3] calculated_profiles_array = calculate_seed_profiles.calculate_write_profiles(n_motifs_list, n_seqs_list, args.profiles_bin_file, do_print=True, do_return=True) with open(args.profiles_bin_file, 'rb') as rf: bitstring = rf.read() decompressed_profiles_array = IO.decompress_profiles(bitstring) assert(np.array_equal(calculated_profiles_array, decompressed_profiles_array))
def prepare_known_seeds(args): seqs_shape, seqs_to_test, bin_file_to_test, desired_numbers = define_constants(args) w_motifs_list = [0] * len(seqs_to_test) for ind, seq in enumerate(seqs_to_test): curr_test_motif = structures.w_motif(seqs_shape[0], seqs_shape[1]) curr_test_motif.from_string(seq) w_motifs_list[ind] = curr_test_motif seqs_dict, seqs_order = IO.read_rna_bin_file(bin_file_to_test) w_seqs_list = [seqs_dict[name] for name in seqs_order] n_motifs_list = type_conversions.w_to_n_motifs_list(w_motifs_list) n_seqs_list = type_conversions.w_to_n_sequences_list(w_seqs_list) return n_motifs_list, n_seqs_list
def test_compressing_decompressing_indices(args, decompressed_profiles_array): with open(args.compressed_profiles_file, 'wb') as wf: transcriptome_length = decompressed_profiles_array.shape[1] for i in range(decompressed_profiles_array.shape[0]): curr_profile = structures.w_profile(transcriptome_length) curr_profile.values = decompressed_profiles_array[i] curr_profile.compress_indices() wf.write(curr_profile.bytestring_indices) with open(args.compressed_profiles_file, 'rb') as rf: bitstring = rf.read() read_out_profiles_array = IO.decompress_profiles_indices(bitstring) assert (read_out_profiles_array == decompressed_profiles_array ).all(), "decompression has changed the data!"
def time_reading_fasta(fasta_file): tr_dict_loc = {} seqs_order = [] with open(fasta_file, 'r') as f: split_string = f.read().split('>') for entry in split_string: if entry == '': continue seq_start = entry.find('\n') annotation = entry[:seq_start] sequence_string = entry[seq_start + 1:].replace('\n', '') current_sequence = structures.w_sequence(len(sequence_string)) current_sequence.from_sequence(sequence_string) time_create_object = timeit.timeit( lambda: structures.w_sequence(len(sequence_string)), number=100) time_fill_object = timeit.timeit( lambda: current_sequence.from_sequence(sequence_string), number=100) time_compress_object = timeit.timeit( lambda: current_sequence.compress(), number=100) time_compress_named_object = timeit.timeit( lambda: IO.compress_named_sequences( {annotation: current_sequence}, [annotation]), number=100) print("Create object: %.5f" % time_create_object) print("Fill object: %.5f" % time_fill_object) print("Compress object: %.5f" % time_compress_object) print("Compress named object: %.5f" % time_compress_named_object) print() # curr_timing = timeit.timeit('current_sequence.from_sequence(sequence_string)', # 'from __main__ import current_sequence, sequence_string') # print(curr_timing) # # tr_dict_loc[annotation] = current_sequence # seqs_order.append(annotation) return tr_dict_loc, seqs_order
def read_sequences(rna_bin_filename): seqs_dict, seqs_order = IO.read_rna_bin_file(rna_bin_filename) w_seqs_list = [seqs_dict[name] for name in seqs_order] n_seqs_list = type_conversions.w_to_n_sequences_list(w_seqs_list) return n_seqs_list
def time_compressing_sequences(fasta_file): sequences_dict, seqs_order = IO.read_fasta(fasta_file) for i in range(len(seqs_order)): print(seqs_order[i])