def write_array_of_profiles(profiles_passed_array, combined_profiles_filename): with open(combined_profiles_filename, 'wb') as wf: for i in range(profiles_passed_array.shape[0]): current_profile = structures.w_profile(profiles_passed_array[i].shape[0]) current_profile.values = profiles_passed_array[i] current_profile.compress() wf.write(current_profile.bytestring)
def calculate_profile_one_motif(motif, n_seqs_list, is_degenerate=False): start_time = time.time() current_profile = structures.w_profile(len(n_seqs_list)) for i, seq in enumerate(n_seqs_list): match = is_there_motif_instance(motif, seq, is_degenerate) if match: current_profile.values[i] = True end_time = time.time() time_spent = end_time - start_time return current_profile, time_spent
def filter_profiles_by_folding(w_motifs_list, w_seqs_list, n_motifs_list, n_seqs_list, profiles_array, output_filename, window_size, MFE_ratio_thresh, is_degenerate, do_print=False, do_print_subs_matches=False, do_print_progress=True, how_often_print=100): N_seq = len(n_seqs_list) with open(output_filename, 'wb') as wf: # iterate over seeds for i, w_motif in enumerate(w_motifs_list): if i <= 5: continue if i >= 16: break n_motif = n_motifs_list[i] current_profile = profiles_array[i, :] filtered_profile = process_one_profile_one_seed( w_motif, n_motif, w_seqs_list, n_seqs_list, current_profile, window_size, is_degenerate, MFE_ratio_thresh, do_print=do_print, do_print_subs_matches=do_print_subs_matches, do_print_progress=do_print_progress, how_often_print=how_often_print) filtered_profile_w = structures.w_profile(N_seq) filtered_profile_w.values = filtered_profile filtered_profile_w.compress_indices() wf.write(filtered_profile_w.bytestring_indices) if do_print: difference_u_f = np.logical_and(current_profile, np.invert(filtered_profile)) print("%d out of %d transcripts were filtered out: " % (difference_u_f.sum(), current_profile.sum()))
def decompress_profiles(bitstring, do_print=False, how_often_print=10000): profiles_list = [] total_length = len(bitstring) current_spot = 0 counter = 0 while current_spot < total_length: # get the length of the profile length_bitstring = bitstring[current_spot : current_spot + 4] profile_length_np = np.frombuffer(length_bitstring, dtype=np.uint32) profile_length = profile_length_np[0] # figure out how long is the profile packed into bits # if profile length // 8 > 0, it will take one additional byte if profile_length % 8 != 0: length_packed = (profile_length // 8) + 1 else: length_packed = profile_length // 8 values_bitstring = bitstring[current_spot + 4 : current_spot + 4 + length_packed] md5_bitstring = bitstring[current_spot + 4 + length_packed : current_spot + 4 + length_packed + 16] current_spot += 4 + length_packed + 16 values_packed_bits = np.frombuffer(values_bitstring, dtype=np.uint8) values = np.unpackbits(values_packed_bits) values = values[0 : profile_length] current_profile = structures.w_profile(profile_length) current_profile.values = values current_profile.compress() assert (md5_bitstring == current_profile.md5) profiles_list.append(current_profile.values) counter += 1 if counter % how_often_print == 0: if do_print: print("Decompressed profile number ", counter) profiles_array = np.array(profiles_list, dtype=np.bool) return profiles_array
def write_profiles_passed(last_positive_seed, MI_values_array, profiles_array, passed_profiles_filename): if last_positive_seed < 0: total_bitstring = np.uint32(0).tobytes() else: seed_indices_sorted = np.argsort(MI_values_array)[::-1] indices_passed = seed_indices_sorted[0:last_positive_seed + 1] profiles_passed_list = profiles_array[indices_passed] profiles_bitstrings = [] for i in range(profiles_passed_list.shape[0]): current_profile = structures.w_profile( profiles_passed_list[i].shape[0]) current_profile.values = profiles_passed_list[i] current_profile.compress() profiles_bitstrings.append(current_profile.bytestring) total_bitstring = np.uint32( len(profiles_bitstrings)).tobytes() + b''.join(profiles_bitstrings) with open(passed_profiles_filename, 'wb') as wf: wf.write(total_bitstring)
def decompress_profiles_indices(bitstring, do_print=False, how_often_print=10000): profiles_list = [] total_length = len(bitstring) current_spot = 0 counter = 0 while current_spot < total_length: # get the length of the profile length_bitstring = bitstring[current_spot : current_spot + 4] profile_length_np = np.frombuffer(length_bitstring, dtype=np.uint32) length = profile_length_np[0] # get the number of indices (of True) of the profile N_indices_bitstring = bitstring[current_spot + 4 : current_spot + 8] N_indices_np = np.frombuffer(N_indices_bitstring, dtype=np.uint32) N_indices = N_indices_np[0] # get the number of bits used per index (compression width) width_bitstring = bitstring[current_spot + 8 : current_spot + 12] width_np = np.frombuffer(width_bitstring, dtype=np.uint32) width = width_np[0] # figure out how many bytes do we need to read out length_packed = N_indices * width if length_packed % 8 != 0: length_packed = (length_packed // 8) + 1 else: length_packed = length_packed // 8 # read out bitstring of the proper size values_bitstring = bitstring[current_spot + 12 : current_spot + 12 + length_packed] md5_bitstring = bitstring[current_spot + 12 + length_packed : current_spot + 12 + length_packed + 16] current_spot += 12 + length_packed + 16 # convert bitsting to 32-bit arrays representing indices indices_packed_uint8 = np.frombuffer(values_bitstring, dtype=np.uint8) binary_bytes_array = np.unpackbits(indices_packed_uint8) binary_bytes_array = binary_bytes_array[0 : N_indices * width] reshaped_binary_array = binary_bytes_array.reshape(N_indices, width) full_binary_array = np.zeros((N_indices, 32), dtype=np.bool) full_binary_array[:, 0:width] = reshaped_binary_array # convert 32-bit arrays into a uint32 indices reshaped_full_binary_array = full_binary_array.flatten() reshaped_full_binary_string = np.packbits(reshaped_full_binary_array) true_indices = np.frombuffer(reshaped_full_binary_string, dtype=np.uint32) # create a new profile curr_profile = structures.w_profile(length) curr_profile.values[true_indices] = True curr_profile.compress_indices() assert (md5_bitstring == curr_profile.md5_indices) profiles_list.append(curr_profile.values) counter += 1 if counter % how_often_print == 0: if do_print: print("Decompressed profile number ", counter) profiles_array = np.array(profiles_list, dtype=np.bool) return profiles_array