def get_sum_signs(rna, mcff_args='-t 1', mcff_timeout=300, win_size=79, win_skip=1): '''Takes as an argument an RNA as a SeqRecord object. Returns a vector of the sum of signatures for all windows (non-averaged) and the number of windows used. mcff_args = mcff parameters, defaulted to -t 1 mcff_timeout = time to take before mcff exits win_size = length of windows to use win_skip = how many nucleotides to move to the right each time''' win_count = 0 signature_vector = np.zeros(940) for i in range(0, len(rna.seq), win_skip): # exits loop when the passed window is shorter than win_size if len(rna.seq[i:i + win_size]) < win_size: break # calculates the signature for passed window cur_win = rna.seq[i:i + win_size] dotbs, shapes = mv.dotbs_and_shapes(cur_win, parameters=mcff_args, timeout_in_seconds=mcff_timeout) signature_vector += mv.shape60_ncm40_ncmexp500_expseq340( cur_win, dotbs, shapes) win_count += 1 return (signature_vector, win_count)
def get_sum_signs_int(rna, mcff_args='-t 1', mcff_timeout=300, win_size=79, win_skip=WIN_SKIP): '''Takes as an argument an RNA as a SeqRecord object. Returns a list of vectors for the motif counts, along with the windows used and the number of dot-brackets used. Also returns the rna_id of the processed transcript. Used to make more lightweight .csv files in which all numeric entries are ints mcff_args = mcff parameters, defaulted to -t 1 mcff_timeout = time to take before mcff exits win_size = length of windows to use win_skip = how many nucleotides to move to the right each time''' win_count = 0 all_sign_vectors = [] dotbs_in_win = [] nt_position = [] for i in range(0, len(rna.seq), win_skip): # exits loop when the passed window is shorter than win_size if len(rna.seq[i:i+win_size]) < win_size: break # calculate the signature for passed window cur_win = rna.seq[i:i+win_size] dotbs, shapes = mv.dotbs_and_shapes(cur_win, parameters=mcff_args, timeout_in_seconds=mcff_timeout) cur_signature_vector, nb_dotbs, _, _ = mv.shape60_ncm40_ncmexp500_expseq340_nodiv(cur_win, dotbs, shapes) win_count += 1 all_sign_vectors.append(cur_signature_vector) # keeps track of the dotbs and nt position dotbs_in_win.append(nb_dotbs) nt_position.append(i) int_vectors = np.array( [arr.astype(int) for arr in all_sign_vectors] ) return (int_vectors, win_count, dotbs_in_win, rna.id, nt_position)
# retrieve the DF of motifs from file print('Loading file...') df = pd.read_parquet( f'/u/floresj/mRNA_norm/mRNA_vectors/mrna_folded_int_subset{FILE_NUMBER}' ) print('File has loaded.') df_indices = sorted(list({rna_id for rna_id, _ in df.index.values})) # calculate the frequency for aptamer-21 seq_list = [ seq for seq in parse( '/u/floresj/Transcriptome_scanning_apta21/aptamer_21.fa', format='fasta') ] apt_21 = seq_list[0] dotbs, shapes = mv.dotbs_and_shapes(apt_21.seq) apt21_sign, _, _ = mv.shape60_ncm40_ncmexp500_expseq340( apt_21.seq, dotbs, shapes) ## keep track of execution time print('-------------------------------') print('Version: Thursday July 26, 2018') print('Time\t\tProcessed') start_time = dt.now() # multiprocessing setup multiprocessing.set_start_method('spawn') compteur = TimeCounter(len(df_indices)) pool = multiprocessing.Pool(20) # process all transcripts