def get_sum_signs(rna,
                  mcff_args='-t 1',
                  mcff_timeout=300,
                  win_size=79,
                  win_skip=1):
    '''Takes as an argument an RNA as a SeqRecord object.
       Returns a vector of the sum of signatures for all windows (non-averaged)
       and the number of windows used.
       mcff_args     = mcff parameters, defaulted to -t 1
       mcff_timeout  = time to take before mcff exits
       win_size      = length of windows to use
       win_skip      = how many nucleotides to move to the right each time'''

    win_count = 0
    signature_vector = np.zeros(940)

    for i in range(0, len(rna.seq), win_skip):
        # exits loop when the passed window is shorter than win_size
        if len(rna.seq[i:i + win_size]) < win_size:
            break

        # calculates the signature for passed window
        cur_win = rna.seq[i:i + win_size]
        dotbs, shapes = mv.dotbs_and_shapes(cur_win,
                                            parameters=mcff_args,
                                            timeout_in_seconds=mcff_timeout)

        signature_vector += mv.shape60_ncm40_ncmexp500_expseq340(
            cur_win, dotbs, shapes)
        win_count += 1

    return (signature_vector, win_count)
def get_sum_signs_int(rna, mcff_args='-t 1', mcff_timeout=300, win_size=79, win_skip=WIN_SKIP):
    '''Takes as an argument an RNA as a SeqRecord object.
       Returns a list of vectors for the motif counts, along with the windows used and the number of dot-brackets used.
       Also returns the rna_id of the processed transcript.
       Used to make more lightweight .csv files in which all numeric entries are ints
       
       mcff_args     = mcff parameters, defaulted to -t 1
       mcff_timeout  = time to take before mcff exits
       win_size      = length of windows to use
       win_skip      = how many nucleotides to move to the right each time'''
    
    win_count = 0
    all_sign_vectors = []
    dotbs_in_win = []
    nt_position = []
    
    for i in range(0, len(rna.seq), win_skip):
        # exits loop when the passed window is shorter than win_size
        if len(rna.seq[i:i+win_size]) < win_size:
            break
        
        # calculate the signature for passed window
        cur_win = rna.seq[i:i+win_size]
        dotbs, shapes = mv.dotbs_and_shapes(cur_win, parameters=mcff_args, timeout_in_seconds=mcff_timeout)
        
        cur_signature_vector, nb_dotbs, _, _ = mv.shape60_ncm40_ncmexp500_expseq340_nodiv(cur_win, dotbs, shapes)
        win_count += 1
        
        all_sign_vectors.append(cur_signature_vector)
        
        # keeps track of the dotbs and nt position
        dotbs_in_win.append(nb_dotbs)
        nt_position.append(i)
    
    int_vectors = np.array( [arr.astype(int) for arr in all_sign_vectors] )
    
    return (int_vectors, win_count, dotbs_in_win, rna.id, nt_position)
Пример #3
0
    # retrieve the DF of motifs from file
    print('Loading file...')
    df = pd.read_parquet(
        f'/u/floresj/mRNA_norm/mRNA_vectors/mrna_folded_int_subset{FILE_NUMBER}'
    )
    print('File has loaded.')
    df_indices = sorted(list({rna_id for rna_id, _ in df.index.values}))

    # calculate the frequency for aptamer-21
    seq_list = [
        seq for seq in parse(
            '/u/floresj/Transcriptome_scanning_apta21/aptamer_21.fa',
            format='fasta')
    ]
    apt_21 = seq_list[0]
    dotbs, shapes = mv.dotbs_and_shapes(apt_21.seq)
    apt21_sign, _, _ = mv.shape60_ncm40_ncmexp500_expseq340(
        apt_21.seq, dotbs, shapes)

    ## keep track of execution time
    print('-------------------------------')
    print('Version: Thursday July 26, 2018')
    print('Time\t\tProcessed')
    start_time = dt.now()

    # multiprocessing setup
    multiprocessing.set_start_method('spawn')
    compteur = TimeCounter(len(df_indices))
    pool = multiprocessing.Pool(20)

    # process all transcripts