Пример #1
0
"""Created:  Monday August 13, 2018
   Modified: Monday August 20, 2018
   Jorge Luis Flores
   Generates a normalizing vector of 940 structural motifs from the collection of all mRNAs
   This vector is the average of all windows folded and will be used to normalize"""
import sys
sys.path.append('/u/floresj/Pyth_modules/')

import motifs_vector_3 as mv
from scipy.stats.stats import pearsonr
import numpy as np
import pandas as pd

from datetime import datetime as dt

ALL_MOTIFS = mv.list_of_all_motifs()

## keep track of execution time
start_time = dt.now()

freq_df = pd.DataFrame()
# load all dataframes
for i in ['01', '02', '03', '04', '05', '06', '07' ,'08', '09', '10', '11', '12', '13']:
    filename = '/u/floresj/mRNA_norm/mRNA_vectors/mrna_folded_int_subset'+i
    tmp_df = pd.read_parquet(filename)
    freq_df = freq_df.append( tmp_df.iloc[:,:101].astype('uint16') )
    
    print(f'{dt.now() - start_time}\tLoaded dataframe {i}')
    
# if all motifs are used, len(freq_df.columns.values) == 941
nb_motifs = len(freq_df.columns.values) - 1
        all_mrna.append(rna)

    # initialize logging file
    with open('/u/floresj/mRNA_norm/normalization_fold_all_log.txt',
              'w') as fp:
        fp.write('Version: Monday August 20, 2018\n')
        fp.write('Time\t\tProcessed\n')

    ## keep track of execution time
    start_time = dt.now()

    # multiprocessing setup
    compteur = TimeCounter(len(all_mrna))
    multiprocessing.set_start_method('spawn')
    pool = multiprocessing.Pool(36)

    for rna in all_mrna:
        pool.apply_async(get_sum_signs, (rna, ),
                         callback=compteur.print_progress)
        #compteur.print_progress(get_sum_signs(rna))
    pool.close()
    pool.join()

    # average the sum of frequencies by number of windows used
    vector_avg = compteur.vector_sum / compteur.nb_windows
    print(f'{compteur.count}\t Sequences folded')

    motif_index = mv.list_of_all_motifs()

    vector_df = pd.DataFrame(vector_avg, index=motif_index).transpose()
    vector_df.to_csv('/u/floresj/mRNA_norm/mrna_avg_vector.csv')