def t_test_and_pickle(fnames_dic, chromo, output_dir, group_1_samples, group_2_samples, main_dir, queue_obj, min_counts): ''' Combines several modules together into one so that the process can be easily multithreaded. Return a dictionary containing chromosomes as keynames as fnames as values. ''' # Define constants pval_str = 'pval' event_str = 'event' # Define output dic # DEBUG fnames_dic = {} # Create directory to store pickled dictionary. make_dir(os.path.join(output_dir, chromo)) ''' # Get list of AS events that need to be t-tested. # Run the function on the lists separately to ensure # that each list contains at least one element. # This means our master_fnames_list is guaranteed to # have one sample in each group. ''' group_1_fnames_list = get_all_fnames(group_1_samples, main_dir, chromo) group_2_fnames_list = get_all_fnames(group_2_samples, main_dir, chromo) master_fnames_list = group_1_fnames_list + group_2_fnames_list # Remove repeats master_fnames_list = list(set(master_fnames_list)) # master_fnames_size = len(master_fnames_list) # Do t-test between the two groups. fnames_pickled_list = [] count = 0 for fname in master_fnames_list: count += 1 # Get dictionary containing psi information for all samples. psi_info_dic, _ = get_psi_dic_across_samples(fname, group_1_samples, group_2_samples, main_dir, chromo, output_dir, min_counts) # Add pval and event to dic psi_info_dic[pval_str] = [t_test_psi_info(psi_info_dic)] # Remove .miso from fname to get event name. psi_info_dic[event_str] = [fname.split('.')[0]] # Save dictionary as a pickle file. # add .pickle to fname pickled_fname = ''.join([fname, '.pickle']) output_fullpath = os.path.join(output_dir, chromo, pickled_fname) fnames_pickled_list.append( save_dic_as_pickle(psi_info_dic, output_fullpath)) # save fnames list to output dic if chromo not in fnames_dic: fnames_dic[chromo] = fnames_pickled_list else: print('Warning, overwriting fnames_list in %s' % chromo) print('T-tested %s events in %s' % (count, chromo)) queue_obj.put(fnames_dic) # For multithreading
def t_test_and_pickle(fnames_dic, chromo, output_dir, group_1_samples, group_2_samples, main_dir, queue_obj, min_counts): ''' Combines several modules together into one so that the process can be easily multithreaded. Return a dictionary containing chromosomes as keynames as fnames as values. ''' # Define constants pval_str = 'pval' event_str = 'event' # Define output dic # DEBUG fnames_dic = {} # Create directory to store pickled dictionary. make_dir(os.path.join(output_dir, chromo)) ''' # Get list of AS events that need to be t-tested. # Run the function on the lists separately to ensure # that each list contains at least one element. # This means our master_fnames_list is guaranteed to # have one sample in each group. ''' group_1_fnames_list = get_all_fnames(group_1_samples, main_dir, chromo) group_2_fnames_list = get_all_fnames(group_2_samples, main_dir, chromo) master_fnames_list = group_1_fnames_list + group_2_fnames_list # Remove repeats master_fnames_list = list(set(master_fnames_list)) # master_fnames_size = len(master_fnames_list) # Do t-test between the two groups. fnames_pickled_list = [] count = 0 for fname in master_fnames_list: count += 1 # Get dictionary containing psi information for all samples. psi_info_dic, _ = get_psi_dic_across_samples(fname, group_1_samples, group_2_samples, main_dir, chromo, output_dir, min_counts) # Add pval and event to dic psi_info_dic[pval_str] = [t_test_psi_info(psi_info_dic)] # Remove .miso from fname to get event name. psi_info_dic[event_str] = [fname.split('.')[0]] # Save dictionary as a pickle file. # add .pickle to fname pickled_fname = ''.join([fname, '.pickle']) output_fullpath = os.path.join(output_dir, chromo, pickled_fname) fnames_pickled_list.append(save_dic_as_pickle(psi_info_dic, output_fullpath)) # save fnames list to output dic if chromo not in fnames_dic: fnames_dic[chromo] = fnames_pickled_list else: print('Warning, overwriting fnames_list in %s' %chromo) print('T-tested %s events in %s' %(count, chromo)) queue_obj.put(fnames_dic) # For multithreading
def main(): parser = OptionParser() parser.add_option('-1', '--group1_file', dest='group_1_samplenames_file', help='Filename containing group 1 sample names (PCa)') parser.add_option('-2', '--group2_file', dest='group_2_samplenames_file', help='Filename containing group 2 sample names (NEPC)') parser.add_option('-d', '--main_directory', dest='main_dir', help='Main directory containing miso output results.') parser.add_option('-o', '--output_directory', dest='output_dir', help='Output directory of t-test results.') parser.add_option('-O', '--output_filename', dest='output_fname', help='Output filename of the t-test results.') parser.add_option('-m', '--min_counts', type='int', dest='min_counts', help='Minimum junction read counts to be considered '\ 'into the t-test. Best practices says 10.') # Parse options (options, _) = parser.parse_args() # Define constants from options group_1_samplenames_file = options.group_1_samplenames_file group_2_samplenames_file = options.group_2_samplenames_file main_dir = options.main_dir output_dir = options.output_dir output_fname = options.output_fname min_counts = options.min_counts # Define constants summary_fullpath = os.path.join(output_dir, output_fname) # Get sample names from textfile. group_1_samples = get_sample_names_from_file(group_1_samplenames_file) group_2_samples = get_sample_names_from_file(group_2_samplenames_file) # Create list of chromosomes. chr_list = create_chromo_list(prefix='chr') # chr_list = ['chr11'] # Subset list for only those that contain miso outputs. group_1_samples = check_if_empty_dir(main_dir, group_1_samples, chr_list) group_2_samples = check_if_empty_dir(main_dir, group_2_samples, chr_list) # Init fnames dic fnames_dic = {} # Run on multiple threads. q = Queue() process_list = [] for chromo in chr_list: print('Sending %s job to core...' %chromo) p = Process(target=t_test_and_pickle, args=(fnames_dic, chromo, output_dir, group_1_samples, group_2_samples, main_dir, q, min_counts)) process_list.append(p) p.start() for chromo in chr_list: fnames_dic.update(q.get()) # Wait for all threads to be done before continuing. for p in process_list: p.join() print('Completed %s jobs.' %len(chr_list)) # Write fnames_dic as pickle file. pickle_filename = ''.join([output_fname, '_filenames_dic.pickle']) fnames_savepath = os.path.join(output_dir, pickle_filename) print('Saving filenames_dic.pickle to %s' %fnames_savepath) pickle_path = save_dic_as_pickle(fnames_dic, fnames_savepath) # Write information from pickle to textfile. print('Writing information from pickle to textfile.') # Read pickle file to get fnames_dic fnames_dic = read_pickle(pickle_path) # Read and write to file. read_pickle_write_to_file(summary_fullpath, chr_list, fnames_dic, filter_events=True) print('Summary file saved in: %s' %summary_fullpath)
def main(): parser = OptionParser() parser.add_option('-1', '--group1_file', dest='group_1_samplenames_file', help='Filename containing group 1 sample names (PCa)') parser.add_option('-2', '--group2_file', dest='group_2_samplenames_file', help='Filename containing group 2 sample names (NEPC)') parser.add_option('-d', '--main_directory', dest='main_dir', help='Main directory containing miso output results.') parser.add_option('-o', '--output_directory', dest='output_dir', help='Output directory of t-test results.') parser.add_option('-O', '--output_filename', dest='output_fname', help='Output filename of the t-test results.') parser.add_option('-m', '--min_counts', type='int', dest='min_counts', help='Minimum junction read counts to be considered '\ 'into the t-test. Best practices says 10.') # Parse options (options, _) = parser.parse_args() # Define constants from options group_1_samplenames_file = options.group_1_samplenames_file group_2_samplenames_file = options.group_2_samplenames_file main_dir = options.main_dir output_dir = options.output_dir output_fname = options.output_fname min_counts = options.min_counts # Define constants summary_fullpath = os.path.join(output_dir, output_fname) # Get sample names from textfile. group_1_samples = get_sample_names_from_file(group_1_samplenames_file) group_2_samples = get_sample_names_from_file(group_2_samplenames_file) # Create list of chromosomes. chr_list = create_chromo_list(prefix='chr') # chr_list = ['chr11'] # Subset list for only those that contain miso outputs. group_1_samples = check_if_empty_dir(main_dir, group_1_samples, chr_list) group_2_samples = check_if_empty_dir(main_dir, group_2_samples, chr_list) # Init fnames dic fnames_dic = {} # Run on multiple threads. q = Queue() process_list = [] for chromo in chr_list: print('Sending %s job to core...' % chromo) p = Process(target=t_test_and_pickle, args=(fnames_dic, chromo, output_dir, group_1_samples, group_2_samples, main_dir, q, min_counts)) process_list.append(p) p.start() for chromo in chr_list: fnames_dic.update(q.get()) # Wait for all threads to be done before continuing. for p in process_list: p.join() print('Completed %s jobs.' % len(chr_list)) # Write fnames_dic as pickle file. pickle_filename = ''.join([output_fname, '_filenames_dic.pickle']) fnames_savepath = os.path.join(output_dir, pickle_filename) print('Saving filenames_dic.pickle to %s' % fnames_savepath) pickle_path = save_dic_as_pickle(fnames_dic, fnames_savepath) # Write information from pickle to textfile. print('Writing information from pickle to textfile.') # Read pickle file to get fnames_dic fnames_dic = read_pickle(pickle_path) # Read and write to file. read_pickle_write_to_file(summary_fullpath, chr_list, fnames_dic, filter_events=True) print('Summary file saved in: %s' % summary_fullpath)