def cluster_count_matrix(config_file, lane_id, strain_fmt_string, cond_fmt_string): config_params = cfp.parse(config_file) sample_detection_limit, control_detection_limit = get_detection_limits(config_params) # If the file does not exist, then do not attempt to cluster it! try: genes, conditions, matrix = load_dumped_count_matrix(config_params, lane_id) except IOError: print "could not find '{}' count matrix".format(lane_id) return None thresholded_matrix = matrix thresholded_matrix[thresholded_matrix < sample_detection_limit] = sample_detection_limit logged_matrix = np.log2(thresholded_matrix) # Customize the strain and condition names for interpretable visualization! custom_genes = customize_strains(genes, config_params, strain_fmt_string) custom_conditions = customize_conditions(conditions, config_params, cond_fmt_string) dataset = [custom_genes, custom_conditions, logged_matrix] record, rows_tree, cols_tree = clus.cluster(dataset) f = get_clustered_count_matrix_filename(config_params, lane_id) record.save(f, rows_tree, cols_tree)
def cluster_zscore_matrix(config_file, lane_id, strain_fmt_string, cond_fmt_string): config_params = cfp.parse(config_file) # If the file does not exist, then do not attempt to cluster it! try: genes, conditions, matrix = load_dumped_zscore_matrix(config_params, lane_id) except IOError: print "could not find '{}' zscore matrix".format(lane_id) return None # Customize the strain and condition names for interpretable visualization! strain_table = get_barcode_table(config_params) sample_table = get_sample_table(config_params) custom_genes = customize_strains(genes, strain_table, strain_fmt_string) custom_conditions = customize_conditions(conditions, sample_table, cond_fmt_string) dataset = [custom_genes, custom_conditions, matrix] record, rows_tree, cols_tree = clus.cluster(dataset) f = get_clustered_zscore_matrix_filename(config_params, lane_id) record.save(f, rows_tree, cols_tree) # return the filename so the cdt/atr/gtr files can be copied to a directory with all # of the other clustergrams and eventually tarred/gzipped for distribution! return f
def get_configuration(): """ Get the configuration from command line and config files """ # This is the dict we will return configuration = {"global": {}, "logging": {}, "tables": {}} # Read the command line options cmd_line_options = command_line_parser.parse() # If a configuration file is specified, read that as well if "config" in cmd_line_options: conf_file_options = config_file_parser.parse(cmd_line_options["config"]) else: conf_file_options = None # Extract global config configuration["global"] = __get_global_options(cmd_line_options, conf_file_options) # Extract logging config configuration["logging"] = __get_logging_options(cmd_line_options, conf_file_options) # Extract table configuration # If the --table cmd line option is set, it indicates that only table # options from the command line should be used if "table_name" in cmd_line_options: configuration["tables"] = __get_cmd_table_options(cmd_line_options) else: configuration["tables"] = __get_config_table_options(conf_file_options) # Ensure some basic rules __check_table_rules(configuration) return configuration
def main(config_file): # Read in the config params config_params = cfp.parse(config_file) sample_table = get_sample_table(config_params) # Create the folder where all the filtered condition/strain info is dumped create_filtered_output_folder(config_params) # Get parameters that specify if some steps should be run or not bool_dict = {'True': True, 'TRUE': True, 'False': False, 'FALSE': False} remove_mtag_offenders = bool_dict[config_params['remove_correlated_index_tags']] remove_barcode_specific_conds = bool_dict[config_params['remove_barcode_specific_conditions']] dataset = load_dumped_count_matrix(config_params, 'all_lanes') if get_verbosity(config_params) >= 2: print dataset[2].shape if get_verbosity(config_params) >= 1: print 'Filtering out...' print '\tPrespecified conditions to exclude' dataset, filtered_include_tab = filter_dataset_for_include_2(dataset, sample_table) if get_verbosity(config_params) >= 2: print dataset[2].shape write_filtered_include_table(filtered_include_tab, config_params) if get_verbosity(config_params) >= 1: print '\tPrespecified barcodes to exclude' dataset, filtered_barcodes = filter_dataset_for_barcodes(dataset, config_params) if get_verbosity(config_params) >= 2: print dataset[2].shape write_filtered_strain_file(filtered_barcodes, config_params) if remove_mtag_offenders: if get_verbosity(config_params) >= 1: print '\tHighly-correlated index tags' dataset, filtered_index_tag_condition_table = filter_dataset_for_index_tags(dataset, config_params) if get_verbosity(config_params) >= 2: print dataset[2].shape write_correlated_index_tags_excluded_conditions(filtered_index_tag_condition_table, config_params) if remove_barcode_specific_conds: if get_verbosity(config_params) >= 1: print '\tConditions with barcode-specific signatures' dataset, filtered_barcode_specific_condition_table = filter_dataset_for_barcode_specific_patterns(dataset, config_params) if get_verbosity(config_params) >= 2: print dataset[2].shape write_barcode_specific_excluded_conditions(filtered_barcode_specific_condition_table, config_params) if get_verbosity(config_params) >= 1: print '\tConditions and strains with low counts' dataset, filtered_degree_condition_table, filtered_degree_barcodes = filter_dataset_for_count_degree(dataset, config_params, sample_table) if get_verbosity(config_params) >= 2: print dataset[2].shape write_count_degree_excluded_conditions(filtered_degree_condition_table, config_params) write_count_degree_excluded_strains(filtered_degree_barcodes, config_params) # Dump the dataset out to file dump_filtered_count_matrix(config_params, dataset)
def get_configuration(): """ Get the configuration from command line and config files """ # This is the dict we will return configuration = {'global': {}, 'logging': {}, 'tables': {}} # Read the command line options cmd_line_options = command_line_parser.parse() # If a configuration file is specified, read that as well if 'config' in cmd_line_options: conf_file_options = config_file_parser.parse( cmd_line_options['config']) else: conf_file_options = None # Extract global config configuration['global'] = __get_global_options(cmd_line_options, conf_file_options) # Extract logging config configuration['logging'] = __get_logging_options(cmd_line_options, conf_file_options) # Extract table configuration # If the --table cmd line option is set, it indicates that only table # options from the command line should be used if 'table_name' in cmd_line_options: configuration['tables'] = __get_cmd_table_options(cmd_line_options) else: configuration['tables'] = __get_config_table_options(conf_file_options) # Ensure some basic rules __check_table_rules(configuration) return configuration
def main(config_file, lane_id): config_params = cfp.parse(config_file) species_config_params = get_species_config_params(config_params) # Loop over the raw fastq files, write out the "index_tag\tbarcode" file, # and return all encountered index tags and barcodes if get_verbosity(config_params) >= 1: print 'parsing fastq file(s)...' total_counts, common_primer_counts, barcodes_in_data, index_tags_in_data = fastq_to_barseq(config_params, species_config_params, lane_id) # Get maps of barcode to barcode_gene (keeps the strains unique/traceable), and index tag to condition if get_verbosity(config_params) >= 1: print 'creating mappings from barcodes and index tags...' barcode_to_gene = get_barcode_to_gene(species_config_params) index_tag_to_condition = get_index_tag_to_condition(config_params, lane_id) if get_verbosity(config_params) >= 1: print 'barcode to gene map: {}'.format(barcode_to_gene.items()[0:5]) print 'index tag to condition map: {}'.format(index_tag_to_condition.items()[0:5]) # Correct the barcodes within the specified error tolerance # (There is no function to correct the index tags - this could easily be # written in later, although we see no need for it) if get_verbosity(config_params) >= 1: print 'correcting barcodes...' barcode_correcting_map = correct_barcode_map(config_params, barcodes_in_data, barcode_to_gene) if get_verbosity(config_params) >= 1: print 'number of barcodes that will be counted: {}'.format(len(barcode_correcting_map)) # Loop over the barseq file (index_tag\tbarcode\n) and assemble the matrix of read counts if get_verbosity(config_params) >= 1: print 'generating barseq matrix...' corrected_barcodes, index_tags, matrix = get_barseq_matrix(config_params, lane_id, barcode_to_gene, barcode_correcting_map, index_tag_to_condition) if get_verbosity(config_params) >= 1: print 'number of barcodes: {}'.format(len(corrected_barcodes)) print 'number of index tags: {}'.format(len(index_tags)) print 'matrix shape: {0} rows x {1} columns'.format(*matrix.shape) # Generate reports for index tags and barcodes if get_verbosity(config_params) >= 1: print 'generating reports...' generate_reports(config_params, lane_id, corrected_barcodes, index_tags, matrix, total_counts, common_primer_counts) # Convert the barcodes to their condition names and unique gene/barcode names barcode_gene_ids = np.array([barcode_to_gene[bc] for bc in corrected_barcodes]) condition_ids = np.array([index_tag_to_condition[tag] for tag in index_tags]) # Dump out the final count matrix to file - other scripts will read it and turn it into a readable matrix/CDT if get_verbosity(config_params) >= 1: print 'dumping count matrix...' dump_count_matrix(config_params, lane_id, barcode_gene_ids, condition_ids, matrix) # Remove the temporary barseq file remove_barseq_file(config_params, lane_id)
def main(config_file, lane_id): # Read in the config params config_params = cfp.parse(config_file) sample_table = get_sample_table(config_params) # Get the interactions output folder outfolder = get_lane_interactions_path(config_params, lane_id) if not os.path.isdir(outfolder): os.makedirs(outfolder) # Read in the count matrix from dumped file dataset = load_dumped_count_matrix(config_params, lane_id) # Filter out samples flagged as "do not include" (include? == True) filtered_dataset = filter_dataset_for_include(dataset, sample_table, config_params) # I think here is the best spot to split the dataset so that different # controls can be used for different samples. ### split dataset stuff!!! # Get list of control samples (control? = True) control_condition_ids = get_control_condition_ids(dataset, sample_table) # Proceed with algorithm to obtain chemical genetic interaction zscores (scaled deviations) if get_verbosity(config_params) >= 1: print "Normalizing ... " normalized_dataset, mean_control_profile = normalizeUsingAllControlsAndSave(config_params, outfolder, filtered_dataset, control_condition_ids, lane_id) if get_verbosity(config_params) >= 1: print "Column means: " print np.nanmean(normalized_dataset[2], axis = 0) print "Done" print "Calculating deviations ... " deviation_dataset = deviations_globalmean(config_params, outfolder, normalized_dataset, mean_control_profile, lane_id) if get_verbosity(config_params) >= 1: print "Column means: " print np.nanmean(deviation_dataset[2], axis = 0) print "Done" print "Scaling interactions ... " scaled_dev_dataset = scaleInteractions(config_params, outfolder, deviation_dataset, filtered_dataset, control_condition_ids, lane_id) if get_verbosity(config_params) >= 1: print "Column means: " print np.nanmean(scaled_dev_dataset[2], axis = 0) print "Done" if 'generate_scatterplots' in config_params: if config_params['generate_scatterplots'] == 'Y' and lane_id == 'all_lanes_filtered': if get_verbosity(config_params) >= 1: print "Generating scatterplots" generate_scatterplots(config_params, outfolder, mean_control_profile, filtered_dataset, normalized_dataset, deviation_dataset, scaled_dev_dataset)
def main(config_file): # Read in the config params config_params = cfp.parse(config_file) sample_table = get_sample_table(config_params) # Read in all of the z-score matrices and combine into one matrix dataset = combine_count_matrices(config_params) # Get a new folder to house the combined count matrix combined_count_folder = get_lane_data_path(config_params, 'all_lanes') if not os.path.isdir(combined_count_folder): os.makedirs(combined_count_folder) # Dump out the combined count matrix! combined_count_filename = get_dumped_count_matrix_filename(config_params, 'all_lanes') dump_dataset(dataset, combined_count_filename)
def main(config_file): # Read in the config params config_params = cfp.parse(config_file) sample_table = get_sample_table(config_params) # Read in all of the z-score matrices and combine into one matrix dataset = combine_zscore_matrices(config_params) # Get directory for index_tag_correlation analysis index_tag_path = get_index_tag_correlation_path(config_params) if not os.path.isdir(index_tag_path): os.makedirs(index_tag_path) # Export the initial combined z-score matrix per_lane_zscore_dataset_filename = os.path.join(index_tag_path, 'combined_per_lane_zscore_dataset.dump.gz') dump_dataset(dataset, per_lane_zscore_dataset_filename) # Get just the control dataset, and dump that out too control_condition_ids = get_control_condition_ids(dataset, sample_table) control_dataset = get_control_dataset(dataset, control_condition_ids) per_lane_control_zscore_dataset_filename = os.path.join(index_tag_path, 'combined_per_lane_control_zscore_dataset.dump.gz') dump_dataset(control_dataset, per_lane_control_zscore_dataset_filename) # Get the sorted index tag correlations for control conditions index_tags_sorted, control_index_tag_correlations_sorted = get_control_index_tag_correlations(control_dataset, sample_table, config_params) # Export the sorted index tag correlations to dump and text files write_index_tag_corrs(index_tags_sorted, control_index_tag_correlations_sorted, index_tag_path) # Get the correlations of each profile to the barcode-specific template profiles template_profile_ids, template_profile_mat = generate_barcode_specific_template_profiles(dataset[0]) condition_ids_sorted, barcode_specific_template_correlations_sorted, template_profile_ids_sorted = compute_max_correlation_barcode_specific_offenders(template_profile_ids, template_profile_mat, dataset[1], dataset[2], config_params) # Export the sorted correlations of profiles to the barcode-specific template profiles write_barcode_specific_template_corrs(condition_ids_sorted, barcode_specific_template_correlations_sorted, template_profile_ids_sorted, index_tag_path) ## Plot a histogram of the index tag correlations plot_control_index_tag_correlations(control_index_tag_correlations_sorted, index_tag_path)
filename = config_params['sample_table_file'] # Read everything in as a string, to prevent vexing # number interpretation problems! Methods further down # can coerce to different types. tab = pd.read_table(filename, dtype = 'S') return tab ########################################### ####### Here is the main script ######### ########################################### # Get the config file, which is the only argument needed for the pipeline config_file = args.config_file config_params = cfp.parse(config_file) # Read in the sample table sample_table = get_sample_table(config_params) # Grab all of the ids of the lanes to process lane_ids = get_all_lane_ids(sample_table) ## Or, if you were silly and ran all lanes but the newest 4, add this in #lane_ids = ['lane51', 'lane52', 'lane53', 'lane54'] # First, get one strain X condition count matrix per lane # This only needs to be run once, unless the barcodes # or index tags change for some reason. if start <= 1: for lane_id in lane_ids: