def parse_shapemapper_output_files(self): shapemapper_output_dir = os.path.join(os.path.dirname(self.experiment_settings.get_shapemapper_config_file()), 'output', 'counted_mutations_columns') sample_name = self.lib_settings.sample_name for rRNA_name in self.experiment_settings.rRNA_seqs: shapemapper_output_file = os.path.join(shapemapper_output_dir, sample_name+'_'+rRNA_name+'.csv') assert mod_utils.file_exists(shapemapper_output_file) self.rRNA_mutation_data[rRNA_name] = rRNA_mutations(self, self.lib_settings, self.experiment_settings, shapemapper_output_file)
def need_to_run_shapemapper(self): for lib_setting in self.settings.iter_lib_settings(): for rRNA_name in self.settings.rRNA_seqs: expected_file_name = os.path.join( lib_setting.get_shapemapper_out_dir(), 'Pipeline_Modified_' + rRNA_name + '_mutation_counts.txt') if not mod_utils.file_exists(expected_file_name): return True return False
def need_to_run_shapemapper(self): if self.settings.get_property('force_shapemapper'): return True else: shapemapper_output_dir = os.path.join(os.path.dirname(self.settings.get_shapemapper_config_file()), 'output', 'counted_mutations_columns') for sample_name in self.settings.get_property('experimentals') + self.settings.get_property( 'no_mod_controls')+ self.settings.get_property('with_mod_controls'): for rRNA_name in self.settings.rRNA_seqs: expected_file_name = os.path.join(shapemapper_output_dir, sample_name+'_'+rRNA_name+'.csv') if not mod_utils.file_exists(expected_file_name): return True return False
def star_index_exists(self): star_index = self.get_star_index() return mod_utils.file_exists(star_index)
def process_settings(self, settings_file): """ - reads the settings file and converts str to float, list, etc. - stores result in self.settings as a dict() """ int_keys = [ 'first_base_to_keep', 'last_base_to_keep', 'min_post_adaptor_length', 'min_base_quality', 'min_mapping_quality'] float_keys = ['confidence_interval_cutoff', 'fold_change_cutoff'] str_keys = ['adaptor_sequence', 'rrna_fasta', 'experiment_name', 'shapemapper_ref_file', 'affected_nucleotides', 'pymol_base_script', 'pymol_base_script_colorchange', 'tptn_file_18s', 'tptn_file_25s', 'functional_groupings'] boolean_keys = ['collapse_identical_reads', 'force_read_resplit', 'force_remapping', 'force_recollapse', 'force_recount', 'force_index_rebuild', 'force_retrim', 'trim_adaptor', 'discard_untrimmed', 'force_shapemapper', 'make_interactive_plots'] list_str_keys = ['fastq_gz_files', 'sample_names', 'experimentals', 'no_mod_controls', 'with_mod_controls', 'exclude_constitutive'] #list_float_keys = ['probe_concentrations'] config = ConfigParser.ConfigParser() config.read(settings_file) settings = {} for section in config.sections(): for option in config.options(section): settings[option] = config.get(section, option) settings[section] = True for k in int_keys: settings[k] = int(settings[k]) for k in str_keys: settings[k] = settings[k] for k in float_keys: settings[k] = float(settings[k]) for k in boolean_keys: if not settings[k].lower() in ['true', 'false']: raise ValueError( 'Boolean value %s must be "true" or "false"' % k) settings[k] = settings[k].lower() == 'true' #for k in list_float_keys: # settings[k] = map(float, simplejson.loads(settings[k])) #for k in list_int_keys: # settings[k] = map(int, simplejson.loads(settings[k])) for k in list_str_keys: settings[k] = simplejson.loads(settings[k]) self.fqdir = settings['fastq_dir'] self.sample_names = settings['sample_names'] self.experimentals = settings['experimentals'] self.no_mod_controls = settings['no_mod_controls'] self.with_mod_controls = settings['with_mod_controls'] self.exclude_constitutive = settings['exclude_constitutive'] try: assert len(self.experimentals) == len(self.no_mod_controls) assert len(self.experimentals) == len(self.with_mod_controls) except: print 'error: experimentals, no_mod_controls, and with_mod_controls should all be the same length' print 'for mutation rate purposes, its ok to reuse a dataset here, it really doesnt matter' try: for sample_name in self.experimentals+self.no_mod_controls+self.with_mod_controls: assert sample_name in self.sample_names except: print sample_name, ' not in sample names, make sure you are using regular quotation marks' self.fastq_gz_file_handles = [os.path.join(self.fqdir, fastq_gz_file) for fastq_gz_file in settings['fastq_gz_files']] for file_handle in self.fastq_gz_file_handles: assert mod_utils.file_exists(file_handle) self.settings = settings self.rdir = settings['results_dir'] mod_utils.make_dir(self.rdir) shutil.copy(settings_file, self.rdir)
def mapped_reads_exist(self): mapped_reads = self.get_mapped_reads_sam_gz() return mod_utils.file_exists(mapped_reads)
def mapped_reads_exist(self): mapped_reads = self.get_mapped_reads() return mod_utils.file_exists(mapped_reads)
def adaptorless_reads_exist(self): adaptorless_reads = self.get_adaptor_trimmed_reads() return mod_utils.file_exists(adaptorless_reads)
def collapsed_reads_exist(self): collapsed_reads = self.get_collapsed_reads() return mod_utils.file_exists(collapsed_reads)
def split_reads_exist(self): split_reads = self.get_split_reads() return mod_utils.file_exists(split_reads)
def mutation_counts_exists(self): return mod_utils.file_exists(self.get_mutation_counts())
def positional_coverage_exists(self): return mod_utils.file_exists(self.get_positional_coverage())
def read_5p_counts_exists(self): return mod_utils.file_exists(self.get_read_5p_counts())
def rRNA_bowtie_index_exists(self): return mod_utils.file_exists(self.get_rRNA_bowtie_index()+'.1.bt2')
def primerless_reads_exist(self): primerless_reads = self.get_primer_trimmed_reads() return mod_utils.file_exists(primerless_reads)
def trimmed_reads_exist(self): trimmed_reads = self.get_trimmed_reads() return mod_utils.file_exists(trimmed_reads)
def process_settings(self, settings_file): """ - reads the settings file and converts str to float, list, etc. - stores result in self.settings as a dict() """ int_keys = [ 'first_base_to_keep', 'last_base_to_keep', 'min_post_adaptor_length', 'min_base_quality', 'min_mapping_quality' ] float_keys = [ 'confidence_interval_cutoff', 'fold_change_cutoff', 'winsorization_upper_limit' ] str_keys = [ 'adaptor_sequence', 'rrna_fasta', 'experiment_name', 'affected_nucleotides', 'pymol_base_script', 'pymol_base_script_colorchange', 'tptn_file_18s', 'tptn_file_25s' ] boolean_keys = ['make_interactive_plots'] list_str_keys = [ 'fastq_gz_files', 'sample_names', 'experimentals', 'no_mod_controls', 'with_mod_controls', 'exclude_constitutive' ] #list_float_keys = ['probe_concentrations'] config = ConfigParser.ConfigParser() config.read(settings_file) settings = {} for section in config.sections(): for option in config.options(section): settings[option] = config.get(section, option) settings[section] = True for k in int_keys: settings[k] = int(settings[k]) for k in str_keys: settings[k] = settings[k] for k in float_keys: settings[k] = float(settings[k]) for k in boolean_keys: if not settings[k].lower() in ['true', 'false']: raise ValueError('Boolean value %s must be "true" or "false"' % k) settings[k] = settings[k].lower() == 'true' #for k in list_float_keys: # settings[k] = map(float, simplejson.loads(settings[k])) #for k in list_int_keys: # settings[k] = map(int, simplejson.loads(settings[k])) for k in list_str_keys: settings[k] = simplejson.loads(settings[k]) self.fqdir = settings['fastq_dir'] self.sample_names = settings['sample_names'] self.experimentals = settings['experimentals'] self.no_mod_controls = settings['no_mod_controls'] self.with_mod_controls = settings['with_mod_controls'] self.exclude_constitutive = settings['exclude_constitutive'] try: assert len(self.experimentals) == len(self.no_mod_controls) assert len(self.experimentals) == len(self.with_mod_controls) except: print 'error: experimentals, no_mod_controls, and with_mod_controls should all be the same length' print 'for mutation rate purposes, its ok to reuse a dataset here, it really doesnt matter' try: for sample_name in self.experimentals + self.no_mod_controls + self.with_mod_controls: assert sample_name in self.sample_names except: print sample_name, ' not in sample names, make sure you are using regular quotation marks' self.fastq_gz_file_handles = [ os.path.join(self.fqdir, fastq_gz_file) for fastq_gz_file in settings['fastq_gz_files'] ] for file_handle in self.fastq_gz_file_handles: assert mod_utils.file_exists(file_handle) self.settings = settings self.rdir = settings['results_dir'] mod_utils.make_dir(self.rdir) shutil.copy(settings_file, self.rdir)
def filtered_reads_exist(self): filtered_reads = self.get_filtered_reads() return mod_utils.file_exists(filtered_reads)