def __init__(self, exp_seqs_fname, control_seqs_fname, kmer_lens, output_dir, exp_coords_fname=None, control_coords_fname=None, genome="mm9"): # FASTAs representing the sequences for exp and control # conditions self.exp_seqs_fname = exp_seqs_fname self.control_seqs_fname = control_seqs_fname # Coordinates files for exp and control conditions self.exp_coords_fname = exp_coords_fname self.control_coords_fname = control_coords_fname # Kmer lengths to consider self.kmer_lens = kmer_lens self.output_dir = output_dir # Optional genome name self.genome = genome # Define Kmer tables for each kmer length self.exp_kmer_tables = [] self.control_kmer_tables = [] self.logger = utils.get_logger("MotifSet", os.path.join(self.output_dir, "logs"))
def __init__(self, sample, pipeline): # Pipeline instance that the sample is attached to self.pipeline = pipeline self.sample = sample self.settings_info = pipeline.settings_info # Define logger self.logger = utils.get_logger("QualityControl.%s" % (sample.label), self.pipeline.pipeline_outdirs["logs"]) # QC header: order of QC fields to be outputted self.regions_header = [ "num_ribo", "num_exons", "num_cds", "num_introns", "num_3p_utr", "num_5p_utr", "num_tRNAs", "num_junctions", ] self.qc_stats_header = [ "percent_mapped", "percent_unique", "percent_ribo", "percent_exons", "percent_cds", "percent_introns", "percent_3p_utr", "percent_5p_utr", "percent_tRNAs", "3p_to_cds", "5p_to_cds", "3p_to_5p", "exon_intron_ratio", ] self.qc_header = ( ["num_reads", "num_mapped", "num_ribosub_mapped", "num_unique_mapped"] + self.qc_stats_header + self.regions_header ) # QC results self.na_val = "NA" self.qc_results = defaultdict(lambda: self.na_val) # QC output dir self.qc_outdir = self.pipeline.pipeline_outdirs["qc"] # QC filename for this sample self.sample_outdir = os.path.join(self.qc_outdir, self.sample.label) utils.make_dir(self.sample_outdir) # Regions output dir self.regions_outdir = os.path.join(self.sample_outdir, "regions") utils.make_dir(self.regions_outdir) self.qc_filename = os.path.join(self.sample_outdir, "%s.qc.txt" % (self.sample.label)) self.qc_loaded = False # use ensGene gene table for QC computations self.gene_table = self.pipeline.rna_base.gene_tables["ensGene"] # Load QC information if file corresponding to sample # already exists self.load_qc_from_file()
def __init__(self, settings_filename, log_output_dir, curr_sample=None): """ Initialize pipeline. """ # If invoked to run on particular sample self.curr_sample = curr_sample self.genome = None # Output directory for logging pipeline activity self.log_output_dir = log_output_dir # Output directory for actual pipeline output self.output_dir = None # Load settings file self.settings_filename = settings_filename # Load settings self.sequence_filenames = None self.parsed_settings = None self.settings_info = None self.data_type = None # Directory where pipeline init files are self.init_dir = None # Paired-end or not self.is_paired_end = None self.sample_to_group = None self.group_to_samples = None self.samples = [] # Cluster objects to use self.my_cluster = None # Check settings are correct self.load_pipeline_settings() # Pipeline output subdirectories self.pipeline_outdirs = {} # RPKM directory for teh pipeline self.rpkm_dir = None # QC objects for each sample in pipeline self.qc_objects = {} # Top-level output dirs self.toplevel_dirs = ["rawdata", "mapping", "qc", "analysis", "logs"] self.init_outdirs() pipeline_log_name = "Pipeline" if self.curr_sample is not None: pipeline_log_name = "Pipeline.%s" % (self.curr_sample) self.logger = utils.get_logger(pipeline_log_name, self.pipeline_outdirs["logs"]) self.load_cluster() ## Load RNA Base: object storing all the relevant ## initialization information self.rna_base = None self.load_rna_base() ## Load samples self.load_pipeline_samples() ## Initialize QC for samples # QC header: order of QC fields to be outputted self.qc_header = [] self.init_qc() self.na_val = "NA"
def __init__(self, sample, pipeline): # Pipeline instance that the sample is attached to self.pipeline = pipeline self.sample = sample self.settings_info = pipeline.settings_info # Define logger self.logger = utils.get_logger("QualityControl.%s" % (sample.label), self.pipeline.pipeline_outdirs["logs"]) # QC header: order of QC fields to be outputted self.regions_header = [ "num_ribo", "num_exons", "num_cds", "num_introns", "num_3p_utr", "num_5p_utr", "num_tRNAs", "num_junctions" ] self.qc_stats_header = [ "percent_mapped", "percent_unique", "percent_ribo", "percent_exons", "percent_cds", "percent_introns", "percent_3p_utr", "percent_5p_utr", "percent_tRNAs", "3p_to_cds", "5p_to_cds", "3p_to_5p", "exon_intron_ratio" ] self.qc_header = ["num_reads", "num_mapped", "num_ribosub_mapped", "num_unique_mapped"] + \ self.qc_stats_header + \ self.regions_header # QC results self.na_val = "NA" self.qc_results = defaultdict(lambda: self.na_val) # QC output dir self.qc_outdir = self.pipeline.pipeline_outdirs["qc"] # QC filename for this sample self.sample_outdir = os.path.join(self.qc_outdir, self.sample.label) utils.make_dir(self.sample_outdir) # Regions output dir self.regions_outdir = os.path.join(self.sample_outdir, "regions") utils.make_dir(self.regions_outdir) self.qc_filename = os.path.join(self.sample_outdir, "%s.qc.txt" % (self.sample.label)) self.qc_loaded = False # use ensGene gene table for QC computations self.gene_table = self.pipeline.rna_base.gene_tables["ensGene"] # Load QC information if file corresponding to sample # already exists self.load_qc_from_file()
def __init__(self, results_dir, output_dir, label=None): """ Load up results directory """ self.output_dir = output_dir self.logger_label = label if self.logger_label is None: self.logger_label = "BindnSeq" self.logger = utils.get_logger(self.logger_label, self.output_dir) self.results_dir = results_dir self.label = label # All kmer lengths to load self.kmer_lens = [4, 5, 6, 7, 8, 9] # Odds ratios (DataFrames indexed by kmer length) self.odds_ratios = {} # Counts (DataFrames indexed by kmer length) self.counts = {}
def load_settings(self): """ Load settings for misowrap. """ settings_info, parsed_settings = \ misowrap_settings.load_misowrap_settings(self.settings_filename) self.settings_info = settings_info # Load basic settings about data self.read_len = self.settings_info["settings"]["readlen"] self.overhang_len = self.settings_info["settings"]["overhanglen"] self.miso_bin_dir = \ utils.pathify(self.settings_info["settings"]["miso_bin_dir"]) self.miso_settings_filename = \ utils.pathify(self.settings_info["settings"]["miso_settings_filename"]) self.miso_events_dir = \ utils.pathify(self.settings_info["settings"]["miso_events_dir"]) self.miso_outdir = \ utils.pathify(self.settings_info["settings"]["miso_output_dir"]) # Load data-related parameters self.bam_files = self.settings_info["data"]["bam_files"] if "insert_lens_dir" in self.settings_info["data"]: self.insert_lens_dir = \ utils.pathify(self.settings_info["data"]["insert_lens_dir"]) # Sample labels self.sample_labels = self.settings_info["data"]["sample_labels"] # Set output directories self.comparisons_dir = os.path.join(self.output_dir, "comparisons") self.comparison_groups = \ self.settings_info["data"]["comparison_groups"] self.logs_outdir = os.path.join(self.output_dir, "misowrap_logs") # Create necessary directories utils.make_dir(self.miso_outdir) utils.make_dir(self.comparisons_dir) utils.make_dir(self.logs_outdir) if "cluster_type" in self.settings_info["settings"]: self.use_cluster = True self.cluster_type = \ self.settings_info["settings"]["cluster_type"] self.chunk_jobs = \ self.settings_info["settings"]["chunk_jobs"] if self.use_cluster: print "Loading cluster information." # Load cluster object if given a cluster type self.load_cluster() # Create a logger object if self.logger_label is None: self.logger_label = "misowrap" else: self.logger_label = "misowrap_%s" %(logger_label) self.logger = utils.get_logger(self.logger_label, self.logs_outdir) # Whether to prefilter MISO events # Set general default settings if "prefilter_miso" not in settings_info["settings"]: # By default, set it so that MISO events are not # prefiltered settings_info["settings"]["prefilter_miso"] = False self.prefilter_miso = \ self.settings_info["settings"]["prefilter_miso"] # Load event types self.load_event_types() # Set path to MISO scripts self.run_miso_cmd = os.path.join(self.miso_bin_dir, "run_miso.py") self.run_events_cmd = os.path.join(self.miso_bin_dir, "run_events_analysis.py") self.pe_utils_cmd = os.path.join(self.miso_bin_dir, "pe_utils.py") # Files related to gene tables self.tables_dir = \ os.path.join(self.settings_info["pipeline-files"]["init_dir"], "ucsc") if not os.path.isdir(self.tables_dir): print "Error: %s directory does not exist." \ %(self.tables_dir) sys.exit(1) self.const_exons_gff = os.path.join(self.tables_dir, "exons", "const_exons", "ensGene.const_exons.gff") if not os.path.isfile(self.const_exons_gff): print "Error: Const. exons GFF %s does not exist." \ %(self.const_exons_gff) sys.exit(1)
def load_settings(self): """ Load settings for misowrap. """ settings_info, parsed_settings = \ misowrap_settings.load_misowrap_settings(self.settings_filename) self.settings_info = settings_info # Load basic settings about data self.read_len = self.settings_info["settings"]["readlen"] self.overhang_len = self.settings_info["settings"]["overhanglen"] self.miso_bin_dir = \ utils.pathify(self.settings_info["settings"]["miso_bin_dir"]) self.miso_settings_filename = \ utils.pathify(self.settings_info["settings"]["miso_settings_filename"]) self.miso_events_dir = \ utils.pathify(self.settings_info["settings"]["miso_events_dir"]) self.miso_outdir = \ utils.pathify(self.settings_info["settings"]["miso_output_dir"]) # Load data-related parameters self.bam_files = self.settings_info["data"]["bam_files"] if "insert_lens_dir" in self.settings_info["data"]: self.insert_lens_dir = \ utils.pathify(self.settings_info["data"]["insert_lens_dir"]) # Sample labels self.sample_labels = self.settings_info["data"]["sample_labels"] # Set output directories self.comparisons_dir = os.path.join(self.output_dir, "comparisons") self.comparison_groups = \ self.settings_info["data"]["comparison_groups"] self.logs_outdir = os.path.join(self.output_dir, "misowrap_logs") # Create necessary directories utils.make_dir(self.logs_outdir) if "cluster_type" in self.settings_info["settings"]: self.use_cluster = True self.cluster_type = \ self.settings_info["settings"]["cluster_type"] self.chunk_jobs = \ self.settings_info["settings"]["chunk_jobs"] if self.use_cluster: print "Loading cluster information." # Load cluster object if given a cluster type self.load_cluster() # Create a logger object if self.logger_label is None: self.logger_label = "misowrap" else: self.logger_label = "misowrap_%s" % (logger_label) self.logger = utils.get_logger(self.logger_label, self.logs_outdir) # Whether to prefilter MISO events # Set general default settings if "prefilter_miso" not in settings_info["settings"]: # By default, set it so that MISO events are not # prefiltered settings_info["settings"]["prefilter_miso"] = False self.prefilter_miso = \ self.settings_info["settings"]["prefilter_miso"] # Load event types self.load_event_types() # Set path to MISO scripts self.compare_miso_cmd = os.path.join(self.miso_bin_dir, "compare_miso") self.summarize_miso_cmd = os.path.join(self.miso_bin_dir, "summarize_miso") self.run_events_cmd = os.path.join(self.miso_bin_dir, "miso") self.pe_utils_cmd = os.path.join(self.miso_bin_dir, "pe_utils") # Files related to gene tables self.tables_dir = \ os.path.join(self.settings_info["pipeline-files"]["init_dir"], "ucsc") if not os.path.isdir(self.tables_dir): print "Error: %s directory does not exist." \ %(self.tables_dir) sys.exit(1) self.const_exons_gff = os.path.join(self.tables_dir, "exons", "const_exons", "ensGene.const_exons.gff") if not os.path.isfile(self.const_exons_gff): print "Error: Const. exons GFF %s does not exist." \ %(self.const_exons_gff) sys.exit(1)