def oncodrivefm(project): log = task.logger conf = task.conf log.info("--- [{0}] --------------------------------------------".format(project["id"])) # configuration default_filter = get_data_gene_filter_path(conf) genes_filter_enabled = get_project_conf(conf, project, "oncodrivefm.genes.filter_enabled", ONCODRIVEFM_GENES_FILTER_ENABLED) genes_filter = get_project_conf(conf, project, "oncodrivefm.genes.filter", default_filter) if genes_filter is None: # user can assign a null genes_filter_enabled = False genes_filter = default_filter filt = LabelFilter() if genes_filter_enabled: log.info("Loading expression filter ...") log.debug("> {0}".format(genes_filter)) filt.load(genes_filter) log.info("Calculating quality indicators for OncodriveFM ...") qc_data = quality_control(log, conf, project, filt if genes_filter_enabled else None) project_results = ProjectResults(project) project_results.save_quality_control("oncodrivefm", qc_data)
def load_expression_filter(log, conf, project): default_filter = get_data_gene_filter_path(conf) genes_filter_enabled = get_project_conf(conf, project, "oncodrivefm.genes.filter_enabled", ONCODRIVEFM_GENES_FILTER_ENABLED) genes_filter = get_project_conf(conf, project, "oncodrivefm.genes.filter", default_filter) if genes_filter is None: # user can assign a null genes_filter_enabled = False genes_filter = default_filter filt = LabelFilter() if genes_filter_enabled: log.info("Loading expression filter ...") log.debug("> {0}".format(genes_filter)) filt.load(genes_filter) return genes_filter_enabled, genes_filter, filt
def get_oncodriveclust_configuration(log, conf, project): log.info("OncodriveCLUST configuration:") mutations_threshold = get_project_conf(conf, project, "oncodriveclust.mutations_threshold", ONCODRIVECLUST_MUTATIONS_THRESHOLD) default_filter = get_data_gene_filter_path(conf) genes_filter_enabled = get_project_conf(conf, project, "oncodriveclust.genes_filter_enabled", ONCODRIVECLUST_GENES_FILTER_ENABLED) genes_filter = get_project_conf(conf, project, "oncodriveclust.genes_filter", default_filter) if genes_filter is None: # user can assign a null genes_filter_enabled = False genes_filter = default_filter log.info(" mutations_threshold = {0}".format(mutations_threshold)) log.info(" genes_filter_enabled = {0}".format(genes_filter_enabled)) log.info(" genes_filter = {0}".format(os.path.basename(genes_filter))) filt = LabelFilter() if genes_filter_enabled: log.info("Loading expression filter ...") log.debug("> {0}".format(genes_filter)) filt.load(genes_filter) return (mutations_threshold, genes_filter_enabled, genes_filter, filt)
class OncodriveFm(object): "OncodriveFM helper" def __init__(self, config, paths, logger=None): """ :param config: intogensm.config.OncodriveFmConfig :param paths: PathsConfig :param logger: Optional logger """ self.config = config self.paths = paths self.logger = logger or get_logger("oncodrivefm") self.genes_samples_threshold = self.pathways_samples_threshold = None self.filter_enabled = self.filter_path = self.filter = None def load_expression_filter(self): default_filter = self.paths.data_gene_filter_path() self.filter_enabled = self.config.filter_enabled self.filter_path = self.config.filter_path or default_filter if self.filter_path is None: # user can assign a null self.filter_enabled = False self.filter_path = default_filter self.filter = LabelFilter() if self.filter_enabled: self.logger.info("Loading expression filter ...") self.logger.debug("> {0}".format(self.filter_path)) self.filter.load(self.filter_path) def load_samples_thresholds(self, num_samples): self.genes_samples_threshold = get_threshold(self.logger, self.config.genes.threshold, num_samples) self.pathways_samples_threshold = get_threshold(self.logger, self.config.pathways.threshold, num_samples) def load_config(self, num_samples): self.load_expression_filter() self.load_samples_thresholds(num_samples) self.log_config() def log_config(self): self.logger.info("OncodriveFM configuration:") self.logger.info(" num_cores = {0}".format(self.config.num_cores)) self.logger.info(" estimator = {0}".format(self.config.estimator)) self.logger.info(" filter_enabled = {0}".format(self.filter_enabled)) self.logger.info(" filter_path = {0}".format( os.path.basename(self.filter_path) if self.filter_path is not None else None)) self.logger.info(" Genes:") self.logger.info(" num_samplings = {0}".format(self.config.genes.num_samplings)) self.logger.info(" samples_threshold = {0}".format(self.genes_samples_threshold)) self.logger.info(" Pathways:") self.logger.info(" num_samplings = {0}".format(self.config.pathways.num_samplings)) self.logger.info(" samples_threshold = {0}".format(self.pathways_samples_threshold)) return self _AGGREGATE = (max, max, min, max, max, min, max, max, min) def retrieve_data(self, projdb): data = {} self.logger.info("Retrieving gene alterations for OncodriveFM ...") for csq in projdb.consequences(join_samples=True, join_ctypes=False, filters={ProjectDb.CSQ_CTYPES : so.ONCODRIVEFM}): var = csq.var for sample in var.samples: key = (sample.id, csq.gene) if key not in data: data[key] = (csq.sift_score, csq.sift_tfic, csq.sift_tfic_class, csq.pph2_score, csq.pph2_tfic, csq.pph2_tfic_class, csq.ma_score, csq.ma_tfic, csq.ma_tfic_class) else: a = data[key] b = (csq.sift_score, csq.sift_tfic, csq.sift_tfic_class, csq.pph2_score, csq.pph2_tfic, csq.pph2_tfic_class, csq.ma_score, csq.ma_tfic, csq.ma_tfic_class) c = [0.0] * len(a) for i in range(len(a)): c[i] = self._AGGREGATE[i](a[i], b[i]) data[key] = tuple(c) return data
class OncodriveClust(object): "OncodriveCLUST helper" def __init__(self, config, paths, logger=None): """ :param config: intogensm.config.OncodriveClustConfig :param paths: PathsConfig :param logger: Optional logger """ self.config = config self.paths = paths self.logger = logger or get_logger("oncodriveclust") self._load_config() def _load_config(self): self.logger.info("OncodriveCLUST configuration:") self.samples_threshold = self.config.samples_threshold default_filter = self.paths.data_gene_filter_path() self.filter_enabled = self.config.filter_enabled self.filter_path = self.config.filter_path or default_filter if self.filter_path is None: # user can assign a null self.filter_enabled = False self.filter_path = default_filter self.logger.info(" samples_threshold = {0}".format(self.samples_threshold)) self.logger.info(" genes_filter_enabled = {0}".format(self.filter_enabled)) self.logger.info(" genes_filter_path = {0}".format(os.path.basename(self.filter_path))) self.filter = LabelFilter() if self.filter_enabled: self.logger.info("Loading expression filter ...") self.logger.debug("> {0}".format(self.filter_path)) self.filter.load(self.filter_path) return self def load_cds_len(self, path): self.logger.info("Loading transcripts CDS length ...") self.logger.debug("> {}".format(path)) cds_len = {} with tsv.open(path, "r") as f: for gene, transcript, transcript_len in tsv.lines(f, (str, str, int), header=True): cds_len[transcript] = transcript_len return cds_len def retrieve_data(self, projdb): cds_len = self.load_cds_len(self.paths.data_ensembl_gene_transcripts_path()) data = {} self.logger.info("Retrieving gene alterations for OncodriveCLUST ...") for csq in projdb.consequences(join_samples=True, filters={ProjectDb.CSQ_CTYPES : so.ONCODRIVECLUST | so.SYNONYMOUS}): if csq.transcript not in cds_len: continue transcript_len = cds_len[csq.transcript] if so.match(csq.ctypes, so.ONCODRIVECLUST): cls = NON_SYN elif so.match(csq.ctypes, so.SYNONYMOUS): cls = SYN else: continue for sample in csq.var.samples: key = (cls, csq.gene, sample.name) if key not in data: data[key] = (csq.transcript, transcript_len, csq.protein_pos) else: transcript, tlen, protein_pos = data[key] if transcript_len > tlen: data[key] = (csq.transcript, transcript_len, csq.protein_pos) return data
class FullCommand(Command): def __init__(self): Command.__init__(self, prog="oncodrivefm", desc="Compute the FM bias for genes and pathways") def _add_arguments(self, parser): Command._add_arguments(self, parser) parser.add_argument("data_path", metavar="DATA", help="File containing the data matrix in TDM format") parser.add_argument("-N", "--samplings", dest="num_samplings", type=int, default=10000, metavar="NUMBER", help="Number of samplings to compute the FM bias pvalue") parser.add_argument("-e", "--estimator", dest="estimator", metavar="ESTIMATOR", choices=["mean", "median"], default="mean", help="Test estimator for computation.") parser.add_argument("--gt", "--gene-threshold", dest="mut_gene_threshold", type=int, default=2, metavar="THRESHOLD", help="Minimum number of mutations per gene to compute the FM bias") parser.add_argument("--pt", "--pathway-threshold", dest="mut_pathway_threshold", type=int, default=10, metavar="THRESHOLD", help="Minimum number of mutations per pathway to compute the FM bias") parser.add_argument("-s", "--slices", dest="slices", metavar="SLICES", help="Slices to process separated by commas") parser.add_argument("-m", "--mapping", dest="mapping", metavar="PATH", help="File with mappings between genes and pathways to be analysed") parser.add_argument("-f", "--filter", dest="filter", metavar="PATH", help="File containing the features to be filtered. By default labels are includes," " labels preceded with - are excludes.") parser.add_argument("--save-data", dest="save_data", default=False, action="store_true", help="The input data matrix will be saved") parser.add_argument("--save-analysis", dest="save_analysis", default=False, action="store_true", help="The analysis results will be saved") def _check_args(self): Command._check_args(self) if self.args.analysis_name is None: self.args.analysis_name, ext = os.path.splitext(os.path.basename(self.args.data_path)) if self.args.num_samplings < 1: self._error("Number of samplings out of range [2, ..)") if self.args.mut_gene_threshold < 1: self._error("Minimum number of mutations per gene out of range [1, ..)") if self.args.mut_pathway_threshold < 1: self._error("Minimum number of mutations per pathway out of range [1, ..)") if self.args.mapping is not None and not os.path.isfile(self.args.mapping): self._error("Pathways mapping file not found: {0}".format(self.args.mapping)) def run(self): Command.run(self) # Load filter self.filter = LabelFilter() if self.args.filter is not None: self.log.info("Loading filter ...") self.log.debug(" > {0}".format(self.args.filter)) self.filter.load(self.args.filter) self.log.debug(" {0} includes, {1} excludes".format( self.filter.include_count, self.filter.exclude_count)) # Load data self.log.info("Loading data ...") self.log.debug(" > {0}".format(self.args.data_path)) #TODO: Support loading plain matrices: /file.tsv#slice=SIFT self.matrix = tdm.load_matrix(self.args.data_path) self.log.debug(" {0} rows, {1} columns and {2} slices".format( self.matrix.num_rows, self.matrix.num_cols, self.matrix.num_slices)) # Get selected slice indices if self.args.slices is not None: slices = [] for name in self.args.slices.split(","): name = name.strip() if name not in self.matrix.slice_name_index: raise Exception("Slice not found: {0}".format(name)) slices += [self.matrix.slice_name_index[name]] else: slices = range(self.matrix.num_slices) col_names = [self.matrix.slice_names[i] for i in slices] if self.args.save_data: for i in slices: slice_name = self.matrix.slice_names[i] self.log.info("Saving {0} data matrix ...".format(slice_name)) self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format, self.matrix.row_names, self.matrix.col_names, self.matrix.data[i], suffix="data-{0}".format(slice_name)) # GENES --------------------------------------- # One to one mapping for genes map = {} for row_name in self.matrix.row_names: if self.filter.valid(row_name): map[row_name] = (row_name,) genes_mapping = MatrixMapping(self.matrix, map) genes_method_name = "{0}-{1}".format(self.args.estimator, EmpiricalTest.NAME) # Analysis for genes self.log.info("Analysing genes with '{0}' ...".format(genes_method_name)) analysis = OncodriveFmAnalysis( "oncodrivefm.genes", num_samplings = self.args.num_samplings, mut_threshold = self.args.mut_gene_threshold, num_cores=self.args.num_cores) results = analysis.compute(self.matrix, genes_mapping, genes_method_name, slices) method = create_method(genes_method_name) if self.args.save_analysis: self.log.info("Saving genes analysis results ...") self.save_splited_results( self.args.output_path, self.args.analysis_name, self.args.output_format, self.matrix, genes_mapping, method, results, slices, suffix="genes") # Combination for genes self.log.info("Combining analysis results ...") combined_results = method.combine(np.ma.masked_invalid(results.T)) self.log.info("Saving genes combined results ...") self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format, genes_mapping.group_names, method.combination_columns, combined_results.T, params=[("slices", ",".join(col_names)), ("method", method.name)], suffix="genes", valid_row=lambda row: sum([1 if np.isnan(v) else 0 for v in row]) == 0) if self.args.mapping is None: return # PATHWAYS --------------------------------------- # Load pathways mappping self.log.info("Loading pathways mapping ...") self.log.debug(" > {0}".format(self.args.mapping)) pathways_mapping = self.load_mapping(self.matrix, self.args.mapping) self.log.debug(" {0} pathways".format(pathways_mapping.num_groups)) pathways_method_name = "{0}-{1}".format(self.args.estimator, ZscoreTest.NAME) # Analysis for pathways self.log.info("Analysing pathways with '{0}' ...".format(pathways_method_name)) analysis = OncodriveFmAnalysis( "oncodrivefm.pathways", num_samplings = self.args.num_samplings, mut_threshold = self.args.mut_pathway_threshold, num_cores=self.args.num_cores) results = analysis.compute(self.matrix, pathways_mapping, pathways_method_name, slices) method = create_method(pathways_method_name) if self.args.save_analysis: self.log.info("Saving pathways analysis results ...") self.save_splited_results( self.args.output_path, self.args.analysis_name, self.args.output_format, self.matrix, pathways_mapping, method, results, slices, suffix="pathways") # Combination for pathways self.log.info("Combining analysis results ...") combined_results = method.combine(np.ma.masked_invalid(results.T)) self.log.info("Saving pathways combined results ...") self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format, pathways_mapping.group_names, method.combination_columns, combined_results.T, params=[("slices", ",".join(col_names)), ("method", method.name)], suffix="pathways", valid_row=lambda row: sum([1 if np.isnan(v) else 0 for v in row]) == 0)
class ComputeCommand(Command): def __init__(self): Command.__init__(self, prog="oncodrivefm-compute", desc="Compute the FM bias") def _add_arguments(self, parser): Command._add_arguments(self, parser) parser.add_argument("data_path", metavar="DATA", help="File containing the data matrix in TDM format") parser.add_argument("-N", "--samplings", dest="num_samplings", type=int, default=10000, metavar="NUMBER", help="Number of samplings to compute the FM bias pvalue") parser.add_argument("-t", "--threshold", dest="mut_threshold", type=int, default=2, metavar="THRESHOLD", help="Minimum number of mutations to compute the FM bias") parser.add_argument("-s", "--slices", dest="slices", metavar="SLICES", help="Slices to process separated by commas") parser.add_argument("-e", "--estimator", dest="estimator", metavar="ESTIMATOR", choices=["mean", "median"], default="mean", help="Test estimator for computation.") parser.add_argument("-m", "--mapping", dest="mapping", metavar="PATH", help="File with mappings between rows and features to be analysed") parser.add_argument("-f", "--filter", dest="filter", metavar="PATH", help="File containing the features to be filtered. By default labels are includes," " labels preceded with - are excludes.") parser.add_argument("--save-data", dest="save_data", default=False, action="store_true", help="The input data matrix will be saved") def _check_args(self): Command._check_args(self) if self.args.analysis_name is None: self.args.analysis_name, ext = os.path.splitext(os.path.basename(self.args.data_path)) if self.args.num_samplings < 1: self._error("Number of samplings out of range [2, ..)") if self.args.mut_threshold < 1: self._error("Minimum number of mutations out of range [1, ..)") if self.args.filter is not None: if not os.path.exists(self.args.filter): self._error("Filter file not found: {0}".format(self.args.filter)) def run(self): Command.run(self) # Load data self.log.info("Loading data ...") self.log.debug(" > {0}".format(self.args.data_path)) #TODO: Support loading plain matrices: /file.tsv#name=SIFT self.matrix = tdm.load_matrix(self.args.data_path) self.log.debug(" {0} rows, {1} columns and {2} slices".format( self.matrix.num_rows, self.matrix.num_cols, self.matrix.num_slices)) # Load filter self.filter = LabelFilter() if self.args.filter is not None: self.log.info("Loading filter ...") self.log.debug(" > {0}".format(self.args.filter)) self.filter.load(self.args.filter) self.log.debug(" {0} includes, {1} excludes".format( self.filter.include_count, self.filter.exclude_count)) # Load mapping if self.args.mapping is not None: self.log.info("Loading mapping ...") self.log.debug(" > {0}".format(self.args.mapping)) self.mapping = self.load_mapping(self.matrix, self.args.mapping, self.filter) self.log.debug(" {0} features".format(self.mapping.num_groups)) method_name = "{0}-{1}".format(self.args.estimator, ZscoreTest.NAME) else: # One to one mapping map = {} for row_name in self.matrix.row_names: if self.filter.valid(row_name): map[row_name] = (row_name,) self.mapping = MatrixMapping(self.matrix, map) method_name = "{0}-{1}".format(self.args.estimator, EmpiricalTest.NAME) # Get selected slice indices if self.args.slices is not None: slices = [] for name in self.args.slices.split(","): name = name.strip() if name not in self.matrix.slice_name_index: self.log.warn("Skipping slice not found: {0}".format(name)) continue slices += [self.matrix.slice_name_index[name]] else: slices = range(self.matrix.num_slices) col_names = [self.matrix.slice_names[i] for i in slices] if self.args.save_data: for i in slices: slice_name = self.matrix.slice_names[i] self.log.info("Saving {0} data matrix ...".format(slice_name)) self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format, self.matrix.row_names, self.matrix.col_names, self.matrix.data[i], suffix="data-{0}".format(slice_name)) # Run the analysis self.log.info("Running the analysing using '{0}' ...".format(method_name)) analysis = OncodriveFmAnalysis( "oncodrivefm.compute", num_samplings = self.args.num_samplings, mut_threshold = self.args.mut_threshold, num_cores=self.args.num_cores) results = analysis.compute(self.matrix, self.mapping, method_name, slices) method = create_method(method_name) self.log.info("Saving results ...") #TODO: Have an option to save in TDM instead of splited self.save_splited_results( self.args.output_path, self.args.analysis_name, self.args.output_format, self.matrix, self.mapping, method, results, slices)