Python LabelFilter.load示例，bgcore.labelfilter.LabelFilter.load Python示例

示例#1

0

显示文件

文件： oncodrivefm.py 项目： chris-zen/phd-thesis

def oncodrivefm(project):
	log = task.logger
	conf = task.conf

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	# configuration

	default_filter = get_data_gene_filter_path(conf)

	genes_filter_enabled = get_project_conf(conf, project,
											"oncodrivefm.genes.filter_enabled", ONCODRIVEFM_GENES_FILTER_ENABLED)

	genes_filter = get_project_conf(conf, project, "oncodrivefm.genes.filter", default_filter)
	if genes_filter is None: # user can assign a null
		genes_filter_enabled = False
		genes_filter = default_filter

	filt = LabelFilter()

	if genes_filter_enabled:
		log.info("Loading expression filter ...")
		log.debug("> {0}".format(genes_filter))
		filt.load(genes_filter)

	log.info("Calculating quality indicators for OncodriveFM ...")

	qc_data = quality_control(log, conf, project, filt if genes_filter_enabled else None)

	project_results = ProjectResults(project)
	project_results.save_quality_control("oncodrivefm", qc_data)

示例#2

0

显示文件

文件： oncodrivefm.py 项目： chris-zen/phd-thesis

def load_expression_filter(log, conf, project):

	default_filter = get_data_gene_filter_path(conf)
	genes_filter_enabled = get_project_conf(conf, project,
											"oncodrivefm.genes.filter_enabled", ONCODRIVEFM_GENES_FILTER_ENABLED)

	genes_filter = get_project_conf(conf, project, "oncodrivefm.genes.filter", default_filter)
	if genes_filter is None: # user can assign a null
		genes_filter_enabled = False
		genes_filter = default_filter

	filt = LabelFilter()

	if genes_filter_enabled:
		log.info("Loading expression filter ...")
		log.debug("> {0}".format(genes_filter))
		filt.load(genes_filter)

	return genes_filter_enabled, genes_filter, filt

示例#3

0

显示文件

文件： oncodriveclust.py 项目： chris-zen/phd-thesis

def get_oncodriveclust_configuration(log, conf, project):
	log.info("OncodriveCLUST configuration:")

	mutations_threshold = get_project_conf(conf, project, "oncodriveclust.mutations_threshold", ONCODRIVECLUST_MUTATIONS_THRESHOLD)

	default_filter = get_data_gene_filter_path(conf)
	genes_filter_enabled = get_project_conf(conf, project, "oncodriveclust.genes_filter_enabled", ONCODRIVECLUST_GENES_FILTER_ENABLED)
	genes_filter = get_project_conf(conf, project, "oncodriveclust.genes_filter", default_filter)
	if genes_filter is None: # user can assign a null
		genes_filter_enabled = False
		genes_filter = default_filter

	log.info("  mutations_threshold = {0}".format(mutations_threshold))
	log.info("  genes_filter_enabled = {0}".format(genes_filter_enabled))
	log.info("  genes_filter = {0}".format(os.path.basename(genes_filter)))

	filt = LabelFilter()

	if genes_filter_enabled:
		log.info("Loading expression filter ...")
		log.debug("> {0}".format(genes_filter))
		filt.load(genes_filter)

	return (mutations_threshold, genes_filter_enabled, genes_filter, filt)

示例#4

0

显示文件

文件： oncodrivefm.py 项目： chris-zen/phd-thesis

class OncodriveFm(object):
	"OncodriveFM helper"

	def __init__(self, config, paths, logger=None):
		"""
		:param config: intogensm.config.OncodriveFmConfig
		:param paths: PathsConfig
		:param logger: Optional logger
		"""

		self.config = config
		self.paths = paths
		self.logger = logger or get_logger("oncodrivefm")

		self.genes_samples_threshold = self.pathways_samples_threshold = None
		self.filter_enabled = self.filter_path = self.filter = None

	def load_expression_filter(self):
		default_filter = self.paths.data_gene_filter_path()
		self.filter_enabled = self.config.filter_enabled

		self.filter_path = self.config.filter_path or default_filter
		if self.filter_path is None: # user can assign a null
			self.filter_enabled = False
			self.filter_path = default_filter

		self.filter = LabelFilter()

		if self.filter_enabled:
			self.logger.info("Loading expression filter ...")
			self.logger.debug("> {0}".format(self.filter_path))
			self.filter.load(self.filter_path)

	def load_samples_thresholds(self, num_samples):
		self.genes_samples_threshold = get_threshold(self.logger, self.config.genes.threshold, num_samples)
		self.pathways_samples_threshold = get_threshold(self.logger, self.config.pathways.threshold, num_samples)

	def load_config(self, num_samples):
		self.load_expression_filter()
		self.load_samples_thresholds(num_samples)
		self.log_config()

	def log_config(self):
		self.logger.info("OncodriveFM configuration:")

		self.logger.info("  num_cores = {0}".format(self.config.num_cores))
		self.logger.info("  estimator = {0}".format(self.config.estimator))

		self.logger.info("  filter_enabled = {0}".format(self.filter_enabled))
		self.logger.info("  filter_path = {0}".format(
			os.path.basename(self.filter_path) if self.filter_path is not None else None))

		self.logger.info("  Genes:")

		self.logger.info("    num_samplings = {0}".format(self.config.genes.num_samplings))
		self.logger.info("    samples_threshold = {0}".format(self.genes_samples_threshold))

		self.logger.info("  Pathways:")

		self.logger.info("    num_samplings = {0}".format(self.config.pathways.num_samplings))
		self.logger.info("    samples_threshold = {0}".format(self.pathways_samples_threshold))

		return self

	_AGGREGATE = (max, max, min,
				  max, max, min,
				  max, max, min)

	def retrieve_data(self, projdb):
		data = {}

		self.logger.info("Retrieving gene alterations for OncodriveFM ...")

		for csq in projdb.consequences(join_samples=True, join_ctypes=False,
									   filters={ProjectDb.CSQ_CTYPES : so.ONCODRIVEFM}):

			var = csq.var
			for sample in var.samples:
				key = (sample.id, csq.gene)
				if key not in data:
					data[key] = (csq.sift_score, csq.sift_tfic, csq.sift_tfic_class,
								 csq.pph2_score, csq.pph2_tfic, csq.pph2_tfic_class,
								 csq.ma_score, csq.ma_tfic, csq.ma_tfic_class)
				else:
					a = data[key]

					b = (csq.sift_score, csq.sift_tfic, csq.sift_tfic_class,
						 csq.pph2_score, csq.pph2_tfic, csq.pph2_tfic_class,
						 csq.ma_score, csq.ma_tfic, csq.ma_tfic_class)

					c = [0.0] * len(a)

					for i in range(len(a)):
						c[i] = self._AGGREGATE[i](a[i], b[i])

					data[key] = tuple(c)

		return data

示例#5

0

显示文件

文件： oncodriveclust.py 项目： chris-zen/phd-thesis

class OncodriveClust(object):
	"OncodriveCLUST helper"

	def __init__(self, config, paths, logger=None):
		"""
		:param config: intogensm.config.OncodriveClustConfig
		:param paths: PathsConfig
		:param logger: Optional logger
		"""

		self.config = config
		self.paths = paths
		self.logger = logger or get_logger("oncodriveclust")

		self._load_config()

	def _load_config(self):
		self.logger.info("OncodriveCLUST configuration:")

		self.samples_threshold = self.config.samples_threshold

		default_filter = self.paths.data_gene_filter_path()
		self.filter_enabled = self.config.filter_enabled
		self.filter_path = self.config.filter_path or default_filter
		if self.filter_path is None: # user can assign a null
			self.filter_enabled = False
			self.filter_path = default_filter

		self.logger.info("  samples_threshold = {0}".format(self.samples_threshold))
		self.logger.info("  genes_filter_enabled = {0}".format(self.filter_enabled))
		self.logger.info("  genes_filter_path = {0}".format(os.path.basename(self.filter_path)))

		self.filter = LabelFilter()

		if self.filter_enabled:
			self.logger.info("Loading expression filter ...")
			self.logger.debug("> {0}".format(self.filter_path))
			self.filter.load(self.filter_path)

		return self

	def load_cds_len(self, path):

		self.logger.info("Loading transcripts CDS length ...")
		self.logger.debug("> {}".format(path))

		cds_len = {}
		with tsv.open(path, "r") as f:
			for gene, transcript, transcript_len in tsv.lines(f, (str, str, int), header=True):
				cds_len[transcript] = transcript_len
		return cds_len

	def retrieve_data(self, projdb):

		cds_len = self.load_cds_len(self.paths.data_ensembl_gene_transcripts_path())
		data = {}

		self.logger.info("Retrieving gene alterations for OncodriveCLUST ...")

		for csq in projdb.consequences(join_samples=True,
									   filters={ProjectDb.CSQ_CTYPES : so.ONCODRIVECLUST | so.SYNONYMOUS}):

			if csq.transcript not in cds_len:
				continue

			transcript_len = cds_len[csq.transcript]

			if so.match(csq.ctypes, so.ONCODRIVECLUST):
				cls = NON_SYN
			elif so.match(csq.ctypes, so.SYNONYMOUS):
				cls = SYN
			else:
				continue

			for sample in csq.var.samples:
				key = (cls, csq.gene, sample.name)
				if key not in data:
					data[key] = (csq.transcript, transcript_len, csq.protein_pos)
				else:
					transcript, tlen, protein_pos = data[key]
					if transcript_len > tlen:
						data[key] = (csq.transcript, transcript_len, csq.protein_pos)

		return data

示例#6

0

显示文件

文件： full.py 项目： chris-zen/phd-thesis

class FullCommand(Command):
	def __init__(self):
		Command.__init__(self, prog="oncodrivefm", desc="Compute the FM bias for genes and pathways")

	def _add_arguments(self, parser):
		Command._add_arguments(self, parser)

		parser.add_argument("data_path", metavar="DATA",
							help="File containing the data matrix in TDM format")

		parser.add_argument("-N", "--samplings", dest="num_samplings", type=int, default=10000, metavar="NUMBER",
							help="Number of samplings to compute the FM bias pvalue")

		parser.add_argument("-e", "--estimator", dest="estimator", metavar="ESTIMATOR",
							choices=["mean", "median"], default="mean",
							help="Test estimator for computation.")
		
		parser.add_argument("--gt", "--gene-threshold", dest="mut_gene_threshold", type=int, default=2, metavar="THRESHOLD",
							help="Minimum number of mutations per gene to compute the FM bias")

		parser.add_argument("--pt", "--pathway-threshold", dest="mut_pathway_threshold", type=int, default=10, metavar="THRESHOLD",
							help="Minimum number of mutations per pathway to compute the FM bias")

		parser.add_argument("-s", "--slices", dest="slices", metavar="SLICES",
							help="Slices to process separated by commas")

		parser.add_argument("-m", "--mapping", dest="mapping", metavar="PATH",
							help="File with mappings between genes and pathways to be analysed")

		parser.add_argument("-f", "--filter", dest="filter", metavar="PATH",
							help="File containing the features to be filtered. By default labels are includes,"
								 " labels preceded with - are excludes.")

		parser.add_argument("--save-data", dest="save_data", default=False, action="store_true",
							help="The input data matrix will be saved")

		parser.add_argument("--save-analysis", dest="save_analysis", default=False, action="store_true",
							help="The analysis results will be saved")

	def _check_args(self):
		Command._check_args(self)

		if self.args.analysis_name is None:
			self.args.analysis_name, ext = os.path.splitext(os.path.basename(self.args.data_path))

		if self.args.num_samplings < 1:
			self._error("Number of samplings out of range [2, ..)")

		if self.args.mut_gene_threshold < 1:
			self._error("Minimum number of mutations per gene out of range [1, ..)")

		if self.args.mut_pathway_threshold < 1:
			self._error("Minimum number of mutations per pathway out of range [1, ..)")

		if self.args.mapping is not None and not os.path.isfile(self.args.mapping):
			self._error("Pathways mapping file not found: {0}".format(self.args.mapping))

	def run(self):
		Command.run(self)

		# Load filter

		self.filter = LabelFilter()
		if self.args.filter is not None:
			self.log.info("Loading filter ...")
			self.log.debug("  > {0}".format(self.args.filter))

			self.filter.load(self.args.filter)

			self.log.debug("  {0} includes, {1} excludes".format(
				self.filter.include_count, self.filter.exclude_count))

		# Load data

		self.log.info("Loading data ...")
		self.log.debug("  > {0}".format(self.args.data_path))

		#TODO: Support loading plain matrices: /file.tsv#slice=SIFT

		self.matrix = tdm.load_matrix(self.args.data_path)

		self.log.debug("  {0} rows, {1} columns and {2} slices".format(
			self.matrix.num_rows, self.matrix.num_cols, self.matrix.num_slices))

		# Get selected slice indices

		if self.args.slices is not None:
			slices = []
			for name in self.args.slices.split(","):
				name = name.strip()
				if name not in self.matrix.slice_name_index:
					raise Exception("Slice not found: {0}".format(name))
				slices += [self.matrix.slice_name_index[name]]
		else:
			slices = range(self.matrix.num_slices)

		col_names = [self.matrix.slice_names[i] for i in slices]

		if self.args.save_data:
			for i in slices:
				slice_name = self.matrix.slice_names[i]
				self.log.info("Saving {0} data matrix ...".format(slice_name))
				self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
							  self.matrix.row_names, self.matrix.col_names, self.matrix.data[i],
							  suffix="data-{0}".format(slice_name))

		# GENES ---------------------------------------

		# One to one mapping for genes

		map = {}
		for row_name in self.matrix.row_names:
			if self.filter.valid(row_name):
				map[row_name] = (row_name,)
		genes_mapping = MatrixMapping(self.matrix, map)
		genes_method_name = "{0}-{1}".format(self.args.estimator, EmpiricalTest.NAME)

		# Analysis for genes

		self.log.info("Analysing genes with '{0}' ...".format(genes_method_name))

		analysis = OncodriveFmAnalysis(
			"oncodrivefm.genes",
			num_samplings = self.args.num_samplings,
			mut_threshold = self.args.mut_gene_threshold,
			num_cores=self.args.num_cores)

		results = analysis.compute(self.matrix, genes_mapping, genes_method_name, slices)

		method = create_method(genes_method_name)

		if self.args.save_analysis:
			self.log.info("Saving genes analysis results ...")
			self.save_splited_results(
				self.args.output_path, self.args.analysis_name, self.args.output_format,
				self.matrix, genes_mapping,
				method, results, slices, suffix="genes")

		# Combination for genes

		self.log.info("Combining analysis results ...")

		combined_results = method.combine(np.ma.masked_invalid(results.T))

		self.log.info("Saving genes combined results ...")
		self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
							 genes_mapping.group_names, method.combination_columns, combined_results.T,
							 params=[("slices", ",".join(col_names)), ("method", method.name)], suffix="genes",
							 valid_row=lambda row: sum([1 if np.isnan(v) else 0 for v in row]) == 0)

		if self.args.mapping is None:
			return

		# PATHWAYS ---------------------------------------

		# Load pathways mappping

		self.log.info("Loading pathways mapping ...")
		self.log.debug("  > {0}".format(self.args.mapping))

		pathways_mapping = self.load_mapping(self.matrix, self.args.mapping)

		self.log.debug("  {0} pathways".format(pathways_mapping.num_groups))

		pathways_method_name = "{0}-{1}".format(self.args.estimator, ZscoreTest.NAME)

		# Analysis for pathways

		self.log.info("Analysing pathways with '{0}' ...".format(pathways_method_name))

		analysis = OncodriveFmAnalysis(
			"oncodrivefm.pathways",
			num_samplings = self.args.num_samplings,
			mut_threshold = self.args.mut_pathway_threshold,
			num_cores=self.args.num_cores)

		results = analysis.compute(self.matrix, pathways_mapping, pathways_method_name, slices)

		method = create_method(pathways_method_name)

		if self.args.save_analysis:
			self.log.info("Saving pathways analysis results ...")
			self.save_splited_results(
				self.args.output_path, self.args.analysis_name, self.args.output_format,
				self.matrix, pathways_mapping,
				method, results, slices, suffix="pathways")

		# Combination for pathways

		self.log.info("Combining analysis results ...")

		combined_results = method.combine(np.ma.masked_invalid(results.T))

		self.log.info("Saving pathways combined results ...")
		self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
							pathways_mapping.group_names, method.combination_columns, combined_results.T,
							params=[("slices", ",".join(col_names)), ("method", method.name)], suffix="pathways",
							valid_row=lambda row: sum([1 if np.isnan(v) else 0 for v in row]) == 0)

示例#7

0

显示文件

文件： compute.py 项目： chris-zen/phd-thesis

class ComputeCommand(Command):
	def __init__(self):
		Command.__init__(self, prog="oncodrivefm-compute", desc="Compute the FM bias")

	def _add_arguments(self, parser):
		Command._add_arguments(self, parser)

		parser.add_argument("data_path", metavar="DATA",
							help="File containing the data matrix in TDM format")

		parser.add_argument("-N", "--samplings", dest="num_samplings", type=int, default=10000, metavar="NUMBER",
							help="Number of samplings to compute the FM bias pvalue")

		parser.add_argument("-t", "--threshold", dest="mut_threshold", type=int, default=2, metavar="THRESHOLD",
							help="Minimum number of mutations to compute the FM bias")

		parser.add_argument("-s", "--slices", dest="slices", metavar="SLICES",
							help="Slices to process separated by commas")

		parser.add_argument("-e", "--estimator", dest="estimator", metavar="ESTIMATOR",
							choices=["mean", "median"], default="mean",
							help="Test estimator for computation.")

		parser.add_argument("-m", "--mapping", dest="mapping", metavar="PATH",
							help="File with mappings between rows and features to be analysed")

		parser.add_argument("-f", "--filter", dest="filter", metavar="PATH",
							help="File containing the features to be filtered. By default labels are includes,"
								 " labels preceded with - are excludes.")

		parser.add_argument("--save-data", dest="save_data", default=False, action="store_true",
							help="The input data matrix will be saved")

	def _check_args(self):
		Command._check_args(self)

		if self.args.analysis_name is None:
			self.args.analysis_name, ext = os.path.splitext(os.path.basename(self.args.data_path))

		if self.args.num_samplings < 1:
			self._error("Number of samplings out of range [2, ..)")

		if self.args.mut_threshold < 1:
			self._error("Minimum number of mutations out of range [1, ..)")

		if self.args.filter is not None:
			if not os.path.exists(self.args.filter):
				self._error("Filter file not found: {0}".format(self.args.filter))

	def run(self):
		Command.run(self)

		# Load data

		self.log.info("Loading data ...")
		self.log.debug("  > {0}".format(self.args.data_path))

		#TODO: Support loading plain matrices: /file.tsv#name=SIFT

		self.matrix = tdm.load_matrix(self.args.data_path)

		self.log.debug("  {0} rows, {1} columns and {2} slices".format(
		self.matrix.num_rows, self.matrix.num_cols, self.matrix.num_slices))

		# Load filter

		self.filter = LabelFilter()
		if self.args.filter is not None:
			self.log.info("Loading filter ...")
			self.log.debug("  > {0}".format(self.args.filter))

			self.filter.load(self.args.filter)

			self.log.debug("  {0} includes, {1} excludes".format(
				self.filter.include_count, self.filter.exclude_count))

		# Load mapping

		if self.args.mapping is not None:
			self.log.info("Loading mapping ...")
			self.log.debug("  > {0}".format(self.args.mapping))

			self.mapping = self.load_mapping(self.matrix, self.args.mapping, self.filter)

			self.log.debug("  {0} features".format(self.mapping.num_groups))

			method_name = "{0}-{1}".format(self.args.estimator, ZscoreTest.NAME)
		else: # One to one mapping
			map = {}
			for row_name in self.matrix.row_names:
				if self.filter.valid(row_name):
					map[row_name] = (row_name,)
			self.mapping = MatrixMapping(self.matrix, map)
			method_name = "{0}-{1}".format(self.args.estimator, EmpiricalTest.NAME)

		# Get selected slice indices

		if self.args.slices is not None:
			slices = []
			for name in self.args.slices.split(","):
				name = name.strip()
				if name not in self.matrix.slice_name_index:
					self.log.warn("Skipping slice not found: {0}".format(name))
					continue
				slices += [self.matrix.slice_name_index[name]]
		else:
			slices = range(self.matrix.num_slices)

		col_names = [self.matrix.slice_names[i] for i in slices]

		if self.args.save_data:
			for i in slices:
				slice_name = self.matrix.slice_names[i]
				self.log.info("Saving {0} data matrix ...".format(slice_name))
				self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
								 self.matrix.row_names, self.matrix.col_names, self.matrix.data[i],
								 suffix="data-{0}".format(slice_name))

		# Run the analysis

		self.log.info("Running the analysing using '{0}' ...".format(method_name))

		analysis = OncodriveFmAnalysis(
			"oncodrivefm.compute",
			num_samplings = self.args.num_samplings,
			mut_threshold = self.args.mut_threshold,
			num_cores=self.args.num_cores)

		results = analysis.compute(self.matrix, self.mapping, method_name, slices)

		method = create_method(method_name)

		self.log.info("Saving results ...")

		#TODO: Have an option to save in TDM instead of splited
		self.save_splited_results(
			self.args.output_path, self.args.analysis_name, self.args.output_format,
			self.matrix, self.mapping, method, results, slices)