Exemplo n.º 1
0
	def load_data(self, data_paths, method=None):
		columns = []
		col_names = []
		row_name_index = {}
		for col_index, data_file in enumerate(data_paths):
			self.log.debug("  > {0}".format(data_file))
			names = []
			values = []
			with tsv.open(data_file, "r") as f:
				col_name, ext = os.path.splitext(os.path.basename(data_file))
				params = tsv.params(f)
				if "slice" in params:
					col_name = params["slice"]
				if "method" in params:
					if method is None:
						method = params["method"]
					elif method != params["method"]:
						self.log.warn("Different method of computation used for file {0}".format(data_file))

				for name, value in tsv.lines(f, (str, float), header=True, null_value="-"):
					if len(name) == 0:
						self.log.warn("Empty identifier detected")
						continue

					if name not in row_name_index:
						row_name_index[name] = len(row_name_index)

					names += [name]
					values += [value]
			col_names += [col_name]
			columns += [(names, values)]

		num_cols = len(columns)
		num_rows = len(row_name_index)
		row_names = [None] * num_rows
		for name, index in row_name_index.items():
			row_names[index] = name

		data = np.empty((num_rows, num_cols))
		data[:] = np.nan

		for col_index, (names, values) in enumerate(columns):
			for i, name in enumerate(names):
				data[row_name_index[name], col_index] = values[i]

		return row_names, col_names, data, method
Exemplo n.º 2
0
def drivers():
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	db_path = paths.results_path("drivers.db")
	db = SigDb(db_path)
	db.open()

	log.info("Variants ...")

	path = paths.combination_path("recurrences", "variant_gene-global-all.tsv.gz")
	with tsv.open(path, "r") as f:
		types = (str, str, int, str)
		for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True):
			chr, strand, start, allele = fields[:4]
			db.add_variant(chr, start)

	log.info("Genes ...")

	gene_sites = {}

	gene_fm = set()
	gene_clust = set()

	#SPECIAL_THRESHOLD = ["C18", "C34"]
	SPECIAL_THRESHOLD = []

	log.info("  OncodriveFM ...")

	filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz")
	base_path = paths.combination_path("oncodrivefm")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:
			continue

		cancer_site_code = m.group(1)

		if cancer_site_code in SPECIAL_THRESHOLD:
			threshold = 1e-6
		else:
			threshold = 0.01

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < threshold:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)

					gene_fm.add(gene)

	log.info("  OncodriveCLUST ...")

	filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz")
	base_path = paths.combination_path("oncodriveclust")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:
			continue

		cancer_site_code = m.group(1)

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < 0.05:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)

					gene_clust.add(gene)

	log.info("  Updating db ...")
	sig_genes = gene_fm | gene_clust
	for gene in sig_genes:
		db.add_gene(gene, gene in gene_fm, gene in gene_clust)

	log.info("Saving driver genes cancer sites dataset ...")
	path = paths.results_path("gene-driver_cancer_sites.tsv")
	log.debug("> {}".format(path))
	with open(path, "w") as f:
		tsv.write_param(f, "date", datetime.now())
		tsv.write_line(f, "GENE_ID", "FM", "CLUST", "CANCER_SITES_COUNT", "CANCER_SITE_CODES", "CANCER_SITE_NAMES")
		for gene, sites in gene_sites.items():
			tsv.write_line(f, gene,
						   1 if gene in gene_fm else 0,
						   1 if gene in gene_clust else 0,
						   len(sites),
						   ", ".join(sorted([code for code, name in sites])),
						   ", ".join(sorted([name for code, name in sites])))

	db.commit()
	db.close()