Exemplo n.º 1
0
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.data", "repositories.source",
						"mrna.enrichment", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["oncodrive_ids"])
	task.check_out_ports(["enrichment_ids"])

	oncodrive_port = task.ports["oncodrive_ids"]
	enrichment_port = task.ports["enrichment_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	data_repo = rs.repository("data")
	
	overwrite = conf.get("overwrite", False, dtype=bool)

	# retrieve enrichment configurations
	ec = conf["mrna.enrichment"]
	if "default" in ec:
		default = ec["default"]
	else:
		default = conf.create_element()

	if "modules" not in ec:
		log.error("There is no enrichment modules section available in mrna.enrichment")
		return -1

	log.info("Reading modules configuration ...")

	econfs = list()
	for mod in ec["modules"]:
		m = ec.create_element()
		m.merge(default)
		m.merge(mod)
		mf = m.missing_fields(["id_type", "test", "modules_file"])
		if len(mf) > 0:
			log.error("Enrichment configuration missing required fields: {}".format(", ".join(mf)))
			log.error("Module configuration: {}".format(m))
		else:
			econfs.append(m)
			log.debug("{} -> {}".format(m["id_type"], m["modules_file"]))

	if len(econfs) == 0:
		log.error("There are no enrichment configurations available in mrna.enrichment")
		return 0

	results_base_path = types.MRNA_ENRICHMENT.replace(".", "/")
	
	log.info("Indexing available enrichment results ...")
	enrichment_results_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology", "id_type"],
		types.MRNA_ENRICHMENT, unique = True)

	for oid in oncodrive_port:
		o = em.find(oid, types.MRNA_ONCODRIVE_GENES)
		if o is None:
			log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, oid))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])

		log.info("Enrichment for oncodrive results ({0}) [{1}] ...".format(", ".join(okey), oid))

		for ec in econfs:
			log.info("Module {} [{}] ...".format(ec["id_type"], ec["modules_file"]))

			key = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"], ec["id_type"])

			if key in enrichment_results_index:
				eid = enrichment_results_index[key][0]
				e = em.find(eid, types.MRNA_ENRICHMENT)
				if e is None:
					log.error("{} not found: {}".format(types.MRNA_ENRICHMENT, eid))
					continue
			else:
				e = o.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
				e["id"] = eid = str(uuid.uuid4())

			e["id_type"] = ec["id_type"]

			# enrichment results

			results_path = rpath.join(results_base_path, eid + ".tsv.gz")

			if skip_file(overwrite, data_repo, results_path, e.get("results_file")):
				log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid))
				enrichment_port.write(eid)
				continue

			valid = enrichment(log, conf, rs, data_repo, results_path, o["results_file"], e, ec,
						["id", "upreg_corrected_right_p_value", "downreg_corrected_right_p_value"],
						["id", "upreg", "downreg"])

			# save mapped results
			if valid:
				em.persist(e, types.MRNA_ENRICHMENT)
				enrichment_port.write(eid)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
Exemplo n.º 2
0
def run(task):

    # Initialization

    task.check_conf(
        [
            "entities",
            "repositories",
            "repositories.data",
            "repositories.source",
            "bin_paths.python",
            "bin_paths.matrix_map",
        ]
    )
    conf = task.conf

    log = task.logger()

    task.check_in_ports(["oncodrive_ids"])
    task.check_out_ports(["mapped_oncodrive_ids"])

    oncodrive_port = task.ports["oncodrive_ids"]
    mapped_oncodrive_port = task.ports["mapped_oncodrive_ids"]

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    data_repo = rs.repository("data")
    source_repo = rs.repository("source")

    overwrite = conf.get("overwrite", False, dtype=bool)

    platform_base_path = "platform"
    vplatform_base_path = "vplatform"

    results_base_path = types.MRNA_ONCODRIVE_GENES.replace(".", "/")

    log.info("Indexing available oncodrive results for genes ...")
    oncodrive_results_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_ONCODRIVE_GENES, unique=True
    )

    for oid in oncodrive_port:
        o = em.find(oid, types.MRNA_ONCODRIVE_PROBES)
        if o is None:
            log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_PROBES, oid))
            continue

        study_id = o["study_id"]
        platform_id = o["platform_id"]
        key = (study_id, platform_id, o["icdo_topography"], o["icdo_morphology"])

        if key in oncodrive_results_index:
            mid = oncodrive_results_index[key][0]
            m = em.find(mid, types.MRNA_ONCODRIVE_GENES)
            if m is None:
                log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, mid))
                continue
        else:
            m = o.transform(
                [
                    "study_id",
                    "platform_id",
                    "icdo_topography",
                    "icdo_morphology",
                    "log2r_tumour_unit_id",
                    ("oncodrive_probes_id", "id"),
                ]
            )
            m["id"] = mid = str(uuid.uuid4())

            # mapped oncodrive results

        results_path = rpath.join(results_base_path, mid + ".tsv.gz")
        gitools_results_path = rpath.join(results_base_path, mid + ".tdm.gz")

        if skip_file(overwrite, data_repo, results_path, m.get("results_file")):
            log.warn("Skipping ({0}) [{1}] as it already exists".format(", ".join(key), mid))
            mapped_oncodrive_port.write(mid)
            continue

        log.info("Mapping oncodriver results ({0}) [{1}] ...".format(", ".join(key), oid))

        # determine the mapping file
        map_file = None
        p = em.find(platform_id, types.SOURCE_PLATFORM)
        if p is None:
            log.error("{0} not found: {1}".format(types.SOURCE_PLATFORM, platform_id))
            continue

        platform_id_type = p.get("SO/platform_id_type")
        if platform_id_type is None:
            log.error("Undefined annotation 'SO/platform_id_type' for platform '{0}'.".format(platform_id))
            continue
        elif platform_id_type != "genbank_accession":  # affy_accession, custom, ...
            missing = p.missing_fields(["ensg_map", "ensg_map/file"])
            if len(missing) > 0:
                log.error("Missing required fields for platform '{0}': {1}".format(platform_id, ", ".join(missing)))
                continue
            map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"])
            if not source_repo.exists(map_file):
                log.error("Mapping file not found for platform '{0}': {1}".format(platform_id, map_file))
                continue
        elif platform_id_type == "genbank_accession":
            if len(p.missing_fields(["ensg_map", "ensg_map/file"])) > 0:
                map_file = None
            else:
                map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"])
            if map_file is None or not source_repo.exists(map_file):
                vpid = "-".join([platform_id, study_id])
                vp = em.find(vpid, types.SOURCE_VPLATFORM)
                if vp is None:
                    log.error("{0} not found: {1}".format(types.SOURCE_VPLATFORM, vpid))
                    continue
                missing = vp.missing_fields(["ensg_map", "ensg_map/path", "ensg_map/file"])
                if len(missing) > 0:
                    log.error("Missing required fields for vplatform '{0}': {1}".format(vpid, ", ".join(missing)))
                    continue
                map_file = rpath.join(vplatform_base_path, vp["ensg_map/path"], vp["ensg_map/file"])
                if not source_repo.exists(map_file):
                    log.error(
                        "Mapping file not found for vplatform ({0}, {1}): {2}".format(platform_id, study_id, map_file)
                    )
                    continue
        else:
            log.error("Unknown SO/platform_id_type '{0}' for platform '{1}'.".format(platform_id_type, platform_id))
            continue

        log.debug("Mapping file: {0}".format(map_file))

        m["platform_map_file"] = source_repo.url(map_file)

        # oncodrive results file
        repo, repo_path = rs.from_url(o["results_file"])
        local_path = repo.get_local(repo_path)

        # mapped oncodrive results
        m["results_file"] = data_repo.url(results_path)
        results_local_path = data_repo.create_local(results_path)
        gitools_results_local_path = data_repo.create_local(gitools_results_path)

        mapping_path = rpath.join(results_base_path, mid + ".mapping.tsv.gz")
        m["mapping_file"] = data_repo.url(mapping_path)
        mapping_local_path = data_repo.create_local(mapping_path)

        map_results_file = tempfile.mkstemp(prefix="mrna_oncodrive_map_", suffix=".tsv")[1]

        try:
            # run the mapping tool
            local_map_file = source_repo.get_local(map_file)

            log.debug("Mapping {0} to {1} ...".format(repo_path, map_results_file))

            cmd = " ".join(
                [
                    conf["bin_paths.python"],
                    conf["bin_paths.matrix_map"],
                    "-o",
                    map_results_file,
                    "-i",
                    mapping_local_path,
                    local_path,
                    local_map_file,
                ]
            )

            log.debug(cmd)

            retcode = subprocess.call(args=cmd, shell=True)

            if retcode != 0:
                raise Exception("There was an error mapping the results")

                # merge repeated ids

            log.debug("Merging {0} to {1} ...".format(map_results_file, results_path))
            log.debug("Gitools file: {0}".format(gitools_results_path))

            upreg_count, downreg_count = merge(log, map_results_file, results_local_path, gitools_results_local_path)
            if upreg_count == 0 and downreg_count == 0:
                log.error(
                    "The results of the mapping for ({0}) are empty. This could be because the annotated platform or the mapping file is wrong.".format(
                        ", ".join(key)
                    )
                )

                # close local paths
            data_repo.put_local(results_local_path)
            data_repo.put_local(mapping_local_path)

        except Exception as e:
            log.exception(e)

            data_repo.close_local(results_local_path)
            data_repo.close_local(mapping_local_path)
            continue

        finally:
            os.remove(map_results_file)
            repo.close_local(local_path)
            source_repo.close_local(local_map_file)

            # save mapped results
        em.persist(m, types.MRNA_ONCODRIVE_GENES)
        mapped_oncodrive_port.write(mid)

    em.close()
    data_repo.close()
    source_repo.close()
    rs.close()
Exemplo n.º 3
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	evt_tumour_unit_port, oncodrive_results_port = \
		task.ports("evt_tumour_unit_ids", "oncodrive_results_ids")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	log.info("Indexing available {} ...".format(types.CNV_ONCODRIVE_GENES))
	oncodrive_results_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_ONCODRIVE_GENES, unique = True)

	results_base_path = types.CNV_ONCODRIVE_GENES.replace(".", "/")

	for uid in evt_tumour_unit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"])
		if key in oncodrive_results_index:
			eid = oncodrive_results_index[key][0]
			e = em.find(eid, types.CNV_ONCODRIVE_GENES)
			if e is None:
				log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, eid))
				continue
		else:
			e = u.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
			eid = e["id"] = str(uuid.uuid4())

		# create oncodrive results entity
		e["evt_tumour_unit_id"] = uid

		results_path = rpath.join(results_base_path, eid + ".tsv.gz")

		if skip_file(overwrite, data_repo, results_path, e.get("results_file")):
			log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid))
			oncodrive_results_port.write(eid)
			continue

		e["results_file"] = data_repo.url(results_path)
		
		# data matrix for oncodrive calculation
		matrix_repo, matrix_path = rs.from_url(u["data_file"])

		# Gain & Loss

		log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))
		log.debug("{} id is {}".format(types.CNV_ONCODRIVE_GENES, eid))

		tmp_path = mkdtemp(prefix = "cnv_oncodrive_calc_")
		log.debug("Temporary directory: {}".format(tmp_path))
		tmp_file = os.path.join(tmp_path, "filtered_data.tsv")

		matrix_local_path = matrix_repo.get_local(matrix_path)
		log.debug("Matrix path: {}".format(matrix_path))

		try:
			try:
				log.info("Calculating Gain ...")
				log.debug("Bit mask filtering (01) {} to {} ...".format(matrix_local_path, tmp_file))
				mask_filtering(matrix_local_path, tmp_file, 1)
				gain_results = run_oncodrive(
					conf, log, e, "gain", tmp_file, tmp_path)
			except:
				log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for gain failed".format(",".join(key), uid))
				matrix_repo.close_local(matrix_local_path)
				raise

			try:
				log.info("Calculating Loss ...")
				log.debug("Bit mask filtering (10) {} to {} ...".format(matrix_local_path, tmp_file))
				mask_filtering(matrix_local_path, tmp_file, 2)
				loss_results = run_oncodrive(
					conf, log, e, "loss", tmp_file, tmp_path)
			except:
				log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for downreg failed".format(",".join(key), uid))
				matrix_repo.close_local(matrix_local_path)
				raise

			# Join gain & loss results

			log.info("Joining upreg & downreg results into memory ...")

			# the join is done in memory with a map
			dmap = read_data_map(log, gain_results, loss_results)

			log.info("Writting joined data to {} ...".format(results_path))

			results_local_path = data_repo.create_local(results_path)

			write_data_map(dmap, results_local_path)

		finally:
			matrix_repo.close_local(matrix_local_path)
			matrix_repo.close()

			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		data_repo.put_local(results_local_path)

		em.persist(e, types.CNV_ONCODRIVE_GENES)
		oncodrive_results_port.write(eid)
	
	em.close()
	data_repo.close()
	rs.close()
Exemplo n.º 4
0
def run(task):

    # Initialization

    task.check_conf(["entities", "repositories", "bin_paths.gitools"])
    conf = task.conf

    log = task.logger()

    task.check_in_ports(["log2r_tumour_unit_ids"])
    task.check_out_ports(["oncodrive_results_ids"])

    log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"]
    oncodrive_results_port = task.ports["oncodrive_results_ids"]

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])
    data_repo = rs.repository("data")

    overwrite = conf.get("overwrite", False, dtype=bool)

    # Run

    log.info("Indexing available oncodrive results for probes ...")
    oncodrive_results_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
        types.MRNA_ONCODRIVE_PROBES,
        unique=True)

    log.info("Indexing available mrna log2r cutoffs ...")
    log2r_cutoff_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
        types.MRNA_LOG2R_CUTOFF,
        unique=True)

    results_base_path = types.MRNA_ONCODRIVE_PROBES.replace(".", "/")

    for log2r_unit_id in log2r_tumour_unit_port:
        u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT)
        if u is None:
            log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT,
                                                log2r_unit_id))
            continue

        key = (u["study_id"], u["platform_id"], u["icdo_topography"],
               u["icdo_morphology"])
        if key in oncodrive_results_index:
            eid = oncodrive_results_index[key][0]
            e = em.find(eid, types.MRNA_ONCODRIVE_PROBES)
            if e is None:
                log.error("{} not found: {}".format(
                    types.MRNA_ONCODRIVE_PROBES, eid))
                continue
        else:
            e = u.transform([
                "study_id", "platform_id", "icdo_topography", "icdo_morphology"
            ])
            eid = e["id"] = str(uuid.uuid4())

        log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(
            types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id))
        log.debug("{} id is {}".format(types.MRNA_ONCODRIVE_PROBES, eid))

        # create oncodrive results entity
        e["log2r_tumour_unit_id"] = log2r_unit_id

        results_path = rpath.join(results_base_path, eid + ".tsv.gz")

        if skip_file(overwrite, data_repo, results_path,
                     e.get("results_file")):
            log.warn("Skipping ({}) [{}] as it already exists".format(
                ", ".join(key), eid))
            oncodrive_results_port.write(eid)
            continue

        e["results_file"] = data_repo.url(results_path)

        # data matrix for oncodrive calculation
        file_repo = u["data_file/repo"]
        matrix_repo = rs.repository(file_repo)

        file_path = u["data_file/path"]
        file_name = u["data_file/file"]
        matrix_path = os.path.join(file_path, file_name)

        # Load calculated cutoff

        log.info("Loading mrna cutoff for key ({}) ...".format(", ".join(key)))

        if key not in log2r_cutoff_index:
            log.error("mrna log2r cuttof not found for key ({})".format(
                ", ".join(key)))
            matrix_repo.close()
            continue

        cutoff_id = log2r_cutoff_index[key][0]
        cutoff = em.find(cutoff_id, types.MRNA_LOG2R_CUTOFF)
        if cutoff is None:
            log.error("mrna log2r cuttof for key ({}) [{}] couldn't be loaded".
                      format(", ".join(key), cutoff_id))
            matrix_repo.close()
            continue

        log.debug("{} id is {}".format(types.MRNA_LOG2R_CUTOFF, cutoff_id))

        # Upregulation & downregulation

        try:
            from tempfile import mkdtemp
            tmp_path = mkdtemp(prefix="mrna_oncodrive_calc_")
            log.debug("Temporary directory: {}".format(tmp_path))

            matrix_local_path = matrix_repo.get_local(matrix_path)
            log.debug("Matrix path: {}".format(matrix_path))

            try:
                log.info("Calculating Upregulation with cutoff {} ...".format(
                    cutoff["upreg/cutoff"]))
                upreg_results = run_oncodrive(conf, log, e, "upreg",
                                              matrix_local_path, "gt",
                                              cutoff["upreg/cutoff"], tmp_path)
            except:
                log.error("Oncodrive calculation for upreg failed")
                matrix_repo.close_local(matrix_local_path)
                raise

            try:
                log.info(
                    "Calculating Downregulation with cutoff {} ...".format(
                        cutoff["downreg/cutoff"]))
                downreg_results = run_oncodrive(
                    conf, log, e, "downreg", matrix_local_path, "lt",
                    cutoff["downreg/cutoff"], tmp_path)
            except:
                log.error("Oncodrive calculation for downreg failed")
                matrix_repo.close_local(matrix_local_path)
                raise

            # Join upreg & downreg results

            log.info("Joining upreg & downreg results into memory ...")

            # the join is done in memory with a map
            dmap = read_data_map(log, upreg_results, downreg_results)

            log.info("Writting joined results to {} ...".format(results_path))

            results_local_path = data_repo.create_local(results_path)

            write_data_map(dmap, results_local_path)

        finally:
            matrix_repo.close_local(matrix_local_path)
            matrix_repo.close()

            if os.path.exists(tmp_path):
                log.debug(
                    "Removing temporary directory {} ...".format(tmp_path))
                import shutil
                shutil.rmtree(tmp_path)

        data_repo.put_local(results_local_path)

        em.persist(e, types.MRNA_ONCODRIVE_PROBES)
        oncodrive_results_port.write(eid)

    em.close()
    data_repo.close()
    rs.close()
Exemplo n.º 5
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.data", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	combinations_port, combination_ids_port = \
		task.ports("combinations", "combination_ids")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	results_base_path = types.CNV_COMBINATION.replace(".", "/")

	conditions = ("gain", "loss")
	
	for c_dict in combinations_port:
		c = DataFactory.from_native(c_dict, key_sep = "/")
		
		"""
		o = em.find(c, types.CNV_ONCODRIVE_GENES)
		if o is None:
			log.error("{0} not found: {1}".format(types.CNV_ONCODRIVE_GENES, c))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])
		"""

		cid = c["id"]

		key = (c["icdo_topography"], c["icdo_morphology"], c["id_type"])
		
		log.info("Processing combination for ({}) [{}] ...".format(", ".join(key), cid))

		#files = c["files"]
		#if len(files) == 1:
		#	log.info("No combination required, copyed from {0}".format(files[0]))
		#	c["results_file"] = files[0]
		#else:
		results_path = rpath.join(results_base_path, cid + ".tsv.gz")
		results_url = data_repo.url(results_path)

		if skip_file(overwrite, data_repo, results_path, c.get("results_file")):
			log.warn("Skipping {} ({}) [{}] as it already exists".format(types.CNV_COMBINATION, ", ".join(key), cid))
			combination_ids_port.write(cid)
			continue

		c["results_file"] = results_url

		combination(log, conf, rs, c, data_repo, results_path, conditions)

		# save combination results
		em.persist(c, types.CNV_COMBINATION)
		combination_ids_port.write(cid)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
Exemplo n.º 6
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories",
		"cnv.background.ensg", "cnv.mapping.ensg",
		"bin_paths.bed_tools"])

	conf = task.conf

	log = task.logger()

	evt_tunit_port, joined_evt_tunit_port = \
		task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	mapping_file = conf["cnv.mapping.ensg"]
	log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file))
	mapping_repo, mapping_path = rs.from_url(mapping_file)
	mapping_local_path = mapping_repo.get_local(mapping_path)

	background_file = conf["cnv.background.ensg"]
	log.info("Loading background from {} ...".format(background_file))

	background = set()
	repo, path = rs.from_url(background_file)
	reader = repo.open_reader(path)
	for line in reader:
		line = line.rstrip()
		if len(line) == 0:
			continue
		background.add(line)
	reader.close()
	repo.close()

	for uid in evt_tunit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", ""))

		tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/")
		tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz")

		if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")):
			log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid))
			joined_evt_tunit_port.write(uid)
			continue

		log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))

		cnv_evt_ids = u["cnv_evt_ids"]
		log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS))

		data = {}
		
		tmp_path = mkdtemp(prefix = "evt_map_and_join_")
		log.debug("Temporary directory: {}".format(tmp_path))
		
		try:
			for eid in cnv_evt_ids:
				e = em.find(eid, types.CNV_EVENTS)
				if e is None:
					log.error("{} not found: {}".format(types.CNV_EVENTS, eid))
					continue

				data_file = e["data_file"]

				log.debug("{} ...".format(data_file))

				repo, path = rs.from_url(data_file)

				local_path = repo.get_local(path)

				# Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed)

#				tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"]))

#				writer = FileWriter(tmp_file)
#				reader = repo.open_reader(path)
#				for line in reader:
#					if line.lstrip().startswith("#"):
#						continue
#					fields = line.rstrip().split("\t")
#					end = int(fields[2]) + 0 # FIXME fix not necessary already
#					fields[2] = str(end)
#					writer.write("\t".join(fields))
#					writer.write("\n")
#				writer.close()
#				reader.close()

				# Run BED tools to intersect event regions with gene names

				tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"]))

				cmd = " ".join([
					os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"),
					"-a", mapping_local_path,
					#"-b", tmp_file,
					"-b", local_path,
					"-s -wb",
					">{}".format(tmp_file2)])

				log.debug(cmd)

				retcode = subprocess.call(args = cmd, shell = True)

				if retcode != 0:
					raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

				repo.close_local(local_path)

				# Read BED tools results and load event data into memory

				reader = FileReader(tmp_file2)

				name_index = 3
				value_index = 12

				line_num = 1
				for line in reader:
					try:
						fields = line.rstrip().split("\t")
						name = fields[name_index]
						value = int(fields[value_index])
						if value not in [1, 2]:
							log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file))
							continue
					except:
						log.error("Error parsing line {} of data file {}".format(line_num, data_file))
						continue

					k = (eid, name)
					if k in data:
						prev_value = data[k]
					else:
						prev_value = 0

					data[k] = prev_value | value

					line_num += 1

				reader.close()
				repo.close()

		finally:
			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		# Write events data to data file and merge with background labels

		log.info("Writing data to {} ...".format(tunit_path))

		u["data_file"] = data_repo.url(tunit_path)
		#TODO u["data_timestamp"] = ...

		writer = data_repo.open_writer(tunit_path)

		# header
		for name in cnv_evt_ids:
			writer.write("\t")
			writer.write(name)
		writer.write("\n")

		# data
		for row_name in sorted(background):
			writer.write(row_name)
			for col_name in cnv_evt_ids:
				k = (col_name, row_name)
				if k in data:
					value = data[k]
				else:
					value = 0
				writer.write("\t")
				writer.write(str(value))
			writer.write("\n")

		writer.close()
		
		log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key)))
		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		joined_evt_tunit_port.write(uid)

	em.close()
	es.close()

	mapping_repo.close_local(mapping_local_path)
	mapping_repo.close()
	data_repo.close()
	rs.close()