Exemplo n.º 1
0
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	oncodrive_port, combination_port = \
		task.ports("oncodrive_ids", "combinations")

	es = EntityServer(conf["entities"])
	em = es.manager()

	log.info("Indexing available {} results ...".format(types.CNV_COMBINATION))
	comb_results_index = em.group_ids(
		["icdo_topography", "icdo_morphology", "id_type"],
		types.CNV_COMBINATION, unique = True)

	ENSEMBL_GENE = "ensembl:gene"

	classif = {}

	log.info("Classifying oncodrive results ...")

	for oid in oncodrive_port:
		o = em.find(oid, types.CNV_ONCODRIVE_GENES)
		if o is None:
			log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, oid))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])

		key = (o["icdo_topography"], o["icdo_morphology"], ENSEMBL_GENE)

		log.debug("Oncodrive results ({}) [{}] classified into ({}) ...".format(", ".join(okey), oid, ", ".join(key)))

		if key in classif:
			classif[key] += [o]
		else:
			classif[key] = [o]

	log.info("Preparing combinations ...")

	for key in sorted(classif):
		if key in comb_results_index:
			cid = comb_results_index[key][0]
			c = em.find(cid, types.CNV_COMBINATION)
			if c is None:
				log.error("{} not found: {}".format(types.CNV_COMBINATION, cid))
				return
		else:
			c = DataElement(key_sep = "/")
			c["id"] = cid = str(uuid.uuid4())
			c["icdo_topography"] = key[0]
			c["icdo_morphology"] = key[1]

		c["id_type"] = ENSEMBL_GENE

		olist = classif[key]
		
		log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(olist)))

		ids = c.create_list()
		flist = c.create_list()

		for o in olist:
			ids += [o["id"]]
			flist += [o["results_file"]]

		c["source"] = src = c.create_element()
		src["type"] = types.CNV_ONCODRIVE_GENES
		src["ids"] = ids

		c["files"] = flist

		combination_port.write(c.to_native())

	em.close()
	es.close()
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	enrichment_port, combination_port = \
		task.ports("enrichment_ids", "combinations")

	es = EntityServer(conf["entities"])
	em = es.manager()

	log.info("Indexing available {} results ...".format(types.CNV_COMBINATION))
	comb_results_index = em.group_ids(
		["icdo_topography", "icdo_morphology", "id_type"],
		types.CNV_COMBINATION, unique = True)

	classif = {}

	log.info("Classifying enrichment results ...")

	for eid in enrichment_port:
		e = em.find(eid, types.CNV_ENRICHMENT)
		if e is None:
			log.error("{} not found: {}".format(types.CNV_ENRICHMENT, eid))
			continue

		ekey = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"], e["id_type"])

		key = (e["icdo_topography"], e["icdo_morphology"], e["id_type"])

		log.debug("Enrichment results ({}) [{}] classified into ({}) ...".format(", ".join(ekey), eid, ", ".join(key)))

		if key in classif:
			classif[key] += [e]
		else:
			classif[key] = [e]

	log.info("Preparing combinations ...")

	for key in sorted(classif):
		if key in comb_results_index:
			cid = comb_results_index[key][0]
			c = em.find(cid, types.CNV_COMBINATION)
			if c is None:
				log.error("{} not found: {}".format(types.CNV_COMBINATION, cid))
				return
		else:
			c = DataElement(key_sep = "/")
			c["id"] = cid = str(uuid.uuid4())
			c["icdo_topography"] = key[0]
			c["icdo_morphology"] = key[1]
			c["id_type"] = key[2]

		elist = classif[key]
		
		log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(elist)))

		ids = c.create_list()
		flist = c.create_list()

		for e in elist:
			ids += [e["id"]]
			flist += [e["results_file"]]

		c["source"] = src = c.create_element()
		src["type"] = types.CNV_ENRICHMENT
		src["ids"] = ids

		c["files"] = flist

		combination_port.write(c.to_native())

	em.close()
	es.close()
Exemplo n.º 3
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay",
				"cnv.min_tumour_unit_size"])

	conf = task.conf

	log = task.logger()

	study_ids_port, evt_port, evt_tunit_port = \
		task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	source_repo = rs.repository("source")

	if "excluded_topographies" in conf:
		excluded_topographies = set(conf.get("excluded_topographies"))
		log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies)))
	else:
		excluded_topographies = set()
		
	# Run

	log.info("Creating indices for {} ...".format(types.CNV_EVENTS))
	evt_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS, unique = True)
	
	log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	evt_tunit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS_TUMOUR_UNIT, unique = True)
	
	processed_studies = set()
	processed_assays = 0
	valid_assay_count = {}
	skipped_assay_count = {}
	wrong_assays = {}
	wrong_samples = {}
	tumour_units = {}
	evt_dup = {}
	
	study_ids = study_ids_port.read_all()
	log.info("Processing %i studies ..." % len(study_ids))

	for assay in em.iter_all(types.SOURCE_ASSAY):

		assay_id = assay.get("id", "WITHOUT ID")
		log.debug("Reading assay %s ..." % assay_id)

		mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id",
			"assay_property/assay_design", "assay_property/data_type",
			"assay_property/study_type", "assay_property/filename"])	
		
		assay_source_path = assay.get("source_path", "")
		
		if len(mf) > 0:
			study_id = assay.get("study_id", "WITHOUT ID")
			doc_path = assay.get("__doc_path", "UNKNOWN")

			log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		study_id = assay["study_id"]

		if study_id not in study_ids:
			log.debug("Assay {} not included in 'study_ids'".format(assay_id))
			continue

		platform_id = assay["platform_id"]
		sample_id = assay["sample_id"]
		
		assay_design = assay["assay_property/assay_design"]
		data_type = assay["assay_property/data_type"]
		study_type = assay["assay_property/study_type"]

		source_path = assay["source_path"]
		source_file = assay["assay_property/filename"]

		e = assay.transform([
			("assay_id", "id"),
			"study_id",
			"platform_id",
			"sample_id",
			"source_path"])

		e["data_file"] = source_repo.url("assay", source_path, source_file)

		included = study_id in study_ids and study_type == "genomic"
		included &= (assay_design == "cancer_vs_normal" and data_type == "binary")

		if not included:
			if study_type != "transcriptomic" and study_id in study_ids:
				s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]])
				log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s))
				map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type))
			continue

		sample = em.find(sample_id, types.SOURCE_SAMPLE)
		if sample is None:
			log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue
		
		mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"])
		if len(mf) > 0:
			sample_source_path = sample.get("source_path", "")
			log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		sample = sample.transform([
			"id",
			"source_path",
			("disease_state", "basic_sample_details/disease_state"),
			("normal_counterpart", "normal_counterpart_location/topography"),
			("icdo_topography", "icdo/topography"),
			("icdo_morphology", "icdo/morphology") ])
		
		disease_state = sample["disease_state"]
		if disease_state not in disease_state_map:
			log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", "")))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		orig_disease_state = disease_state
		disease_state = disease_state_map[disease_state]
		if disease_state not in ["tumour"]:
			log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state))
			continue

		e["disease_state"] = disease_state
		
		e["icdo_topography"] = sample["icdo_topography"]
		e["icdo_morphology"] = sample.get("icdo_morphology", "")
		if "normal_counterpart" in sample:
			e["normal_counterpart"] = sample["normal_counterpart"]

		repo, rel_path = rs.from_url(e["data_file"])

		if not repo.exists(rel_path):
			log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"])

		eid = None
		duplicated = False
		exists = False
		if e_key in evt_dup:
			duplicated = True
		elif e_key in evt_index:
			eid = evt_index[e_key][0]
			exists = True
		
		if duplicated:
			log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		if eid is None:
			eid = str(uuid.uuid4())
		
		e["id"] = eid

		u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", ""))
		keys = classify_by_experiment_and_icdo(
					u_key[0], u_key[1], u_key[2], u_key[3])
		for key in keys:
			icdo_topography = key[2]
			if icdo_topography in excluded_topographies:
				continue
			map_list_add(tumour_units, key, eid)

		processed_studies.add(study_id)
		processed_assays += 1
		map_inc(valid_assay_count, (study_id, platform_id))

		msg = {True : "Overwritting", False : "Writting"}[exists]
		log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key)))
		em.persist(e, types.CNV_EVENTS)
		evt_port.write(eid)
		evt_dup[e_key] = eid

	min_tumour_unit_size = conf["cnv.min_tumour_unit_size"]

	log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	log.debug("Minimum size = {}".format(min_tumour_unit_size))

	for key in sorted(tumour_units):
		v = tumour_units[key]
		size = len(v)
		if size < min_tumour_unit_size:
			discard = True
			discard_text = "[skipped]"
		else:
			discard = False
			discard_text = ""

		if key in evt_tunit_index:
			uid = evt_tunit_index[key][0]
			u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
			if u is None:
				log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
				continue

			arrow_text = "==>"
		else:
			uid = str(uuid.uuid4())
			u = DataElement(key_sep = "/")
			u["id"] = uid
			u["study_id"] = key[0]
			u["platform_id"] = key[1]
			u["icdo_topography"] = key[2]
			u["icdo_morphology"] = key[3]

			arrow_text = "-->"

		log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text))

		if discard:
			continue

		u["size"] = len(v)
		u["cnv_evt_ids"] = u.create_list(v)

		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		evt_tunit_port.write(uid)

	sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))]
	log.info("".join(sb))

	log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1)))
	
	log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays)))
	
	log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples)))

	em.close()
	es.close()
Exemplo n.º 4
0
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	min_tumour_unit_size = conf.get("mrna.min_tumour_unit_size", 20, dtype=int)

	log = task.logger()

	task.check_in_ports(["log2r_ids"])
	task.check_out_ports(["log2r_tumour_unit_ids"])

	log2r_port = task.ports["log2r_ids"]
	log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	overwrite = conf.get("overwrite", False, dtype=bool)

	if "excluded_topographies" in conf:
		excluded_topographies = set(conf.get("excluded_topographies"))
		log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies)))
	else:
		excluded_topographies = set()
		
	# Run

	log.info("Indexing available mrna log2r tumour units ...")
	log2r_tumour_unit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_LOG2R_TUMOUR_UNIT, unique = True)

	units = {}
	for log2r_id in log2r_port:
		e = em.find(log2r_id, types.MRNA_LOG2R)
		if e is None:
			log.error("%s not found: %s" % (types.MRNA_LOG2R, log2r_id))
			continue

		eid = e["id"]
		study_id = e["study_id"]
		platform_id = e["platform_id"]
		icdo_topography = e["icdo_topography"]
		icdo_morphology = e.get("icdo_morphology", "")
		
		log.info("Classifying mrna log2r (%s, %s, %s, %s) [%s] ..." % (study_id, platform_id, icdo_topography, icdo_morphology, eid))
		
		keys = []
	
		m = _ICDO_TOPOGRAPHY_PAT.match(icdo_topography)
		if m is None:
			log.error("Wrong ICD-O Topography code: {0}".format(icdo_topography))
			continue
		else:
			level1 = m.group(1)
			level2 = m.group(2)

		if len(icdo_morphology) > 0:
			m = _ICDO_MORPHOLOGY_PAT.match(icdo_morphology)
			if m is None:
				log.error("Wrong ICD-O Morphology code: {0}".format(icdo_morphology))
				continue

		keys += [(study_id, platform_id, level1, "")]
		if len(icdo_morphology) > 0:
			keys += [(study_id, platform_id, level1, icdo_morphology)]
			#keys += [(study_id, platform_id, "", icdo_morphology)]
	
		if level2 is not None:
			keys += [(study_id, platform_id, icdo_topography, "")]
			if len(icdo_morphology) > 0:
				keys += [(study_id, platform_id, icdo_topography, icdo_morphology)]

		for key in keys:
			icdo_topography = key[2]
			if icdo_topography in excluded_topographies:
				log.debug("\t(%s) [excluded]" % ", ".join(key))
				continue

			log.debug("\t(%s)" % ", ".join(key))
			
			if key not in units:
				units[key] = [eid]
			else:
				units[key] += [eid]

	log.info("Persisting %i mrna log2r tumour units ..." % len(units))
	log.debug("Minimum size = %i" % min_tumour_unit_size)

	for key, ids in sorted(units.iteritems()):
		
		size = len(ids)
		
		if size < min_tumour_unit_size:
			log.debug("\t(%s)\t%i assays [Skipped]" % (", ".join(key), size))
			continue
		else:
			log.debug("\t(%s)\t%i assays" % (", ".join(key), size))

		if key in log2r_tumour_unit_index:
			uid = log2r_tumour_unit_index[key][0]
			if not overwrite:
				u = em.find(uid, types.MRNA_LOG2R_TUMOUR_UNIT)
			else:
				u = DataElement(key_sep = "/")
		else:
			uid = str(uuid.uuid4())
			u = DataElement(key_sep = "/")

		u["id"] = uid
		u["study_id"] = key[0]
		u["platform_id"] = key[1]
		u["icdo_topography"] = key[2]
		u["icdo_morphology"] = key[3]

		u["size"] = size
		u["mrna_log2r_ids"] = u.create_list(ids)
		
		em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT)
		log2r_tumour_unit_port.write(uid)
	
	em.close()
	es.close()
Exemplo n.º 5
0
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	log = task.logger()
	
	task.check_in_ports(["study_ids"])
	task.check_out_ports(["absi_ids", "absi_tumour_unit_ids", "normal_pool_ids", "log2r_source_ids"])

	study_ids_port = task.ports["study_ids"]
	absi_port = task.ports["absi_ids"]
	absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"]
	normal_pool_port = task.ports["normal_pool_ids"]
	log2r_source_port = task.ports["log2r_source_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])

	#overwrite = conf.get("overwrite", False, dtype=bool)

	# Run
	
	log.info("Creating indices for {} ...".format(types.MRNA_ABS_INTENSITY))
	absi_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_ABS_INTENSITY, unique = True)
	
	log.info("Creating indices for {} ...".format(types.MRNA_LOG2R_SOURCE))
	log2r_src_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_LOG2R_SOURCE, unique = True)

	log.info("Creating indices for {} ...".format(types.MRNA_ABSI_TUMOUR_UNIT))
	absi_tumour_unit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_ABSI_TUMOUR_UNIT, unique = True)

	processed_studies = set()
	processed_assays = 0
	valid_assay_count = {}
	skipped_assay_count = {}
	wrong_assays = {}
	wrong_samples = {}
	log2r_src_units = {}
	tumour_units = {}
	normal_pools = {}
	absi_dup = {}
	log2r_source_dup = {}

	study_ids = study_ids_port.read_all()
	log.info("Processing %i studies ..." % len(study_ids))

	for assay in em.iter_all(types.SOURCE_ASSAY):

		assay_id = assay.get("id", "WITHOUT ID")
		log.debug("Reading assay %s ..." % assay_id)

		mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id",
			"assay_property/assay_design", "assay_property/data_type",
			"assay_property/study_type", "assay_property/filename"])	
		
		assay_source_path = assay.get("source_path", "")
		
		if len(mf) > 0:
			study_id = assay.get("study_id", "WITHOUT ID")
			doc_path = assay.get("__doc_path", "UNKNOWN")

			log.error("Assay %s in study %s missing required fields: %s {%s}" % (assay_id, study_id, mf, assay_source_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		study_id = assay["study_id"]

		if study_id not in study_ids:
			log.debug("Assay %s not included in 'study_ids'" % assay_id)
			continue

		platform_id = assay["platform_id"]
		sample_id = assay["sample_id"]
		
		assay_design = assay["assay_property/assay_design"]
		data_type = assay["assay_property/data_type"]
		study_type = assay["assay_property/study_type"]
		
		e = assay.transform([
			("assay_id", "id"),
			"study_id",
			"platform_id",
			"sample_id",
			"source_path",
			("data_file/path", "source_path"),
			("data_file/name", "assay_property/filename") ])

		e["data_file/repo"] = assay.get("data_file/repo", "assay")

		included = study_id in study_ids and study_type == "transcriptomic"
		included &= (assay_design == "cancer_and_normal" and data_type == "log_abs_readings") \
						or (assay_design == "cancer_vs_normal" and data_type == "log2ratios")

		if not included:
			if study_type != "genomic" and study_id in study_ids:
				s = ", ".join(["%s = %s" % (v[0], v[1]) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]])
				log.warn("Skipping assay %s {%s}: %s." % (assay_id, assay_source_path, s))
				map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type))
			continue

		sample = em.find(sample_id, types.SOURCE_SAMPLE)
		if sample is None:
			log.error("Assay %s references a non-existent sample: %s" % (assay_id, sample_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue
		
		mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"])
		if len(mf) > 0:
			sample_id = sample.get("id", "WITHOUT ID")
			doc_path = sample.get("__doc_path", "UNKNOWN")
			sample_source_path = sample.get("source_path", "")
			
			log.error("Sample %s associated with assay %s in study %s missing required fields: %s {%s}" % (sample_id, assay_id, study_id, mf, sample_source_path))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		sample = sample.transform([
			"id",
			("source_path", "source_path"),
			("disease_state", "basic_sample_details/disease_state"),
			("normal_counterpart", "normal_counterpart_location/topography"),
			("icdo_topography", "icdo/topography"),
			("icdo_morphology", "icdo/morphology") ])
		
		disease_state = sample["disease_state"]
		if disease_state not in disease_state_map:
			log.error("Unknown disease_state '%s' for sample %s {%s}" % (disease_state, sample_id, sample.get("source_path", "")))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		disease_state = disease_state_map[disease_state]
		if disease_state not in ["tumour", "normal"]:
			continue

		e["disease_state"] = disease_state
		
		e["icdo_topography"] = sample["icdo_topography"]
		e["icdo_morphology"] = sample.get("icdo_morphology", "")
		if "normal_counterpart" in sample:
			e["normal_counterpart"] = sample["normal_counterpart"]

		repo = rs.repository(e["data_file/repo"])
		rel_path = os.path.join(e["data_file/path"], e["data_file/name"])

		if not repo.exists(rel_path):
			log.error("Assay %s in study %s missing data file: [%s]" % (assay_id, study_id, rel_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"])
		
		eid = None
		duplicated = False
		exists = False
		if data_type == "log_abs_readings":
			if key in absi_dup:
				duplicated = True
			elif key in absi_index:
				eid = absi_index[key][0]
				exists = True
		elif data_type == "log2ratios":
			if key in log2r_source_dup:
				duplicated = True
			elif key in log2r_src_index:
				eid = log2r_src_index[key][0]
				exists = True

		if duplicated:
			log.error("Duplicated key (%s) for assay %s" % (", ".join(key), assay_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		if eid is None:
			eid = str(uuid.uuid4())
		
		e["id"] = eid
		
		if disease_state == "normal":
			if data_type == "log2ratios":
				k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]))
				map_list_add(log2r_src_units, k, eid)
			elif data_type == "log_abs_readings":
				map_list_add(normal_pools, (study_id, platform_id, e["icdo_topography"]), eid)
			else:
				log.error("Assay %s has an unexpected combination of (disease_state, assay_design, data_type): (%s, %s)" % (assay_id, disease_state, assay_design, data_type))
				map_list_add(wrong_assays, study_id, assay_id)
				continue
		elif disease_state == "tumour":
			k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]))
			if data_type == "log_abs_readings":
				map_list_add(tumour_units, k, eid)
			elif data_type == "log2ratios":
				map_list_add(log2r_src_units, k, eid)

		processed_studies.add(study_id)
		processed_assays += 1
		map_inc(valid_assay_count, (study_id, platform_id))

		msg = {True : "Overwritting", False : "Writting"}[exists]
		if data_type == "log_abs_readings":
			log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_ABS_INTENSITY, ", ".join(key), eid))
			em.persist(e, types.MRNA_ABS_INTENSITY)
			absi_port.write(eid)
			absi_dup[key] = eid
		elif data_type == "log2ratios":
			log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_LOG2R_SOURCE, ", ".join(key), eid))
			em.persist(e, types.MRNA_LOG2R_SOURCE)
			log2r_source_port.write(eid)
			log2r_source_dup[key] = eid

	log.info("Persisting mrna absi tumour units ...")

	for k, v in sorted(tumour_units.items()):
		key = (k[0], k[1], k[2])
		exists = key in absi_tumour_unit_index
		if exists:
			uid = absi_tumour_unit_index[key][0]
		else:
			uid = str(uuid.uuid4())

		u = DataElement(key_sep = "/")
		u["id"] = uid
		u["study_id"] = k[0]
		u["platform_id"] = k[1]
		u["icdo_topography"] = k[2]
		u["size"] = len(v)
		u["mrna_absi_ids"] = u.create_list(v)

		if exists:
			log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid))
		else:
			log.debug("\t(%s) --> %s ..." % (", ".join(k), uid))

		em.persist(u, types.MRNA_ABSI_TUMOUR_UNIT)
		absi_tumour_unit_port.write(uid)

	log.info("Creating indices for mrna normal pools ...")
	normal_pool_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_NORMAL_POOL, unique = True)

	log.info("Persisting mrna normal pools ...")

	for k, v in sorted(normal_pools.items()):
		key = (k[0], k[1], k[2])
		exists = key in normal_pool_index
		if exists:
			uid = normal_pool_index[key][0]
		else:
			uid = str(uuid.uuid4())

		u = DataElement(key_sep = "/")
		u["id"] = uid
		u["study_id"] = k[0]
		u["platform_id"] = k[1]
		u["icdo_topography"] = k[2]
		u["size"] = len(v)
		u["mrna_absi_ids"] = u.create_list(v)

		if exists:
			log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid))
		else:
			log.debug("\t(%s) --> %s ..." % (", ".join(k), uid))

		em.persist(u, types.MRNA_NORMAL_POOL)
		normal_pool_port.write(uid)

	sb = ["\n\nProcessed %i assays for %i studies (out of %i):\n\n" % (processed_assays, len(processed_studies), len(study_ids))]
	
	sb += ["%i mrna tumour units:\n\n" % (len(tumour_units))]
	
	for k, v in sorted(tumour_units.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]

	sb += ["\n%i mrna normal pools:\n\n" % (len(normal_pools))]
	
	for k, v in sorted(normal_pools.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]
	
	sb += ["\n%i mrna source log2r units:\n\n" % (len(log2r_src_units))]
	
	for k, v in sorted(log2r_src_units.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]

	sb += ["\nAssay counts by study and platform:\n\n"]
	
	for k, v in sorted(valid_assay_count.items()):
		sb += ["\t%s\t%i assays" % (k, v)]
		if k in wrong_assays:
			sb += ["\t%i failed assays" % len(wrong_assays[k])]
		if k in wrong_samples:
			sb += ["\t%i failed samples" % len(wrong_samples[k])]
		sb += ["\n"]

	log.info("".join(sb))

	if len(skipped_assay_count) > 0:
		log.info("Skipped assays:\n\n%s" % map_count_tostring(skipped_assay_count, indent = 1))

	if len(wrong_assays) > 0:
		log.info("Summary of failed assays:\n\n%s" % map_list_tostring(wrong_assays))

	if len(wrong_samples) > 0:
		log.info("Summary of failed samples:\n\n%s" % map_list_tostring(wrong_samples))

	em.close()

	return 0