示例#1
0
文件: native.py 项目: chris-zen/wok
	def _merge_env(self, env1, env2):
		env = DataElement()
		if env1 is not None:
			env.merge(env1)
		if env2 is not None:
			env.merge(env2)
		return env
示例#2
0
文件: nodes.py 项目: dalloliogm/wok
	def resources(self):
		if self.parent is None:
			conf = DataElement()
		else:
			conf = self.parent.resources

		if self.model.resources is not None:
			conf.merge(self.model.resources)

		return conf
示例#3
0
文件: config.py 项目: dalloliogm/wok
	def __init__(self, initial_conf = None, required = [], args_usage = "", add_options = None, expand_vars = False):
		DataElement.__init__(self)
		
		from optparse import OptionParser

		parser = OptionParser(usage = "usage: %prog [options] " + args_usage, version = VERSION)

		parser.add_option("-L", "--log-level", dest="log_level", 
			default=None, choices=["debug", "info", "warn", "error", "critical", "notset"],
			help="Which log level: debug, info, warn, error, critical, notset")

		parser.add_option("-c", "--conf", action="append", dest="conf_files", default=[], metavar="FILE",
			help="Load configuration from a file. Multiple files can be specified")
			
		parser.add_option("-D", action="append", dest="data", default=[], metavar="PARAM=VALUE",
			help="External data value. example -D param1=value")

		if add_options is not None:
			add_options(parser)

		(self.options, self.args) = parser.parse_args()

		self.builder = ConfigBuilder()

		if initial_conf is not None:
			if isinstance(initial_conf, dict):
				initial_conf = DataFactory.from_native(initial_conf)
			self.builder.add_element(initial_conf)

		if self.options.log_level is not None:
			self.builder.add_value("wok.log.level", self.options.log_level)

		if len(self.options.conf_files) > 0:
			files = []
			for conf_file in self.options.conf_files:
				self.builder.add_file(conf_file)
				files.append(os.path.abspath(conf_file))

			self.builder.add_value("__files", DataFactory.from_native(files))

		for data in self.options.data:
			d = data.split("=")
			if len(d) != 2:
				raise Exception("Data argument wrong: " + data)

			self.builder.add_value(d[0], d[1])

		self.builder.merge_into(self)

		if len(required) > 0:
			self.check_required(required)

		if expand_vars:
			self.expand_vars()
示例#4
0
	def to_element(self, e = None):
		if e is None:
			e = DataElement()

		e["name"] = self.name
		e["conf"] = self.conf

		self.root_node.update_tasks_count_by_state()
		self.root_node.update_modules_count_by_state()
		self.root_node.to_element(e.create_element("root"))

		return e
示例#5
0
文件: storage.py 项目: dalloliogm/wok
	def _task_config_to_element(task):
		e = DataElement(key_sep = "/")
		e["id"] = task.id
		e["name"] = task.name
		e["index"] = task.index
		e["module"] = task.parent.id
		e["instance"] = task.instance.name
		e["conf"] = task.conf

		#TODO depends on module definition
		iter = e.create_element("iteration")
		iter["strategy"] = "dot"
		iter["size"] = 0
		
		ports = e.create_element("ports")

		in_ports = ports.create_list("in")
		for i, port_node in enumerate(task.parent.in_ports):
			pe = DataElement(key_sep = "/")
#			pe["name"] = port_node.name
#			pe["serializer"] = port_node.serializer
#			pe["partition"] = pdata.partition
#			pe["start"] = pdata.start
#			pe["size"] = pdata.size
			#task.in_port_data[i].fill_element(pe.create_element("data"))
			task.in_port_data[i].fill_element(pe)
			in_ports.append(pe)
			
		out_ports = ports.create_list("out")
		for i, port_node in enumerate(task.parent.out_ports):
			pe = DataElement(key_sep = "/")
#			pe["name"] = port_node.name
#			pe["serializer"] = port_node.serializer
#			pe["partition"] = pdata.partition
			#task.out_port_data[i].fill_element(pe.create_element("data"))
			task.out_port_data[i].fill_element(pe)
			out_ports.append(pe)
		
		return e
示例#6
0
文件: native.py 项目: chris-zen/wok
	def prepare(self, task):
		wok_conf = task.instance.conf.get("wok")
		if wok_conf is None:
			wok_conf = DataElement()

		lang = self.conf.get("language", "python")

		lang_key = "execution.mode.native.{}".format(lang)
		if lang_key in wok_conf:
			lang_conf = wok_conf[lang_key]
		else:
			lang_conf = DataElement()

		if "script_path" not in self.conf:
			raise MissingRequiredOption("script_path")

		script_path = self.conf["script_path"]

		if lang == "python":
			cmd = lang_conf.get("bin", "python")
			args = [self._task_absolute_path(task, script_path)]
			env = self._merge_env(lang_conf.get("env"), self.conf.get("env"))

			if "lib_path" in lang_conf:
				if "PYTHONPATH" in env:
					env["PYTHONPATH"] = ":".join(lang_conf["lib_path"]) + ":" + env["PYTHONPATH"]
				else:
					env["PYTHONPATH"] = ":".join(lang_conf["lib_path"])
		else:
			raise UnknownNativeCmdBuilderLanguage(lang)

		args += ["-D", "instance_name=" + task.instance.name,
				"-D", "module_path=" + ".".join([task.parent.namespace, task.parent.name]),
				"-D", "task_index=" + str(task.index)]

		for key, value in self._storage_conf(task.instance.engine.storage.basic_conf):
			args += ["-D", "storage.{}={}".format(key, value)]

		return cmd, args, env.to_native()
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	oncodrive_port, combination_port = \
		task.ports("oncodrive_ids", "combinations")

	es = EntityServer(conf["entities"])
	em = es.manager()

	log.info("Indexing available {} results ...".format(types.CNV_COMBINATION))
	comb_results_index = em.group_ids(
		["icdo_topography", "icdo_morphology", "id_type"],
		types.CNV_COMBINATION, unique = True)

	ENSEMBL_GENE = "ensembl:gene"

	classif = {}

	log.info("Classifying oncodrive results ...")

	for oid in oncodrive_port:
		o = em.find(oid, types.CNV_ONCODRIVE_GENES)
		if o is None:
			log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, oid))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])

		key = (o["icdo_topography"], o["icdo_morphology"], ENSEMBL_GENE)

		log.debug("Oncodrive results ({}) [{}] classified into ({}) ...".format(", ".join(okey), oid, ", ".join(key)))

		if key in classif:
			classif[key] += [o]
		else:
			classif[key] = [o]

	log.info("Preparing combinations ...")

	for key in sorted(classif):
		if key in comb_results_index:
			cid = comb_results_index[key][0]
			c = em.find(cid, types.CNV_COMBINATION)
			if c is None:
				log.error("{} not found: {}".format(types.CNV_COMBINATION, cid))
				return
		else:
			c = DataElement(key_sep = "/")
			c["id"] = cid = str(uuid.uuid4())
			c["icdo_topography"] = key[0]
			c["icdo_morphology"] = key[1]

		c["id_type"] = ENSEMBL_GENE

		olist = classif[key]
		
		log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(olist)))

		ids = c.create_list()
		flist = c.create_list()

		for o in olist:
			ids += [o["id"]]
			flist += [o["results_file"]]

		c["source"] = src = c.create_element()
		src["type"] = types.CNV_ONCODRIVE_GENES
		src["ids"] = ids

		c["files"] = flist

		combination_port.write(c.to_native())

	em.close()
	es.close()
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	enrichment_port, combination_port = \
		task.ports("enrichment_ids", "combinations")

	es = EntityServer(conf["entities"])
	em = es.manager()

	log.info("Indexing available {} results ...".format(types.CNV_COMBINATION))
	comb_results_index = em.group_ids(
		["icdo_topography", "icdo_morphology", "id_type"],
		types.CNV_COMBINATION, unique = True)

	classif = {}

	log.info("Classifying enrichment results ...")

	for eid in enrichment_port:
		e = em.find(eid, types.CNV_ENRICHMENT)
		if e is None:
			log.error("{} not found: {}".format(types.CNV_ENRICHMENT, eid))
			continue

		ekey = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"], e["id_type"])

		key = (e["icdo_topography"], e["icdo_morphology"], e["id_type"])

		log.debug("Enrichment results ({}) [{}] classified into ({}) ...".format(", ".join(ekey), eid, ", ".join(key)))

		if key in classif:
			classif[key] += [e]
		else:
			classif[key] = [e]

	log.info("Preparing combinations ...")

	for key in sorted(classif):
		if key in comb_results_index:
			cid = comb_results_index[key][0]
			c = em.find(cid, types.CNV_COMBINATION)
			if c is None:
				log.error("{} not found: {}".format(types.CNV_COMBINATION, cid))
				return
		else:
			c = DataElement(key_sep = "/")
			c["id"] = cid = str(uuid.uuid4())
			c["icdo_topography"] = key[0]
			c["icdo_morphology"] = key[1]
			c["id_type"] = key[2]

		elist = classif[key]
		
		log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(elist)))

		ids = c.create_list()
		flist = c.create_list()

		for e in elist:
			ids += [e["id"]]
			flist += [e["results_file"]]

		c["source"] = src = c.create_element()
		src["type"] = types.CNV_ENRICHMENT
		src["ids"] = ids

		c["files"] = flist

		combination_port.write(c.to_native())

	em.close()
	es.close()
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	min_tumour_unit_size = conf.get("mrna.min_tumour_unit_size", 20, dtype=int)

	log = task.logger()

	task.check_in_ports(["log2r_ids"])
	task.check_out_ports(["log2r_tumour_unit_ids"])

	log2r_port = task.ports["log2r_ids"]
	log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	overwrite = conf.get("overwrite", False, dtype=bool)

	if "excluded_topographies" in conf:
		excluded_topographies = set(conf.get("excluded_topographies"))
		log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies)))
	else:
		excluded_topographies = set()
		
	# Run

	log.info("Indexing available mrna log2r tumour units ...")
	log2r_tumour_unit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_LOG2R_TUMOUR_UNIT, unique = True)

	units = {}
	for log2r_id in log2r_port:
		e = em.find(log2r_id, types.MRNA_LOG2R)
		if e is None:
			log.error("%s not found: %s" % (types.MRNA_LOG2R, log2r_id))
			continue

		eid = e["id"]
		study_id = e["study_id"]
		platform_id = e["platform_id"]
		icdo_topography = e["icdo_topography"]
		icdo_morphology = e.get("icdo_morphology", "")
		
		log.info("Classifying mrna log2r (%s, %s, %s, %s) [%s] ..." % (study_id, platform_id, icdo_topography, icdo_morphology, eid))
		
		keys = []
	
		m = _ICDO_TOPOGRAPHY_PAT.match(icdo_topography)
		if m is None:
			log.error("Wrong ICD-O Topography code: {0}".format(icdo_topography))
			continue
		else:
			level1 = m.group(1)
			level2 = m.group(2)

		if len(icdo_morphology) > 0:
			m = _ICDO_MORPHOLOGY_PAT.match(icdo_morphology)
			if m is None:
				log.error("Wrong ICD-O Morphology code: {0}".format(icdo_morphology))
				continue

		keys += [(study_id, platform_id, level1, "")]
		if len(icdo_morphology) > 0:
			keys += [(study_id, platform_id, level1, icdo_morphology)]
			#keys += [(study_id, platform_id, "", icdo_morphology)]
	
		if level2 is not None:
			keys += [(study_id, platform_id, icdo_topography, "")]
			if len(icdo_morphology) > 0:
				keys += [(study_id, platform_id, icdo_topography, icdo_morphology)]

		for key in keys:
			icdo_topography = key[2]
			if icdo_topography in excluded_topographies:
				log.debug("\t(%s) [excluded]" % ", ".join(key))
				continue

			log.debug("\t(%s)" % ", ".join(key))
			
			if key not in units:
				units[key] = [eid]
			else:
				units[key] += [eid]

	log.info("Persisting %i mrna log2r tumour units ..." % len(units))
	log.debug("Minimum size = %i" % min_tumour_unit_size)

	for key, ids in sorted(units.iteritems()):
		
		size = len(ids)
		
		if size < min_tumour_unit_size:
			log.debug("\t(%s)\t%i assays [Skipped]" % (", ".join(key), size))
			continue
		else:
			log.debug("\t(%s)\t%i assays" % (", ".join(key), size))

		if key in log2r_tumour_unit_index:
			uid = log2r_tumour_unit_index[key][0]
			if not overwrite:
				u = em.find(uid, types.MRNA_LOG2R_TUMOUR_UNIT)
			else:
				u = DataElement(key_sep = "/")
		else:
			uid = str(uuid.uuid4())
			u = DataElement(key_sep = "/")

		u["id"] = uid
		u["study_id"] = key[0]
		u["platform_id"] = key[1]
		u["icdo_topography"] = key[2]
		u["icdo_morphology"] = key[3]

		u["size"] = size
		u["mrna_log2r_ids"] = u.create_list(ids)
		
		em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT)
		log2r_tumour_unit_port.write(uid)
	
	em.close()
	es.close()
示例#10
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay",
				"cnv.min_tumour_unit_size"])

	conf = task.conf

	log = task.logger()

	study_ids_port, evt_port, evt_tunit_port = \
		task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	source_repo = rs.repository("source")

	if "excluded_topographies" in conf:
		excluded_topographies = set(conf.get("excluded_topographies"))
		log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies)))
	else:
		excluded_topographies = set()
		
	# Run

	log.info("Creating indices for {} ...".format(types.CNV_EVENTS))
	evt_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS, unique = True)
	
	log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	evt_tunit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS_TUMOUR_UNIT, unique = True)
	
	processed_studies = set()
	processed_assays = 0
	valid_assay_count = {}
	skipped_assay_count = {}
	wrong_assays = {}
	wrong_samples = {}
	tumour_units = {}
	evt_dup = {}
	
	study_ids = study_ids_port.read_all()
	log.info("Processing %i studies ..." % len(study_ids))

	for assay in em.iter_all(types.SOURCE_ASSAY):

		assay_id = assay.get("id", "WITHOUT ID")
		log.debug("Reading assay %s ..." % assay_id)

		mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id",
			"assay_property/assay_design", "assay_property/data_type",
			"assay_property/study_type", "assay_property/filename"])	
		
		assay_source_path = assay.get("source_path", "")
		
		if len(mf) > 0:
			study_id = assay.get("study_id", "WITHOUT ID")
			doc_path = assay.get("__doc_path", "UNKNOWN")

			log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		study_id = assay["study_id"]

		if study_id not in study_ids:
			log.debug("Assay {} not included in 'study_ids'".format(assay_id))
			continue

		platform_id = assay["platform_id"]
		sample_id = assay["sample_id"]
		
		assay_design = assay["assay_property/assay_design"]
		data_type = assay["assay_property/data_type"]
		study_type = assay["assay_property/study_type"]

		source_path = assay["source_path"]
		source_file = assay["assay_property/filename"]

		e = assay.transform([
			("assay_id", "id"),
			"study_id",
			"platform_id",
			"sample_id",
			"source_path"])

		e["data_file"] = source_repo.url("assay", source_path, source_file)

		included = study_id in study_ids and study_type == "genomic"
		included &= (assay_design == "cancer_vs_normal" and data_type == "binary")

		if not included:
			if study_type != "transcriptomic" and study_id in study_ids:
				s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]])
				log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s))
				map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type))
			continue

		sample = em.find(sample_id, types.SOURCE_SAMPLE)
		if sample is None:
			log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue
		
		mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"])
		if len(mf) > 0:
			sample_source_path = sample.get("source_path", "")
			log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		sample = sample.transform([
			"id",
			"source_path",
			("disease_state", "basic_sample_details/disease_state"),
			("normal_counterpart", "normal_counterpart_location/topography"),
			("icdo_topography", "icdo/topography"),
			("icdo_morphology", "icdo/morphology") ])
		
		disease_state = sample["disease_state"]
		if disease_state not in disease_state_map:
			log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", "")))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		orig_disease_state = disease_state
		disease_state = disease_state_map[disease_state]
		if disease_state not in ["tumour"]:
			log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state))
			continue

		e["disease_state"] = disease_state
		
		e["icdo_topography"] = sample["icdo_topography"]
		e["icdo_morphology"] = sample.get("icdo_morphology", "")
		if "normal_counterpart" in sample:
			e["normal_counterpart"] = sample["normal_counterpart"]

		repo, rel_path = rs.from_url(e["data_file"])

		if not repo.exists(rel_path):
			log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"])

		eid = None
		duplicated = False
		exists = False
		if e_key in evt_dup:
			duplicated = True
		elif e_key in evt_index:
			eid = evt_index[e_key][0]
			exists = True
		
		if duplicated:
			log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		if eid is None:
			eid = str(uuid.uuid4())
		
		e["id"] = eid

		u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", ""))
		keys = classify_by_experiment_and_icdo(
					u_key[0], u_key[1], u_key[2], u_key[3])
		for key in keys:
			icdo_topography = key[2]
			if icdo_topography in excluded_topographies:
				continue
			map_list_add(tumour_units, key, eid)

		processed_studies.add(study_id)
		processed_assays += 1
		map_inc(valid_assay_count, (study_id, platform_id))

		msg = {True : "Overwritting", False : "Writting"}[exists]
		log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key)))
		em.persist(e, types.CNV_EVENTS)
		evt_port.write(eid)
		evt_dup[e_key] = eid

	min_tumour_unit_size = conf["cnv.min_tumour_unit_size"]

	log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	log.debug("Minimum size = {}".format(min_tumour_unit_size))

	for key in sorted(tumour_units):
		v = tumour_units[key]
		size = len(v)
		if size < min_tumour_unit_size:
			discard = True
			discard_text = "[skipped]"
		else:
			discard = False
			discard_text = ""

		if key in evt_tunit_index:
			uid = evt_tunit_index[key][0]
			u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
			if u is None:
				log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
				continue

			arrow_text = "==>"
		else:
			uid = str(uuid.uuid4())
			u = DataElement(key_sep = "/")
			u["id"] = uid
			u["study_id"] = key[0]
			u["platform_id"] = key[1]
			u["icdo_topography"] = key[2]
			u["icdo_morphology"] = key[3]

			arrow_text = "-->"

		log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text))

		if discard:
			continue

		u["size"] = len(v)
		u["cnv_evt_ids"] = u.create_list(v)

		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		evt_tunit_port.write(uid)

	sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))]
	log.info("".join(sb))

	log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1)))
	
	log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays)))
	
	log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples)))

	em.close()
	es.close()
示例#11
0
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	log = task.logger()
	
	task.check_in_ports(["study_ids"])
	task.check_out_ports(["absi_ids", "absi_tumour_unit_ids", "normal_pool_ids", "log2r_source_ids"])

	study_ids_port = task.ports["study_ids"]
	absi_port = task.ports["absi_ids"]
	absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"]
	normal_pool_port = task.ports["normal_pool_ids"]
	log2r_source_port = task.ports["log2r_source_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])

	#overwrite = conf.get("overwrite", False, dtype=bool)

	# Run
	
	log.info("Creating indices for {} ...".format(types.MRNA_ABS_INTENSITY))
	absi_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_ABS_INTENSITY, unique = True)
	
	log.info("Creating indices for {} ...".format(types.MRNA_LOG2R_SOURCE))
	log2r_src_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_LOG2R_SOURCE, unique = True)

	log.info("Creating indices for {} ...".format(types.MRNA_ABSI_TUMOUR_UNIT))
	absi_tumour_unit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_ABSI_TUMOUR_UNIT, unique = True)

	processed_studies = set()
	processed_assays = 0
	valid_assay_count = {}
	skipped_assay_count = {}
	wrong_assays = {}
	wrong_samples = {}
	log2r_src_units = {}
	tumour_units = {}
	normal_pools = {}
	absi_dup = {}
	log2r_source_dup = {}

	study_ids = study_ids_port.read_all()
	log.info("Processing %i studies ..." % len(study_ids))

	for assay in em.iter_all(types.SOURCE_ASSAY):

		assay_id = assay.get("id", "WITHOUT ID")
		log.debug("Reading assay %s ..." % assay_id)

		mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id",
			"assay_property/assay_design", "assay_property/data_type",
			"assay_property/study_type", "assay_property/filename"])	
		
		assay_source_path = assay.get("source_path", "")
		
		if len(mf) > 0:
			study_id = assay.get("study_id", "WITHOUT ID")
			doc_path = assay.get("__doc_path", "UNKNOWN")

			log.error("Assay %s in study %s missing required fields: %s {%s}" % (assay_id, study_id, mf, assay_source_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		study_id = assay["study_id"]

		if study_id not in study_ids:
			log.debug("Assay %s not included in 'study_ids'" % assay_id)
			continue

		platform_id = assay["platform_id"]
		sample_id = assay["sample_id"]
		
		assay_design = assay["assay_property/assay_design"]
		data_type = assay["assay_property/data_type"]
		study_type = assay["assay_property/study_type"]
		
		e = assay.transform([
			("assay_id", "id"),
			"study_id",
			"platform_id",
			"sample_id",
			"source_path",
			("data_file/path", "source_path"),
			("data_file/name", "assay_property/filename") ])

		e["data_file/repo"] = assay.get("data_file/repo", "assay")

		included = study_id in study_ids and study_type == "transcriptomic"
		included &= (assay_design == "cancer_and_normal" and data_type == "log_abs_readings") \
						or (assay_design == "cancer_vs_normal" and data_type == "log2ratios")

		if not included:
			if study_type != "genomic" and study_id in study_ids:
				s = ", ".join(["%s = %s" % (v[0], v[1]) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]])
				log.warn("Skipping assay %s {%s}: %s." % (assay_id, assay_source_path, s))
				map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type))
			continue

		sample = em.find(sample_id, types.SOURCE_SAMPLE)
		if sample is None:
			log.error("Assay %s references a non-existent sample: %s" % (assay_id, sample_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue
		
		mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"])
		if len(mf) > 0:
			sample_id = sample.get("id", "WITHOUT ID")
			doc_path = sample.get("__doc_path", "UNKNOWN")
			sample_source_path = sample.get("source_path", "")
			
			log.error("Sample %s associated with assay %s in study %s missing required fields: %s {%s}" % (sample_id, assay_id, study_id, mf, sample_source_path))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		sample = sample.transform([
			"id",
			("source_path", "source_path"),
			("disease_state", "basic_sample_details/disease_state"),
			("normal_counterpart", "normal_counterpart_location/topography"),
			("icdo_topography", "icdo/topography"),
			("icdo_morphology", "icdo/morphology") ])
		
		disease_state = sample["disease_state"]
		if disease_state not in disease_state_map:
			log.error("Unknown disease_state '%s' for sample %s {%s}" % (disease_state, sample_id, sample.get("source_path", "")))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		disease_state = disease_state_map[disease_state]
		if disease_state not in ["tumour", "normal"]:
			continue

		e["disease_state"] = disease_state
		
		e["icdo_topography"] = sample["icdo_topography"]
		e["icdo_morphology"] = sample.get("icdo_morphology", "")
		if "normal_counterpart" in sample:
			e["normal_counterpart"] = sample["normal_counterpart"]

		repo = rs.repository(e["data_file/repo"])
		rel_path = os.path.join(e["data_file/path"], e["data_file/name"])

		if not repo.exists(rel_path):
			log.error("Assay %s in study %s missing data file: [%s]" % (assay_id, study_id, rel_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"])
		
		eid = None
		duplicated = False
		exists = False
		if data_type == "log_abs_readings":
			if key in absi_dup:
				duplicated = True
			elif key in absi_index:
				eid = absi_index[key][0]
				exists = True
		elif data_type == "log2ratios":
			if key in log2r_source_dup:
				duplicated = True
			elif key in log2r_src_index:
				eid = log2r_src_index[key][0]
				exists = True

		if duplicated:
			log.error("Duplicated key (%s) for assay %s" % (", ".join(key), assay_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		if eid is None:
			eid = str(uuid.uuid4())
		
		e["id"] = eid
		
		if disease_state == "normal":
			if data_type == "log2ratios":
				k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]))
				map_list_add(log2r_src_units, k, eid)
			elif data_type == "log_abs_readings":
				map_list_add(normal_pools, (study_id, platform_id, e["icdo_topography"]), eid)
			else:
				log.error("Assay %s has an unexpected combination of (disease_state, assay_design, data_type): (%s, %s)" % (assay_id, disease_state, assay_design, data_type))
				map_list_add(wrong_assays, study_id, assay_id)
				continue
		elif disease_state == "tumour":
			k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]))
			if data_type == "log_abs_readings":
				map_list_add(tumour_units, k, eid)
			elif data_type == "log2ratios":
				map_list_add(log2r_src_units, k, eid)

		processed_studies.add(study_id)
		processed_assays += 1
		map_inc(valid_assay_count, (study_id, platform_id))

		msg = {True : "Overwritting", False : "Writting"}[exists]
		if data_type == "log_abs_readings":
			log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_ABS_INTENSITY, ", ".join(key), eid))
			em.persist(e, types.MRNA_ABS_INTENSITY)
			absi_port.write(eid)
			absi_dup[key] = eid
		elif data_type == "log2ratios":
			log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_LOG2R_SOURCE, ", ".join(key), eid))
			em.persist(e, types.MRNA_LOG2R_SOURCE)
			log2r_source_port.write(eid)
			log2r_source_dup[key] = eid

	log.info("Persisting mrna absi tumour units ...")

	for k, v in sorted(tumour_units.items()):
		key = (k[0], k[1], k[2])
		exists = key in absi_tumour_unit_index
		if exists:
			uid = absi_tumour_unit_index[key][0]
		else:
			uid = str(uuid.uuid4())

		u = DataElement(key_sep = "/")
		u["id"] = uid
		u["study_id"] = k[0]
		u["platform_id"] = k[1]
		u["icdo_topography"] = k[2]
		u["size"] = len(v)
		u["mrna_absi_ids"] = u.create_list(v)

		if exists:
			log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid))
		else:
			log.debug("\t(%s) --> %s ..." % (", ".join(k), uid))

		em.persist(u, types.MRNA_ABSI_TUMOUR_UNIT)
		absi_tumour_unit_port.write(uid)

	log.info("Creating indices for mrna normal pools ...")
	normal_pool_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_NORMAL_POOL, unique = True)

	log.info("Persisting mrna normal pools ...")

	for k, v in sorted(normal_pools.items()):
		key = (k[0], k[1], k[2])
		exists = key in normal_pool_index
		if exists:
			uid = normal_pool_index[key][0]
		else:
			uid = str(uuid.uuid4())

		u = DataElement(key_sep = "/")
		u["id"] = uid
		u["study_id"] = k[0]
		u["platform_id"] = k[1]
		u["icdo_topography"] = k[2]
		u["size"] = len(v)
		u["mrna_absi_ids"] = u.create_list(v)

		if exists:
			log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid))
		else:
			log.debug("\t(%s) --> %s ..." % (", ".join(k), uid))

		em.persist(u, types.MRNA_NORMAL_POOL)
		normal_pool_port.write(uid)

	sb = ["\n\nProcessed %i assays for %i studies (out of %i):\n\n" % (processed_assays, len(processed_studies), len(study_ids))]
	
	sb += ["%i mrna tumour units:\n\n" % (len(tumour_units))]
	
	for k, v in sorted(tumour_units.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]

	sb += ["\n%i mrna normal pools:\n\n" % (len(normal_pools))]
	
	for k, v in sorted(normal_pools.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]
	
	sb += ["\n%i mrna source log2r units:\n\n" % (len(log2r_src_units))]
	
	for k, v in sorted(log2r_src_units.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]

	sb += ["\nAssay counts by study and platform:\n\n"]
	
	for k, v in sorted(valid_assay_count.items()):
		sb += ["\t%s\t%i assays" % (k, v)]
		if k in wrong_assays:
			sb += ["\t%i failed assays" % len(wrong_assays[k])]
		if k in wrong_samples:
			sb += ["\t%i failed samples" % len(wrong_samples[k])]
		sb += ["\n"]

	log.info("".join(sb))

	if len(skipped_assay_count) > 0:
		log.info("Skipped assays:\n\n%s" % map_count_tostring(skipped_assay_count, indent = 1))

	if len(wrong_assays) > 0:
		log.info("Summary of failed assays:\n\n%s" % map_list_tostring(wrong_assays))

	if len(wrong_samples) > 0:
		log.info("Summary of failed samples:\n\n%s" % map_list_tostring(wrong_samples))

	em.close()

	return 0