예제 #1
0
def run(task):

	# Initialization

	conf = task.conf

	log = task.logger()

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	# Run

	for k, v in vars(types).items():
		if k.startswith("CNV_"):
			log.info("Preparing '{0}' ...".format(v))
			em.ensure_collection_exists(v)
			path = rpath.absolute(v.replace(".", "/"))
			log.debug("\tData: {0}".format(path))
			data_repo.mkdir_if_not_exists(path)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
	
	return 0
예제 #2
0
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.data", "repositories.source",
						"mrna.enrichment", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["oncodrive_ids"])
	task.check_out_ports(["enrichment_ids"])

	oncodrive_port = task.ports["oncodrive_ids"]
	enrichment_port = task.ports["enrichment_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	data_repo = rs.repository("data")
	
	overwrite = conf.get("overwrite", False, dtype=bool)

	# retrieve enrichment configurations
	ec = conf["mrna.enrichment"]
	if "default" in ec:
		default = ec["default"]
	else:
		default = conf.create_element()

	if "modules" not in ec:
		log.error("There is no enrichment modules section available in mrna.enrichment")
		return -1

	log.info("Reading modules configuration ...")

	econfs = list()
	for mod in ec["modules"]:
		m = ec.create_element()
		m.merge(default)
		m.merge(mod)
		mf = m.missing_fields(["id_type", "test", "modules_file"])
		if len(mf) > 0:
			log.error("Enrichment configuration missing required fields: {}".format(", ".join(mf)))
			log.error("Module configuration: {}".format(m))
		else:
			econfs.append(m)
			log.debug("{} -> {}".format(m["id_type"], m["modules_file"]))

	if len(econfs) == 0:
		log.error("There are no enrichment configurations available in mrna.enrichment")
		return 0

	results_base_path = types.MRNA_ENRICHMENT.replace(".", "/")
	
	log.info("Indexing available enrichment results ...")
	enrichment_results_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology", "id_type"],
		types.MRNA_ENRICHMENT, unique = True)

	for oid in oncodrive_port:
		o = em.find(oid, types.MRNA_ONCODRIVE_GENES)
		if o is None:
			log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, oid))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])

		log.info("Enrichment for oncodrive results ({0}) [{1}] ...".format(", ".join(okey), oid))

		for ec in econfs:
			log.info("Module {} [{}] ...".format(ec["id_type"], ec["modules_file"]))

			key = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"], ec["id_type"])

			if key in enrichment_results_index:
				eid = enrichment_results_index[key][0]
				e = em.find(eid, types.MRNA_ENRICHMENT)
				if e is None:
					log.error("{} not found: {}".format(types.MRNA_ENRICHMENT, eid))
					continue
			else:
				e = o.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
				e["id"] = eid = str(uuid.uuid4())

			e["id_type"] = ec["id_type"]

			# enrichment results

			results_path = rpath.join(results_base_path, eid + ".tsv.gz")

			if skip_file(overwrite, data_repo, results_path, e.get("results_file")):
				log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid))
				enrichment_port.write(eid)
				continue

			valid = enrichment(log, conf, rs, data_repo, results_path, o["results_file"], e, ec,
						["id", "upreg_corrected_right_p_value", "downreg_corrected_right_p_value"],
						["id", "upreg", "downreg"])

			# save mapped results
			if valid:
				em.persist(e, types.MRNA_ENRICHMENT)
				enrichment_port.write(eid)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
예제 #3
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)
	
	log = task.logger()

	id_port = task.ports("id")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	conn = biomart_db_connect(conf["biomart.db"], log)

	cursor = conn.cursor()

	table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values())

	feat_ids = {}

	for name in table_infixs:
		if name == "gene":
			continue
			
		cursor.execute("""
			CREATE TABLE IF NOT EXISTS exp_{0}_trs (
			  {0}_id int(11) NOT NULL,
			  icdo_id int(11) NOT NULL,
			  exp_id int(11) NOT NULL,
			  upreg_total int(11) DEFAULT NULL,
			  upreg_observed double DEFAULT NULL,
			  upreg_expected double DEFAULT NULL,
			  upreg_stdev double DEFAULT NULL,
			  upreg_pvalue double DEFAULT NULL,
			  upreg_cpvalue double DEFAULT NULL,
			  downreg_total int(11) DEFAULT NULL,
			  downreg_observed double DEFAULT NULL,
			  downreg_expected double DEFAULT NULL,
			  downreg_stdev double DEFAULT NULL,
			  downreg_pvalue double DEFAULT NULL,
			  downreg_cpvalue double DEFAULT NULL,
			  PRIMARY KEY ({0}_id,icdo_id,exp_id),
			  KEY icdo (icdo_id,exp_id),
			  KEY exp (exp_id),
			  CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id),
			  CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
			  CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
			) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine))

		feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name))

	icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
	exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

	for id_type, eid in id_port:
		e = em.find(eid, types.MRNA_ENRICHMENT)
		if e is None:
			log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid))
			continue

		if "results_file" not in e:
			log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid))
			continue

		study_id = e["study_id"]
		platform_id = e["platform_id"]
		icdo_topography = e["icdo_topography"]
		icdo_morphology = e["icdo_morphology"]

		okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type)

		log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid))

		table_infix = ID_TYPE_TO_TABLE_INFIX[id_type]

		icdo_key = (icdo_topography, icdo_morphology)
		if icdo_key not in icdo:
			log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
			continue
		icdo_id = icdo[icdo_key]

		exp_key = (study_id, platform_id)
		if exp_key not in exp:
			log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
			continue
		exp_id = exp[exp_key]

		ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix),
				["{}_id".format(table_infix), "icdo_id", "exp_id",
						"upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue",
						"downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size)

		results_repo, results_path = rs.from_url(e["results_file"])

		try:
			reader = results_repo.open_reader(results_path)
		except Exception as ex:
			log.exception(ex)
			ib.close()
			results_repo.close()
			continue
		
		# read header
		hdr_map = {}
		hdr = reader.readline().rstrip().split("\t")
		for i, name in enumerate(hdr):
			hdr_map[name] = i

		try:
			col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
		except KeyError as e:
			log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
			reader.close()
			ib.close()
			results_repo.close()
			continue

		skipped_ids = set()

		fids = feat_ids[table_infix]

		# read data
		for line in reader:
			line = line.rstrip()
			data = line.split("\t")
			feat_name = data[0]
			data = [data[i] for i in col_indices]
			if feat_name not in fids:
				skipped_ids.add(feat_name)
				continue

			feat_id = fids[feat_name]
			
			ib.insert(feat_id, icdo_id, exp_id, *data)

		if len(skipped_ids) > 0:
			log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids)))

		log.debug("{} results inserted".format(ib.count))

		ib.close()
		reader.close()

	em.close()
	es.close()
	rs.close()
예제 #4
0
def run(task):

    # Initialization

    task.check_conf(
        [
            "entities",
            "repositories",
            "repositories.data",
            "repositories.source",
            "bin_paths.python",
            "bin_paths.matrix_map",
        ]
    )
    conf = task.conf

    log = task.logger()

    task.check_in_ports(["oncodrive_ids"])
    task.check_out_ports(["mapped_oncodrive_ids"])

    oncodrive_port = task.ports["oncodrive_ids"]
    mapped_oncodrive_port = task.ports["mapped_oncodrive_ids"]

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    data_repo = rs.repository("data")
    source_repo = rs.repository("source")

    overwrite = conf.get("overwrite", False, dtype=bool)

    platform_base_path = "platform"
    vplatform_base_path = "vplatform"

    results_base_path = types.MRNA_ONCODRIVE_GENES.replace(".", "/")

    log.info("Indexing available oncodrive results for genes ...")
    oncodrive_results_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_ONCODRIVE_GENES, unique=True
    )

    for oid in oncodrive_port:
        o = em.find(oid, types.MRNA_ONCODRIVE_PROBES)
        if o is None:
            log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_PROBES, oid))
            continue

        study_id = o["study_id"]
        platform_id = o["platform_id"]
        key = (study_id, platform_id, o["icdo_topography"], o["icdo_morphology"])

        if key in oncodrive_results_index:
            mid = oncodrive_results_index[key][0]
            m = em.find(mid, types.MRNA_ONCODRIVE_GENES)
            if m is None:
                log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, mid))
                continue
        else:
            m = o.transform(
                [
                    "study_id",
                    "platform_id",
                    "icdo_topography",
                    "icdo_morphology",
                    "log2r_tumour_unit_id",
                    ("oncodrive_probes_id", "id"),
                ]
            )
            m["id"] = mid = str(uuid.uuid4())

            # mapped oncodrive results

        results_path = rpath.join(results_base_path, mid + ".tsv.gz")
        gitools_results_path = rpath.join(results_base_path, mid + ".tdm.gz")

        if skip_file(overwrite, data_repo, results_path, m.get("results_file")):
            log.warn("Skipping ({0}) [{1}] as it already exists".format(", ".join(key), mid))
            mapped_oncodrive_port.write(mid)
            continue

        log.info("Mapping oncodriver results ({0}) [{1}] ...".format(", ".join(key), oid))

        # determine the mapping file
        map_file = None
        p = em.find(platform_id, types.SOURCE_PLATFORM)
        if p is None:
            log.error("{0} not found: {1}".format(types.SOURCE_PLATFORM, platform_id))
            continue

        platform_id_type = p.get("SO/platform_id_type")
        if platform_id_type is None:
            log.error("Undefined annotation 'SO/platform_id_type' for platform '{0}'.".format(platform_id))
            continue
        elif platform_id_type != "genbank_accession":  # affy_accession, custom, ...
            missing = p.missing_fields(["ensg_map", "ensg_map/file"])
            if len(missing) > 0:
                log.error("Missing required fields for platform '{0}': {1}".format(platform_id, ", ".join(missing)))
                continue
            map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"])
            if not source_repo.exists(map_file):
                log.error("Mapping file not found for platform '{0}': {1}".format(platform_id, map_file))
                continue
        elif platform_id_type == "genbank_accession":
            if len(p.missing_fields(["ensg_map", "ensg_map/file"])) > 0:
                map_file = None
            else:
                map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"])
            if map_file is None or not source_repo.exists(map_file):
                vpid = "-".join([platform_id, study_id])
                vp = em.find(vpid, types.SOURCE_VPLATFORM)
                if vp is None:
                    log.error("{0} not found: {1}".format(types.SOURCE_VPLATFORM, vpid))
                    continue
                missing = vp.missing_fields(["ensg_map", "ensg_map/path", "ensg_map/file"])
                if len(missing) > 0:
                    log.error("Missing required fields for vplatform '{0}': {1}".format(vpid, ", ".join(missing)))
                    continue
                map_file = rpath.join(vplatform_base_path, vp["ensg_map/path"], vp["ensg_map/file"])
                if not source_repo.exists(map_file):
                    log.error(
                        "Mapping file not found for vplatform ({0}, {1}): {2}".format(platform_id, study_id, map_file)
                    )
                    continue
        else:
            log.error("Unknown SO/platform_id_type '{0}' for platform '{1}'.".format(platform_id_type, platform_id))
            continue

        log.debug("Mapping file: {0}".format(map_file))

        m["platform_map_file"] = source_repo.url(map_file)

        # oncodrive results file
        repo, repo_path = rs.from_url(o["results_file"])
        local_path = repo.get_local(repo_path)

        # mapped oncodrive results
        m["results_file"] = data_repo.url(results_path)
        results_local_path = data_repo.create_local(results_path)
        gitools_results_local_path = data_repo.create_local(gitools_results_path)

        mapping_path = rpath.join(results_base_path, mid + ".mapping.tsv.gz")
        m["mapping_file"] = data_repo.url(mapping_path)
        mapping_local_path = data_repo.create_local(mapping_path)

        map_results_file = tempfile.mkstemp(prefix="mrna_oncodrive_map_", suffix=".tsv")[1]

        try:
            # run the mapping tool
            local_map_file = source_repo.get_local(map_file)

            log.debug("Mapping {0} to {1} ...".format(repo_path, map_results_file))

            cmd = " ".join(
                [
                    conf["bin_paths.python"],
                    conf["bin_paths.matrix_map"],
                    "-o",
                    map_results_file,
                    "-i",
                    mapping_local_path,
                    local_path,
                    local_map_file,
                ]
            )

            log.debug(cmd)

            retcode = subprocess.call(args=cmd, shell=True)

            if retcode != 0:
                raise Exception("There was an error mapping the results")

                # merge repeated ids

            log.debug("Merging {0} to {1} ...".format(map_results_file, results_path))
            log.debug("Gitools file: {0}".format(gitools_results_path))

            upreg_count, downreg_count = merge(log, map_results_file, results_local_path, gitools_results_local_path)
            if upreg_count == 0 and downreg_count == 0:
                log.error(
                    "The results of the mapping for ({0}) are empty. This could be because the annotated platform or the mapping file is wrong.".format(
                        ", ".join(key)
                    )
                )

                # close local paths
            data_repo.put_local(results_local_path)
            data_repo.put_local(mapping_local_path)

        except Exception as e:
            log.exception(e)

            data_repo.close_local(results_local_path)
            data_repo.close_local(mapping_local_path)
            continue

        finally:
            os.remove(map_results_file)
            repo.close_local(local_path)
            source_repo.close_local(local_map_file)

            # save mapped results
        em.persist(m, types.MRNA_ONCODRIVE_GENES)
        mapped_oncodrive_port.write(mid)

    em.close()
    data_repo.close()
    source_repo.close()
    rs.close()
예제 #5
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	evt_tumour_unit_port, oncodrive_results_port = \
		task.ports("evt_tumour_unit_ids", "oncodrive_results_ids")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	log.info("Indexing available {} ...".format(types.CNV_ONCODRIVE_GENES))
	oncodrive_results_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_ONCODRIVE_GENES, unique = True)

	results_base_path = types.CNV_ONCODRIVE_GENES.replace(".", "/")

	for uid in evt_tumour_unit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"])
		if key in oncodrive_results_index:
			eid = oncodrive_results_index[key][0]
			e = em.find(eid, types.CNV_ONCODRIVE_GENES)
			if e is None:
				log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, eid))
				continue
		else:
			e = u.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
			eid = e["id"] = str(uuid.uuid4())

		# create oncodrive results entity
		e["evt_tumour_unit_id"] = uid

		results_path = rpath.join(results_base_path, eid + ".tsv.gz")

		if skip_file(overwrite, data_repo, results_path, e.get("results_file")):
			log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid))
			oncodrive_results_port.write(eid)
			continue

		e["results_file"] = data_repo.url(results_path)
		
		# data matrix for oncodrive calculation
		matrix_repo, matrix_path = rs.from_url(u["data_file"])

		# Gain & Loss

		log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))
		log.debug("{} id is {}".format(types.CNV_ONCODRIVE_GENES, eid))

		tmp_path = mkdtemp(prefix = "cnv_oncodrive_calc_")
		log.debug("Temporary directory: {}".format(tmp_path))
		tmp_file = os.path.join(tmp_path, "filtered_data.tsv")

		matrix_local_path = matrix_repo.get_local(matrix_path)
		log.debug("Matrix path: {}".format(matrix_path))

		try:
			try:
				log.info("Calculating Gain ...")
				log.debug("Bit mask filtering (01) {} to {} ...".format(matrix_local_path, tmp_file))
				mask_filtering(matrix_local_path, tmp_file, 1)
				gain_results = run_oncodrive(
					conf, log, e, "gain", tmp_file, tmp_path)
			except:
				log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for gain failed".format(",".join(key), uid))
				matrix_repo.close_local(matrix_local_path)
				raise

			try:
				log.info("Calculating Loss ...")
				log.debug("Bit mask filtering (10) {} to {} ...".format(matrix_local_path, tmp_file))
				mask_filtering(matrix_local_path, tmp_file, 2)
				loss_results = run_oncodrive(
					conf, log, e, "loss", tmp_file, tmp_path)
			except:
				log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for downreg failed".format(",".join(key), uid))
				matrix_repo.close_local(matrix_local_path)
				raise

			# Join gain & loss results

			log.info("Joining upreg & downreg results into memory ...")

			# the join is done in memory with a map
			dmap = read_data_map(log, gain_results, loss_results)

			log.info("Writting joined data to {} ...".format(results_path))

			results_local_path = data_repo.create_local(results_path)

			write_data_map(dmap, results_local_path)

		finally:
			matrix_repo.close_local(matrix_local_path)
			matrix_repo.close()

			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		data_repo.put_local(results_local_path)

		em.persist(e, types.CNV_ONCODRIVE_GENES)
		oncodrive_results_port.write(eid)
	
	em.close()
	data_repo.close()
	rs.close()
예제 #6
0
def run(task):
	
	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["absi_tumour_unit_ids"])
	task.check_out_ports(["log2r_ids"])

	absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"]
	log2r_port = task.ports["log2r_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)
	
	# Run
	
	# Index normal pools by study, platform, topography
	log.debug("Indexing normal pools by study, platform and topography ...")
	pools_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_NORMAL_POOL, unique = True)

	# Index log2r assays by absi_id
	log.debug("Indexing log2r assays by absi assay ...")
	log2r_index = em.group_ids(
		["absi_id"],
		types.MRNA_LOG2R, unique = True)

	absi_tumour_unit_ids = absi_tumour_unit_port.read_all()
	
	log.info("Processing %i mrna absi tumour units ..." % len(absi_tumour_unit_ids))
	#log.debug("[%s]" % (", ".join(absi_tumour_unit_ids)))

	# For each abs intensity assay
	pool = None
	pool_data = {}
	for absi in iter_tumour_absi(conf, em, absi_tumour_unit_ids, log):

		absi_id = absi["id"]

		rpath = os.path.join(absi["data_file/path"], absi["data_file/name"])
		
		icdo_topography = absi["icdo_topography"]
		normal_counterpart = absi.get("normal_counterpart", icdo_topography)
		if icdo_topography != normal_counterpart:
			keystr = "(%s, %s, %s --> %s)" % (absi["study_id"], absi["platform_id"], icdo_topography, normal_counterpart)
		else:
			keystr = "(%s, %s, %s)" % (absi["study_id"], absi["platform_id"], icdo_topography)

		exists = (absi_id,) in log2r_index
		if exists:
			log2r_id = log2r_index[(absi_id,)][0]
		else:
			log2r_id = str(uuid.uuid4())

		data_file_path = types.MRNA_LOG2R.replace(".", "/")
		data_file_name = log2r_id + ".tsv.gz"
		dst_path = os.path.join(data_file_path, data_file_name)

		if not overwrite and exists and data_repo.exists(dst_path):
			log.debug("Skipping calculation of log2r for tumour assay %s %s as it is already calculated" % (keystr, absi_id))
			log2r_port.write(log2r_id)
			continue

		log.info("Processing tumour assay %s %s from %s ..." % (keystr, absi_id, rpath))

		repo = rs.repository(absi["data_file/repo"])
		if not repo.exists(rpath):
			log.error("File not found: %s" % rpath)
			continue

		# Get normal counterpart data
		if pool is None \
			or absi["study_id"] != pool["study_id"] \
			or absi["platform_id"] != pool["platform_id"] \
			or normal_counterpart != pool["icdo_topography"]:

			pool_key = (absi["study_id"], absi["platform_id"], normal_counterpart)
			if pool_key not in pools_index:
				log.error("Normal pool not found for tumour assay (%s) %s {%s}" % (", ".join(pool_key), absi_id, absi.get("source_path", "")))
				continue

			pool_id = pools_index[pool_key][0]
			pool = em.find(pool_id, types.MRNA_NORMAL_POOL)
			if pool is None:
				log.error("Normal pool %s not found by the entity manager !" % pool_id)
				continue
			
			pool_data = read_pool_data(conf, rs, pool, log)
			if pool_data is None:
				pool = None
				continue

		log.info("Using normal pool ({}) [{}]".format(", ".join(pool_key), pool_id))

		# Calculate log2 ratios
		mr = MatrixReader(repo.open_reader(rpath))
		header = mr.read_header()
		if len(header.columns) != 2:
			log.error("Unexpected number of columns: %i" % len(header.columns))
			mr.close()
			continue

		warn_count = {
			"id_not_in_pool" : 0,
			"value_is_nan" : 0,
			"pool_value_is_nan" : 0,
			"value_is_inf" : 0,
			"pool_value_is_inf" : 0}

		data = {}
		for row in mr:
			if row.name in data:
				log.error("Skipping tumour assay, duplicated row %s at file %s" % (row.name, rpath))
				break

			value = row.values[0]

			value_is_nan = numpy.isnan(value)

			if value_is_nan:
				warn_count["value_is_nan"] += 1
			elif numpy.isinf(value):
				warn_count["value_is_inf"] += 1

			if row.name not in pool_data:
				pool_value = value = numpy.nan
				warn_count["id_not_in_pool"] += 1
			else:
				pool_value = pool_data[row.name]

			pool_value_is_nan = numpy.isnan(pool_value)
			if pool_value_is_nan:
				warn_count["pool_value_is_nan"] += 1
			elif numpy.isinf(pool_value):
				warn_count["pool_value_is_inf"] += 1

			if not value_is_nan and not pool_value_is_nan: # and value != 0.0 and pool_value != 0.0:
				log2r = value - pool_value
			else:
				log2r = numpy.nan

			if not numpy.isinf(log2r):
				data[row.name] = log2r
			#else:
			#	log.warn("row = %s, log2r = %f, value = %f, pool_value = %f" % (row.name, log2r, value, pool_value))

		mr.close()
		
		sb = ["{0}={1}".format(k, v) for k, v in warn_count.items() if v > 0]
		if len(sb) > 0:
			log.warn(", ".join(sb))

		# Save log2 ratios data and assay
		log2r = deepcopy(absi)

		log2r["id"] = log2r_id
		log2r["absi_id"] = absi_id
		log2r["normal_pool_id"] = pool["id"]

		log2r["data_file/repo"] = data_repo.name()
		log2r["data_file/path"] = data_file_path
		log2r["data_file/name"] = data_file_name

		msg = {True : "Overwritting", False : "Writting"}[exists]
		log.debug("%s log2 ratio data to %s ..." % (msg, dst_path))

		mw = MatrixWriter(data_repo.open_writer(dst_path))
		mw.write_header(["id", "value"])
		for name, value in sorted(data.items()):
			mw.write(name, [value])
		mw.close()

		em.persist(log2r, types.MRNA_LOG2R)
		log2r_port.write(log2r_id)

	em.close()
	es.close()

	data_repo.close()
	rs.close()
예제 #7
0
def run(task):

    # Initialization

    task.check_conf(["entities", "repositories", "bin_paths.gitools"])
    conf = task.conf

    log = task.logger()

    task.check_in_ports(["log2r_tumour_unit_ids"])
    task.check_out_ports(["oncodrive_results_ids"])

    log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"]
    oncodrive_results_port = task.ports["oncodrive_results_ids"]

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])
    data_repo = rs.repository("data")

    overwrite = conf.get("overwrite", False, dtype=bool)

    # Run

    log.info("Indexing available oncodrive results for probes ...")
    oncodrive_results_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
        types.MRNA_ONCODRIVE_PROBES,
        unique=True)

    log.info("Indexing available mrna log2r cutoffs ...")
    log2r_cutoff_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
        types.MRNA_LOG2R_CUTOFF,
        unique=True)

    results_base_path = types.MRNA_ONCODRIVE_PROBES.replace(".", "/")

    for log2r_unit_id in log2r_tumour_unit_port:
        u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT)
        if u is None:
            log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT,
                                                log2r_unit_id))
            continue

        key = (u["study_id"], u["platform_id"], u["icdo_topography"],
               u["icdo_morphology"])
        if key in oncodrive_results_index:
            eid = oncodrive_results_index[key][0]
            e = em.find(eid, types.MRNA_ONCODRIVE_PROBES)
            if e is None:
                log.error("{} not found: {}".format(
                    types.MRNA_ONCODRIVE_PROBES, eid))
                continue
        else:
            e = u.transform([
                "study_id", "platform_id", "icdo_topography", "icdo_morphology"
            ])
            eid = e["id"] = str(uuid.uuid4())

        log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(
            types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id))
        log.debug("{} id is {}".format(types.MRNA_ONCODRIVE_PROBES, eid))

        # create oncodrive results entity
        e["log2r_tumour_unit_id"] = log2r_unit_id

        results_path = rpath.join(results_base_path, eid + ".tsv.gz")

        if skip_file(overwrite, data_repo, results_path,
                     e.get("results_file")):
            log.warn("Skipping ({}) [{}] as it already exists".format(
                ", ".join(key), eid))
            oncodrive_results_port.write(eid)
            continue

        e["results_file"] = data_repo.url(results_path)

        # data matrix for oncodrive calculation
        file_repo = u["data_file/repo"]
        matrix_repo = rs.repository(file_repo)

        file_path = u["data_file/path"]
        file_name = u["data_file/file"]
        matrix_path = os.path.join(file_path, file_name)

        # Load calculated cutoff

        log.info("Loading mrna cutoff for key ({}) ...".format(", ".join(key)))

        if key not in log2r_cutoff_index:
            log.error("mrna log2r cuttof not found for key ({})".format(
                ", ".join(key)))
            matrix_repo.close()
            continue

        cutoff_id = log2r_cutoff_index[key][0]
        cutoff = em.find(cutoff_id, types.MRNA_LOG2R_CUTOFF)
        if cutoff is None:
            log.error("mrna log2r cuttof for key ({}) [{}] couldn't be loaded".
                      format(", ".join(key), cutoff_id))
            matrix_repo.close()
            continue

        log.debug("{} id is {}".format(types.MRNA_LOG2R_CUTOFF, cutoff_id))

        # Upregulation & downregulation

        try:
            from tempfile import mkdtemp
            tmp_path = mkdtemp(prefix="mrna_oncodrive_calc_")
            log.debug("Temporary directory: {}".format(tmp_path))

            matrix_local_path = matrix_repo.get_local(matrix_path)
            log.debug("Matrix path: {}".format(matrix_path))

            try:
                log.info("Calculating Upregulation with cutoff {} ...".format(
                    cutoff["upreg/cutoff"]))
                upreg_results = run_oncodrive(conf, log, e, "upreg",
                                              matrix_local_path, "gt",
                                              cutoff["upreg/cutoff"], tmp_path)
            except:
                log.error("Oncodrive calculation for upreg failed")
                matrix_repo.close_local(matrix_local_path)
                raise

            try:
                log.info(
                    "Calculating Downregulation with cutoff {} ...".format(
                        cutoff["downreg/cutoff"]))
                downreg_results = run_oncodrive(
                    conf, log, e, "downreg", matrix_local_path, "lt",
                    cutoff["downreg/cutoff"], tmp_path)
            except:
                log.error("Oncodrive calculation for downreg failed")
                matrix_repo.close_local(matrix_local_path)
                raise

            # Join upreg & downreg results

            log.info("Joining upreg & downreg results into memory ...")

            # the join is done in memory with a map
            dmap = read_data_map(log, upreg_results, downreg_results)

            log.info("Writting joined results to {} ...".format(results_path))

            results_local_path = data_repo.create_local(results_path)

            write_data_map(dmap, results_local_path)

        finally:
            matrix_repo.close_local(matrix_local_path)
            matrix_repo.close()

            if os.path.exists(tmp_path):
                log.debug(
                    "Removing temporary directory {} ...".format(tmp_path))
                import shutil
                shutil.rmtree(tmp_path)

        data_repo.put_local(results_local_path)

        em.persist(e, types.MRNA_ONCODRIVE_PROBES)
        oncodrive_results_port.write(eid)

    em.close()
    data_repo.close()
    rs.close()
예제 #8
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.data", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	combinations_port, combination_ids_port = \
		task.ports("combinations", "combination_ids")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	results_base_path = types.CNV_COMBINATION.replace(".", "/")

	conditions = ("gain", "loss")
	
	for c_dict in combinations_port:
		c = DataFactory.from_native(c_dict, key_sep = "/")
		
		"""
		o = em.find(c, types.CNV_ONCODRIVE_GENES)
		if o is None:
			log.error("{0} not found: {1}".format(types.CNV_ONCODRIVE_GENES, c))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])
		"""

		cid = c["id"]

		key = (c["icdo_topography"], c["icdo_morphology"], c["id_type"])
		
		log.info("Processing combination for ({}) [{}] ...".format(", ".join(key), cid))

		#files = c["files"]
		#if len(files) == 1:
		#	log.info("No combination required, copyed from {0}".format(files[0]))
		#	c["results_file"] = files[0]
		#else:
		results_path = rpath.join(results_base_path, cid + ".tsv.gz")
		results_url = data_repo.url(results_path)

		if skip_file(overwrite, data_repo, results_path, c.get("results_file")):
			log.warn("Skipping {} ({}) [{}] as it already exists".format(types.CNV_COMBINATION, ", ".join(key), cid))
			combination_ids_port.write(cid)
			continue

		c["results_file"] = results_url

		combination(log, conf, rs, c, data_repo, results_path, conditions)

		# save combination results
		em.persist(c, types.CNV_COMBINATION)
		combination_ids_port.write(cid)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
예제 #9
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories",
		"cnv.background.ensg", "cnv.mapping.ensg",
		"bin_paths.bed_tools"])

	conf = task.conf

	log = task.logger()

	evt_tunit_port, joined_evt_tunit_port = \
		task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	mapping_file = conf["cnv.mapping.ensg"]
	log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file))
	mapping_repo, mapping_path = rs.from_url(mapping_file)
	mapping_local_path = mapping_repo.get_local(mapping_path)

	background_file = conf["cnv.background.ensg"]
	log.info("Loading background from {} ...".format(background_file))

	background = set()
	repo, path = rs.from_url(background_file)
	reader = repo.open_reader(path)
	for line in reader:
		line = line.rstrip()
		if len(line) == 0:
			continue
		background.add(line)
	reader.close()
	repo.close()

	for uid in evt_tunit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", ""))

		tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/")
		tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz")

		if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")):
			log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid))
			joined_evt_tunit_port.write(uid)
			continue

		log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))

		cnv_evt_ids = u["cnv_evt_ids"]
		log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS))

		data = {}
		
		tmp_path = mkdtemp(prefix = "evt_map_and_join_")
		log.debug("Temporary directory: {}".format(tmp_path))
		
		try:
			for eid in cnv_evt_ids:
				e = em.find(eid, types.CNV_EVENTS)
				if e is None:
					log.error("{} not found: {}".format(types.CNV_EVENTS, eid))
					continue

				data_file = e["data_file"]

				log.debug("{} ...".format(data_file))

				repo, path = rs.from_url(data_file)

				local_path = repo.get_local(path)

				# Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed)

#				tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"]))

#				writer = FileWriter(tmp_file)
#				reader = repo.open_reader(path)
#				for line in reader:
#					if line.lstrip().startswith("#"):
#						continue
#					fields = line.rstrip().split("\t")
#					end = int(fields[2]) + 0 # FIXME fix not necessary already
#					fields[2] = str(end)
#					writer.write("\t".join(fields))
#					writer.write("\n")
#				writer.close()
#				reader.close()

				# Run BED tools to intersect event regions with gene names

				tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"]))

				cmd = " ".join([
					os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"),
					"-a", mapping_local_path,
					#"-b", tmp_file,
					"-b", local_path,
					"-s -wb",
					">{}".format(tmp_file2)])

				log.debug(cmd)

				retcode = subprocess.call(args = cmd, shell = True)

				if retcode != 0:
					raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

				repo.close_local(local_path)

				# Read BED tools results and load event data into memory

				reader = FileReader(tmp_file2)

				name_index = 3
				value_index = 12

				line_num = 1
				for line in reader:
					try:
						fields = line.rstrip().split("\t")
						name = fields[name_index]
						value = int(fields[value_index])
						if value not in [1, 2]:
							log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file))
							continue
					except:
						log.error("Error parsing line {} of data file {}".format(line_num, data_file))
						continue

					k = (eid, name)
					if k in data:
						prev_value = data[k]
					else:
						prev_value = 0

					data[k] = prev_value | value

					line_num += 1

				reader.close()
				repo.close()

		finally:
			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		# Write events data to data file and merge with background labels

		log.info("Writing data to {} ...".format(tunit_path))

		u["data_file"] = data_repo.url(tunit_path)
		#TODO u["data_timestamp"] = ...

		writer = data_repo.open_writer(tunit_path)

		# header
		for name in cnv_evt_ids:
			writer.write("\t")
			writer.write(name)
		writer.write("\n")

		# data
		for row_name in sorted(background):
			writer.write(row_name)
			for col_name in cnv_evt_ids:
				k = (col_name, row_name)
				if k in data:
					value = data[k]
				else:
					value = 0
				writer.write("\t")
				writer.write(str(value))
			writer.write("\n")

		writer.close()
		
		log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key)))
		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		joined_evt_tunit_port.write(uid)

	em.close()
	es.close()

	mapping_repo.close_local(mapping_local_path)
	mapping_repo.close()
	data_repo.close()
	rs.close()
예제 #10
0
def main():
    task.check_conf(["entities", "repositories", "biomart.db"])
    conf = task.conf

    insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

    db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

    log = task.logger()

    oncodrive_port = task.ports("id")

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    conn = biomart_db_connect(conf["biomart.db"], log)

    cursor = conn.cursor()

    gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene")
    icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
    exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

    cursor.execute(
        """
		CREATE TABLE IF NOT EXISTS exp_gene_trs (
		  gene_id int(11) NOT NULL,
		  icdo_id int(11) NOT NULL,
		  exp_id int(11) NOT NULL,
		  upreg_total int(11) DEFAULT NULL,
		  upreg_observed double DEFAULT NULL,
		  upreg_expected double DEFAULT NULL,
		  upreg_stdev double DEFAULT NULL,
		  upreg_pvalue double DEFAULT NULL,
		  upreg_cpvalue double DEFAULT NULL,
		  downreg_total int(11) DEFAULT NULL,
		  downreg_observed double DEFAULT NULL,
		  downreg_expected double DEFAULT NULL,
		  downreg_stdev double DEFAULT NULL,
		  downreg_pvalue double DEFAULT NULL,
		  downreg_cpvalue double DEFAULT NULL,
		  PRIMARY KEY (gene_id,icdo_id,exp_id),
		  KEY icdo (icdo_id,exp_id),
		  KEY exp (exp_id),
		  CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id),
		  CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
		  CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(
            db_engine
        )
    )

    cursor.execute("LOCK TABLES exp_gene_trs WRITE")

    lock_count = 0

    for eid in oncodrive_port:
        e = em.find(eid, types.MRNA_ONCODRIVE_GENES)
        if e is None:
            log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        if "results_file" not in e:
            log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        study_id = e["study_id"]
        platform_id = e["platform_id"]
        icdo_topography = e["icdo_topography"]
        icdo_morphology = e["icdo_morphology"]

        okey = (study_id, platform_id, icdo_topography, icdo_morphology)

        log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid))

        icdo_key = (icdo_topography, icdo_morphology)
        if icdo_key not in icdo:
            log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
            continue
        icdo_id = icdo[icdo_key]

        exp_key = (study_id, platform_id)
        if exp_key not in exp:
            log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
            continue
        exp_id = exp[exp_key]

        ib = BatchInsert(
            cursor,
            "exp_gene_trs",
            [
                "gene_id",
                "icdo_id",
                "exp_id",
                "upreg_total",
                "upreg_observed",
                "upreg_expected",
                "upreg_stdev",
                "upreg_pvalue",
                "upreg_cpvalue",
                "downreg_total",
                "downreg_observed",
                "downreg_expected",
                "downreg_stdev",
                "downreg_pvalue",
                "downreg_cpvalue",
            ],
            insert_size,
        )

        results_repo, results_path = rs.from_url(e["results_file"])

        try:
            reader = results_repo.open_reader(results_path)
        except Exception as ex:
            log.exception(ex)
            ib.close()
            results_repo.close()
            continue

            # read header
        hdr_map = {}
        hdr = reader.readline().rstrip().split("\t")
        for i, name in enumerate(hdr):
            hdr_map[name] = i

        try:
            col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
        except KeyError as e:
            log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
            reader.close()
            lock_count += ib.count
            ib.close()
            results_repo.close()
            continue

        skipped_genes = set()

        # read data
        for line in reader:
            line = line.rstrip()
            data = line.split("\t")
            gene_name = data[0]
            data = [data[i] for i in col_indices]
            if gene_name not in gene:
                skipped_genes.add(gene_name)
                continue

            gene_id = gene[gene_name]

            ib.insert(gene_id, icdo_id, exp_id, *data)

        if len(skipped_genes) > 0:
            log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes)))

        log.debug("{} gene results inserted".format(ib.count))

        lock_count += ib.count

        ib.close()
        reader.close()

        if lock_count >= 1000000:
            cursor.execute("UNLOCK TABLES")
            cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
            cursor.execute("LOCK TABLES exp_gene_trs WRITE")
            lock_count = 0

    cursor.execute("UNLOCK TABLES")
    cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
    cursor.close()

    em.close()
    es.close()
    rs.close()
예제 #11
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db",
		"biomart.files.icdo_topography", "biomart.files.icdo_morphology"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	log = task.logger()

	icdo_port = task.ports("icdo")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	log.info("Loading topography codes from {} ...".format(conf["biomart.files.icdo_topography"]))
	icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_topography"])
	icdo_local_path = icdo_repo.get_local(icdo_path)
	icdo_topography = map_from_file(icdo_local_path)
	icdo_repo.close_local(icdo_path)
	icdo_repo.close()

	log.info("Loading morphology codes from {} ...".format(conf["biomart.files.icdo_morphology"]))
	icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_morphology"])
	icdo_local_path = icdo_repo.get_local(icdo_path)
	icdo_morphology = map_from_file(icdo_local_path)
	icdo_repo.close_local(icdo_path)
	icdo_repo.close()

	conn = biomart_db_connect(conf["biomart.db"], log)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

	cursor = conn.cursor()

	cursor.execute("""
		CREATE TABLE  ent_icdo (
		  id int(11) NOT NULL,
		  icdo_name varchar(512) NOT NULL DEFAULT '',
		  icdo_topography varchar(255) NOT NULL DEFAULT '',
		  icdo_morphology varchar(255) NOT NULL DEFAULT '',
		  icdo_topography_code varchar(24) NOT NULL DEFAULT '',
		  icdo_morphology_code varchar(24) NOT NULL DEFAULT '',
		  icdo_topography_name varchar(255) NOT NULL DEFAULT '',
		  icdo_morphology_name varchar(255) NOT NULL DEFAULT '',
		  PRIMARY KEY (id),
		  KEY icdo_name (icdo_name),
		  KEY icdo_tm (icdo_topography,icdo_morphology),
		  KEY icdo_m (icdo_morphology),
		  KEY icdo_tm_c (icdo_topography_code,icdo_morphology_code),
		  KEY icdo_m_c (icdo_morphology_code)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(db_engine))

	ib = BatchInsert(cursor, "ent_icdo",
			["id", "icdo_name", "icdo_topography", "icdo_topography_code", "icdo_topography_name",
				"icdo_morphology", "icdo_morphology_code", "icdo_morphology_name"], insert_size)

	for i, tm in enumerate(icdo_port, 1):
		t_code = tm[0]
		if t_code == "":
			t_name = t_desc = "ANY topography"
		elif t_code not in icdo_topography:
			log.error("Unknown topography description for code {}".format(t_code))
			t_name = ""
			t_desc = "[{}]".format(t_code)
		else:
			t_name = icdo_topography[t_code]
			t_desc = "{} [{}]".format(t_name, t_code)

		m_code = tm[1]
		if m_code == "":
			m_name = m_desc = "ANY morphology"
		elif m_code not in icdo_morphology:
			log.error("Unknown morphology description for code {}".format(m_code))
			m_name = ""
			m_desc = "[{}]".format(m_code)
		else:
			m_name = icdo_morphology[m_code]
			m_desc = "{} [{}]".format(m_name, m_code)

		name = "; ".join((t_desc, m_desc))

		log.info("({}, {}) --> ({}, {})".format(t_code, m_code, t_desc, m_desc))

		ib.insert(i, name, t_desc, t_code, t_name, m_desc, m_code, m_name)

	log.debug("{} ICDO terms inserted".format(ib.count))

	ib.close()
	cursor.close()
	conn.close()
	em.close()
	es.close()
	rs.close()