def run(task): # Initialization conf = task.conf log = task.logger() task.check_out_ports(["study_ids"]) study_ids_port = task.ports["study_ids"] es = EntityServer(conf["entities"]) em = es.manager() # Run log.info("Reading studies ...") count = 0 for id in em.find_ids(types.SOURCE_STUDY): study_ids_port.write(id) count += 1 log.info("{0} studies found".format(count)) return 0
def main(): # Initialization conf = task.conf etype = conf["etype"] log = task.logger() port = task.ports["id"] es = EntityServer(conf["entities"]) em = es.manager() # Run log.info("Reading '{0}' ...".format(etype)) count = 0 for id in em.find_ids(etype): port.write(id) count += 1 log.info("{0} entities found".format(count)) return 0
def run(task): # Initialization conf = task.conf log = task.logger() es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") # Run for k, v in vars(types).items(): if k.startswith("CNV_"): log.info("Preparing '{0}' ...".format(v)) em.ensure_collection_exists(v) path = rpath.absolute(v.replace(".", "/")) log.debug("\tData: {0}".format(path)) data_repo.mkdir_if_not_exists(path) em.close() es.close() data_repo.close() rs.close() return 0
def run(task): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() task.check_in_ports(["log2r_source_ids"]) task.check_out_ports(["log2r_ids"]) log2r_source_port = task.ports["log2r_source_ids"] log2r_port = task.ports["log2r_ids"] es = EntityServer(conf["entities"]) em = es.manager() # Run log.info("Creating indices for mrna log2r assays ...") log2r_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R, unique = True) for log2r_source_id in log2r_source_port: s = em.find(log2r_source_id, types.MRNA_LOG2R_SOURCE) if s is None: log.error("%s not found: %s" % (types.MRNA_LOG2R_SOURCE, log2r_source_id)) continue update = True key = (s["study_id"], s["platform_id"], s["sample_id"], s["icdo_topography"], s.get("icdo_morphology", "")) if key in log2r_index: log2r_id = log2r_index[key][0] log2r = em.find(log2r_id, types.MRNA_LOG2R) if log2r is None: log.error("%s not found: %s" % (types.MRNA_LOG2R, log2r_id)) continue update = "absi_id" not in log2r if not update: log.debug("Not copying log2r %s already calculated from absi %s" % (log2r_source_id, log2r["absi_id"])) continue else: log2r_id = str(uuid.uuid4()) log2r = s.transform([ "study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology", "data_file/repo", "data_file/path", "data_file/name"]) log2r["id"] = log2r_id log2r["log2r_source_id"] = log2r_source_id log.debug("Persisting log2r assay %s ..." % log2r_id) em.persist(log2r, types.MRNA_LOG2R) log2r_port.write(log2r_id) em.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() id_port = task.ports("mrna_normal_pool") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) results_base_path = "reports/" + types.CNV_COMBINATION.replace(".", "/") # Run for id in id_port: e = em.find(oid, types.MRNA_LOG2R_TUMOUR_UNIT) if e is None: log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT, id)) continue repo, data_path = rs.from_url(e["data_file"]) data_local_path = repo.get_local(data_path) cmd = " ".join([conf["bin_paths.R"], "--vanilla --slave -f", script, "--args", results_base_path, id, data_local_path]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: raise Exception("R script failed") repo.close_local(data_local_path) repo.close() em.close() es.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() id_port = task.ports("mrna_normal_pool") es = EntityServer(conf["entities"]) em = es.manager() # Run em.close() es.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() mrna_log2r_tunit_port, mrna_normal_pool_port = task.ports(["mrna_log2r_tunit", "mrna_normal_pool"]) es = EntityServer(conf["entities"]) em = es.manager() # Run # mrna preprocessing extract_and_send(log, em, types.MRNA_NORMAL_POOL, mrna_normal_pool_port) extract_and_send(log, em, types.MRNA_LOG2R_TUMOUR_UNIT, mrna_log2r_tunit_port) em.close() es.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db", "biomart.files.icdo_topography", "biomart.files.icdo_morphology"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) log = task.logger() icdo_port = task.ports("icdo") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) log.info("Loading topography codes from {} ...".format(conf["biomart.files.icdo_topography"])) icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_topography"]) icdo_local_path = icdo_repo.get_local(icdo_path) icdo_topography = map_from_file(icdo_local_path) icdo_repo.close_local(icdo_path) icdo_repo.close() log.info("Loading morphology codes from {} ...".format(conf["biomart.files.icdo_morphology"])) icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_morphology"]) icdo_local_path = icdo_repo.get_local(icdo_path) icdo_morphology = map_from_file(icdo_local_path) icdo_repo.close_local(icdo_path) icdo_repo.close() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_icdo ( id int(11) NOT NULL, icdo_name varchar(512) NOT NULL DEFAULT '', icdo_topography varchar(255) NOT NULL DEFAULT '', icdo_morphology varchar(255) NOT NULL DEFAULT '', icdo_topography_code varchar(24) NOT NULL DEFAULT '', icdo_morphology_code varchar(24) NOT NULL DEFAULT '', icdo_topography_name varchar(255) NOT NULL DEFAULT '', icdo_morphology_name varchar(255) NOT NULL DEFAULT '', PRIMARY KEY (id), KEY icdo_name (icdo_name), KEY icdo_tm (icdo_topography,icdo_morphology), KEY icdo_m (icdo_morphology), KEY icdo_tm_c (icdo_topography_code,icdo_morphology_code), KEY icdo_m_c (icdo_morphology_code) ) ENGINE={} DEFAULT CHARSET=latin1""".format(db_engine)) ib = BatchInsert(cursor, "ent_icdo", ["id", "icdo_name", "icdo_topography", "icdo_topography_code", "icdo_topography_name", "icdo_morphology", "icdo_morphology_code", "icdo_morphology_name"], insert_size) for i, tm in enumerate(icdo_port, 1): t_code = tm[0] if t_code == "": t_name = t_desc = "ANY topography" elif t_code not in icdo_topography: log.error("Unknown topography description for code {}".format(t_code)) t_name = "" t_desc = "[{}]".format(t_code) else: t_name = icdo_topography[t_code] t_desc = "{} [{}]".format(t_name, t_code) m_code = tm[1] if m_code == "": m_name = m_desc = "ANY morphology" elif m_code not in icdo_morphology: log.error("Unknown morphology description for code {}".format(m_code)) m_name = "" m_desc = "[{}]".format(m_code) else: m_name = icdo_morphology[m_code] m_desc = "{} [{}]".format(m_name, m_code) name = "; ".join((t_desc, m_desc)) log.info("({}, {}) --> ({}, {})".format(t_code, m_code, t_desc, m_desc)) ib.insert(i, name, t_desc, t_code, t_name, m_desc, m_code, m_name) log.debug("{} ICDO terms inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.data", "repositories.source", "mrna.enrichment", "bin_paths.gitools"]) conf = task.conf log = task.logger() task.check_in_ports(["oncodrive_ids"]) task.check_out_ports(["enrichment_ids"]) oncodrive_port = task.ports["oncodrive_ids"] enrichment_port = task.ports["enrichment_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # retrieve enrichment configurations ec = conf["mrna.enrichment"] if "default" in ec: default = ec["default"] else: default = conf.create_element() if "modules" not in ec: log.error("There is no enrichment modules section available in mrna.enrichment") return -1 log.info("Reading modules configuration ...") econfs = list() for mod in ec["modules"]: m = ec.create_element() m.merge(default) m.merge(mod) mf = m.missing_fields(["id_type", "test", "modules_file"]) if len(mf) > 0: log.error("Enrichment configuration missing required fields: {}".format(", ".join(mf))) log.error("Module configuration: {}".format(m)) else: econfs.append(m) log.debug("{} -> {}".format(m["id_type"], m["modules_file"])) if len(econfs) == 0: log.error("There are no enrichment configurations available in mrna.enrichment") return 0 results_base_path = types.MRNA_ENRICHMENT.replace(".", "/") log.info("Indexing available enrichment results ...") enrichment_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology", "id_type"], types.MRNA_ENRICHMENT, unique = True) for oid in oncodrive_port: o = em.find(oid, types.MRNA_ONCODRIVE_GENES) if o is None: log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, oid)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) log.info("Enrichment for oncodrive results ({0}) [{1}] ...".format(", ".join(okey), oid)) for ec in econfs: log.info("Module {} [{}] ...".format(ec["id_type"], ec["modules_file"])) key = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"], ec["id_type"]) if key in enrichment_results_index: eid = enrichment_results_index[key][0] e = em.find(eid, types.MRNA_ENRICHMENT) if e is None: log.error("{} not found: {}".format(types.MRNA_ENRICHMENT, eid)) continue else: e = o.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"]) e["id"] = eid = str(uuid.uuid4()) e["id_type"] = ec["id_type"] # enrichment results results_path = rpath.join(results_base_path, eid + ".tsv.gz") if skip_file(overwrite, data_repo, results_path, e.get("results_file")): log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid)) enrichment_port.write(eid) continue valid = enrichment(log, conf, rs, data_repo, results_path, o["results_file"], e, ec, ["id", "upreg_corrected_right_p_value", "downreg_corrected_right_p_value"], ["id", "upreg", "downreg"]) # save mapped results if valid: em.persist(e, types.MRNA_ENRICHMENT) enrichment_port.write(eid) em.close() es.close() data_repo.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf log = task.logger() task.check_in_ports(["study_ids"]) task.check_out_ports(["absi_ids", "absi_tumour_unit_ids", "normal_pool_ids", "log2r_source_ids"]) study_ids_port = task.ports["study_ids"] absi_port = task.ports["absi_ids"] absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"] normal_pool_port = task.ports["normal_pool_ids"] log2r_source_port = task.ports["log2r_source_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) #overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Creating indices for {} ...".format(types.MRNA_ABS_INTENSITY)) absi_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.MRNA_ABS_INTENSITY, unique = True) log.info("Creating indices for {} ...".format(types.MRNA_LOG2R_SOURCE)) log2r_src_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_SOURCE, unique = True) log.info("Creating indices for {} ...".format(types.MRNA_ABSI_TUMOUR_UNIT)) absi_tumour_unit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_ABSI_TUMOUR_UNIT, unique = True) processed_studies = set() processed_assays = 0 valid_assay_count = {} skipped_assay_count = {} wrong_assays = {} wrong_samples = {} log2r_src_units = {} tumour_units = {} normal_pools = {} absi_dup = {} log2r_source_dup = {} study_ids = study_ids_port.read_all() log.info("Processing %i studies ..." % len(study_ids)) for assay in em.iter_all(types.SOURCE_ASSAY): assay_id = assay.get("id", "WITHOUT ID") log.debug("Reading assay %s ..." % assay_id) mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id", "assay_property/assay_design", "assay_property/data_type", "assay_property/study_type", "assay_property/filename"]) assay_source_path = assay.get("source_path", "") if len(mf) > 0: study_id = assay.get("study_id", "WITHOUT ID") doc_path = assay.get("__doc_path", "UNKNOWN") log.error("Assay %s in study %s missing required fields: %s {%s}" % (assay_id, study_id, mf, assay_source_path)) map_list_add(wrong_assays, study_id, assay_id) continue study_id = assay["study_id"] if study_id not in study_ids: log.debug("Assay %s not included in 'study_ids'" % assay_id) continue platform_id = assay["platform_id"] sample_id = assay["sample_id"] assay_design = assay["assay_property/assay_design"] data_type = assay["assay_property/data_type"] study_type = assay["assay_property/study_type"] e = assay.transform([ ("assay_id", "id"), "study_id", "platform_id", "sample_id", "source_path", ("data_file/path", "source_path"), ("data_file/name", "assay_property/filename") ]) e["data_file/repo"] = assay.get("data_file/repo", "assay") included = study_id in study_ids and study_type == "transcriptomic" included &= (assay_design == "cancer_and_normal" and data_type == "log_abs_readings") \ or (assay_design == "cancer_vs_normal" and data_type == "log2ratios") if not included: if study_type != "genomic" and study_id in study_ids: s = ", ".join(["%s = %s" % (v[0], v[1]) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]]) log.warn("Skipping assay %s {%s}: %s." % (assay_id, assay_source_path, s)) map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type)) continue sample = em.find(sample_id, types.SOURCE_SAMPLE) if sample is None: log.error("Assay %s references a non-existent sample: %s" % (assay_id, sample_id)) map_list_add(wrong_assays, study_id, assay_id) continue mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"]) if len(mf) > 0: sample_id = sample.get("id", "WITHOUT ID") doc_path = sample.get("__doc_path", "UNKNOWN") sample_source_path = sample.get("source_path", "") log.error("Sample %s associated with assay %s in study %s missing required fields: %s {%s}" % (sample_id, assay_id, study_id, mf, sample_source_path)) map_list_add(wrong_samples, study_id, sample_id) continue sample = sample.transform([ "id", ("source_path", "source_path"), ("disease_state", "basic_sample_details/disease_state"), ("normal_counterpart", "normal_counterpart_location/topography"), ("icdo_topography", "icdo/topography"), ("icdo_morphology", "icdo/morphology") ]) disease_state = sample["disease_state"] if disease_state not in disease_state_map: log.error("Unknown disease_state '%s' for sample %s {%s}" % (disease_state, sample_id, sample.get("source_path", ""))) map_list_add(wrong_samples, study_id, sample_id) continue disease_state = disease_state_map[disease_state] if disease_state not in ["tumour", "normal"]: continue e["disease_state"] = disease_state e["icdo_topography"] = sample["icdo_topography"] e["icdo_morphology"] = sample.get("icdo_morphology", "") if "normal_counterpart" in sample: e["normal_counterpart"] = sample["normal_counterpart"] repo = rs.repository(e["data_file/repo"]) rel_path = os.path.join(e["data_file/path"], e["data_file/name"]) if not repo.exists(rel_path): log.error("Assay %s in study %s missing data file: [%s]" % (assay_id, study_id, rel_path)) map_list_add(wrong_assays, study_id, assay_id) continue key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"]) eid = None duplicated = False exists = False if data_type == "log_abs_readings": if key in absi_dup: duplicated = True elif key in absi_index: eid = absi_index[key][0] exists = True elif data_type == "log2ratios": if key in log2r_source_dup: duplicated = True elif key in log2r_src_index: eid = log2r_src_index[key][0] exists = True if duplicated: log.error("Duplicated key (%s) for assay %s" % (", ".join(key), assay_id)) map_list_add(wrong_assays, study_id, assay_id) continue if eid is None: eid = str(uuid.uuid4()) e["id"] = eid if disease_state == "normal": if data_type == "log2ratios": k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"])) map_list_add(log2r_src_units, k, eid) elif data_type == "log_abs_readings": map_list_add(normal_pools, (study_id, platform_id, e["icdo_topography"]), eid) else: log.error("Assay %s has an unexpected combination of (disease_state, assay_design, data_type): (%s, %s)" % (assay_id, disease_state, assay_design, data_type)) map_list_add(wrong_assays, study_id, assay_id) continue elif disease_state == "tumour": k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"])) if data_type == "log_abs_readings": map_list_add(tumour_units, k, eid) elif data_type == "log2ratios": map_list_add(log2r_src_units, k, eid) processed_studies.add(study_id) processed_assays += 1 map_inc(valid_assay_count, (study_id, platform_id)) msg = {True : "Overwritting", False : "Writting"}[exists] if data_type == "log_abs_readings": log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_ABS_INTENSITY, ", ".join(key), eid)) em.persist(e, types.MRNA_ABS_INTENSITY) absi_port.write(eid) absi_dup[key] = eid elif data_type == "log2ratios": log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_LOG2R_SOURCE, ", ".join(key), eid)) em.persist(e, types.MRNA_LOG2R_SOURCE) log2r_source_port.write(eid) log2r_source_dup[key] = eid log.info("Persisting mrna absi tumour units ...") for k, v in sorted(tumour_units.items()): key = (k[0], k[1], k[2]) exists = key in absi_tumour_unit_index if exists: uid = absi_tumour_unit_index[key][0] else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = k[0] u["platform_id"] = k[1] u["icdo_topography"] = k[2] u["size"] = len(v) u["mrna_absi_ids"] = u.create_list(v) if exists: log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid)) else: log.debug("\t(%s) --> %s ..." % (", ".join(k), uid)) em.persist(u, types.MRNA_ABSI_TUMOUR_UNIT) absi_tumour_unit_port.write(uid) log.info("Creating indices for mrna normal pools ...") normal_pool_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_NORMAL_POOL, unique = True) log.info("Persisting mrna normal pools ...") for k, v in sorted(normal_pools.items()): key = (k[0], k[1], k[2]) exists = key in normal_pool_index if exists: uid = normal_pool_index[key][0] else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = k[0] u["platform_id"] = k[1] u["icdo_topography"] = k[2] u["size"] = len(v) u["mrna_absi_ids"] = u.create_list(v) if exists: log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid)) else: log.debug("\t(%s) --> %s ..." % (", ".join(k), uid)) em.persist(u, types.MRNA_NORMAL_POOL) normal_pool_port.write(uid) sb = ["\n\nProcessed %i assays for %i studies (out of %i):\n\n" % (processed_assays, len(processed_studies), len(study_ids))] sb += ["%i mrna tumour units:\n\n" % (len(tumour_units))] for k, v in sorted(tumour_units.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\n%i mrna normal pools:\n\n" % (len(normal_pools))] for k, v in sorted(normal_pools.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\n%i mrna source log2r units:\n\n" % (len(log2r_src_units))] for k, v in sorted(log2r_src_units.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\nAssay counts by study and platform:\n\n"] for k, v in sorted(valid_assay_count.items()): sb += ["\t%s\t%i assays" % (k, v)] if k in wrong_assays: sb += ["\t%i failed assays" % len(wrong_assays[k])] if k in wrong_samples: sb += ["\t%i failed samples" % len(wrong_samples[k])] sb += ["\n"] log.info("".join(sb)) if len(skipped_assay_count) > 0: log.info("Skipped assays:\n\n%s" % map_count_tostring(skipped_assay_count, indent = 1)) if len(wrong_assays) > 0: log.info("Summary of failed assays:\n\n%s" % map_list_tostring(wrong_assays)) if len(wrong_samples) > 0: log.info("Summary of failed samples:\n\n%s" % map_list_tostring(wrong_samples)) em.close() return 0
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() enrichment_port, combination_port = \ task.ports("enrichment_ids", "combinations") es = EntityServer(conf["entities"]) em = es.manager() log.info("Indexing available {} results ...".format(types.CNV_COMBINATION)) comb_results_index = em.group_ids( ["icdo_topography", "icdo_morphology", "id_type"], types.CNV_COMBINATION, unique = True) classif = {} log.info("Classifying enrichment results ...") for eid in enrichment_port: e = em.find(eid, types.CNV_ENRICHMENT) if e is None: log.error("{} not found: {}".format(types.CNV_ENRICHMENT, eid)) continue ekey = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"], e["id_type"]) key = (e["icdo_topography"], e["icdo_morphology"], e["id_type"]) log.debug("Enrichment results ({}) [{}] classified into ({}) ...".format(", ".join(ekey), eid, ", ".join(key))) if key in classif: classif[key] += [e] else: classif[key] = [e] log.info("Preparing combinations ...") for key in sorted(classif): if key in comb_results_index: cid = comb_results_index[key][0] c = em.find(cid, types.CNV_COMBINATION) if c is None: log.error("{} not found: {}".format(types.CNV_COMBINATION, cid)) return else: c = DataElement(key_sep = "/") c["id"] = cid = str(uuid.uuid4()) c["icdo_topography"] = key[0] c["icdo_morphology"] = key[1] c["id_type"] = key[2] elist = classif[key] log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(elist))) ids = c.create_list() flist = c.create_list() for e in elist: ids += [e["id"]] flist += [e["results_file"]] c["source"] = src = c.create_element() src["type"] = types.CNV_ENRICHMENT src["ids"] = ids c["files"] = flist combination_port.write(c.to_native()) em.close() es.close()
def run(task): # Initialization task.check_conf( [ "entities", "repositories", "repositories.data", "repositories.source", "bin_paths.python", "bin_paths.matrix_map", ] ) conf = task.conf log = task.logger() task.check_in_ports(["oncodrive_ids"]) task.check_out_ports(["mapped_oncodrive_ids"]) oncodrive_port = task.ports["oncodrive_ids"] mapped_oncodrive_port = task.ports["mapped_oncodrive_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") source_repo = rs.repository("source") overwrite = conf.get("overwrite", False, dtype=bool) platform_base_path = "platform" vplatform_base_path = "vplatform" results_base_path = types.MRNA_ONCODRIVE_GENES.replace(".", "/") log.info("Indexing available oncodrive results for genes ...") oncodrive_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_ONCODRIVE_GENES, unique=True ) for oid in oncodrive_port: o = em.find(oid, types.MRNA_ONCODRIVE_PROBES) if o is None: log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_PROBES, oid)) continue study_id = o["study_id"] platform_id = o["platform_id"] key = (study_id, platform_id, o["icdo_topography"], o["icdo_morphology"]) if key in oncodrive_results_index: mid = oncodrive_results_index[key][0] m = em.find(mid, types.MRNA_ONCODRIVE_GENES) if m is None: log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, mid)) continue else: m = o.transform( [ "study_id", "platform_id", "icdo_topography", "icdo_morphology", "log2r_tumour_unit_id", ("oncodrive_probes_id", "id"), ] ) m["id"] = mid = str(uuid.uuid4()) # mapped oncodrive results results_path = rpath.join(results_base_path, mid + ".tsv.gz") gitools_results_path = rpath.join(results_base_path, mid + ".tdm.gz") if skip_file(overwrite, data_repo, results_path, m.get("results_file")): log.warn("Skipping ({0}) [{1}] as it already exists".format(", ".join(key), mid)) mapped_oncodrive_port.write(mid) continue log.info("Mapping oncodriver results ({0}) [{1}] ...".format(", ".join(key), oid)) # determine the mapping file map_file = None p = em.find(platform_id, types.SOURCE_PLATFORM) if p is None: log.error("{0} not found: {1}".format(types.SOURCE_PLATFORM, platform_id)) continue platform_id_type = p.get("SO/platform_id_type") if platform_id_type is None: log.error("Undefined annotation 'SO/platform_id_type' for platform '{0}'.".format(platform_id)) continue elif platform_id_type != "genbank_accession": # affy_accession, custom, ... missing = p.missing_fields(["ensg_map", "ensg_map/file"]) if len(missing) > 0: log.error("Missing required fields for platform '{0}': {1}".format(platform_id, ", ".join(missing))) continue map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"]) if not source_repo.exists(map_file): log.error("Mapping file not found for platform '{0}': {1}".format(platform_id, map_file)) continue elif platform_id_type == "genbank_accession": if len(p.missing_fields(["ensg_map", "ensg_map/file"])) > 0: map_file = None else: map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"]) if map_file is None or not source_repo.exists(map_file): vpid = "-".join([platform_id, study_id]) vp = em.find(vpid, types.SOURCE_VPLATFORM) if vp is None: log.error("{0} not found: {1}".format(types.SOURCE_VPLATFORM, vpid)) continue missing = vp.missing_fields(["ensg_map", "ensg_map/path", "ensg_map/file"]) if len(missing) > 0: log.error("Missing required fields for vplatform '{0}': {1}".format(vpid, ", ".join(missing))) continue map_file = rpath.join(vplatform_base_path, vp["ensg_map/path"], vp["ensg_map/file"]) if not source_repo.exists(map_file): log.error( "Mapping file not found for vplatform ({0}, {1}): {2}".format(platform_id, study_id, map_file) ) continue else: log.error("Unknown SO/platform_id_type '{0}' for platform '{1}'.".format(platform_id_type, platform_id)) continue log.debug("Mapping file: {0}".format(map_file)) m["platform_map_file"] = source_repo.url(map_file) # oncodrive results file repo, repo_path = rs.from_url(o["results_file"]) local_path = repo.get_local(repo_path) # mapped oncodrive results m["results_file"] = data_repo.url(results_path) results_local_path = data_repo.create_local(results_path) gitools_results_local_path = data_repo.create_local(gitools_results_path) mapping_path = rpath.join(results_base_path, mid + ".mapping.tsv.gz") m["mapping_file"] = data_repo.url(mapping_path) mapping_local_path = data_repo.create_local(mapping_path) map_results_file = tempfile.mkstemp(prefix="mrna_oncodrive_map_", suffix=".tsv")[1] try: # run the mapping tool local_map_file = source_repo.get_local(map_file) log.debug("Mapping {0} to {1} ...".format(repo_path, map_results_file)) cmd = " ".join( [ conf["bin_paths.python"], conf["bin_paths.matrix_map"], "-o", map_results_file, "-i", mapping_local_path, local_path, local_map_file, ] ) log.debug(cmd) retcode = subprocess.call(args=cmd, shell=True) if retcode != 0: raise Exception("There was an error mapping the results") # merge repeated ids log.debug("Merging {0} to {1} ...".format(map_results_file, results_path)) log.debug("Gitools file: {0}".format(gitools_results_path)) upreg_count, downreg_count = merge(log, map_results_file, results_local_path, gitools_results_local_path) if upreg_count == 0 and downreg_count == 0: log.error( "The results of the mapping for ({0}) are empty. This could be because the annotated platform or the mapping file is wrong.".format( ", ".join(key) ) ) # close local paths data_repo.put_local(results_local_path) data_repo.put_local(mapping_local_path) except Exception as e: log.exception(e) data_repo.close_local(results_local_path) data_repo.close_local(mapping_local_path) continue finally: os.remove(map_results_file) repo.close_local(local_path) source_repo.close_local(local_map_file) # save mapped results em.persist(m, types.MRNA_ONCODRIVE_GENES) mapped_oncodrive_port.write(mid) em.close() data_repo.close() source_repo.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf log = task.logger() task.check_in_ports(["absi_tumour_unit_ids"]) task.check_out_ports(["log2r_ids"]) absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"] log2r_port = task.ports["log2r_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run # Index normal pools by study, platform, topography log.debug("Indexing normal pools by study, platform and topography ...") pools_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_NORMAL_POOL, unique = True) # Index log2r assays by absi_id log.debug("Indexing log2r assays by absi assay ...") log2r_index = em.group_ids( ["absi_id"], types.MRNA_LOG2R, unique = True) absi_tumour_unit_ids = absi_tumour_unit_port.read_all() log.info("Processing %i mrna absi tumour units ..." % len(absi_tumour_unit_ids)) #log.debug("[%s]" % (", ".join(absi_tumour_unit_ids))) # For each abs intensity assay pool = None pool_data = {} for absi in iter_tumour_absi(conf, em, absi_tumour_unit_ids, log): absi_id = absi["id"] rpath = os.path.join(absi["data_file/path"], absi["data_file/name"]) icdo_topography = absi["icdo_topography"] normal_counterpart = absi.get("normal_counterpart", icdo_topography) if icdo_topography != normal_counterpart: keystr = "(%s, %s, %s --> %s)" % (absi["study_id"], absi["platform_id"], icdo_topography, normal_counterpart) else: keystr = "(%s, %s, %s)" % (absi["study_id"], absi["platform_id"], icdo_topography) exists = (absi_id,) in log2r_index if exists: log2r_id = log2r_index[(absi_id,)][0] else: log2r_id = str(uuid.uuid4()) data_file_path = types.MRNA_LOG2R.replace(".", "/") data_file_name = log2r_id + ".tsv.gz" dst_path = os.path.join(data_file_path, data_file_name) if not overwrite and exists and data_repo.exists(dst_path): log.debug("Skipping calculation of log2r for tumour assay %s %s as it is already calculated" % (keystr, absi_id)) log2r_port.write(log2r_id) continue log.info("Processing tumour assay %s %s from %s ..." % (keystr, absi_id, rpath)) repo = rs.repository(absi["data_file/repo"]) if not repo.exists(rpath): log.error("File not found: %s" % rpath) continue # Get normal counterpart data if pool is None \ or absi["study_id"] != pool["study_id"] \ or absi["platform_id"] != pool["platform_id"] \ or normal_counterpart != pool["icdo_topography"]: pool_key = (absi["study_id"], absi["platform_id"], normal_counterpart) if pool_key not in pools_index: log.error("Normal pool not found for tumour assay (%s) %s {%s}" % (", ".join(pool_key), absi_id, absi.get("source_path", ""))) continue pool_id = pools_index[pool_key][0] pool = em.find(pool_id, types.MRNA_NORMAL_POOL) if pool is None: log.error("Normal pool %s not found by the entity manager !" % pool_id) continue pool_data = read_pool_data(conf, rs, pool, log) if pool_data is None: pool = None continue log.info("Using normal pool ({}) [{}]".format(", ".join(pool_key), pool_id)) # Calculate log2 ratios mr = MatrixReader(repo.open_reader(rpath)) header = mr.read_header() if len(header.columns) != 2: log.error("Unexpected number of columns: %i" % len(header.columns)) mr.close() continue warn_count = { "id_not_in_pool" : 0, "value_is_nan" : 0, "pool_value_is_nan" : 0, "value_is_inf" : 0, "pool_value_is_inf" : 0} data = {} for row in mr: if row.name in data: log.error("Skipping tumour assay, duplicated row %s at file %s" % (row.name, rpath)) break value = row.values[0] value_is_nan = numpy.isnan(value) if value_is_nan: warn_count["value_is_nan"] += 1 elif numpy.isinf(value): warn_count["value_is_inf"] += 1 if row.name not in pool_data: pool_value = value = numpy.nan warn_count["id_not_in_pool"] += 1 else: pool_value = pool_data[row.name] pool_value_is_nan = numpy.isnan(pool_value) if pool_value_is_nan: warn_count["pool_value_is_nan"] += 1 elif numpy.isinf(pool_value): warn_count["pool_value_is_inf"] += 1 if not value_is_nan and not pool_value_is_nan: # and value != 0.0 and pool_value != 0.0: log2r = value - pool_value else: log2r = numpy.nan if not numpy.isinf(log2r): data[row.name] = log2r #else: # log.warn("row = %s, log2r = %f, value = %f, pool_value = %f" % (row.name, log2r, value, pool_value)) mr.close() sb = ["{0}={1}".format(k, v) for k, v in warn_count.items() if v > 0] if len(sb) > 0: log.warn(", ".join(sb)) # Save log2 ratios data and assay log2r = deepcopy(absi) log2r["id"] = log2r_id log2r["absi_id"] = absi_id log2r["normal_pool_id"] = pool["id"] log2r["data_file/repo"] = data_repo.name() log2r["data_file/path"] = data_file_path log2r["data_file/name"] = data_file_name msg = {True : "Overwritting", False : "Writting"}[exists] log.debug("%s log2 ratio data to %s ..." % (msg, dst_path)) mw = MatrixWriter(data_repo.open_writer(dst_path)) mw.write_header(["id", "value"]) for name, value in sorted(data.items()): mw.write(name, [value]) mw.close() em.persist(log2r, types.MRNA_LOG2R) log2r_port.write(log2r_id) em.close() es.close() data_repo.close() rs.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() id_port = task.ports("id") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values()) feat_ids = {} for name in table_infixs: if name == "gene": continue cursor.execute(""" CREATE TABLE IF NOT EXISTS exp_{0}_trs ( {0}_id int(11) NOT NULL, icdo_id int(11) NOT NULL, exp_id int(11) NOT NULL, upreg_total int(11) DEFAULT NULL, upreg_observed double DEFAULT NULL, upreg_expected double DEFAULT NULL, upreg_stdev double DEFAULT NULL, upreg_pvalue double DEFAULT NULL, upreg_cpvalue double DEFAULT NULL, downreg_total int(11) DEFAULT NULL, downreg_observed double DEFAULT NULL, downreg_expected double DEFAULT NULL, downreg_stdev double DEFAULT NULL, downreg_pvalue double DEFAULT NULL, downreg_cpvalue double DEFAULT NULL, PRIMARY KEY ({0}_id,icdo_id,exp_id), KEY icdo (icdo_id,exp_id), KEY exp (exp_id), CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id), CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id), CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id) ) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine)) feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name)) icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo") exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment") for id_type, eid in id_port: e = em.find(eid, types.MRNA_ENRICHMENT) if e is None: log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid)) continue if "results_file" not in e: log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid)) continue study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e["icdo_morphology"] okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type) log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid)) table_infix = ID_TYPE_TO_TABLE_INFIX[id_type] icdo_key = (icdo_topography, icdo_morphology) if icdo_key not in icdo: log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key))) continue icdo_id = icdo[icdo_key] exp_key = (study_id, platform_id) if exp_key not in exp: log.error("Experiment ({}) not found in the database".format(", ".join(exp_key))) continue exp_id = exp[exp_key] ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix), ["{}_id".format(table_infix), "icdo_id", "exp_id", "upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue", "downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size) results_repo, results_path = rs.from_url(e["results_file"]) try: reader = results_repo.open_reader(results_path) except Exception as ex: log.exception(ex) ib.close() results_repo.close() continue # read header hdr_map = {} hdr = reader.readline().rstrip().split("\t") for i, name in enumerate(hdr): hdr_map[name] = i try: col_indices = [hdr_map[x] for x in __COLUMN_NAMES] except KeyError as e: log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0])) reader.close() ib.close() results_repo.close() continue skipped_ids = set() fids = feat_ids[table_infix] # read data for line in reader: line = line.rstrip() data = line.split("\t") feat_name = data[0] data = [data[i] for i in col_indices] if feat_name not in fids: skipped_ids.add(feat_name) continue feat_id = fids[feat_name] ib.insert(feat_id, icdo_id, exp_id, *data) if len(skipped_ids) > 0: log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids))) log.debug("{} results inserted".format(ib.count)) ib.close() reader.close() em.close() es.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf log = task.logger() task.check_in_ports(["normal_pool_ids"]) normal_pool_port = task.ports["normal_pool_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Processing %i mrna normal pools ..." % normal_pool_port.size()) for pool_id in normal_pool_port: pool = em.find(pool_id, types.MRNA_NORMAL_POOL) if pool is None: log.error("%s not found: %s" % (types.MRNA_NORMAL_POOL, pool_id)) continue mf = pool.missing_fields(["study_id", "platform_id", "icdo_topography", "size", "mrna_absi_ids"]) if len(mf) > 0: log.error("Normal pool %s missing required fields: %s {%s}" % (pool_id, mf, pool.get("__doc_path", ""))) continue key = (pool["study_id"], pool["platform_id"], pool["icdo_topography"]) log.info("Normal pool (%s) [%s] with %i assays ..." % (", ".join(key), pool_id, pool["size"])) data_file_path = types.MRNA_NORMAL_POOL.replace(".", "/") data_file_name = pool_id + ".tsv.gz" dst_rel_path = os.path.join(data_file_path, data_file_name) #dst_path = os.path.join(conf["repo.data"], dst_rel_path) if not overwrite and data_repo.exists(dst_rel_path) \ and "mrna_absi_ids" in pool and "pooled_assays" in pool and \ len(pool["mrna_absi_ids"]) == pool.get("pooled_assays", dtype=int): log.warn("Skipping normal pool %s that already has data" % pool_id) continue method = MeanPoolMethod() pooled_assays = 0 duplicated_rows = False for absi in em.iter_all(types.MRNA_ABS_INTENSITY, eids = pool["mrna_absi_ids"]): mf = absi.missing_fields(["data_file/path", "data_file/name"]) if len(mf) > 0: log.error("Normal assay %s missing required fields: %s {%s}" % (absi["id"], mf, absi.get("__doc_path", ""))) continue data_file = absi["data_file"] rel_path = os.path.join(data_file["path"], data_file["name"]) #filename = os.path.join(conf["repo.assays"], rel_path) repo = rs.repository(data_file["repo"]) if not repo.exists(rel_path): log.error("File not found: %s" % rel_path) continue log.debug("Processing normal assay %s for source assay %s at %s ..." % (absi["id"], absi["assay_id"], rel_path)) pooled_assays += 1 mr = MatrixReader(repo.open_reader(rel_path)) header = mr.read_header() if len(header.columns) != 2: log.error("Unexpected number of columns: %i" % len(header.columns)) mr.close() continue row_names = set() for row in mr: if row.name in row_names: log.error("Skipping normal assay, duplicated row %s at file %s" % (row.name, rel_path)) duplicated_rows = True break else: row_names.add(row.name) value = numpy.exp2(row.values[0]) method.process(row.name, value) mr.close() if not duplicated_rows and pooled_assays > 0: exists = data_repo.exists(dst_rel_path) msg = {True : "Overwritting", False : "Writting"}[exists] log.debug("%s pooled data to %s ..." % (msg, dst_rel_path)) mw = MatrixWriter(data_repo.open_writer(dst_rel_path)) mw.write_header(["id", "value"]) for row in method.pooled_rows(): value = numpy.log2(row.values[0]) mw.write(row.name, [value]) mw.close() pool["pooled_assays"] = pooled_assays pool["data_file/repo"] = "data" pool["data_file/path"] = data_file_path pool["data_file/name"] = data_file_name em.persist(pool, types.MRNA_NORMAL_POOL) em.close() return 0
def main(): # Initialization task.check_conf(["entities", "repositories", "bin_paths.gitools"]) conf = task.conf log = task.logger() evt_tumour_unit_port, oncodrive_results_port = \ task.ports("evt_tumour_unit_ids", "oncodrive_results_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Indexing available {} ...".format(types.CNV_ONCODRIVE_GENES)) oncodrive_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.CNV_ONCODRIVE_GENES, unique = True) results_base_path = types.CNV_ONCODRIVE_GENES.replace(".", "/") for uid in evt_tumour_unit_port: u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"]) if key in oncodrive_results_index: eid = oncodrive_results_index[key][0] e = em.find(eid, types.CNV_ONCODRIVE_GENES) if e is None: log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, eid)) continue else: e = u.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"]) eid = e["id"] = str(uuid.uuid4()) # create oncodrive results entity e["evt_tumour_unit_id"] = uid results_path = rpath.join(results_base_path, eid + ".tsv.gz") if skip_file(overwrite, data_repo, results_path, e.get("results_file")): log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid)) oncodrive_results_port.write(eid) continue e["results_file"] = data_repo.url(results_path) # data matrix for oncodrive calculation matrix_repo, matrix_path = rs.from_url(u["data_file"]) # Gain & Loss log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid)) log.debug("{} id is {}".format(types.CNV_ONCODRIVE_GENES, eid)) tmp_path = mkdtemp(prefix = "cnv_oncodrive_calc_") log.debug("Temporary directory: {}".format(tmp_path)) tmp_file = os.path.join(tmp_path, "filtered_data.tsv") matrix_local_path = matrix_repo.get_local(matrix_path) log.debug("Matrix path: {}".format(matrix_path)) try: try: log.info("Calculating Gain ...") log.debug("Bit mask filtering (01) {} to {} ...".format(matrix_local_path, tmp_file)) mask_filtering(matrix_local_path, tmp_file, 1) gain_results = run_oncodrive( conf, log, e, "gain", tmp_file, tmp_path) except: log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for gain failed".format(",".join(key), uid)) matrix_repo.close_local(matrix_local_path) raise try: log.info("Calculating Loss ...") log.debug("Bit mask filtering (10) {} to {} ...".format(matrix_local_path, tmp_file)) mask_filtering(matrix_local_path, tmp_file, 2) loss_results = run_oncodrive( conf, log, e, "loss", tmp_file, tmp_path) except: log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for downreg failed".format(",".join(key), uid)) matrix_repo.close_local(matrix_local_path) raise # Join gain & loss results log.info("Joining upreg & downreg results into memory ...") # the join is done in memory with a map dmap = read_data_map(log, gain_results, loss_results) log.info("Writting joined data to {} ...".format(results_path)) results_local_path = data_repo.create_local(results_path) write_data_map(dmap, results_local_path) finally: matrix_repo.close_local(matrix_local_path) matrix_repo.close() if os.path.exists(tmp_path): log.debug("Removing temporary directory {} ...".format(tmp_path)) shutil.rmtree(tmp_path) data_repo.put_local(results_local_path) em.persist(e, types.CNV_ONCODRIVE_GENES) oncodrive_results_port.write(eid) em.close() data_repo.close() rs.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() oncodrive_port, combination_port = \ task.ports("oncodrive_ids", "combinations") es = EntityServer(conf["entities"]) em = es.manager() log.info("Indexing available {} results ...".format(types.CNV_COMBINATION)) comb_results_index = em.group_ids( ["icdo_topography", "icdo_morphology", "id_type"], types.CNV_COMBINATION, unique = True) ENSEMBL_GENE = "ensembl:gene" classif = {} log.info("Classifying oncodrive results ...") for oid in oncodrive_port: o = em.find(oid, types.CNV_ONCODRIVE_GENES) if o is None: log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, oid)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) key = (o["icdo_topography"], o["icdo_morphology"], ENSEMBL_GENE) log.debug("Oncodrive results ({}) [{}] classified into ({}) ...".format(", ".join(okey), oid, ", ".join(key))) if key in classif: classif[key] += [o] else: classif[key] = [o] log.info("Preparing combinations ...") for key in sorted(classif): if key in comb_results_index: cid = comb_results_index[key][0] c = em.find(cid, types.CNV_COMBINATION) if c is None: log.error("{} not found: {}".format(types.CNV_COMBINATION, cid)) return else: c = DataElement(key_sep = "/") c["id"] = cid = str(uuid.uuid4()) c["icdo_topography"] = key[0] c["icdo_morphology"] = key[1] c["id_type"] = ENSEMBL_GENE olist = classif[key] log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(olist))) ids = c.create_list() flist = c.create_list() for o in olist: ids += [o["id"]] flist += [o["results_file"]] c["source"] = src = c.create_element() src["type"] = types.CNV_ONCODRIVE_GENES src["ids"] = ids c["files"] = flist combination_port.write(c.to_native()) em.close() es.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "repositories.assay", "cnv.min_tumour_unit_size"]) conf = task.conf log = task.logger() study_ids_port, evt_port, evt_tunit_port = \ task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) source_repo = rs.repository("source") if "excluded_topographies" in conf: excluded_topographies = set(conf.get("excluded_topographies")) log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies))) else: excluded_topographies = set() # Run log.info("Creating indices for {} ...".format(types.CNV_EVENTS)) evt_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.CNV_EVENTS, unique = True) log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT)) evt_tunit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.CNV_EVENTS_TUMOUR_UNIT, unique = True) processed_studies = set() processed_assays = 0 valid_assay_count = {} skipped_assay_count = {} wrong_assays = {} wrong_samples = {} tumour_units = {} evt_dup = {} study_ids = study_ids_port.read_all() log.info("Processing %i studies ..." % len(study_ids)) for assay in em.iter_all(types.SOURCE_ASSAY): assay_id = assay.get("id", "WITHOUT ID") log.debug("Reading assay %s ..." % assay_id) mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id", "assay_property/assay_design", "assay_property/data_type", "assay_property/study_type", "assay_property/filename"]) assay_source_path = assay.get("source_path", "") if len(mf) > 0: study_id = assay.get("study_id", "WITHOUT ID") doc_path = assay.get("__doc_path", "UNKNOWN") log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path)) map_list_add(wrong_assays, study_id, assay_id) continue study_id = assay["study_id"] if study_id not in study_ids: log.debug("Assay {} not included in 'study_ids'".format(assay_id)) continue platform_id = assay["platform_id"] sample_id = assay["sample_id"] assay_design = assay["assay_property/assay_design"] data_type = assay["assay_property/data_type"] study_type = assay["assay_property/study_type"] source_path = assay["source_path"] source_file = assay["assay_property/filename"] e = assay.transform([ ("assay_id", "id"), "study_id", "platform_id", "sample_id", "source_path"]) e["data_file"] = source_repo.url("assay", source_path, source_file) included = study_id in study_ids and study_type == "genomic" included &= (assay_design == "cancer_vs_normal" and data_type == "binary") if not included: if study_type != "transcriptomic" and study_id in study_ids: s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]]) log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s)) map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type)) continue sample = em.find(sample_id, types.SOURCE_SAMPLE) if sample is None: log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id)) map_list_add(wrong_assays, study_id, assay_id) continue mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"]) if len(mf) > 0: sample_source_path = sample.get("source_path", "") log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path)) map_list_add(wrong_samples, study_id, sample_id) continue sample = sample.transform([ "id", "source_path", ("disease_state", "basic_sample_details/disease_state"), ("normal_counterpart", "normal_counterpart_location/topography"), ("icdo_topography", "icdo/topography"), ("icdo_morphology", "icdo/morphology") ]) disease_state = sample["disease_state"] if disease_state not in disease_state_map: log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", ""))) map_list_add(wrong_samples, study_id, sample_id) continue orig_disease_state = disease_state disease_state = disease_state_map[disease_state] if disease_state not in ["tumour"]: log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state)) continue e["disease_state"] = disease_state e["icdo_topography"] = sample["icdo_topography"] e["icdo_morphology"] = sample.get("icdo_morphology", "") if "normal_counterpart" in sample: e["normal_counterpart"] = sample["normal_counterpart"] repo, rel_path = rs.from_url(e["data_file"]) if not repo.exists(rel_path): log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path)) map_list_add(wrong_assays, study_id, assay_id) continue e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"]) eid = None duplicated = False exists = False if e_key in evt_dup: duplicated = True elif e_key in evt_index: eid = evt_index[e_key][0] exists = True if duplicated: log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id)) map_list_add(wrong_assays, study_id, assay_id) continue if eid is None: eid = str(uuid.uuid4()) e["id"] = eid u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", "")) keys = classify_by_experiment_and_icdo( u_key[0], u_key[1], u_key[2], u_key[3]) for key in keys: icdo_topography = key[2] if icdo_topography in excluded_topographies: continue map_list_add(tumour_units, key, eid) processed_studies.add(study_id) processed_assays += 1 map_inc(valid_assay_count, (study_id, platform_id)) msg = {True : "Overwritting", False : "Writting"}[exists] log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key))) em.persist(e, types.CNV_EVENTS) evt_port.write(eid) evt_dup[e_key] = eid min_tumour_unit_size = conf["cnv.min_tumour_unit_size"] log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT)) log.debug("Minimum size = {}".format(min_tumour_unit_size)) for key in sorted(tumour_units): v = tumour_units[key] size = len(v) if size < min_tumour_unit_size: discard = True discard_text = "[skipped]" else: discard = False discard_text = "" if key in evt_tunit_index: uid = evt_tunit_index[key][0] u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue arrow_text = "==>" else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = key[0] u["platform_id"] = key[1] u["icdo_topography"] = key[2] u["icdo_morphology"] = key[3] arrow_text = "-->" log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text)) if discard: continue u["size"] = len(v) u["cnv_evt_ids"] = u.create_list(v) em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT) evt_tunit_port.write(uid) sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))] log.info("".join(sb)) log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1))) log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays))) log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples))) em.close() es.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() oncodrive_port = task.ports("id") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene") icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo") exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment") cursor.execute( """ CREATE TABLE IF NOT EXISTS exp_gene_trs ( gene_id int(11) NOT NULL, icdo_id int(11) NOT NULL, exp_id int(11) NOT NULL, upreg_total int(11) DEFAULT NULL, upreg_observed double DEFAULT NULL, upreg_expected double DEFAULT NULL, upreg_stdev double DEFAULT NULL, upreg_pvalue double DEFAULT NULL, upreg_cpvalue double DEFAULT NULL, downreg_total int(11) DEFAULT NULL, downreg_observed double DEFAULT NULL, downreg_expected double DEFAULT NULL, downreg_stdev double DEFAULT NULL, downreg_pvalue double DEFAULT NULL, downreg_cpvalue double DEFAULT NULL, PRIMARY KEY (gene_id,icdo_id,exp_id), KEY icdo (icdo_id,exp_id), KEY exp (exp_id), CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id), CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id), CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id) ) ENGINE={} DEFAULT CHARSET=latin1""".format( db_engine ) ) cursor.execute("LOCK TABLES exp_gene_trs WRITE") lock_count = 0 for eid in oncodrive_port: e = em.find(eid, types.MRNA_ONCODRIVE_GENES) if e is None: log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid)) continue if "results_file" not in e: log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid)) continue study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e["icdo_morphology"] okey = (study_id, platform_id, icdo_topography, icdo_morphology) log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid)) icdo_key = (icdo_topography, icdo_morphology) if icdo_key not in icdo: log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key))) continue icdo_id = icdo[icdo_key] exp_key = (study_id, platform_id) if exp_key not in exp: log.error("Experiment ({}) not found in the database".format(", ".join(exp_key))) continue exp_id = exp[exp_key] ib = BatchInsert( cursor, "exp_gene_trs", [ "gene_id", "icdo_id", "exp_id", "upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue", "downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue", ], insert_size, ) results_repo, results_path = rs.from_url(e["results_file"]) try: reader = results_repo.open_reader(results_path) except Exception as ex: log.exception(ex) ib.close() results_repo.close() continue # read header hdr_map = {} hdr = reader.readline().rstrip().split("\t") for i, name in enumerate(hdr): hdr_map[name] = i try: col_indices = [hdr_map[x] for x in __COLUMN_NAMES] except KeyError as e: log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0])) reader.close() lock_count += ib.count ib.close() results_repo.close() continue skipped_genes = set() # read data for line in reader: line = line.rstrip() data = line.split("\t") gene_name = data[0] data = [data[i] for i in col_indices] if gene_name not in gene: skipped_genes.add(gene_name) continue gene_id = gene[gene_name] ib.insert(gene_id, icdo_id, exp_id, *data) if len(skipped_genes) > 0: log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes))) log.debug("{} gene results inserted".format(ib.count)) lock_count += ib.count ib.close() reader.close() if lock_count >= 1000000: cursor.execute("UNLOCK TABLES") cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs") cursor.execute("LOCK TABLES exp_gene_trs WRITE") lock_count = 0 cursor.execute("UNLOCK TABLES") cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs") cursor.close() em.close() es.close() rs.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() icdo_port, exp_port = task.ports(["icdo", "experiment"]) mrna_oncodrive_gene_port, mrna_enrichment_port, mrna_combination_port = \ task.ports(["mrna_oncodrive_gene", "mrna_enrichment", "mrna_combination"]) cnv_oncodrive_gene_port, cnv_enrichment_port, cnv_combination_port = \ task.ports(["cnv_oncodrive_gene", "cnv_enrichment", "cnv_combination"]) es = EntityServer(conf["entities"]) em = es.manager() # Run exp = set() icdo = set() excludes = None if "biomart.excludes" in conf: excludes = conf["biomart.excludes"] # mrna oncodrive genes results = set() extract(log, em, types.MRNA_ONCODRIVE_GENES, (results, ("id")), (exp, ("study_id", "platform_id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.MRNA_ONCODRIVE_GENES)) for rid, in results: mrna_oncodrive_gene_port.write(rid) # mrna enrichment results = set() extract(log, em, types.MRNA_ENRICHMENT, (results, ("id_type", "id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.MRNA_ENRICHMENT)) for r in sorted(results): mrna_enrichment_port.write(r) # mrna combination results = set() extract(log, em, types.MRNA_COMBINATION, (results, ("id_type", "id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.MRNA_COMBINATION)) for r in sorted(results): mrna_combination_port.write(r) # cnv oncodrive genes results = set() extract(log, em, types.CNV_ONCODRIVE_GENES, (results, ("id")), (exp, ("study_id", "platform_id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.CNV_ONCODRIVE_GENES)) for rid, in results: cnv_oncodrive_gene_port.write(rid) # cnv enrichment results = set() extract(log, em, types.CNV_ENRICHMENT, (results, ("id_type", "id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.CNV_ENRICHMENT)) for r in sorted(results): cnv_enrichment_port.write(r) # cnv combination results = set() extract(log, em, types.CNV_COMBINATION, (results, ("id_type", "id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.CNV_COMBINATION)) for r in sorted(results): cnv_combination_port.write(r) # icdo log.info("Sending icdo's ...") for tm in icdo: icdo_port.write(tm) # exp log.info("Sending experiments ...") for e in exp: exp_port.write(e) em.close() es.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf min_tumour_unit_size = conf.get("mrna.min_tumour_unit_size", 20, dtype=int) log = task.logger() task.check_in_ports(["log2r_ids"]) task.check_out_ports(["log2r_tumour_unit_ids"]) log2r_port = task.ports["log2r_ids"] log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"] es = EntityServer(conf["entities"]) em = es.manager() overwrite = conf.get("overwrite", False, dtype=bool) if "excluded_topographies" in conf: excluded_topographies = set(conf.get("excluded_topographies")) log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies))) else: excluded_topographies = set() # Run log.info("Indexing available mrna log2r tumour units ...") log2r_tumour_unit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_TUMOUR_UNIT, unique = True) units = {} for log2r_id in log2r_port: e = em.find(log2r_id, types.MRNA_LOG2R) if e is None: log.error("%s not found: %s" % (types.MRNA_LOG2R, log2r_id)) continue eid = e["id"] study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e.get("icdo_morphology", "") log.info("Classifying mrna log2r (%s, %s, %s, %s) [%s] ..." % (study_id, platform_id, icdo_topography, icdo_morphology, eid)) keys = [] m = _ICDO_TOPOGRAPHY_PAT.match(icdo_topography) if m is None: log.error("Wrong ICD-O Topography code: {0}".format(icdo_topography)) continue else: level1 = m.group(1) level2 = m.group(2) if len(icdo_morphology) > 0: m = _ICDO_MORPHOLOGY_PAT.match(icdo_morphology) if m is None: log.error("Wrong ICD-O Morphology code: {0}".format(icdo_morphology)) continue keys += [(study_id, platform_id, level1, "")] if len(icdo_morphology) > 0: keys += [(study_id, platform_id, level1, icdo_morphology)] #keys += [(study_id, platform_id, "", icdo_morphology)] if level2 is not None: keys += [(study_id, platform_id, icdo_topography, "")] if len(icdo_morphology) > 0: keys += [(study_id, platform_id, icdo_topography, icdo_morphology)] for key in keys: icdo_topography = key[2] if icdo_topography in excluded_topographies: log.debug("\t(%s) [excluded]" % ", ".join(key)) continue log.debug("\t(%s)" % ", ".join(key)) if key not in units: units[key] = [eid] else: units[key] += [eid] log.info("Persisting %i mrna log2r tumour units ..." % len(units)) log.debug("Minimum size = %i" % min_tumour_unit_size) for key, ids in sorted(units.iteritems()): size = len(ids) if size < min_tumour_unit_size: log.debug("\t(%s)\t%i assays [Skipped]" % (", ".join(key), size)) continue else: log.debug("\t(%s)\t%i assays" % (", ".join(key), size)) if key in log2r_tumour_unit_index: uid = log2r_tumour_unit_index[key][0] if not overwrite: u = em.find(uid, types.MRNA_LOG2R_TUMOUR_UNIT) else: u = DataElement(key_sep = "/") else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = key[0] u["platform_id"] = key[1] u["icdo_topography"] = key[2] u["icdo_morphology"] = key[3] u["size"] = size u["mrna_log2r_ids"] = u.create_list(ids) em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT) log2r_tumour_unit_port.write(uid) em.close() es.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) if "biomart.study_source" in conf: study_source_map = conf["biomart.study_source"] else: study_source_map = conf.create_element() log = task.logger() exp_port = task.ports("experiment") es = EntityServer(conf["entities"]) em = es.manager() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_experiment ( id int(11) NOT NULL, exp_name varchar(64) NOT NULL, study_id varchar(32) NOT NULL, study_source varchar(32) DEFAULT NULL, study_source_url varchar(512) DEFAULT NULL, study_link varchar(512) DEFAULT NULL, pub_pubmed varchar(32) DEFAULT NULL, pub_title varchar(300) DEFAULT NULL, pub_authors varchar(300) DEFAULT NULL, pub_year varchar(16) DEFAULT NULL, pub_journal varchar(200) DEFAULT NULL, platf_id varchar(32) NOT NULL, platf_title varchar(250) DEFAULT NULL, platf_technology varchar(96) DEFAULT NULL, PRIMARY KEY (id), KEY exp_name (exp_name), KEY pub_pubmed (pub_pubmed), KEY pub_title (pub_title), KEY pub_authors (pub_authors), KEY pub_year (pub_year), KEY pub_journal (pub_journal), KEY platf_title (platf_title), KEY platf_technology (platf_technology) ) ENGINE={} CHARACTER SET utf8 COLLATE utf8_general_ci""".format(db_engine)) ib = BatchInsert(cursor, "ent_experiment", ["id", "exp_name", "study_id", "study_source", "study_source_url", "study_link", "pub_title", "pub_authors", "pub_year", "pub_pubmed", "pub_journal", "platf_id", "platf_title", "platf_technology"], insert_size) pubmed = Pubmed() for i, exp in enumerate(exp_port, 1): study_id = exp[0] platform_id = exp[1] study = em.find(study_id, types.SOURCE_STUDY) if study is None: log.error("{} not found: {}".format(types.SOURCE_STUDY, study_id)) continue platf = em.find(platform_id, types.SOURCE_PLATFORM) if platf is None: log.error("{} not found: {}".format(types.SOURCE_PLATFORM, platform_id)) continue log.info("Experiment for study {} and platform {} ...".format(study_id, platform_id)) pub = {} for k in ["title", "short_authors", "date", "journal"]: pub[k] = None if "pubmed" in study: pmid = study["pubmed"] if isinstance(pmid, (DataElementList, list)): pmid = pmid[0] log.warn("Study {} with many pubmed_id's, only the first {} will be considered".format(study_id, pmid)) log.debug("Retrieving information for pubmed_id '{}' ...".format(pmid)) try: pub = pubmed.find(pmid) if len(pub) == 0: log.error("No publication information found for pubmed_id '{}' in experiment ({}, {})".format(pmid, study_id, platform_id)) else: pub = pub[0] except Exception as ex: log.error("Error retrieving pubmed information for experiment ({}, {}) with pubmed_id '{}'".format(study_id, platform_id, pmid)) log.exception(ex) else: pmid = None log.warn("Study {} has no 'pubmed_id' annotation".format(study_id)) if "title" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'title'".format(study_id)) elif "SO/contact_details[0]/contact_name" not in study \ and "SO/contact_details/contact_name" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'SO.contact_details[0].contact_name'".format(study_id)) else: try: pub["title"] = study["title"] if "SO/contact_details[0]/contact_name" in study: pub["short_authors"] = study["SO/contact_details[0]/contact_name"] else: pub["short_authors"] = study["SO/contact_details/contact_name"] if "SO/submission/pub_date" in study: pub["date"] = study["SO/submission/pub_date"] else: pub["date"] = "" except Exception as ex: log.debug(study) log.execption(ex) for k, v in pub.items(): if v is not None and isinstance(v, basestring): pub[k] = v.replace("'", r"\'") exp_name = "{}; {}".format(study_id, platform_id) study_source = None study_source_url = None study_link = None parts = study_id.split("-") if len(parts) >= 2 and parts[0] in study_source_map: ss = study_source_map[parts[0]] study_source = ss.get("name") study_source_url = ss.get("home_url") try: study_link = ss.get("link", "").format(parts[1]) except: pass ib.insert(i, exp_name, study_id, study_source, study_source_url, study_link, pub["title"], pub["short_authors"], pub["date"], pmid, pub["journal"], platform_id, platf["SO/platform_title"], "") log.debug("{} experiments inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "cnv.background.ensg", "cnv.mapping.ensg", "bin_paths.bed_tools"]) conf = task.conf log = task.logger() evt_tunit_port, joined_evt_tunit_port = \ task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run mapping_file = conf["cnv.mapping.ensg"] log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file)) mapping_repo, mapping_path = rs.from_url(mapping_file) mapping_local_path = mapping_repo.get_local(mapping_path) background_file = conf["cnv.background.ensg"] log.info("Loading background from {} ...".format(background_file)) background = set() repo, path = rs.from_url(background_file) reader = repo.open_reader(path) for line in reader: line = line.rstrip() if len(line) == 0: continue background.add(line) reader.close() repo.close() for uid in evt_tunit_port: u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", "")) tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/") tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz") if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")): log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid)) joined_evt_tunit_port.write(uid) continue log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid)) cnv_evt_ids = u["cnv_evt_ids"] log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS)) data = {} tmp_path = mkdtemp(prefix = "evt_map_and_join_") log.debug("Temporary directory: {}".format(tmp_path)) try: for eid in cnv_evt_ids: e = em.find(eid, types.CNV_EVENTS) if e is None: log.error("{} not found: {}".format(types.CNV_EVENTS, eid)) continue data_file = e["data_file"] log.debug("{} ...".format(data_file)) repo, path = rs.from_url(data_file) local_path = repo.get_local(path) # Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed) # tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"])) # writer = FileWriter(tmp_file) # reader = repo.open_reader(path) # for line in reader: # if line.lstrip().startswith("#"): # continue # fields = line.rstrip().split("\t") # end = int(fields[2]) + 0 # FIXME fix not necessary already # fields[2] = str(end) # writer.write("\t".join(fields)) # writer.write("\n") # writer.close() # reader.close() # Run BED tools to intersect event regions with gene names tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"])) cmd = " ".join([ os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"), "-a", mapping_local_path, #"-b", tmp_file, "-b", local_path, "-s -wb", ">{}".format(tmp_file2)]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode)) repo.close_local(local_path) # Read BED tools results and load event data into memory reader = FileReader(tmp_file2) name_index = 3 value_index = 12 line_num = 1 for line in reader: try: fields = line.rstrip().split("\t") name = fields[name_index] value = int(fields[value_index]) if value not in [1, 2]: log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file)) continue except: log.error("Error parsing line {} of data file {}".format(line_num, data_file)) continue k = (eid, name) if k in data: prev_value = data[k] else: prev_value = 0 data[k] = prev_value | value line_num += 1 reader.close() repo.close() finally: if os.path.exists(tmp_path): log.debug("Removing temporary directory {} ...".format(tmp_path)) shutil.rmtree(tmp_path) # Write events data to data file and merge with background labels log.info("Writing data to {} ...".format(tunit_path)) u["data_file"] = data_repo.url(tunit_path) #TODO u["data_timestamp"] = ... writer = data_repo.open_writer(tunit_path) # header for name in cnv_evt_ids: writer.write("\t") writer.write(name) writer.write("\n") # data for row_name in sorted(background): writer.write(row_name) for col_name in cnv_evt_ids: k = (col_name, row_name) if k in data: value = data[k] else: value = 0 writer.write("\t") writer.write(str(value)) writer.write("\n") writer.close() log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key))) em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT) joined_evt_tunit_port.write(uid) em.close() es.close() mapping_repo.close_local(mapping_local_path) mapping_repo.close() data_repo.close() rs.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "repositories.data", "bin_paths.gitools"]) conf = task.conf log = task.logger() combinations_port, combination_ids_port = \ task.ports("combinations", "combination_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) results_base_path = types.CNV_COMBINATION.replace(".", "/") conditions = ("gain", "loss") for c_dict in combinations_port: c = DataFactory.from_native(c_dict, key_sep = "/") """ o = em.find(c, types.CNV_ONCODRIVE_GENES) if o is None: log.error("{0} not found: {1}".format(types.CNV_ONCODRIVE_GENES, c)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) """ cid = c["id"] key = (c["icdo_topography"], c["icdo_morphology"], c["id_type"]) log.info("Processing combination for ({}) [{}] ...".format(", ".join(key), cid)) #files = c["files"] #if len(files) == 1: # log.info("No combination required, copyed from {0}".format(files[0])) # c["results_file"] = files[0] #else: results_path = rpath.join(results_base_path, cid + ".tsv.gz") results_url = data_repo.url(results_path) if skip_file(overwrite, data_repo, results_path, c.get("results_file")): log.warn("Skipping {} ({}) [{}] as it already exists".format(types.CNV_COMBINATION, ", ".join(key), cid)) combination_ids_port.write(cid) continue c["results_file"] = results_url combination(log, conf, rs, c, data_repo, results_path, conditions) # save combination results em.persist(c, types.CNV_COMBINATION) combination_ids_port.write(cid) em.close() es.close() data_repo.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "bin_paths.gitools"]) conf = task.conf log = task.logger() task.check_in_ports(["log2r_tumour_unit_ids"]) task.check_out_ports(["oncodrive_results_ids"]) log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"] oncodrive_results_port = task.ports["oncodrive_results_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Indexing available oncodrive results for probes ...") oncodrive_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_ONCODRIVE_PROBES, unique=True) log.info("Indexing available mrna log2r cutoffs ...") log2r_cutoff_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_CUTOFF, unique=True) results_base_path = types.MRNA_ONCODRIVE_PROBES.replace(".", "/") for log2r_unit_id in log2r_tumour_unit_port: u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT, log2r_unit_id)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"]) if key in oncodrive_results_index: eid = oncodrive_results_index[key][0] e = em.find(eid, types.MRNA_ONCODRIVE_PROBES) if e is None: log.error("{} not found: {}".format( types.MRNA_ONCODRIVE_PROBES, eid)) continue else: e = u.transform([ "study_id", "platform_id", "icdo_topography", "icdo_morphology" ]) eid = e["id"] = str(uuid.uuid4()) log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format( types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id)) log.debug("{} id is {}".format(types.MRNA_ONCODRIVE_PROBES, eid)) # create oncodrive results entity e["log2r_tumour_unit_id"] = log2r_unit_id results_path = rpath.join(results_base_path, eid + ".tsv.gz") if skip_file(overwrite, data_repo, results_path, e.get("results_file")): log.warn("Skipping ({}) [{}] as it already exists".format( ", ".join(key), eid)) oncodrive_results_port.write(eid) continue e["results_file"] = data_repo.url(results_path) # data matrix for oncodrive calculation file_repo = u["data_file/repo"] matrix_repo = rs.repository(file_repo) file_path = u["data_file/path"] file_name = u["data_file/file"] matrix_path = os.path.join(file_path, file_name) # Load calculated cutoff log.info("Loading mrna cutoff for key ({}) ...".format(", ".join(key))) if key not in log2r_cutoff_index: log.error("mrna log2r cuttof not found for key ({})".format( ", ".join(key))) matrix_repo.close() continue cutoff_id = log2r_cutoff_index[key][0] cutoff = em.find(cutoff_id, types.MRNA_LOG2R_CUTOFF) if cutoff is None: log.error("mrna log2r cuttof for key ({}) [{}] couldn't be loaded". format(", ".join(key), cutoff_id)) matrix_repo.close() continue log.debug("{} id is {}".format(types.MRNA_LOG2R_CUTOFF, cutoff_id)) # Upregulation & downregulation try: from tempfile import mkdtemp tmp_path = mkdtemp(prefix="mrna_oncodrive_calc_") log.debug("Temporary directory: {}".format(tmp_path)) matrix_local_path = matrix_repo.get_local(matrix_path) log.debug("Matrix path: {}".format(matrix_path)) try: log.info("Calculating Upregulation with cutoff {} ...".format( cutoff["upreg/cutoff"])) upreg_results = run_oncodrive(conf, log, e, "upreg", matrix_local_path, "gt", cutoff["upreg/cutoff"], tmp_path) except: log.error("Oncodrive calculation for upreg failed") matrix_repo.close_local(matrix_local_path) raise try: log.info( "Calculating Downregulation with cutoff {} ...".format( cutoff["downreg/cutoff"])) downreg_results = run_oncodrive( conf, log, e, "downreg", matrix_local_path, "lt", cutoff["downreg/cutoff"], tmp_path) except: log.error("Oncodrive calculation for downreg failed") matrix_repo.close_local(matrix_local_path) raise # Join upreg & downreg results log.info("Joining upreg & downreg results into memory ...") # the join is done in memory with a map dmap = read_data_map(log, upreg_results, downreg_results) log.info("Writting joined results to {} ...".format(results_path)) results_local_path = data_repo.create_local(results_path) write_data_map(dmap, results_local_path) finally: matrix_repo.close_local(matrix_local_path) matrix_repo.close() if os.path.exists(tmp_path): log.debug( "Removing temporary directory {} ...".format(tmp_path)) import shutil shutil.rmtree(tmp_path) data_repo.put_local(results_local_path) em.persist(e, types.MRNA_ONCODRIVE_PROBES) oncodrive_results_port.write(eid) em.close() data_repo.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "bin_paths.R"]) conf = task.conf log = task.logger() task.check_in_ports(["log2r_tumour_unit_ids"]) task.check_out_ports(["processed_log2r_tumour_unit_ids"]) log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"] processed_log2r_tumour_unit_port = task.ports["processed_log2r_tumour_unit_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Indexing available mrna log2r cutoffs ...") log2r_cutoff_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_CUTOFF, unique = True) cutoff_path = types.MRNA_LOG2R_CUTOFF.replace(".", "/") for log2r_unit_id in log2r_tumour_unit_port: u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT) if u is None: log.error("%s not found: %s" % (types.MRNA_LOG2R_TUMOUR_UNIT, log2r_unit_id)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"]) if key in log2r_cutoff_index: eid = log2r_cutoff_index[key][0] e = em.find(eid, types.MRNA_LOG2R_CUTOFF) if ("upreg/cutoff" in e) and ("upreg/cutoff" in e) and not overwrite: log.warn("Skipping (%s) [%s] as it already exists" % (", ".join(key), eid)) processed_log2r_tumour_unit_port.write(log2r_unit_id) continue else: e = u.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"]) eid = e["id"] = str(uuid.uuid4()) log.info("Calculating cutoffs for {} ({}) [{}] ...".format(types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id)) log.debug("{} id is {}".format(types.MRNA_LOG2R_CUTOFF, eid)) file_repo = u["data_file/repo"] matrix_repo = rs.repository(file_repo) file_path = u["data_file/path"] file_name = u["data_file/file"] matrix_path = os.path.join(file_path, file_name) if "mrna.log2r_slope_cutoff.slope" in conf: slope = conf["mrna.log2r_slope_cutoff.slope"] else: slope = str(-0.05) log.debug("slope = {}".format(slope)) # Upregulation log.info("Upregulation ...") try: cutoff, cutoff_file, plot_file = calc_cutoff( conf, log, log2r_unit_id, matrix_repo, matrix_path, data_repo, cutoff_path, "upreg", slope) except Exception as e: log.error("Upreg cutoff calculation for {} ({}) [{}] failed".format(types.MRNA_LOG2R_TUMOUR_UNIT, ",".join(key), log2r_unit_id)) log.exception(e) return -1 log.debug("Upregulation cutoff = {}".format(cutoff)) e["upreg/cutoff"] = cutoff e["upreg/plot_file"] = pf = e.create_element() pf["repo"] = data_repo.name() pf["path"] = os.path.dirname(plot_file) pf["file"] = os.path.basename(plot_file) # Downregulation log.info("Downregulation ...") try: cutoff, cutoff_file, plot_file = calc_cutoff( conf, log, log2r_unit_id, matrix_repo, matrix_path, data_repo, cutoff_path, "downreg", slope) except Exception as e: log.error("Downreg cutoff calculation for {} ({}) [{}] failed".format(types.MRNA_LOG2R_TUMOUR_UNIT, ",".join(key), log2r_unit_id)) log.exception(e) return -1 log.debug("Downregulation cutoff = {}".format(cutoff)) e["downreg/cutoff"] = cutoff e["downreg/plot_file"] = pf = e.create_element() pf["repo"] = data_repo.name() pf["path"] = os.path.dirname(plot_file) pf["file"] = os.path.basename(plot_file) em.persist(e, types.MRNA_LOG2R_CUTOFF) processed_log2r_tumour_unit_port.write(log2r_unit_id) em.close() data_repo.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "bin_paths.matrix_join", "bin_paths.python"]) conf = task.conf log = task.logger() task.check_in_ports(["log2r_tumour_unit_ids"]) task.check_out_ports(["joined_log2r_tumour_unit_ids"]) log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"] joined_log2r_tumour_unit_port = task.ports["joined_log2r_tumour_unit_ids"] python_bin = conf["bin_paths.python"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run unit_base_path = types.MRNA_LOG2R_TUMOUR_UNIT.replace(".", "/") for log2r_unit_id in log2r_tumour_unit_port: u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT) if u is None: log.error("%s not found: %s" % (types.MRNA_LOG2R_TUMOUR_UNIT, log2r_unit_id)) continue uid = u["id"] study_id = u["study_id"] platform_id = u["platform_id"] icdo_topography = u["icdo_topography"] icdo_morphology = u["icdo_morphology"] key = (study_id, platform_id, icdo_topography, icdo_morphology) log.info("Joining columns for {} ({}) [{}] ...".format(types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id)) if "mrna_log2r_ids" not in u: log.warn("Discarding empty unit (%s) [%s]" % (", ".join(key), log2r_unit_id)) continue unit_repo = data_repo if "data_file" in u: unit_repo = rs.repository(u["data_file/repo"]) unit_repo_path = os.path.join(u["data_file/path"], u["data_file/file"]) exists = unit_repo is not None and unit_repo.exists(unit_repo_path) else: unit_repo_path = os.path.join(unit_base_path, log2r_unit_id + ".tsv.gz") exists = False if exists and not overwrite: log.warn("Skipping log2r tumour unit data join (%s) [%s] as it already exists in %s" % (", ".join(key), log2r_unit_id, unit_repo_path)) joined_log2r_tumour_unit_port.write(uid) continue valid = True repos = [] files = [] for log2r_id in u["mrna_log2r_ids"]: e = em.find(log2r_id, types.MRNA_LOG2R) if e is None: log.error("log2r assay '%s' not found" % log2r_id) valid = False break repo = rs.repository(e["data_file/repo"]) repo_path = os.path.join(e["data_file/path"], e["data_file/name"]) if repo is None or not repo.exists(repo_path): log.error("File not found: %s" % repo_path) valid = False break repos += [repo] files += [repo.get_local(repo_path)] if not valid: log.info("Skipping log2r tumour unit (%s) [%s] as there were errors" % (", ".join(key), log2r_unit_id)) continue if exists: unit_local_path = unit_repo.get_local(unit_repo_path) else: unit_local_path = unit_repo.create_local(unit_repo_path) cmd = " ".join([ python_bin, conf["bin_paths.matrix_join"], "-o '%s'" % unit_local_path, "-C '${filename_noext}'", "--skip-empty", " ".join(files)]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: log.error("There was an error joining matrices:\n%s" % "\n".join(files)) continue for i in xrange(len(files)): repos[i].close_local(files[i]) unit_repo.put_local(unit_local_path) df = u["data_file"] = u.create_element() df["repo"] = unit_repo.name() df["path"] = os.path.dirname(unit_repo_path) df["file"] = os.path.basename(unit_repo_path) em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT) joined_log2r_tumour_unit_port.write(uid) em.close() data_repo.close()