def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.data", "repositories.source", "mrna.enrichment", "bin_paths.gitools"]) conf = task.conf log = task.logger() task.check_in_ports(["oncodrive_ids"]) task.check_out_ports(["enrichment_ids"]) oncodrive_port = task.ports["oncodrive_ids"] enrichment_port = task.ports["enrichment_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # retrieve enrichment configurations ec = conf["mrna.enrichment"] if "default" in ec: default = ec["default"] else: default = conf.create_element() if "modules" not in ec: log.error("There is no enrichment modules section available in mrna.enrichment") return -1 log.info("Reading modules configuration ...") econfs = list() for mod in ec["modules"]: m = ec.create_element() m.merge(default) m.merge(mod) mf = m.missing_fields(["id_type", "test", "modules_file"]) if len(mf) > 0: log.error("Enrichment configuration missing required fields: {}".format(", ".join(mf))) log.error("Module configuration: {}".format(m)) else: econfs.append(m) log.debug("{} -> {}".format(m["id_type"], m["modules_file"])) if len(econfs) == 0: log.error("There are no enrichment configurations available in mrna.enrichment") return 0 results_base_path = types.MRNA_ENRICHMENT.replace(".", "/") log.info("Indexing available enrichment results ...") enrichment_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology", "id_type"], types.MRNA_ENRICHMENT, unique = True) for oid in oncodrive_port: o = em.find(oid, types.MRNA_ONCODRIVE_GENES) if o is None: log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, oid)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) log.info("Enrichment for oncodrive results ({0}) [{1}] ...".format(", ".join(okey), oid)) for ec in econfs: log.info("Module {} [{}] ...".format(ec["id_type"], ec["modules_file"])) key = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"], ec["id_type"]) if key in enrichment_results_index: eid = enrichment_results_index[key][0] e = em.find(eid, types.MRNA_ENRICHMENT) if e is None: log.error("{} not found: {}".format(types.MRNA_ENRICHMENT, eid)) continue else: e = o.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"]) e["id"] = eid = str(uuid.uuid4()) e["id_type"] = ec["id_type"] # enrichment results results_path = rpath.join(results_base_path, eid + ".tsv.gz") if skip_file(overwrite, data_repo, results_path, e.get("results_file")): log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid)) enrichment_port.write(eid) continue valid = enrichment(log, conf, rs, data_repo, results_path, o["results_file"], e, ec, ["id", "upreg_corrected_right_p_value", "downreg_corrected_right_p_value"], ["id", "upreg", "downreg"]) # save mapped results if valid: em.persist(e, types.MRNA_ENRICHMENT) enrichment_port.write(eid) em.close() es.close() data_repo.close() rs.close()
def run(task): # Initialization task.check_conf( [ "entities", "repositories", "repositories.data", "repositories.source", "bin_paths.python", "bin_paths.matrix_map", ] ) conf = task.conf log = task.logger() task.check_in_ports(["oncodrive_ids"]) task.check_out_ports(["mapped_oncodrive_ids"]) oncodrive_port = task.ports["oncodrive_ids"] mapped_oncodrive_port = task.ports["mapped_oncodrive_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") source_repo = rs.repository("source") overwrite = conf.get("overwrite", False, dtype=bool) platform_base_path = "platform" vplatform_base_path = "vplatform" results_base_path = types.MRNA_ONCODRIVE_GENES.replace(".", "/") log.info("Indexing available oncodrive results for genes ...") oncodrive_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_ONCODRIVE_GENES, unique=True ) for oid in oncodrive_port: o = em.find(oid, types.MRNA_ONCODRIVE_PROBES) if o is None: log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_PROBES, oid)) continue study_id = o["study_id"] platform_id = o["platform_id"] key = (study_id, platform_id, o["icdo_topography"], o["icdo_morphology"]) if key in oncodrive_results_index: mid = oncodrive_results_index[key][0] m = em.find(mid, types.MRNA_ONCODRIVE_GENES) if m is None: log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, mid)) continue else: m = o.transform( [ "study_id", "platform_id", "icdo_topography", "icdo_morphology", "log2r_tumour_unit_id", ("oncodrive_probes_id", "id"), ] ) m["id"] = mid = str(uuid.uuid4()) # mapped oncodrive results results_path = rpath.join(results_base_path, mid + ".tsv.gz") gitools_results_path = rpath.join(results_base_path, mid + ".tdm.gz") if skip_file(overwrite, data_repo, results_path, m.get("results_file")): log.warn("Skipping ({0}) [{1}] as it already exists".format(", ".join(key), mid)) mapped_oncodrive_port.write(mid) continue log.info("Mapping oncodriver results ({0}) [{1}] ...".format(", ".join(key), oid)) # determine the mapping file map_file = None p = em.find(platform_id, types.SOURCE_PLATFORM) if p is None: log.error("{0} not found: {1}".format(types.SOURCE_PLATFORM, platform_id)) continue platform_id_type = p.get("SO/platform_id_type") if platform_id_type is None: log.error("Undefined annotation 'SO/platform_id_type' for platform '{0}'.".format(platform_id)) continue elif platform_id_type != "genbank_accession": # affy_accession, custom, ... missing = p.missing_fields(["ensg_map", "ensg_map/file"]) if len(missing) > 0: log.error("Missing required fields for platform '{0}': {1}".format(platform_id, ", ".join(missing))) continue map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"]) if not source_repo.exists(map_file): log.error("Mapping file not found for platform '{0}': {1}".format(platform_id, map_file)) continue elif platform_id_type == "genbank_accession": if len(p.missing_fields(["ensg_map", "ensg_map/file"])) > 0: map_file = None else: map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"]) if map_file is None or not source_repo.exists(map_file): vpid = "-".join([platform_id, study_id]) vp = em.find(vpid, types.SOURCE_VPLATFORM) if vp is None: log.error("{0} not found: {1}".format(types.SOURCE_VPLATFORM, vpid)) continue missing = vp.missing_fields(["ensg_map", "ensg_map/path", "ensg_map/file"]) if len(missing) > 0: log.error("Missing required fields for vplatform '{0}': {1}".format(vpid, ", ".join(missing))) continue map_file = rpath.join(vplatform_base_path, vp["ensg_map/path"], vp["ensg_map/file"]) if not source_repo.exists(map_file): log.error( "Mapping file not found for vplatform ({0}, {1}): {2}".format(platform_id, study_id, map_file) ) continue else: log.error("Unknown SO/platform_id_type '{0}' for platform '{1}'.".format(platform_id_type, platform_id)) continue log.debug("Mapping file: {0}".format(map_file)) m["platform_map_file"] = source_repo.url(map_file) # oncodrive results file repo, repo_path = rs.from_url(o["results_file"]) local_path = repo.get_local(repo_path) # mapped oncodrive results m["results_file"] = data_repo.url(results_path) results_local_path = data_repo.create_local(results_path) gitools_results_local_path = data_repo.create_local(gitools_results_path) mapping_path = rpath.join(results_base_path, mid + ".mapping.tsv.gz") m["mapping_file"] = data_repo.url(mapping_path) mapping_local_path = data_repo.create_local(mapping_path) map_results_file = tempfile.mkstemp(prefix="mrna_oncodrive_map_", suffix=".tsv")[1] try: # run the mapping tool local_map_file = source_repo.get_local(map_file) log.debug("Mapping {0} to {1} ...".format(repo_path, map_results_file)) cmd = " ".join( [ conf["bin_paths.python"], conf["bin_paths.matrix_map"], "-o", map_results_file, "-i", mapping_local_path, local_path, local_map_file, ] ) log.debug(cmd) retcode = subprocess.call(args=cmd, shell=True) if retcode != 0: raise Exception("There was an error mapping the results") # merge repeated ids log.debug("Merging {0} to {1} ...".format(map_results_file, results_path)) log.debug("Gitools file: {0}".format(gitools_results_path)) upreg_count, downreg_count = merge(log, map_results_file, results_local_path, gitools_results_local_path) if upreg_count == 0 and downreg_count == 0: log.error( "The results of the mapping for ({0}) are empty. This could be because the annotated platform or the mapping file is wrong.".format( ", ".join(key) ) ) # close local paths data_repo.put_local(results_local_path) data_repo.put_local(mapping_local_path) except Exception as e: log.exception(e) data_repo.close_local(results_local_path) data_repo.close_local(mapping_local_path) continue finally: os.remove(map_results_file) repo.close_local(local_path) source_repo.close_local(local_map_file) # save mapped results em.persist(m, types.MRNA_ONCODRIVE_GENES) mapped_oncodrive_port.write(mid) em.close() data_repo.close() source_repo.close() rs.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "bin_paths.gitools"]) conf = task.conf log = task.logger() evt_tumour_unit_port, oncodrive_results_port = \ task.ports("evt_tumour_unit_ids", "oncodrive_results_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Indexing available {} ...".format(types.CNV_ONCODRIVE_GENES)) oncodrive_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.CNV_ONCODRIVE_GENES, unique = True) results_base_path = types.CNV_ONCODRIVE_GENES.replace(".", "/") for uid in evt_tumour_unit_port: u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"]) if key in oncodrive_results_index: eid = oncodrive_results_index[key][0] e = em.find(eid, types.CNV_ONCODRIVE_GENES) if e is None: log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, eid)) continue else: e = u.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"]) eid = e["id"] = str(uuid.uuid4()) # create oncodrive results entity e["evt_tumour_unit_id"] = uid results_path = rpath.join(results_base_path, eid + ".tsv.gz") if skip_file(overwrite, data_repo, results_path, e.get("results_file")): log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid)) oncodrive_results_port.write(eid) continue e["results_file"] = data_repo.url(results_path) # data matrix for oncodrive calculation matrix_repo, matrix_path = rs.from_url(u["data_file"]) # Gain & Loss log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid)) log.debug("{} id is {}".format(types.CNV_ONCODRIVE_GENES, eid)) tmp_path = mkdtemp(prefix = "cnv_oncodrive_calc_") log.debug("Temporary directory: {}".format(tmp_path)) tmp_file = os.path.join(tmp_path, "filtered_data.tsv") matrix_local_path = matrix_repo.get_local(matrix_path) log.debug("Matrix path: {}".format(matrix_path)) try: try: log.info("Calculating Gain ...") log.debug("Bit mask filtering (01) {} to {} ...".format(matrix_local_path, tmp_file)) mask_filtering(matrix_local_path, tmp_file, 1) gain_results = run_oncodrive( conf, log, e, "gain", tmp_file, tmp_path) except: log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for gain failed".format(",".join(key), uid)) matrix_repo.close_local(matrix_local_path) raise try: log.info("Calculating Loss ...") log.debug("Bit mask filtering (10) {} to {} ...".format(matrix_local_path, tmp_file)) mask_filtering(matrix_local_path, tmp_file, 2) loss_results = run_oncodrive( conf, log, e, "loss", tmp_file, tmp_path) except: log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for downreg failed".format(",".join(key), uid)) matrix_repo.close_local(matrix_local_path) raise # Join gain & loss results log.info("Joining upreg & downreg results into memory ...") # the join is done in memory with a map dmap = read_data_map(log, gain_results, loss_results) log.info("Writting joined data to {} ...".format(results_path)) results_local_path = data_repo.create_local(results_path) write_data_map(dmap, results_local_path) finally: matrix_repo.close_local(matrix_local_path) matrix_repo.close() if os.path.exists(tmp_path): log.debug("Removing temporary directory {} ...".format(tmp_path)) shutil.rmtree(tmp_path) data_repo.put_local(results_local_path) em.persist(e, types.CNV_ONCODRIVE_GENES) oncodrive_results_port.write(eid) em.close() data_repo.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "bin_paths.gitools"]) conf = task.conf log = task.logger() task.check_in_ports(["log2r_tumour_unit_ids"]) task.check_out_ports(["oncodrive_results_ids"]) log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"] oncodrive_results_port = task.ports["oncodrive_results_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Indexing available oncodrive results for probes ...") oncodrive_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_ONCODRIVE_PROBES, unique=True) log.info("Indexing available mrna log2r cutoffs ...") log2r_cutoff_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_CUTOFF, unique=True) results_base_path = types.MRNA_ONCODRIVE_PROBES.replace(".", "/") for log2r_unit_id in log2r_tumour_unit_port: u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT, log2r_unit_id)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"]) if key in oncodrive_results_index: eid = oncodrive_results_index[key][0] e = em.find(eid, types.MRNA_ONCODRIVE_PROBES) if e is None: log.error("{} not found: {}".format( types.MRNA_ONCODRIVE_PROBES, eid)) continue else: e = u.transform([ "study_id", "platform_id", "icdo_topography", "icdo_morphology" ]) eid = e["id"] = str(uuid.uuid4()) log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format( types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id)) log.debug("{} id is {}".format(types.MRNA_ONCODRIVE_PROBES, eid)) # create oncodrive results entity e["log2r_tumour_unit_id"] = log2r_unit_id results_path = rpath.join(results_base_path, eid + ".tsv.gz") if skip_file(overwrite, data_repo, results_path, e.get("results_file")): log.warn("Skipping ({}) [{}] as it already exists".format( ", ".join(key), eid)) oncodrive_results_port.write(eid) continue e["results_file"] = data_repo.url(results_path) # data matrix for oncodrive calculation file_repo = u["data_file/repo"] matrix_repo = rs.repository(file_repo) file_path = u["data_file/path"] file_name = u["data_file/file"] matrix_path = os.path.join(file_path, file_name) # Load calculated cutoff log.info("Loading mrna cutoff for key ({}) ...".format(", ".join(key))) if key not in log2r_cutoff_index: log.error("mrna log2r cuttof not found for key ({})".format( ", ".join(key))) matrix_repo.close() continue cutoff_id = log2r_cutoff_index[key][0] cutoff = em.find(cutoff_id, types.MRNA_LOG2R_CUTOFF) if cutoff is None: log.error("mrna log2r cuttof for key ({}) [{}] couldn't be loaded". format(", ".join(key), cutoff_id)) matrix_repo.close() continue log.debug("{} id is {}".format(types.MRNA_LOG2R_CUTOFF, cutoff_id)) # Upregulation & downregulation try: from tempfile import mkdtemp tmp_path = mkdtemp(prefix="mrna_oncodrive_calc_") log.debug("Temporary directory: {}".format(tmp_path)) matrix_local_path = matrix_repo.get_local(matrix_path) log.debug("Matrix path: {}".format(matrix_path)) try: log.info("Calculating Upregulation with cutoff {} ...".format( cutoff["upreg/cutoff"])) upreg_results = run_oncodrive(conf, log, e, "upreg", matrix_local_path, "gt", cutoff["upreg/cutoff"], tmp_path) except: log.error("Oncodrive calculation for upreg failed") matrix_repo.close_local(matrix_local_path) raise try: log.info( "Calculating Downregulation with cutoff {} ...".format( cutoff["downreg/cutoff"])) downreg_results = run_oncodrive( conf, log, e, "downreg", matrix_local_path, "lt", cutoff["downreg/cutoff"], tmp_path) except: log.error("Oncodrive calculation for downreg failed") matrix_repo.close_local(matrix_local_path) raise # Join upreg & downreg results log.info("Joining upreg & downreg results into memory ...") # the join is done in memory with a map dmap = read_data_map(log, upreg_results, downreg_results) log.info("Writting joined results to {} ...".format(results_path)) results_local_path = data_repo.create_local(results_path) write_data_map(dmap, results_local_path) finally: matrix_repo.close_local(matrix_local_path) matrix_repo.close() if os.path.exists(tmp_path): log.debug( "Removing temporary directory {} ...".format(tmp_path)) import shutil shutil.rmtree(tmp_path) data_repo.put_local(results_local_path) em.persist(e, types.MRNA_ONCODRIVE_PROBES) oncodrive_results_port.write(eid) em.close() data_repo.close() rs.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "repositories.data", "bin_paths.gitools"]) conf = task.conf log = task.logger() combinations_port, combination_ids_port = \ task.ports("combinations", "combination_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) results_base_path = types.CNV_COMBINATION.replace(".", "/") conditions = ("gain", "loss") for c_dict in combinations_port: c = DataFactory.from_native(c_dict, key_sep = "/") """ o = em.find(c, types.CNV_ONCODRIVE_GENES) if o is None: log.error("{0} not found: {1}".format(types.CNV_ONCODRIVE_GENES, c)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) """ cid = c["id"] key = (c["icdo_topography"], c["icdo_morphology"], c["id_type"]) log.info("Processing combination for ({}) [{}] ...".format(", ".join(key), cid)) #files = c["files"] #if len(files) == 1: # log.info("No combination required, copyed from {0}".format(files[0])) # c["results_file"] = files[0] #else: results_path = rpath.join(results_base_path, cid + ".tsv.gz") results_url = data_repo.url(results_path) if skip_file(overwrite, data_repo, results_path, c.get("results_file")): log.warn("Skipping {} ({}) [{}] as it already exists".format(types.CNV_COMBINATION, ", ".join(key), cid)) combination_ids_port.write(cid) continue c["results_file"] = results_url combination(log, conf, rs, c, data_repo, results_path, conditions) # save combination results em.persist(c, types.CNV_COMBINATION) combination_ids_port.write(cid) em.close() es.close() data_repo.close() rs.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "cnv.background.ensg", "cnv.mapping.ensg", "bin_paths.bed_tools"]) conf = task.conf log = task.logger() evt_tunit_port, joined_evt_tunit_port = \ task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run mapping_file = conf["cnv.mapping.ensg"] log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file)) mapping_repo, mapping_path = rs.from_url(mapping_file) mapping_local_path = mapping_repo.get_local(mapping_path) background_file = conf["cnv.background.ensg"] log.info("Loading background from {} ...".format(background_file)) background = set() repo, path = rs.from_url(background_file) reader = repo.open_reader(path) for line in reader: line = line.rstrip() if len(line) == 0: continue background.add(line) reader.close() repo.close() for uid in evt_tunit_port: u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", "")) tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/") tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz") if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")): log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid)) joined_evt_tunit_port.write(uid) continue log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid)) cnv_evt_ids = u["cnv_evt_ids"] log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS)) data = {} tmp_path = mkdtemp(prefix = "evt_map_and_join_") log.debug("Temporary directory: {}".format(tmp_path)) try: for eid in cnv_evt_ids: e = em.find(eid, types.CNV_EVENTS) if e is None: log.error("{} not found: {}".format(types.CNV_EVENTS, eid)) continue data_file = e["data_file"] log.debug("{} ...".format(data_file)) repo, path = rs.from_url(data_file) local_path = repo.get_local(path) # Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed) # tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"])) # writer = FileWriter(tmp_file) # reader = repo.open_reader(path) # for line in reader: # if line.lstrip().startswith("#"): # continue # fields = line.rstrip().split("\t") # end = int(fields[2]) + 0 # FIXME fix not necessary already # fields[2] = str(end) # writer.write("\t".join(fields)) # writer.write("\n") # writer.close() # reader.close() # Run BED tools to intersect event regions with gene names tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"])) cmd = " ".join([ os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"), "-a", mapping_local_path, #"-b", tmp_file, "-b", local_path, "-s -wb", ">{}".format(tmp_file2)]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode)) repo.close_local(local_path) # Read BED tools results and load event data into memory reader = FileReader(tmp_file2) name_index = 3 value_index = 12 line_num = 1 for line in reader: try: fields = line.rstrip().split("\t") name = fields[name_index] value = int(fields[value_index]) if value not in [1, 2]: log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file)) continue except: log.error("Error parsing line {} of data file {}".format(line_num, data_file)) continue k = (eid, name) if k in data: prev_value = data[k] else: prev_value = 0 data[k] = prev_value | value line_num += 1 reader.close() repo.close() finally: if os.path.exists(tmp_path): log.debug("Removing temporary directory {} ...".format(tmp_path)) shutil.rmtree(tmp_path) # Write events data to data file and merge with background labels log.info("Writing data to {} ...".format(tunit_path)) u["data_file"] = data_repo.url(tunit_path) #TODO u["data_timestamp"] = ... writer = data_repo.open_writer(tunit_path) # header for name in cnv_evt_ids: writer.write("\t") writer.write(name) writer.write("\n") # data for row_name in sorted(background): writer.write(row_name) for col_name in cnv_evt_ids: k = (col_name, row_name) if k in data: value = data[k] else: value = 0 writer.write("\t") writer.write(str(value)) writer.write("\n") writer.close() log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key))) em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT) joined_evt_tunit_port.write(uid) em.close() es.close() mapping_repo.close_local(mapping_local_path) mapping_repo.close() data_repo.close() rs.close()