def write(self, name, values): value_to_str = self._value_to_str_func(self.dtype) FileWriter.write(self, name) for value in values: FileWriter.write(self, "\t") FileWriter.write(self, value_to_str(value)) FileWriter.write(self, "\n")
def write_header(self, header = None): if isinstance(header, MatrixHeader): FileWriter.write(self, "\t".join(header.columns)) elif isinstance(header, (list, tuple)): FileWriter.write(self, "\t".join(header)) else: raise Exception("Unsupported headers type: {}".format(str(type(header)))) FileWriter.write(self, "\n")
def merge(log, input, output, gitools_output): """ Merge repeated rows by the lowest pvalue, in case the pvalue is the same take the one with greater n """ f = FileReader(input) hdr = f.readline().rstrip().split("\t") upreg = {} downreg = {} upreg_count = 0 downreg_count = 0 mid_index = 8 for line in f: line = line.rstrip() if len(line) == 0: continue fields = line.split("\t") row_name = fields[0] upreg_count += merge_data(row_name, fields[1:mid_index], upreg) downreg_count += merge_data(row_name, fields[mid_index:], downreg) f.close() upreg_keys = upreg.keys() downreg_keys = downreg.keys() log.debug("Total rows: upreg = {}, downreg = {}".format(len(upreg_keys), len(downreg_keys))) log.debug("Merged rows: upreg = {}, downreg = {}".format(upreg_count, downreg_count)) ofile = FileWriter(output) ofile.write("\t".join(hdr)) ofile.write("\n") gfile = FileWriter(gitools_output) gfile.write("column\trow\t") gfile.write("\t".join([x[6:] for x in hdr if x.startswith("upreg_")])) gfile.write("\n") for row_name in upreg_keys: upreg_data = upreg[row_name] upreg_data_join = "\t".join(upreg_data) downreg_data = downreg[row_name] downreg_data_join = "\t".join(downreg_data) ofile.write(row_name) ofile.write("\t") ofile.write(upreg_data_join) ofile.write("\t") ofile.write(downreg_data_join) ofile.write("\n") gfile.write("upreg\t") gfile.write(row_name) gfile.write("\t") gfile.write(upreg_data_join) gfile.write("\n") gfile.write("downreg\t") gfile.write(row_name) gfile.write("\t") gfile.write(downreg_data_join) gfile.write("\n") ofile.close() gfile.close() return (upreg_count, downreg_count)
def __init__(self, obj, dtype=float): FileWriter.__init__(self, obj) self.dtype = dtype
def write_data_map(dmap, path): rf = FileWriter(path) hdr = ["id"] hdr.extend(["_".join(("gain", f.replace("-", "_").lower())) for f in FIELDS]) hdr.extend(["_".join(("loss", f.replace("-", "_").lower())) for f in FIELDS]) rf.write("\t".join(hdr) + "\n") for row, values in dmap.iteritems(): rf.write(row) for v in values: rf.write("\t") rf.write(v) if len(values) == len(FIELDS): rf.write("\t".join(["-"] * len(FIELDS))) rf.write("\n") rf.close()
def combination(log, conf, rs, c, data_repo, results_path, conditions): cid = c["id"] ids = c["source/ids"] files = c["files"] results_url = data_repo.url(results_path) try: # prepare temporary path and files tmp_path = mkdtemp(prefix = "cnv_combination_") data_file = os.path.join(tmp_path, "data.tdm") columns_file = os.path.join(tmp_path, "columns.gmt") tmp_file = os.path.join(tmp_path, "tmp.tdm") log.debug("Temporary directory: {}".format(tmp_path)) # join files to combine in a single TDM file log.info("Joining files ...".format(files[0])) outpf = FileWriter(data_file) log.debug("\t{} ...".format(files[0])) repo, path = rs.from_url(files[0]) local_path = repo.get_local(path) ref_hdr = tdm.unflatten(local_path, outpf, row_column = "id", column_and_attr_func = lambda name: unflatten_filtered_names(name, ids[0])) #outpf.flush() #ref_hdr = tdm.read_header_names(data_file) repo.close_local(path) for i in xrange(1, len(files)): log.debug("\t{} ...".format(files[i])) repo, path = rs.from_url(files[i]) local_path = repo.get_local(path) hdr = tdm.unflatten(local_path, tmp_file, row_column = "id", column_and_attr_func = lambda name: unflatten_filtered_names(name, ids[i])) tdm.append(outpf, tmp_file, ref_hdr) repo.close_local(path) outpf.close() # prepare conditions columns file in GMT format outpf = FileWriter(columns_file) for cond in conditions: outpf.write(cond) outpf.write("\t\t") outpf.write("\t".join(["_".join((sid, cond)) for sid in ids])) outpf.write("\n") outpf.close() # run gitools-combination with data.tdm log.info("Running gitools combination ...") log.debug("\tData: {}".format(data_file)) log.debug("\tColumns: {}".format(columns_file)) gitools_combination_bin = os.path.join(conf["bin_paths.gitools"], "bin", "gitools-combination") cmd = " ".join([ gitools_combination_bin, "-N", cid, "-w", tmp_path, "-d", data_file, "-c", columns_file, "-pn", P_VALUE_FIELD, "-sn n", "-p 1", "-debug"]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) sys.stdout.write("\n") sys.stdout.flush() if retcode != 0: raise Exception("Combination exit code = {}".format(retcode)) # flatten results log.info("Flattening results into {} ...".format(results_url)) try: results_local_path = data_repo.create_local(results_path) tdm.flatten(os.path.join(tmp_path, cid + "-results.tdm.gz"), results_local_path, None, ["N", "z-score", "p-value"]) data_repo.put_local(results_local_path) except: data_repo.close_local(results_local_path) finally: shutil.rmtree(tmp_path)
def enrichment(log, conf, rs, data_repo, results_path, data_file, e, ec, filtered_columns, filtered_columns_new_names): eid = e["id"] key = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"]) # determine the modules file mod_repo, mod_path = rs.from_url(ec["modules_file"]) mod_local_path = mod_repo.get_local(mod_path) # oncodrive data file matrix_repo, matrix_path = rs.from_url(data_file) matrix_local_path = matrix_repo.get_local(matrix_path) e["data_file"] = data_file e["modules_file"] = ec["modules_file"] results_local_path = None tmp_path = mkdtemp(prefix = "enrichment_") log.debug("Temporary directory: {}".format(tmp_path)) valid = True try: log.info("Filtering pvalue columns from {} ...".format(data_file)) # filter columns for pvalues data_local_path = os.path.join(tmp_path, "data.tsv") rf = of = None try: rf = FileReader(matrix_local_path) of = FileWriter(data_local_path) row_count = tsv.filter_columns(rf, of, filtered_columns, filtered_columns_new_names) finally: if rf is not None: rf.close() if of is not None: of.close() if row_count == 0: log.warn("Oncodrive results are empty: {}".format(matrix_path)) raise EmptyResults # apply background if necessary if "population.file" in ec: pop_url = ec["population.file"] pop_missing_value = ec.get("population.missing_value", "-") log.info("Applying background from {} with missing value {} ...".format(pop_url, pop_missing_value)) data2_local_path = os.path.join(tmp_path, "data-filtered.tsv") pop_repo, pop_path = rs.from_url(pop_url) pop_local_path = pop_repo.get_local(pop_path) cmd = " ".join([ conf["bin_paths.python"], conf["bin_paths.matrix_background"], "--verbose --missing-value", pop_missing_value, "-o", data2_local_path, data_local_path, pop_local_path ]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: raise Exception("Applying population background for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode)) pop_repo.close_local(pop_local_path) data_local_path = data2_local_path # enrichment results e["results_file"] = data_repo.url(results_path) results_local_path = data_repo.create_local(results_path) log.info("Running enrichment ...") log.debug("\tData file: {}".format(data_local_path)) log.debug("\tModules file: {}".format(ec["modules_file"])) gitools_enrichment_bin = os.path.join(conf["bin_paths.gitools"], "bin", "gitools-enrichment") sb = [ gitools_enrichment_bin, "-N", eid, "-w", tmp_path, "-p 1", "-mf tcm", "-m", mod_local_path, "-df cdm", "-d", data_local_path, "-t", ec["test"] ] if "filter" in ec: sb += ["-b", ec["filter"]] if ec.get("only_mapped_items", False, dtype=bool): sb += ["-only-mapped-items"] #if "population" in ec: # pop_repo, pop_path = rs.from_url(ec["population"]) # pop_local_path = pop_repo.get_local(pop_path) # sb += ["-P", pop_local_path] cmd = " ".join(sb) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) sys.stdout.write("\n") sys.stdout.flush() if retcode != 0: raise Exception("Enrichment for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode)) # flatten results log.info("Flattening results into {} ...".format(e["results_file"])) try: gitools_results = os.path.join(tmp_path, eid + "-results.tdm.gz") rf = FileReader(gitools_results) of = FileWriter(results_local_path) tdm.flatten(rf, of, { "column" : str, "row" : str, "N" : int, "observed" : int, "expected-mean" : float, "expected-stdev" : float, "probability" : float, "right-p-value" : float, "corrected-right-p-value" : float }, ["N", "observed", "expected-mean", "expected-stdev", "probability", "right-p-value", "corrected-right-p-value"]) finally: if rf is not None: rf.close() if of is not None: of.close() # close local paths data_repo.put_local(results_local_path) except EmptyResults: valid = False except Exception as ex: log.exception(ex) if results_local_path is not None: data_repo.close_local(results_local_path) valid = False finally: shutil.rmtree(tmp_path) mod_repo.close_local(mod_local_path) data_repo.close_local(matrix_local_path) #if "population" in ec: # pop_repo.close_local(pop_local_path) return valid