def get_convert_func(self, key, keep_name=False, filter_to_these_chromosomes=None): """Note that filter_to_these_chromosomes is after the replacements have kicked in""" chain_file = self.data_path / (key + ".over.chain") if not chain_file.exists(): # pragma: no cover raise ValueError("invalid liftover key, file not found: %s" % chain_file) if filter_to_these_chromosomes: filter_to_these_chromosomes = set(filter_to_these_chromosomes) def do_convert(df): if df.index.duplicated().any(): # pragma: no cover raise ValueError("liftover only works with unique indices") df.index = [str(x) for x in df.index] input_tuples = [("chr" + row["chr"], row["start"], row["stop"], idx) for idx, row in df.iterrows()] output_tuples = self.do_liftover(input_tuples, chain_file) output_lists = list(zip(*output_tuples)) res = pd.DataFrame({ "chr": output_lists[0], "start": output_lists[1], "stop": output_lists[2], "parent": [x.decode("utf-8") for x in output_lists[3]], }).set_index("parent") new_chr = [] for x in res["chr"]: x = x[3:] # these are untested as of 2019-03-27 if x == "m": # pragma: no cover x = "MT" elif (key in self.replacements and x in self.replacements[key]): # pragma: no cover x = self.replacements[key][x] new_chr.append(x) res["chr"] = new_chr for col in df.columns: if col not in res.columns: res = res.assign(**{col: df[col]}) if filter_to_these_chromosomes: res = res[res["chr"].isin(filter_to_these_chromosomes)] return res if ppg.inside_ppg(): do_convert.dependencies = [ ppg.FileTimeInvariant(chain_file), ppg.FunctionInvariant( "genomics.regions.convert.LiftOver.do_liftover", LiftOver.do_liftover, ), ] return do_convert
def GenomicRegions_FromTable( name, filename, genome, on_overlap="raise", summit_annotator=None, filter_func=None, vid=None, sheet_name="FromTable", drop_further_columns=True, chr_column="chr", start_column="start", stop_column="stop", one_based=False, reader=read_pandas, ): """Read a table file (csv/tsv/xls) with the chr/start/stop columns (renamed?), optionally drop all further columns""" def load(): df = reader(filename) df["chr"] = df[chr_column].astype(str) df["start"] = df[start_column].astype(int) if one_based: # pragma: no cover df["start"] -= 1 df["stop"] = df[stop_column].astype(int) if drop_further_columns: # pragma: no cover df = df[["chr", "start", "stop"]] if filter_func: # pragma: no cover df = filter_func(df) return df if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.FunctionInvariant(name + "_filter_func", filter_func), ] else: deps = [] return GenomicRegions( name, load, deps, genome, on_overlap, summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def GenomicRegions_FromBigBed( name, filename, genome, chromosome_mangler=lambda x: x, on_overlap="raise", summit_annotator=None, sheet_name=None, vid=None, ): """Create GenomicRegions from a BigBed file. @chromosome_mangler translates genome chromosomes into the bigbed's chromosomes! """ from mbf_fileformats.bed import read_bigbed def load(): res = read_bigbed(filename, genome.get_chromosome_lengths(), chromosome_mangler) if (res["strand"] == 1).all(): res = res.drop("strand", axis=1) if len(res) == 0: # pragma: no cover raise ValueError( "Emtpty BigBed file (or wrong chromosome names)- %s" % filename) return res if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler), ] else: deps = [] return GenomicRegions( name, load, deps, genome, on_overlap=on_overlap, summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def GenomicRegions_FromWig( name, filename, genome, enlarge_5prime=0, enlarge_3prime=0, on_overlap="raise", comment_char=None, summit_annotator=None, vid=None, ): """Create GenomicRegions from a Wiggle file. @enlarge_5prime and @enlarge_3prime increase the size of the fragments described in the wig in the respective direction (for example if a chip-chip array did not cover every base). @comment_char defines which lines to ignore in the wiggle (see {mbf_fileformats.wiggle_to_intervals}) The resulting GenomicRegions has a column 'Score' that contains the wiggle score""" from mbf_fileformats.wiggle import wiggle_to_intervals def load(): df = wiggle_to_intervals(filename, comment_char=comment_char) df["chr"] = [to_string(x) for x in df["chr"]] df["start"] -= enlarge_5prime df["stop"] += enlarge_3prime return df if ppg.inside_ppg(): deps = [ppg.FileTimeInvariant(filename)] else: deps = [] return GenomicRegions(name, load, deps, genome, on_overlap, summit_annotator=summit_annotator, vid=vid)
def test_accepts(self): import pathlib write("aaa", "hello") write("bbb", "hello") write("ccc", "hello") a = ppg.FileTimeInvariant(pathlib.Path("aaa")) a1 = ppg.MultiFileInvariant([pathlib.Path("bbb"), "ccc"]) b = ppg.FileGeneratingJob( pathlib.Path("b"), lambda of: write(of, "bb" + read("aaa") + read("bbb") + read("ccc")), ) b.depends_on(a) b.depends_on(a1) dd = Dummy() def mf(): write("c", "cc" + read("g")) write("d", "dd" + read("h") + dd.attr) write("e", "ee" + read("i") + read("j")) c = ppg.MultiFileGeneratingJob([pathlib.Path("c"), "d", pathlib.Path("e")], mf) c.depends_on(b) d = ppg.FunctionInvariant(pathlib.Path("f"), lambda x: x + 1) c.depends_on(d) e = ppg.ParameterInvariant(pathlib.Path("c"), "hello") c.depends_on(e) f = ppg.TempFileGeneratingJob(pathlib.Path("g"), lambda: write("g", "gg")) c.depends_on(f) def tmf(): write("h", "hh") write("i", "ii") g = ppg.MultiTempFileGeneratingJob([pathlib.Path("h"), "i"], tmf) c.depends_on(g) def tpf(): write("j", "jjjj") write("k", "kkkk") h = ppg.TempFilePlusGeneratingJob(pathlib.Path("j"), pathlib.Path("k"), tpf) c.depends_on(h) i = ppg.CachedDataLoadingJob( pathlib.Path("l"), lambda: write("l", "llll"), lambda res: res ) c.depends_on(i) m = ppg.CachedAttributeLoadingJob(pathlib.Path("m"), dd, "attr", lambda: "55") c.depends_on(m) ppg.run_pipegraph() assert read("aaa") == "hello" assert read("b") == "bbhellohellohello" assert read("c") == "ccgg" assert read("d") == "ddhh55" assert read("e") == "eeiijjjj" assert not (os.path.exists("g")) assert not (os.path.exists("h")) assert not (os.path.exists("i")) assert not (os.path.exists("j")) assert read("k") == "kkkk"
def GenomicRegions_FromGFF( name, filename, genome, filter_function=None, comment_char=None, on_overlap="raise", chromosome_mangler=None, fix_negative_coordinates=False, alternative_class=None, summit_annotator=None, vid=None, ): """Create a GenomicRegions from a gff file. You can filter entries with @filter_function(gff_entry_dict) -> Bool, remove comment lines starting with a specific character with @comment_char, mangle the chromosomes with @chromosome_mangler(str) -> str, replace negative coordinates with 0 (@fix_negative_coordinates), or provide an alternative constructor to call with @alternative_class """ def load(): from mbf_fileformats.gff import gffToDict entries = gffToDict(filename, comment_char=comment_char) data = { "chr": [], "start": [], "stop": [], "score": [], "strand": [], "name": [], } name_found = False for entry in entries: if filter_function and not filter_function(entry): continue if chromosome_mangler: chr = chromosome_mangler(entry["seqname"]) else: chr = entry["seqname"] data["chr"].append(to_string(chr)) start = entry["start"] if fix_negative_coordinates and start < 0: start = 0 data["start"].append(start) data["stop"].append(entry["end"]) data["score"].append(entry["score"]) data["strand"].append(entry["strand"]) name = entry["attributes"].get("Name", [""])[0] data["name"].append(name) if name: name_found = True if not name_found: del data["name"] return pd.DataFrame(data) if alternative_class is None: # pragma: no cover alternative_class = GenomicRegions if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.ParameterInvariant( name + "_params_GenomicRegions_FromGFF", (comment_char, fix_negative_coordinates), ), ppg.FunctionInvariant(name + "_filter_func_GenomicRegions_FromGFF", filter_function), ppg.FunctionInvariant( name + "_chromosome_manlger_GenomicRegions_FromGFF", chromosome_mangler), ] else: deps = [] return alternative_class(name, load, deps, genome, on_overlap, summit_annotator=summit_annotator, vid=vid)
def GenomicRegions_FromBed( name, filename, genome, chromosome_mangler=lambda x: x, on_overlap="raise", filter_invalid_chromosomes=False, summit_annotator=None, sheet_name=None, vid=None, ): """Create GenomicRegions from a Bed file. The resulting GenomicRegions has a column 'Score' that contains the wiggle score""" from mbf_fileformats.bed import read_bed def load(): valid_chromosomes = set(genome.get_chromosome_lengths()) data = {} entries = read_bed(filename) data["chr"] = np.array( [chromosome_mangler(to_string(e.refseq)) for e in entries], dtype=np.object) data["start"] = np.array([e.position for e in entries], dtype=np.int32) data["stop"] = np.array([e.position + e.length for e in entries], dtype=np.int32) data["score"] = np.array([e.score for e in entries], dtype=np.float) data["strand"] = np.array([e.strand for e in entries], dtype=np.int8) data["name"] = np.array([to_string(e.name) for e in entries], dtype=np.object) data = pd.DataFrame(data) if filter_invalid_chromosomes: # pragma: no cover keep = [x in valid_chromosomes for x in data["chr"]] data = data[keep] res = data if len(res) == 0: raise ValueError("Emtpty Bed file - %s" % filename) if (np.isnan(res["score"])).all(): res = res.drop(["score"], axis=1) if (len(res["name"]) > 1) and (len(res["name"].unique()) == 1): res = res.drop(["name"], axis=1) return res if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler), ] else: deps = [] return GenomicRegions( name, load, deps, genome, on_overlap=on_overlap, summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )