def GenomicRegions_Union(name, list_of_grs, summit_annotator=None, sheet_name="Overlaps"): """Combine serveral GRs into one. Do not set on_overlap """ verify_same_genome(list_of_grs) def load(): dfs = [x.df[["chr", "start", "stop"]] for x in list_of_grs] return pd.concat(dfs, axis=0) if ppg.inside_ppg(): deps = [x.load() for x in list_of_grs] deps.append( ppg.ParameterInvariant(name + "_input_grs", list(sorted([x.name for x in list_of_grs])))) else: deps = [] vid = ("union", [x.vid for x in list_of_grs]) return GenomicRegions( name, load, deps, list_of_grs[0].genome, on_overlap="merge", summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def __init__(self, ddf, groups_to_samples, name=None): if not isinstance(ddf, DelayedDataFrame): raise ValueError("Ddf must be a DelayedDataFrame") self.ddf = ddf self.groups_to_samples = self._check_input_dict(groups_to_samples) self.sample_column_to_group = self._sample_columns_to_group() self.samples = functools.reduce( list.__add__, [x[1] for x in sorted(self.groups_to_samples.items())] ) if name is None: self.name = "comparison__" + "_".join(sorted(self.groups_to_samples.keys())) else: self.name = "comparison__" + name self.result_dir = self.ddf.result_dir / self.name self.result_dir.mkdir(exist_ok=True, parents=True) if ppg.inside_ppg(): ppg.assert_uniqueness_of_object(self) if not hasattr(ppg.util.global_pipegraph, "_mbf_comparisons_name_dedup"): ppg.util.global_pipegraph._mbf_comparisons_name_dedup = set() for name in self.groups_to_samples: if name in ppg.util.global_pipegraph._mbf_comparisons_name_dedup: raise ValueError( f"Comparisons group {name} defined in multiple Comparisons - not supported" ) self.register_qc()
def plot(self): normed = self.normed_ddf(self.ddf) ordered = self.ordered_ddf(normed) names = self.handle_names() def plot(): p = self.plot_strategy.plot(ordered.df, names, self.plot_options) self.plot_strategy.render(str(self.output_filename), p) if ppg.inside_ppg(): ppg.util.global_pipegraph.quiet = False deps = [ ordered.load(), ppg.FunctionInvariant( "mbf_heatmap." + self.plot_strategy.name + "plot_func", self.plot_strategy.__class__.plot, ), ppg.FunctionInvariant( "mbf_heatmap" + self.plot_strategy.name + "render_func", self.plot_strategy.__class__.render, ), ppg.ParameterInvariant(self.output_filename, freeze( (self.names, self.plot_options))), ] return ppg.FileGeneratingJob(self.output_filename, plot).depends_on(deps) else: plot() return self.output_filename
def __new__(cls, *args, **kwargs): cn = cls.__name__ if ppg.inside_ppg(): if not hasattr(ppg.util.global_pipegraph, "_annotator_singleton_dict"): ppg.util.global_pipegraph._annotator_singleton_dict = { "lookup": [] } singleton_dict = ppg.util.global_pipegraph._annotator_singleton_dict else: singleton_dict = annotator_singletons if not cn in singleton_dict: singleton_dict[cn] = {} key = {} for ii in range(0, len(args)): key["arg_%i" % ii] = args[ii] key.update(kwargs) for k, v in key.items(): key[k] = freeze(v) key = tuple(sorted(key.items())) if not key in singleton_dict[cn]: singleton_dict[cn][key] = object.__new__(cls) singleton_dict["lookup"].append(singleton_dict[cn][key]) return singleton_dict[cn][key]
def __init__(self, name, loading_function, dependencies=[], result_dir=None): # assert_uniqueness_of_object is taking core of by the load_strategy self.name = name if result_dir: self.result_dir = Path(result_dir) else: self.result_dir = Path( "results") / self.__class__.__name__ / self.name self.result_dir.mkdir(parents=True, exist_ok=True) if isinstance(loading_function, pd.DataFrame): # don't you just love lambda variable binding? loading_function = ( lambda loading_function=loading_function: loading_function) if not ppg.inside_ppg(): self.load_strategy = Load_Direct(self, loading_function) else: self.load_strategy = Load_PPG(self, loading_function, dependencies) self.column_to_annotators = {} self.annotators = {} self.parent = None self.children = [] # this prevents writing the same file with two different mangler functions # but still allows you to call write() in ppg settings multiple times # if different parts need to ensure it's being written out self.mangler_dict = {self.get_table_filename(): None} self.load()
def normed_ddf(self, input_ddf): def load(): df = input_ddf.df[[ac[1] for ac in self.columns]] normed_df = self.normalization_strategy.calc( df, [ac[1] for ac in self.columns]) return normed_df output_name = input_ddf.name + "_heatmap_" + self.normalization_strategy.name if ppg.inside_ppg(): deps = [ self.ddf.add_annotator(ac[0]) for ac in self.columns if ac[0] is not None ] + [ self.normalization_strategy.deps(), input_ddf.load(), ppg.FunctionInvariant(output_name + '_calc', self.normalization_strategy.calc) ] else: deps = [] return DelayedDataFrame( output_name, load, deps, input_ddf.result_dir, )
def test_do_load_only_happens_once(self): df = pd.DataFrame([{ "gene_stable_id": "fake1", "chr": "1", "strand": 1, "tss": 5000, "tes": 5500, "description": "bla", }]) counter = [0] def load(): counter[0] += 1 return df g = genes.Genes(get_genome_chr_length(), load, name="shu") if ppg.inside_ppg(): assert counter[0] == 0 g.load() assert counter[0] == 0 g.load() assert counter[0] == 0 ppg.run_pipegraph() else: assert counter[0] == 1 g.load() assert counter[0] == 1
def test_random_same_number(self): def sample_data(): return pd.DataFrame({ "chr": ["1", "2", "1"], "start": [10, 100, 1000], "stop": [12, 110, 1110], "column_that_will_disappear": ["A", "b", "c"], }) def convert(df): res = df[["chr", "start", "stop"]] res = res.assign(start=res["start"] + 1) return res if ppg.inside_ppg(): deps = [ppg.ParameterInvariant("shuParam", ("hello"))] else: deps = [] a = regions.GenomicRegions("sharum", sample_data, [], get_genome_chr_length()) a.add_annotator(Constant("Constant", 5)) a.annotate() b = a.convert("a+1", convert, dependencies=deps) force_load(b.load()) for d in deps: assert d in b.load().lfg.prerequisites run_pipegraph() assert len(a.df) == len(b.df) assert (a.df["start"] == b.df["start"] - 1).all() assert "column_that_will_disappear" in a.df.columns assert not ("column_that_will_disappear" in b.df.columns)
def test_multi_plus_filter(self, clear_annotators): d = DelayedDataFrame( "ex1", pd.DataFrame({ "a1": [1 / 0.99, 2 / 0.99, 3 / 0.99], "a2": [1 * 0.99, 2 * 0.99, 3 * 0.99], "b1": [2 * 0.99, 8 * 0.99, (16 * 3) * 0.99], "b2": [2 / 0.99, 8 / 0.99, (16 * 3) / 0.99], "delta": [10, 20, 30], }), ) c = Comparisons(d, {"a": ["a1", "a2"], "b": ["b1", "b2"]}) a = c.a_vs_b("a", "b", Log2FC(), laplace_offset=0) anno1 = Constant("shu1", 5) anno2 = Constant("shu2", 5) # noqa: F841 anno3 = Constant("shu3", 5) # noqa: F841 to_test = [ (("log2FC", "==", -1.0), [-1.0]), (("log2FC", ">", -2.0), [-1.0]), (("log2FC", "<", -2.0), [-4.0]), (("log2FC", ">=", -2.0), [-1.0, -2.0]), (("log2FC", "<=", -2.0), [-2.0, -4.0]), (("log2FC", "|>", 2.0), [-4.0]), (("log2FC", "|<", 2.0), [-1.0]), (("log2FC", "|>=", 2.0), [-2.0, -4.0]), (("log2FC", "|<=", 2.0), [-1.0, -2.0]), ((a["log2FC"], "<", -2.0), [-4.0]), (("log2FC", "|", -2.0), ValueError), ([("log2FC", "|>=", 2.0), ("log2FC", "<=", 0)], [-2.0, -4.0]), ((anno1, ">=", 5), [-1, -2.0, -4.0]), (((anno1, 0), ">=", 5), [-1, -2.0, -4.0]), (("shu2", ">=", 5), [-1, -2.0, -4.0]), (("delta", ">", 10), [-2.0, -4.0]), ] if not ppg.inside_ppg(): # can't test for missing columns in ppg. to_test.extend([(("log2FC_no_such_column", "<", -2.0), KeyError)]) filtered = {} for ii, (f, r) in enumerate(to_test): if r in (ValueError, KeyError): with pytest.raises(r): a.filter([f], "new%i" % ii) else: filtered[tuple(f)] = a.filter( [f] if isinstance(f, tuple) else f, "new%i" % ii) assert filtered[tuple(f)].name == "new%i" % ii force_load(filtered[tuple(f)].annotate(), filtered[tuple(f)].name) force_load(d.add_annotator(a), "somethingsomethingjob") run_pipegraph() c = a["log2FC"] assert (d.df[c] == [-1.0, -2.0, -4.0]).all() for f, r in to_test: if r not in (ValueError, KeyError): try: assert filtered[tuple(f)].df[c].values == approx(r) except AssertionError: print(f) raise
def get_convert_func(self, key, keep_name=False, filter_to_these_chromosomes=None): """Note that filter_to_these_chromosomes is after the replacements have kicked in""" chain_file = self.data_path / (key + ".over.chain") if not chain_file.exists(): # pragma: no cover raise ValueError("invalid liftover key, file not found: %s" % chain_file) if filter_to_these_chromosomes: filter_to_these_chromosomes = set(filter_to_these_chromosomes) def do_convert(df): if df.index.duplicated().any(): # pragma: no cover raise ValueError("liftover only works with unique indices") df.index = [str(x) for x in df.index] input_tuples = [("chr" + row["chr"], row["start"], row["stop"], idx) for idx, row in df.iterrows()] output_tuples = self.do_liftover(input_tuples, chain_file) output_lists = list(zip(*output_tuples)) res = pd.DataFrame({ "chr": output_lists[0], "start": output_lists[1], "stop": output_lists[2], "parent": [x.decode("utf-8") for x in output_lists[3]], }).set_index("parent") new_chr = [] for x in res["chr"]: x = x[3:] # these are untested as of 2019-03-27 if x == "m": # pragma: no cover x = "MT" elif (key in self.replacements and x in self.replacements[key]): # pragma: no cover x = self.replacements[key][x] new_chr.append(x) res["chr"] = new_chr for col in df.columns: if col not in res.columns: res = res.assign(**{col: df[col]}) if filter_to_these_chromosomes: res = res[res["chr"].isin(filter_to_these_chromosomes)] return res if ppg.inside_ppg(): do_convert.dependencies = [ ppg.FileTimeInvariant(chain_file), ppg.FunctionInvariant( "genomics.regions.convert.LiftOver.do_liftover", LiftOver.do_liftover, ), ] return do_convert
def calc(self, df): if ppg.inside_ppg(): data = self._data else: data = self.calc_data() lookup = self.count_strategy.extract_lookup(data) result = [] for gene_stable_id in df["gene_stable_id"]: result.append(lookup.get(gene_stable_id, 0)) result = np.array(result, dtype=np.float) return pd.Series(result)
def calc(self, df): if ppg.inside_ppg(): data = self._data else: data = self.calc_data() lookup = self.count_strategy.extract_lookup(data) result = [] for idx in df.index: result.append(lookup.get(str(idx), 0)) result = np.array(result, dtype=np.float) return pd.Series(result)
def force_load(job, prefix=None): """make sure a dataloadingjob has been loaded (if applicable)""" if ppg.inside_ppg(): if not isinstance(job, ppg.Job): if prefix is None: global fl_count fl_count += 1 prefix = "fl_%i" % fl_count else: prefix = job.job_id return ppg.JobGeneratingJob(prefix + "_force_load", lambda: None).depends_on(job)
def GenomicRegions_FromTable( name, filename, genome, on_overlap="raise", summit_annotator=None, filter_func=None, vid=None, sheet_name="FromTable", drop_further_columns=True, chr_column="chr", start_column="start", stop_column="stop", one_based=False, reader=read_pandas, ): """Read a table file (csv/tsv/xls) with the chr/start/stop columns (renamed?), optionally drop all further columns""" def load(): df = reader(filename) df["chr"] = df[chr_column].astype(str) df["start"] = df[start_column].astype(int) if one_based: # pragma: no cover df["start"] -= 1 df["stop"] = df[stop_column].astype(int) if drop_further_columns: # pragma: no cover df = df[["chr", "start", "stop"]] if filter_func: # pragma: no cover df = filter_func(df) return df if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.FunctionInvariant(name + "_filter_func", filter_func), ] else: deps = [] return GenomicRegions( name, load, deps, genome, on_overlap, summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def __init__(self, species, revision, prebuild_manager): super().__init__() self.prebuild_manager = prebuild_manager self.species = species if not re.match(r"^[A-Z][a-z]+_[a-z]+$", species): raise ValueError("Species must be capitalized like 'Homo_sapiens") self.revision = str(int(revision)) self.name = f"{self.species}_{self.revision}" if ppg.inside_ppg(): ppg.util.assert_uniqueness_of_object(self) self.genetic_code = EukaryoticCode self.download_genome() self._seq_region_is_canonical = {} self._canonical_cache = {}
def test_find_annos_from_column(self, both_ppg_and_no_ppg_no_qc, clear_annotators): a = Constant("shu", 5) assert find_annos_from_column("shu") == [a] assert find_annos_from_column("shu")[0] is a with pytest.raises(KeyError): find_annos_from_column("nosuchcolumn") b = PolyConstant(["shu"], [10]) assert find_annos_from_column("shu") == [a, b] if ppg.inside_ppg(): both_ppg_and_no_ppg_no_qc.new_pipegraph() with pytest.raises(KeyError): find_annos_from_column("shu")
def test_filtering_by_definition(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) c = XAnno("C", [1, 2]) a += c d = XAnno("D", [4, 5]) # native column a1 = a.filter("a1", ("A", "==", 1)) # search for the anno a2 = a.filter("a2", ("C", "==", 2)) # extract the column name from the anno - anno already added a4 = a.filter("a4", (d, "==", 5)) # extract the column name from the anno - anno not already added a3 = a.filter("a3", (c, "==", 1)) # lookup column to name a6 = a.filter("a6", ("X", "==", 2), column_lookup={"X": "C"}) # lookup column to anno a7 = a.filter("a7", ("X", "==", 2), column_lookup={"X": c}) if not ppg.inside_ppg(): e1 = XAnno("E", [6, 7]) e2 = XAnno("E", [6, 8]) assert find_annos_from_column("E") == [e1, e2] # column name to longer unique with pytest.raises(KeyError): a.filter("a5", ("E", "==", 5)) with pytest.raises(KeyError): a.filter("a5", ((c, "D"), "==", 5)) force_load(a1.annotate()) force_load(a2.annotate()) force_load(a3.annotate()) force_load(a4.annotate()) force_load(a6.annotate()) force_load(a7.annotate()) run_pipegraph() assert (a1.df["A"] == [1]).all() assert (a2.df["A"] == [2]).all() assert (a3.df["A"] == [1]).all() assert (a4.df["A"] == [2]).all() assert (a6.df["A"] == [2]).all() assert (a7.df["A"] == [2]).all()
def __iadd__(self, other): """Add and return self""" if isinstance(other, Annotator): if ppg.inside_ppg(): if not self.has_annotator(other): self.load_strategy.add_annotator(other) elif self.get_annotator(other.get_cache_name()) is not other: raise ValueError( "trying to add different annotators with identical cache_names\n%s\n%s" % (other, self.get_annotator(other.get_cache_name()))) else: self.load_strategy.add_annotator(other) return self else: return NotImplemented
def GenomicRegions_FromBigBed( name, filename, genome, chromosome_mangler=lambda x: x, on_overlap="raise", summit_annotator=None, sheet_name=None, vid=None, ): """Create GenomicRegions from a BigBed file. @chromosome_mangler translates genome chromosomes into the bigbed's chromosomes! """ from mbf_fileformats.bed import read_bigbed def load(): res = read_bigbed(filename, genome.get_chromosome_lengths(), chromosome_mangler) if (res["strand"] == 1).all(): res = res.drop("strand", axis=1) if len(res) == 0: # pragma: no cover raise ValueError( "Emtpty BigBed file (or wrong chromosome names)- %s" % filename) return res if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler), ] else: deps = [] return GenomicRegions( name, load, deps, genome, on_overlap=on_overlap, summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def __exit__(self, *tp): from _pytest.outcomes import fail if ppg.inside_ppg(): with pytest.raises(ppg.RuntimeError) as e: run_pipegraph() assert isinstance(e.value.exceptions[0], self.expected_exception) if self.search_message: assert self.search_message in str(e.value.exceptions[0]) else: __tracebackhide__ = True if tp[0] is None: fail(self.message) self.excinfo.__init__(tp) suppress_exception = issubclass(self.excinfo.type, self.expected_exception) if sys.version_info[0] == 2 and suppress_exception: sys.exc_clear() return suppress_exception
def find_annos_from_column(k): from . import annotator import pypipegraph as ppg if ppg.inside_ppg(): if not hasattr(ppg.util.global_pipegraph, "_annotator_singleton_dict"): ppg.util.global_pipegraph._annotator_singleton_dict = {} singleton_dict = ppg.util.global_pipegraph._annotator_singleton_dict else: singleton_dict = annotator.annotator_singletons res = [] for anno in singleton_dict["lookup"]: if k in anno.columns: res.append(anno) if res: return res else: raise KeyError("No anno for column '%s' found" % (k, ))
def GenomicRegions_CommonInAtLeastX(name, list_of_grs, X, summit_annotator=None, sheet_name="Overlaps"): """Combine serveral GRs into one. Keep only those (union) regions occuring in at least x.""" def load(): union = merge_df_intervals( pd.concat([x.df[["chr", "start", "stop"]] for x in list_of_grs])).reset_index() keep = np.zeros((len(union), ), dtype=np.bool) for ii, row in union.iterrows(): count = 0 for gr in list_of_grs: if gr.has_overlapping(row["chr"], row["start"], row["stop"]): count += 1 keep[ii] = count >= X if not keep.any(): # pragma: no cover raise ValueError("Filtered all of them") return union.iloc[keep] if len(set([x.genome for x in list_of_grs])) > 1: # pragma: no cover raise ValueError( "Can only merge GenomicRegions that have the same genome") if ppg.inside_ppg(): deps = [x.load() for x in list_of_grs] deps.append( ppg.ParameterInvariant(name + "_input_grs", sorted([x.name for x in list_of_grs]))) else: deps = [] [x.load() for x in list_of_grs] vid = ("common at least %i" % X, [x.vid for x in list_of_grs]) return GenomicRegions( name, load, deps, list_of_grs[0].genome, on_overlap="raise", summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def GenomicRegions_FromWig( name, filename, genome, enlarge_5prime=0, enlarge_3prime=0, on_overlap="raise", comment_char=None, summit_annotator=None, vid=None, ): """Create GenomicRegions from a Wiggle file. @enlarge_5prime and @enlarge_3prime increase the size of the fragments described in the wig in the respective direction (for example if a chip-chip array did not cover every base). @comment_char defines which lines to ignore in the wiggle (see {mbf_fileformats.wiggle_to_intervals}) The resulting GenomicRegions has a column 'Score' that contains the wiggle score""" from mbf_fileformats.wiggle import wiggle_to_intervals def load(): df = wiggle_to_intervals(filename, comment_char=comment_char) df["chr"] = [to_string(x) for x in df["chr"]] df["start"] -= enlarge_5prime df["stop"] += enlarge_3prime return df if ppg.inside_ppg(): deps = [ppg.FileTimeInvariant(filename)] else: deps = [] return GenomicRegions(name, load, deps, genome, on_overlap, summit_annotator=summit_annotator, vid=vid)
def GenomicRegions_Common(name, list_of_grs, summit_annotator=None, sheet_name="Overlaps"): """Combine serveral GRs into one. Keep only those (union) regions occuring in all.""" def load(): union = merge_df_intervals( pd.concat([x.df[["chr", "start", "stop"]] for x in list_of_grs])).reset_index(drop=True) keep = np.ones((len(union), ), dtype=np.bool) for gr in list_of_grs: for ii, row in union.iterrows(): if keep[ii]: # no point in checking if we already falsified - short circuit... if not gr.has_overlapping(row["chr"], row["start"], row["stop"]): keep[ii] = False return union[keep] verify_same_genome(list_of_grs) if ppg.inside_ppg(): deps = [x.load() for x in list_of_grs] deps.append( ppg.ParameterInvariant(name + "_input_grs", sorted([x.name for x in list_of_grs]))) else: for x in list_of_grs: x.load() deps = [] vid = ("common", [x.vid for x in list_of_grs]) return GenomicRegions( name, load, deps, list_of_grs[0].genome, on_overlap="raise", summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def run_pipegraph(): if ppg.inside_ppg(): ppg.run_pipegraph() else: pass
def test_both_fixtures(self, both_ppg_and_no_ppg_no_qc): if not ppg.inside_ppg(): assert qc_disabled() else: assert qc_disabled()
def GenomicRegions_FromBed( name, filename, genome, chromosome_mangler=lambda x: x, on_overlap="raise", filter_invalid_chromosomes=False, summit_annotator=None, sheet_name=None, vid=None, ): """Create GenomicRegions from a Bed file. The resulting GenomicRegions has a column 'Score' that contains the wiggle score""" from mbf_fileformats.bed import read_bed def load(): valid_chromosomes = set(genome.get_chromosome_lengths()) data = {} entries = read_bed(filename) data["chr"] = np.array( [chromosome_mangler(to_string(e.refseq)) for e in entries], dtype=np.object) data["start"] = np.array([e.position for e in entries], dtype=np.int32) data["stop"] = np.array([e.position + e.length for e in entries], dtype=np.int32) data["score"] = np.array([e.score for e in entries], dtype=np.float) data["strand"] = np.array([e.strand for e in entries], dtype=np.int8) data["name"] = np.array([to_string(e.name) for e in entries], dtype=np.object) data = pd.DataFrame(data) if filter_invalid_chromosomes: # pragma: no cover keep = [x in valid_chromosomes for x in data["chr"]] data = data[keep] res = data if len(res) == 0: raise ValueError("Emtpty Bed file - %s" % filename) if (np.isnan(res["score"])).all(): res = res.drop(["score"], axis=1) if (len(res["name"]) > 1) and (len(res["name"].unique()) == 1): res = res.drop(["name"], axis=1) return res if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler), ] else: deps = [] return GenomicRegions( name, load, deps, genome, on_overlap=on_overlap, summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def GenomicRegions_FromGFF( name, filename, genome, filter_function=None, comment_char=None, on_overlap="raise", chromosome_mangler=None, fix_negative_coordinates=False, alternative_class=None, summit_annotator=None, vid=None, ): """Create a GenomicRegions from a gff file. You can filter entries with @filter_function(gff_entry_dict) -> Bool, remove comment lines starting with a specific character with @comment_char, mangle the chromosomes with @chromosome_mangler(str) -> str, replace negative coordinates with 0 (@fix_negative_coordinates), or provide an alternative constructor to call with @alternative_class """ def load(): from mbf_fileformats.gff import gffToDict entries = gffToDict(filename, comment_char=comment_char) data = { "chr": [], "start": [], "stop": [], "score": [], "strand": [], "name": [], } name_found = False for entry in entries: if filter_function and not filter_function(entry): continue if chromosome_mangler: chr = chromosome_mangler(entry["seqname"]) else: chr = entry["seqname"] data["chr"].append(to_string(chr)) start = entry["start"] if fix_negative_coordinates and start < 0: start = 0 data["start"].append(start) data["stop"].append(entry["end"]) data["score"].append(entry["score"]) data["strand"].append(entry["strand"]) name = entry["attributes"].get("Name", [""])[0] data["name"].append(name) if name: name_found = True if not name_found: del data["name"] return pd.DataFrame(data) if alternative_class is None: # pragma: no cover alternative_class = GenomicRegions if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.ParameterInvariant( name + "_params_GenomicRegions_FromGFF", (comment_char, fix_negative_coordinates), ), ppg.FunctionInvariant(name + "_filter_func_GenomicRegions_FromGFF", filter_function), ppg.FunctionInvariant( name + "_chromosome_manlger_GenomicRegions_FromGFF", chromosome_mangler), ] else: deps = [] return alternative_class(name, load, deps, genome, on_overlap, summit_annotator=summit_annotator, vid=vid)
def prebuild( # noqa: C901 self, name, version, input_files, output_files, calculating_function, minimum_acceptable_version=None, maximum_acceptable_version=None, further_function_deps={}, ): """Create a job that will prebuilt the files if necessary @further_function_deps is a dictionary name => func, and will end up as PrebuildFunctionInvariantFileStoredExploding in the correct directory """ if minimum_acceptable_version is None: minimum_acceptable_version = version available_versions = self._find_versions(name) if version in available_versions: output_path = available_versions[version] else: # these are within minimum..maximum_acceptable_version acceptable_versions = sort_versions([ (v, p) for v, p in available_versions.items() if ((Version(v) >= minimum_acceptable_version) and ( maximum_acceptable_version is None or (Version(v) < maximum_acceptable_version))) ]) ok_versions = [] ( new_source, new_funchash, new_closure, ) = ppg.FunctionInvariant._hash_function(calculating_function) for v, p in acceptable_versions: func_md5sum_path = p / "mbf_func.md5sum" func_md5sum_path2 = p / "mbf_func.md5sum2" try: func_md5sum = json.loads(func_md5sum_path2.read_text()) except OSError: func_md5sum = func_md5sum_path.read_text() ok = False try: new = ppg.FunctionInvariant._compare_new_and_old( new_source, new_funchash, new_closure, func_md5sum) ok = False except ppg.NothingChanged: ok = True if ok: ok_versions.append((v, p)) if ok_versions: version, output_path = ok_versions[-1] else: # no version that is within the acceptable range and had the same build function output_path = self.prebuilt_path / self.hostname / name / version if isinstance(output_files, (str, Path)): output_files = [output_files] output_files = [Path(of) for of in output_files] if ppg.inside_ppg(): job = PrebuildJob(output_files, calculating_function, output_path) job.depends_on( _PrebuildFileInvariantsExploding(output_path, input_files)) job.version = version return job else: for of in output_files: if not (output_path / of).exists(): raise ValueError( "%s was missing and prebuild used outside of ppg - can't build it" % (output_path / of).absolute()) class DummyJob: """just enough of the Jobs interface to ignore the various calls and allow finding the msgpack jobs """ def __init__(self, output_path, filenames): self.output_path = output_path self.filenames = PrebuildJob._normalize_output_files( filenames, output_path) # self.job_id = ":".join(sorted(str(x) for x in filenames)) def depends_on(self, _other_job): # pragma: no cover return self def depends_on_func(self, _name, _func): # pragma: no cover return self def depends_on_file(self, _filename): # pragma: no cover return self def name_file(self, output_filename): """Adjust path of output_filename by job path""" return self.output_path / output_filename def find_file(self, output_filename): """Search for a file named output_filename in the job's known created files""" of = self.name_file(output_filename) for fn in self.filenames: if of.resolve() == Path(fn).resolve(): return of else: raise KeyError("file not found: %s" % output_filename) def __iter__(self): yield self return DummyJob(output_path, output_files)
def qc_disabled(): if not ppg.inside_ppg(): return True return getattr(ppg.util.global_pipegraph, "_qc_keep_function", True) is False