def merge_df_intervals(df, iv_func=lambda iv: iv.merge_hull()):
    """take a DataFrame {chr, start, end, *} and merge overlapping intervals.
    * is from the last entry.


    """
    if not "strand" in df.columns:
        df = df.assign(strand=1)
        strand_added = True
    else:
        strand_added = False
    joined = _df_to_tup(df)

    out = []
    for chr_strand, sub_group in itertools.groupby(joined, lambda tup: tup[0]):
        args = [x[1:] for x in sub_group]
        iv = IntervalSet.from_tuples_with_id(args)
        new_order = iv_func(iv).to_tuples_last_id()
        new_df = df.iloc[[x[2] for x in new_order]].copy()
        new_df.loc[:, "start"] = [x[0] for x in new_order]
        new_df.loc[:, "stop"] = [x[1] for x in new_order]
        out.append(new_df)
    res = pd.concat(out)
    if strand_added:
        res = res.drop("strand", axis=1)
    return res.sort_values(["chr", "start"])
示例#2
0
    def do_load():
        from mbf_nested_intervals import IntervalSet
        import itertools

        df = gr.df
        joined = []
        for ii, (chr, start,
                 stop) in enumerate(zip(df["chr"], df["start"], df["stop"])):
            joined.append(((chr), start, stop, ii))
        joined.sort(key=lambda tup: tup[0])

        out = []
        chr_lengths = gr.genome.get_chromosome_lengths()
        seen = set()
        for chr, sub_group in itertools.groupby(joined, lambda tup: tup[0]):
            args = [x[1:] for x in sub_group]
            iv = IntervalSet.from_tuples_with_id(args)
            new_order = iv.invert(0, chr_lengths[chr]).to_numpy()
            out.append(
                pd.DataFrame({
                    "start": new_order[0],
                    "stop": new_order[1],
                    "chr": chr
                }))
            seen.add(chr)
        for chr in chr_lengths.keys() - seen:
            out.append(
                pd.DataFrame({
                    "start": [0],
                    "stop": [chr_lengths[chr]],
                    "chr": chr
                }))

        return pd.concat(out).reset_index(drop=True)
示例#3
0
 def exons_protein_coding_merged(self):
     """Get the merged exon regions for a gene , only for protein coding exons.
     Empty result on non protein coding genes
     result is a a tuple of np arrays, (starts, stops)
     """
     return (
         IntervalSet.from_tuples(self._exons_protein_coding).merge_hull().to_numpy()
     )
示例#4
0
 def test_invert(self):
     i = IntervalSet.from_tuples([
         (5,10),
     ])
     i2 = i.invert(0, 15)
     assert i2.to_tuples() == [
         (0,5),
         (10,15)]
示例#5
0
 def introns(self):
     """Return [(start, stop),...] for all introns in the transcript
     Order is in genomic order.
     Intron is defined as everything inside tss..tes that is not an exon,
     so if a gene, by any reason would extend beyond it's exons,
     that region would also be covered.
     """
     gene_start = self.gene.start
     gene_stop = self.gene.stop
     exons = sorted(self.exons_tuples)
     return IntervalSet.from_tuples(exons).invert(gene_start, gene_stop).to_tuples()
示例#6
0
    def test_merge_hull(self):
        i = IntervalSet.from_tuples_with_id([
            (1,10, 100),
            (7,15, 200),
            (0,5, 333),

        ])
        i2 = i.merge_hull()
        assert i2.to_tuples_with_id() == [
            (0, 15, [100, 200, 333])
        ]
示例#7
0
    def test_from_tuples(self):
        i = IntervalSet.from_tuples([
            (1,10),
            (1,15),
            (0,5),

        ])
        assert i.to_tuples() == [
            (0,5),
            (1,15),
            (1,10),
        ]
示例#8
0
    def test_from_tuples_with_id2(self):
        i = IntervalSet.from_tuples_with_id([
            (1,10, 100),
            (1,15, 200),
            (0,5, 333),

        ])
        assert i.to_tuples_with_id() == [
            (0,5, [333]),
            (1,15, [200]),
            (1,10, [100]),
        ]
示例#9
0
 def introns_strict(self):
     """Get truly intronic regions - ie. not covered by any exon for this gene
     result is a a tuple of np arrays, (starts, stops)
     By it's definition, the introns are disjunct
     """
     gene_start = self.start
     gene_stop = self.stop
     exons = []
     for tr in self.transcripts:
         try:
             exons.extend(tr.exons)
         except TypeError:  # pragma: no cover
             raise ValueError(f"No exons defined for {tr.transcript_stable_id}")
     return IntervalSet.from_tuples(exons).invert(gene_start, gene_stop).to_numpy()
示例#10
0
 def introns_all(self):
     """Get intronic regions - ie. an intron in any of the transcripts.
     May contain repetitions and overlaps and is not sorted!
     """
     gene_start = self.start
     gene_stop = self.stop
     introns = [], []
     for tr in self.transcripts:
         try:
             starts, stops = (
                 IntervalSet.from_tuples(tr.exons)
                 .invert(gene_start, gene_stop)
                 .to_numpy()
             )
         except TypeError:  # pragma: no cover
             raise ValueError(f"No exons defined for {tr.transcript_stable_id}")
         introns[0].extend(starts)
         introns[1].extend(stops)
     return introns
示例#11
0
def merge_df_intervals_with_callback(df, callback):
    """take a {chr, start, end, *} dataframe and merge overlapping intervals, calling callback for group larger than one.."""
    if not "strand" in df:
        df = df.assign(strand=1)
        strand_added = True
    else:
        strand_added = False
    joined = _df_to_tup(df)
    result = []
    for chr, sub_group in itertools.groupby(joined, lambda tup: tup[0]):
        args = [x[1:] for x in sub_group]
        iv = IntervalSet.from_tuples_with_id(args)
        subsets = iv.merge_hull().to_tuples_with_id()
        for s in subsets:
            sub_df = df.iloc[list(s[2])].copy()
            sub_df.at[:, "start"] = s[0]
            sub_df.at[:, "stop"] = s[1]
            row_data = callback(sub_df)
            if not isinstance(
                row_data, dict
            ):  # and not (isinstance(row_data, pd.core.series.Series) and len(row_data.shape) == 1):
                print("type", type(row_data))
                # print 'len(shape)', len(row_data.shape)
                print(callback)
                raise ValueError(
                    "Merge_function returned something other than dict (writing to the pandas series directly is very slow, call to_dict() on it, then modify it.)"
                )
            if set(row_data.keys()) != set(df.columns):
                raise ValueError(
                    "Merge_function return wrong columns. Expected %s, was %s"
                    % (df.columns, list(row_data.keys()))
                )
            row_data["start"] = s[0]
            row_data["stop"] = s[1]

            result.append(row_data)
    res = pd.DataFrame(result).sort_values(["chr", "start"])
    if strand_added:
        res = res.drop("strand", axis=1)
    return res
示例#12
0
 def exons_merged(self):
     """Get the merged exon regions for a gene given by gene_stable_id
     result is a a tuple of np arrays, (starts, stops)
     """
     return IntervalSet.from_tuples(self._exons).merge_hull().to_numpy()
示例#13
0
            def _get_interval_tuples_by_chr(self, genome):
                from mbf_nested_intervals import IntervalSet

                coll = {chr: [] for chr in genome.get_chromosome_lengths()}
                for g in genome.genes.values():
                    exons = g.exons_overlapping
                    if len(exons[0]) == 0:  # pragma: no cover
                        exons = g.exons_merged
                    for start, stop in zip(*exons):
                        coll[g.chr].append(
                            (start, stop, 0b0101 if g.strand == 1 else 0b0110))
                    for start, stop in zip(*g.introns_strict):
                        coll[g.chr].append(
                            (start, stop, 0b1001 if g.strand == 1 else 0b1010))
                result = {}
                for chr, tups in coll.items():
                    iset = IntervalSet.from_tuples_with_id(tups)
                    # iset = iset.merge_split()
                    iset = iset.merge_hull()
                    if iset.any_overlapping():
                        raise NotImplementedError("Should not be reached")
                    result[chr] = []
                    for start, stop, ids in iset.to_tuples_with_id():
                        ids = set(ids)
                        if len(ids) == 1:
                            id = list(ids)[0]
                            if id == 0b0101:
                                tag = "exon"
                                strand = +1
                            elif id == 0b0110:
                                tag = "exon"
                                strand = -1
                            elif id == 0b1001:
                                tag = "intron"
                                strand = +1
                            elif id == 0b1010:
                                tag = "intron"
                                strand = -1
                            else:  # pragma: no cover
                                raise NotImplementedError(
                                    "Should not be reached")
                        else:
                            down = 0
                            for i in ids:
                                down |= i
                            if down & 0b1100 == 0b1100:
                                tag = "both"
                            elif down & 0b0100 == 0b0100:
                                tag = "exon"
                            else:  # pragma: no cover  haven't observed this case in the wild yet.
                                tag = (  # pragma: no cover
                                    "intron"  # pragma: no cover
                                )  # pragma: no cover  haven't observed this case in the wild yet.
                            if down & 0b11 == 0b11:
                                tag += "_undecidable"
                                strand = (
                                    1
                                )  # doesn't matter, but must be one or the other
                            elif down & 0b01:
                                strand = 1
                            else:
                                strand -= 1

                        result[chr].append((tag, strand, [start], [stop]))
                return result