Exemplo n.º 1
0
def interval_tree(start_data, stop_data, buffer_len):
    starts = []
    stops = []
    t = IntervalTree()

    ## Shrink each interval by the buffer size
    for key, value in start_data.iteritems():
        for i in range(0, len(value)):
            shrunk_start = value[i] + buffer_len / 2.0
            shrunk_stop = stop_data[key][i] + 1 - buffer_len / 2.0
            if shrunk_start < shrunk_stop:
                t[shrunk_start:shrunk_stop] = (shrunk_start, shrunk_stop)

    ## Add chromosome endpoints without buffer
    chrom_start, chrom_stop = get_extremes(start_data, stop_data)
    if chrom_start < t.begin() + 1:
        t[chrom_start:t.begin() + 1] = (chrom_start, t.begin() + 1)
    if t.end() - 1 < chrom_stop:
        t[t.end() - 1:chrom_stop] = (t.end() - 1, chrom_stop)

    ## Merge intervals that overlap in tree to get consensus
    t.merge_overlaps()

    ## Check that original intervals only overlap with one consensus interval
    for key, value in start_data.iteritems():
        for i in range(0, len(value)):
            start = value[i]
            stop = stop_data[key][i] + 1
            if len(t[start:stop]) > 1:
                ## If they overlap with more than one
                ## Remove part of consensus interval
                ## This will never be more than the buffer size/2
                assert (len(t[start:stop]) == 2)
                remove_start = 0
                remove_stop = 0
                min_length = float('inf')
                for interval in t[start:stop]:
                    overlap_start, overlap_stop = get_overlap(
                        (start, stop), (interval[0], interval[1]))
                    if (overlap_stop - overlap_start) < min_length:
                        min_length = overlap_stop - overlap_start
                        remove_start = overlap_start
                        remove_stop = overlap_stop
                print(min_length)
                t.chop(remove_start, remove_stop)
                assert (min_length <= buffer_len / 2.0)
                assert (len(t[start:stop]) < 2)

    ## Get consensus start and stop points
    chrom_len = chrom_stop - chrom_start
    covered = 0.0
    for interval in sorted(t):
        starts.append(interval[0])
        stops.append(interval[1])
        covered = covered + (interval[1] - interval[0])

    print("The percentage of the chromosome covered is: %s" % '{0:.2f}'.format(
        (covered / chrom_len) * 100.0))

    return (starts, stops)
Exemplo n.º 2
0
class Sequencer:
    sortkey = lambda n: n.start + n.length

    def __init__(self):
        self.notes = IntervalTree()

    def add(self, note):
        self.notes.addi(note.start, note.start + note.length, note)

    def remove(self, note):
        self.notes.removei(note.start, note.start + note.length, note)

    def length(self):
        return self.notes.end()

    def sample_at(self, t):

        # again, bad
        current = self.notes.at(t)

        acc = 0
        for note in current:
            note_pos = t - note.begin
            acc += (osc.sine(note_pos, note.data.pitch) * note.data.velocity *
                    adsr(note_pos, note.end - note.begin)) * (1 / len(current))

        return acc
Exemplo n.º 3
0
def test_empty_queries():
    t = IntervalTree()
    e = set()

    assert len(t) == 0
    assert t.is_empty()
    assert t[3] == e
    assert t[4:6] == e
    assert t.begin() == 0
    assert t.end() == 0
    assert t[t.begin():t.end()] == e
    assert t.items() == e
    assert set(t) == e
    assert set(t.copy()) == e
    assert t.find_nested() == {}
    t.verify()
Exemplo n.º 4
0
def test_list_init():
    tree = IntervalTree([Interval(-10, 10), Interval(-20.0, -10.0)])
    tree.verify()
    assert tree
    assert len(tree) == 2
    assert tree.items() == set([Interval(-10, 10), Interval(-20.0, -10.0)])
    assert tree.begin() == -20
    assert tree.end() == 10
Exemplo n.º 5
0
def test_list_init():
    tree = IntervalTree([Interval(-10, 10), Interval(-20.0, -10.0)])
    tree.verify()
    assert tree
    assert len(tree) == 2
    assert tree.items() == set([Interval(-10, 10), Interval(-20.0, -10.0)])
    assert tree.begin() == -20
    assert tree.end() == 10
Exemplo n.º 6
0
def test_empty_queries():
    t = IntervalTree()
    e = set()

    assert len(t) == 0
    assert t.is_empty()
    assert t[3] == e
    assert t[4:6] == e
    assert t.begin() == 0
    assert t.end() == 0
    assert t[t.begin():t.end()] == e
    assert t.items() == e
    assert set(t) == e
    assert set(t.copy()) == e
    assert t.find_nested() == {}
    assert t.range().is_null()
    assert t.range().length() == 0
    t.verify()
Exemplo n.º 7
0
def test_empty_queries():
    t = IntervalTree()
    e = set()

    assert len(t) == 0
    assert t.is_empty()
    assert t[3] == e
    assert t[4:6] == e
    assert t.begin() == 0
    assert t.end() == 0
    assert t[t.begin():t.end()] == e
    assert t.overlap(t.begin(), t.end()) == e
    assert t.envelop(t.begin(), t.end()) == e
    assert t.items() == e
    assert set(t) == e
    assert set(t.copy()) == e
    assert t.find_nested() == {}
    assert t.range().is_null()
    assert t.range().length() == 0
    t.verify()
Exemplo n.º 8
0
class BratEntity(BratAnnotation):
    """
    Each entity annotation has a unique ID and is defined by type (e.g. Person or Organization) and the span of
    characters containing the entity mention (represented as a "start end" offset pair). For example,

    ::

        T1  Organization 0 4  Sony
        T3  Organization 33 41  Ericsson
        T3  Country 75 81 Sweden

    Each line contains one text-bound annotation identifying the entity mention in text

    Represented in standoff as "`ID [tab] TYPE START END [tab] TEXT`" where START and END are positive integer offsets
    identifying the span of the annotation in text and `TEXT` is the corresponding text. Discontinuous annotations can
    be represented as "`ID [tab] TYPE START END[;START END]* [tab] TEXT`" with multiple START END pairs separated by
    semicolons.
    """
    def __init__(self):
        super(BratEntity, self).__init__()
        self.text = None  # type: Optional[str]
        self.locations = IntervalTree()  # type: IntervalTree

    def shift(self, offset: int):
        ent = BratEntity()
        ent.id = self.id
        ent.type = self.type
        ent.text = self.text
        for interval in self.locations:
            ent.locations[interval.begin + offset: interval.end + offset] = interval.data
        return ent

    def add_span(self, start: int, end: int, data = None):
        self.locations[start: end] = data

    @property
    def total_span(self) -> Tuple[int, int]:
        return self.locations.begin(), self.locations.end()

    def __eq__(self, other):
        if not isinstance(other, BratEntity):
            return False
        else:
            return self.id == other.id \
                   and self.type == other.type \
                   and self.text == other.text \
                   and self.locations == other.locations

    def __str__(self):
        return 'BratEntity[id=%s,type=%s,text=%s,loc=%s]' % (
            self.id, self.type, self.text, self.locations)
Exemplo n.º 9
0
def test_generator_init():
    tree = IntervalTree(
        Interval(begin, end)
        for begin, end in [(-10, 10), (-20, -10), (10, 20)])
    tree.verify()
    assert tree
    assert len(tree) == 3
    assert tree.items() == set([
        Interval(-20, -10),
        Interval(-10, 10),
        Interval(10, 20),
    ])
    assert tree.begin() == -20
    assert tree.end() == 20
Exemplo n.º 10
0
def test_generator_init():
    tree = IntervalTree(
        Interval(begin, end) for begin, end in
        [(-10, 10), (-20, -10), (10, 20)]
    )
    tree.verify()
    assert tree
    assert len(tree) == 3
    assert tree.items() == set([
        Interval(-20, -10),
        Interval(-10, 10),
        Interval(10, 20),
    ])
    assert tree.begin() == -20
    assert tree.end() == 20
Exemplo n.º 11
0
class IntervalGraph(object):
    """Base class for undirected interval graphs.

    The IntervalGraph class allows any hashable object as a node
    and can associate key/value attribute pairs with each undirected edge.

    Each edge must have two integers, begin and end for its interval.

    Self-loops are allowed but multiple edges
    (two or more edges with the same nodes, begin and end interval) are not.

    Two nodes can have more than one edge with different overlapping or non-overlapping intervals.

    Parameters
    ----------
    attr : keyword arguments, optional (default= no attributes)
        Attributes to add to graph as key=value pairs.

    Examples
    --------
    Create an empty graph structure (a "null interval graph") with no nodes and
    no edges.

    >>> G = dnx.IntervalGraph()

    G can be grown in several ways.

    **Nodes:**

    Add one node at a time:

    >>> G.add_node(1)

    Add the nodes from any container (a list, dict, set or
    even the lines from a file or the nodes from another graph).

    Add the nodes from any container (a list, dict, set)

    >>> G.add_nodes_from([2, 3])
    >>> G.add_nodes_from(range(100, 110))

    **Edges:**

    G can also be grown by adding edges. This can be considered
    the primary way to grow G, since nodes with no edge will not
    appear in G in most cases. See ``G.to_snapshot()``.

    Add one edge, which starts at 0 and ends at 10.
    Keep in mind that the interval is [0, 10).
    Thus, it does not include the end.

    >>> G.add_edge(1, 2, 0, 10)

    a list of edges,

    >>> G.add_edges_from([(1, 2, 0, 10), (1, 3, 3, 11)])

    If some edges connect nodes not yet in the graph, the nodes
    are added automatically. There are no errors when adding
    nodes or edges that already exist.

    **Attributes:**

    Each interval graph, node, and edge can hold key/value attribute pairs
    in an associated attribute dictionary (the keys must be hashable).
    By default these are empty, but can be added or changed using
    add_edge, add_node.

    Keep in mind that the edge interval is not an attribute of the edge.

    >>> G = dnx.IntervalGraph(day="Friday")
    >>> G.graph
    {'day': 'Friday'}

    Add node attributes using add_node(), add_nodes_from()

    >>> G.add_node(1, time='5pm')
    >>> G.add_nodes_from([3], time='2pm')

    Add edge attributes using add_edge(), add_edges_from().

    >>> G.add_edge(1, 2, 0, 10, weight=4.7 )
    >>> G.add_edges_from([(3, 4, 3, 11), (4, 5, 0, 33)], color='red')

    **Shortcuts:**

    Here are a couple examples of available shortcuts:

    >>> 1 in G  # check if node in interval graph during any interval
    True
    >>> len(G)  # number of nodes in the entire interval graph
    5

    **Subclasses (Advanced):**
    Edges in interval graphs are represented by Interval Objects and are kept
    in an IntervalTree. Both are based on
    intervaltree available in pypi (https://pypi.org/project/intervaltree).
    IntervalTree allows for fast interval based search through edges,
    which makes interval graph analyes possible.

    The Graph class uses a dict-of-dict-of-dict data structure.
    The outer dict (node_dict) holds adjacency information keyed by node.
    The next dict (adjlist_dict) represents the adjacency information and holds
    edge data keyed by interval object.  The inner dict (edge_attr_dict) represents
    the edge data and holds edge attribute values keyed by attribute names.
    """
    def __init__(self, **attr):
        """Initialize an interval graph with edges, name, or graph attributes.

        Parameters
        ----------
        attr : keyword arguments, optional (default= no attributes)
            Attributes to add to graph as key=value pairs.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G = dnx.IntervalGraph(name='my graph')
        >>> G.graph
        {'name': 'my graph'}
        """
        self.tree = IntervalTree()
        self.graph = {}  # dictionary for graph attributes
        self._adj = {}
        self._node = {}

        self.graph.update(attr)

    @property
    def name(self):
        """String identifier of the interval graph.

        This interval graph attribute appears in the attribute dict IG.graph
        keyed by the string `"name"`. as well as an attribute (technically
        a property) `IG.name`. This is entirely user controlled.
        """
        return self.graph.get('name', '')

    @name.setter
    def name(self, s):
        self.graph['name'] = s

    def __str__(self):
        """Return the interval graph name.

        Returns
        -------
        name : string
            The name of the interval graph.

        Examples
        --------
        >>> G = dnx.IntervalGraph(name='foo')
        >>> str(G)
        'foo'
        """
        return self.name

    def __len__(self):
        """Return the number of nodes. Use: 'len(G)'.

        Returns
        -------
        nnodes : int
            The number of nodes in the graph.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_nodes_from([2, 4, 5])
        >>> len(G)
        3

        """
        return len(self._node)

    def __contains__(self, n):
        """Return True if n is a node, False otherwise. Use: 'n in G'.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_node(2)
        >>> 2 in G
        True
        """
        try:
            return n in self._node
        except TypeError:
            return False

    def interval(self):
        """Return a 2-tuple as (begin, end) interval of the entire
         interval graph.

         Note that end is non-inclusive.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 0, 10), (3, 7, 9, 16)])
        >>> G.interval()
        (0, 16)
        """
        return self.tree.begin(), self.tree.end()

    def add_node(self, node_for_adding, **attr):
        """Add a single node `node_for_adding`  and update node attributes.

        Parameters
        ----------
        node_for_adding : node
            A node can be any hashable Python object except None.
        attr : keyword arguments, optional
            Set or change node attributes using key=value.

        See Also
        --------
        add_nodes_from

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_node(1)
        >>> G.add_node('Hello')
        >>> G.number_of_nodes()
        2

        Use keywords set/change node attributes:

        >>> G.add_node(1, size=10)
        >>> G.add_node(3, weight=0.4, UTM=('13S', 382871, 3972649))

        Notes
        -----
        A hashable object is one that can be used as a key in a Python
        dictionary. This includes strings, numbers, tuples of strings
        and numbers, etc.

        On many platforms hashable items also include mutables such as
        NetworkX Graphs, though one should be careful that the hash
        doesn't change on mutables.
        """
        if node_for_adding not in self._node:
            self._adj[node_for_adding] = {}
            self._node[node_for_adding] = attr
        else:  # update attr even if node already exists
            self._node[node_for_adding].update(attr)

    def add_nodes_from(self, nodes_for_adding, **attr):
        """Add multiple nodes.

        Parameters
        ----------
        nodes_for_adding : iterable container
            A container of nodes (list, dict, set, etc.).
            OR
            A container of (node, attribute dict) tuples.
            Node attributes are updated using the attribute dict.
        attr : keyword arguments, optional (default= no attributes)
            Update attributes for all nodes in nodes.
            Node attributes specified in nodes as a tuple take
            precedence over attributes specified via keyword arguments.

        See Also
        --------
        add_node

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_nodes_from('Hello')
        >>> G.has_node('e')
        True

        Use keywords to update specific node attributes for every node.

        >>> G.add_nodes_from([1, 2], size=10)
        >>> G.add_nodes_from([3, 4], weight=0.4)

        Use (node, attrdict) tuples to update attributes for specific nodes.

        >>> G.add_nodes_from([(1, dict(size=11)), (2, {'color':'blue'})])
        """
        for n in nodes_for_adding:
            # keep all this inside try/except because
            # CPython throws TypeError on n not in self._node,
            # while pre-2.7.5 ironpython throws on self._adj[n]
            try:
                if n not in self._node:
                    self._adj[n] = {}
                    self._node[n] = attr.copy()
                else:
                    self._node[n].update(attr)
            except TypeError:
                nn, ndict = n
                if nn not in self._node:
                    self._adj[nn] = {}
                    self._node[nn] = attr.copy()
                    self._node[nn].update(ndict)
                else:
                    self._node[nn].update(attr)
                    self._node[nn].update(ndict)

    def number_of_nodes(self, begin=None, end=None):
        """Return the number of nodes in the interval graph between the given interval.

        Parameters
        ----------
        begin: integer, optional  (default= beginning of the entire interval graph)
            Inclusive beginning time of the node appearing in the interval graph.
        end: integer, optional  (default= end of the entire interval graph + 1)
            Non-inclusive ending time of the node appearing in the interval graph.
            Must be bigger than begin.
            Note that the default value is shifted up by 1 to make it an inclusive end.

        Returns
        -------
        nnodes : int
            The number of nodes in the interval graph.

        See Also
        --------
        __len__

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 0, 5), (3, 4, 8, 11)])
        >>> len(G)
        4
        >>> G.number_of_nodes()
        4
        >>> G.number_of_nodes(begin=6)
        2
        >>> G.number_of_nodes(begin=5, end=8) # end in non-inclusive
        2
        >>> G.number_of_nodes(end=8)
        4
        """

        if begin is None and end is None:
            return len(self._node)

        if begin is None:
            begin = self.tree.begin()

        if end is None:
            end = self.tree.end() + 1

        iedges = self.tree[begin:end]

        inodes = set()

        for iv in iedges:
            inodes.add(iv.data[0])
            inodes.add(iv.data[1])

        return len(inodes)

    def has_node(self, n, begin=None, end=None):
        """Return True if the interval graph contains the node n, during the given interval.

        Identical to `n in G` when 'begin' and 'end' are not defined.

        Parameters
        ----------
        n : node
        begin: integer, optional  (default= beginning of the entire interval graph)
            Inclusive beginning time of the node appearing in the interval graph.
        end: integer, optional  (default= end of the entire interval graph + 1)
            Non-inclusive ending time of the node appearing in the interval graph.
            Must be bigger than begin.
            Note that the default value is shifted up by 1 to make it an inclusive end.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_ndoe(1)
        >>> G.has_node(1)
        True

        It is more readable and simpler to use

        >>> 0 in G
        True

        With interval query:

        >>> G.add_edge(3, 4, 2, 5)
        >>> G.has_node(3)
        True
        >>> G.has_node(3, begin=2)
        True
        >>> G.has_node(3, end=2) # end is non-inclusive
        False
        """
        try:
            exists_node = n in self._node
        except TypeError:
            exists_node = False

        if (begin is None and end is None) or not exists_node:
            return exists_node

        if begin is None:
            begin = self.tree.begin()

        if end is None:
            end = self.tree.end() + 1

        iedges = self._adj[n].keys()

        for iv in iedges:
            if iv.overlaps(begin=begin, end=end):
                return True

        return False

    def nodes(self, begin=None, end=None, data=False, default=None):
        """A NodeDataView of the IntervalGraph nodes.

        A nodes is considered to be present during an interval, if it has
        an edge with overlapping interval.

        Parameters
        ----------
        begin: integer, optional  (default= beginning of the entire interval graph)
            Inclusive beginning time of the node appearing in the interval graph.
        end: integer, optional  (default= end of the entire interval graph + 1)
            Non-inclusive ending time of the node appearing in the interval graph.
            Must be bigger than begin.
            Note that the default value is shifted up by 1 to make it an inclusive end.
        data : string or bool, optional (default=False)
            The node attribute returned in 2-tuple (n, dict[data]).
            If False, return just the nodes n.
        default : value, optional (default=None)
            Value used for nodes that don't have the requested attribute.
            Only relevant if data is not True or False.

        Returns
        -------
        NodeDataView
            A NodeDataView iterates over `(n, data)` and has no set operations.

            When called, if data is False, an iterator over nodes.
            Otherwise an iterator of 2-tuples (node, attribute value)
            where data is True.

        Examples
        --------
        There are two simple ways of getting a list of all nodes in the graph:

        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)])
        [1, 2, 4, 6]

        To get the node data along with the nodes:

        >>> G.add_nodes_from([(1, {'time': '1pm'}), (2, {'time': '2pm'}), (4, {'time': '4pm'}), (6, {'day': 'Friday'})])
        [(1, {'time': '1pm'}), (2, {'time': '2pm'}), (4, {'time': '4pm'}), (6, {'day': 'Friday'})]

        >>> G.nodes(data="time")
        [(1, '1pm'), (2, '2pm'), (4, '4pm'), (6, None)]
        >>> G.nodes(data="time", default="5pm")
        [(1, '1pm'), (2, '2pm'), (4, '4pm'), (6, '5pm')]

        To get nodes which appear in a specific interval. nodes
        without an edge are not considered present.

        >>> G.nodes(begin=11, data=True)
        [(2, {'time': '2pm'}), (4, {'time': '4pm'}), (6, {'day': 'Friday'})]
        >>> G.nodes(begin=4, end=12) # non-inclusive end
        [1, 2, 4]
        """
        if begin is None and end is None:
            return NodeDataView(self._node, data=data, default=default)

        if begin is None:
            begin = self.tree.begin()

        if end is None:
            end = self.tree.end() + 1

        iedges = self.tree[begin:end]

        inodes = set()
        for iv in iedges:
            inodes.add(iv.data[0])
            inodes.add(iv.data[1])

        node_dict = {n: self._node[n] for n in inodes}

        return NodeDataView(node_dict, data=data, default=default)

    def remove_node(self, n, begin=None, end=None):
        """Remove the presence of a node n within the given interval.

        Removes the presence node n and all adjacent edges within the given interval.

        If interval is specified, all the edges of n will be removed within that interval.

        Quiet if n is not in the interval graph.

        Parameters
        ----------
        n : node
           A node in the graph
        begin: integer, optional  (default= beginning of the entire interval graph)
            Inclusive beginning time of the node appearing in the interval graph.
        end: integer, optional  (default= end of the entire interval graph + 1)
            Non-inclusive ending time of the node appearing in the interval graph.
            Must be bigger than begin.
            Note that the default value is shifted up by 1 to make it an inclusive end.

        Examples
        --------
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)])
        >>> G.add_nodes_from([(1, {'time': '1pm'}), (2, {'time': '2pm'}), (4, {'time': '4pm'})])
        >>> G.nodes(begin=4, end=6)
        [1, 2, 4, 6]
        >>> G.remove_node(2, begin=4, end=6)
        >>> G.nodes(begin=4, end=6)
        [4, 6]
        >>> G.nodes(data=True)
        [(1, {'time': '1pm'}), (2, {'time': '2pm'}), (4, {'time': '4pm'}), (6, {})]
        >>> G.remove_node(2)
        >>> G.nodes(data=True)
        [(1, {'time': '1pm'}), (4, {'time': '4pm'}), (6, {})]
        """

        if n not in self._node:
            return

        if begin is None and end is None:
            for iedge in list(self._adj[n].keys()):
                self.__remove_iedge(iedge)
        else:
            if begin is None:
                begin = self.tree.begin()

            if end is None:
                end = self.tree.end() + 1

            for iedge in self.tree[begin:end]:
                if iedge.data[0] == n or iedge.data[1] == n:
                    self.__remove_iedge(iedge)

        # delete the node and its attributes if no edge left
        if len(self._adj[n]) == 0:
            self._adj.pop(n, None)
            self._node.pop(n, None)

    def add_edge(self, u, v, begin, end, **attr):
        """Add an edge between u and v, during interval [begin, end).

        The nodes u and v will be automatically added if they are
        not already in the interval graph.

        Edge attributes can be specified with keywords or by directly
        accessing the edge's attribute dictionary. See examples below.

        Parameters
        ----------
        u, v : nodes
            Nodes can be, for example, strings or numbers.
            Nodes must be hashable (and not None) Python objects.
        begin: orderable type
            Inclusive beginning time of the edge appearing in the interval graph.
        end: orderable type
            Non-inclusive ending time of the edge appearing in the interval graph.
            Must be bigger than begin.
        attr : keyword arguments, optional
            Edge data (or labels or objects) can be assigned using
            keyword arguments.

        See Also
        --------
        add_edges_from : add a collection of edges

        Notes
        -----
        Adding an edge that already exists updates the edge data.

        Both begin and end must be the same type across all edges in the interval graph. Also, to create
        snapshots, both must be integers.

        Many NetworkX algorithms designed for weighted graphs use
        an edge attribute (by default `weight`) to hold a numerical value.

        Examples
        --------
        The following all add the edge e=(1, 2, 3, 10) to graph G:

        >>> G = dnx.IntervalGraph()
        >>> e = (1, 2, 3, 10)
        >>> G.add_edge(1, 2, 3, 10)           # explicit two-node form with interval
        >>> G.add_edge(*e)             # single edge as tuple of two nodes and interval
        >>> G.add_edges_from([(1, 2, 3, 10)])  # add edges from iterable container

        Associate data to edges using keywords:

        >>> G.add_edge(1, 2, 3, 10 weight=3)
        >>> G.add_edge(1, 3, 4, 9, weight=7, capacity=15, length=342.7)
        """

        iedge = self.__get_iedge_in_tree(begin, end, u, v)

        # if edge exists, just update attr
        if iedge is not None:
            # since both point to the same attr, updating one is enough
            self._adj[u][iedge].update(attr)
            return

        iedge = Interval(begin, end, (u, v))

        # add nodes
        if u not in self._node:
            self._adj[u] = {}
            self._node[u] = {}
        if v not in self._node:
            self._adj[v] = {}
            self._node[v] = {}

        # add edge
        try:
            self.tree.add(iedge)
        except ValueError:
            raise NetworkXError(
                "IntervalGraph: edge duration must be strictly bigger than zero {0}."
                .format(iedge))

        self._adj[u][iedge] = self._adj[v][iedge] = attr

    def add_edges_from(self, ebunch_to_add, **attr):
        """Add all the edges in ebunch_to_add.

        Parameters
        ----------
        ebunch_to_add : container of edges
            Each edge given in the container will be added to the
            interval graph. The edges must be given as as 4-tuples (u, v, being, end).
            Both begin and end must be orderable and the same type across all edges.
        attr : keyword arguments, optional
            Edge data (or labels or objects) can be assigned using
            keyword arguments.

        See Also
        --------
        add_edge : add a single edge

        Notes
        -----
        Adding the same edge (with the same interval) twice has no effect
        but any edge data will be updated when each duplicate edge is added.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11)]) # using a list of edge tuples

        Associate data to edges

        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11)], weight=3)
        >>> G.add_edges_from([(3, 4, 2, 19), (1, 4, 1, 3)], label='WN2898')
        """

        for e in ebunch_to_add:
            if len(e) != 4:
                raise NetworkXError(
                    "Edge tuple {0} must be a 4-tuple.".format(e))

            self.add_edge(e[0], e[1], e[2], e[3], **attr)

    def has_edge(self, u, v, begin=None, end=None, overlapping=True):
        """Return True if there exists an edge between u and v
        in the interval graph, during the given interval.

        Parameters
        ----------
        u, v : nodes
            Nodes can be, for example, strings or numbers.
            Nodes must be hashable (and not None) Python objects.
        begin : integer, optional (default= beginning of the entire interval graph)
            Inclusive beginning time of the node appearing in the interval graph.
        end : integer, optional (default= end of the entire interval graph + 1)
            Non-inclusive ending time of the node appearing in the interval graph.
            Must be bigger than begin.
            Note that the default value is shifted up by 1 to make it an inclusive end.
        overlapping : bool, optional (default= True)
            if True, it returns True if there exists an edge between u and v with
            overlapping interval with `begin` and `end`.
            if False, it returns true only if there exists an edge between u and v
            with the exact interval.
            Note: if False, both `begin` and `end` must be defined, otherwise
            an exception is raised.

        Raises
        ------
        NetworkXError
            If `begin` and `end` are not defined and `overlapping= False`

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11)])
        >>> G.has_edge(1, 2)
        True

        With specific overlapping interval:
        
        >>> G.has_edge(1, 2, begin=2)
        True
        >>> G.has_edge(2, 4, begin=12)
        False

        Exact interval match:

        >>> G.has_edge(2, 4, begin=1, end=11)
        True
        >>> G.has_edge(2, 4, begin=2, end=11)
        False
        """

        if begin is None and end is None:
            for iv in self._adj[u].keys():
                if iv.data[0] == v or iv.data[1] == v:
                    return True
            return False

        if not overlapping:
            if begin is None or end is None:
                raise NetworkXError(
                    "For exact interval match (overlapping=False), both begin and end must be defined."
                )

            return self.__get_iedge_in_tree(u, v, begin, end) is not None

        if begin is None:
            begin = self.tree.begin()

        if end is None:
            end = self.tree.end() + 1

        for iv in self._adj[u].keys():
            if (iv.data[0] == v or iv.data[1] == v) and iv.overlaps(
                    begin=begin, end=end):
                return True
        return False

    def edges(self,
              u=None,
              v=None,
              begin=None,
              end=None,
              data=False,
              default=None):
        """A list of Interval objects of the IntervalGraph edges.

        All edges which are present within the given interval.

        All parameters are optional. `u` and `v` can be thought of as constraints.
        If no node is defined, all edges within the interval are returned.
        If one node is defined, all edges which have that node as one end,
        will be returned, and finally if both nodes are defined then all
        edges between the two nodes are returned.

        Parameters
        ----------
        u, v : nodes, optional (default=None)
            Nodes can be, for example, strings or numbers.
            Nodes must be hashable (and not None) Python objects.
            If the node does not exist in the graph, a key error is raised.
        begin: integer, optional  (default= beginning of the entire interval graph)
            Inclusive beginning time of the edge appearing in the interval graph.
        end: integer, optional  (default= end of the entire interval graph + 1)
            Non-inclusive ending time of the edge appearing in the interval graph.
            Must be bigger than begin.
            Note that the default value is shifted up by 1 to make it an inclusive end.
        data : string or bool, optional (default=False)
            If True, return 2-tuple (Interval object, dict of attributes).
            If False, return just the Interval objects.
            If string (name of the attribute), return 2-tuple (Interval object, attribute value).
        default : value, optional (default=None)
            Default Value to be used for edges that don't have the requested attribute.
            Only relevant if `data` is a string (name of an attribute).

        Returns
        -------
        List of Interval objects
            An interval object has the following format: (begin, end, (u, v))

            When called, if `data` is False, a list of interval objects.
            If `data` is True, a list of 2-tuples: (Interval, dict of attribute(s) with values),
            If `data` is a string, a list of 2-tuples (Interval, attribute value).

        Examples
        --------
        To get a list of all edges:

        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)])
        >>> G.edges()
        [Interval(8, 15, (2, 4)), Interval(3, 10, (1, 2)), Interval(1, 11, (2, 4)), Interval(12, 19, (6, 4))]

        To get edges which appear in a specific interval:

        >>> G.edges(begin=10)
        [Interval(12, 19, (6, 4)), Interval(1, 11, (2, 4)), Interval(8, 15, (2, 4))]
        >>> G.edges(end=5)
        [Interval(3, 10, (1, 2)), Interval(1, 11, (2, 4))]
        >>> G.edges(begin=2, end=4)
        [Interval(3, 10, (1, 2)), Interval(1, 11, (2, 4))]

        To get edges with either of the two nodes being defined:

        >>> G.edges(u=2)
        [Interval(3, 10, (1, 2)), Interval(1, 11, (2, 4)), Interval(8, 15, (2, 4))]
        >>> G.edges(u=2, begin=11)
        [Interval(1, 11, (2, 4)), Interval(8, 15, (2, 4))]
        >>> G.edges(u=2, v=4, end=8)
        [Interval(1, 11, (2, 4))]
        >>> G.edges(u=1, v=6)
        []

        To get a list of edges with data:

        >>> G = dnx.IntervalGraph()
        >>> G.add_edge(1, 3, 1, 4, weight=8, height=18)
        >>> G.add_edge(1, 2, 3, 10, weight=10)
        >>> G.add_edge(2, 6, 2, 10)
        >>> G.edges(data="weight")
        [(Interval(2, 8, (2, 3)), None), (Interval(3, 10, (1, 2)), 10), (Interval(1, 4, (1, 3)), 8)]
        >>> G.edges(data="weight", default=5)
        [(Interval(2, 8, (2, 3)), 5), (Interval(3, 10, (1, 2)), 10), (Interval(1, 4, (1, 3)), 8)]
        >>> G.edges(data=True)
        [(Interval(2, 8, (2, 3)), {}), (Interval(3, 10, (1, 2)), {'weight': 10}), (Interval(1, 4, (1, 3)), {'height': 18, 'weight': 8})]
        >>> G.edges(u=1, begin=5, end=9, data="weight")
        [(Interval(3, 10, (1, 2)), 10)]
        """

        # If non of the nodes are defined the interval tree is queried for the list of edges,
        # otherwise the edges are returned based on the nodes in the self._adj.o
        if u is None and v is None:
            if begin is None and end is None:
                iedges = self.tree.all_intervals
            # interval filtering
            else:
                if begin is None:
                    begin = self.tree.begin()
                if end is None:
                    end = self.tree.end() + 1

                iedges = self.tree[begin:end]

        else:
            # Node filtering
            if u is not None and v is not None:
                iedges = [
                    iv for iv in self._adj[u].keys()
                    if iv.data[0] == v or iv.data[1] == v
                ]
            elif u is not None:
                iedges = self._adj[u].keys()
            else:
                iedges = self._adj[v].keys()

            # Interval filtering
            if begin is not None and end is not None:
                iedges = [
                    iv for iv in iedges if iv.end >= begin and iv.begin < end
                ]
            elif begin is not None:
                iedges = [iv for iv in iedges if iv.end >= begin]
            elif end is not None:
                iedges = [iv for iv in iedges if iv.begin < end]

        # Appending attribute data if needed
        if data is False:
            return iedges if isinstance(iedges, list) else list(iedges)

        if data is True:
            return [(iv, self._adj[iv.data[0]][iv]) for iv in iedges]

        return [(iv, self._adj[iv.data[0]][iv][data])
                if data in self._adj[iv.data[0]][iv].keys() else (iv, default)
                for iv in iedges]

    def remove_edge(self, u, v, begin=None, end=None, overlapping=True):
        """Remove the edge between u and v in the interval graph,
        during the given interval.

        Quiet if the specified edge is not present.

        Parameters
        ----------
        u, v : nodes
            Nodes can be, for example, strings or numbers.
            Nodes must be hashable (and not None) Python objects.
        begin : integer, optional (default= beginning of the entire interval graph)
            Inclusive beginning time of the edge appearing in the interval graph.
        end : integer, optional (default= end of the entire interval graph + 1)
            Non-inclusive ending time of the edge appearing in the interval graph.
            Must be bigger than begin.
            Note that the default value is shifted up by 1 to make it an inclusive end.
        overlapping : bool, optional (default= True)
            if True, remove the edge between u and v with overlapping interval
            with `begin` and `end`.
            if False, remove the edge between u and v with the exact interval.
            Note: if False, both `begin` and `end` must be defined, otherwise
            an exception is raised.

        Raises
        ------
        NetworkXError
            If `begin` and `end` are not defined and `overlapping= False`

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 5, 9), (1, 2, 8, 15)])
        >>> G.remove_edge(1, 2)
        >>> G.has_edge(1, 2)
        False

        With specific overlapping interval

        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 5, 9), (1, 2, 8, 15)])
        >>> G.remove_edge(1, 2, begin=2, end=4)
        >>> G.has_edge(1, 2, begin=2, end=4)
        False
        >>> G.has_edge(1, 2)
        True

        Exact interval match

        >>> G.remove_edge(2, 4, begin=1, end=11, overlapping=False)
        >>> G.has_edge(2, 4, begin=1, end=11)
        False
        """
        # remove edge between u and v with the exact given interval
        if not overlapping:
            if begin is None or end is None:
                raise NetworkXError(
                    "For exact interval match (overlapping=False), both begin and end must be defined."
                )

            iedge = self.__get_iedge_in_tree(u, v, begin, end)
            if iedge is None:
                return
            self.__remove_iedge(iedge)
            return

        iedges_to_remove = []

        # remove every edge between u and v
        if begin is None and end is None:
            for iv in self._adj[u].keys():
                if iv.data[0] == v or iv.data[1] == v:
                    iedges_to_remove.append(iv)

        # remove edge between u and v with overlapping interval with the given interval
        if begin is None:
            begin = self.tree.begin()

        if end is None:
            end = self.tree.end() + 1

        for iv in self._adj[u].keys():
            if (iv.data[0] == v or iv.data[1] == v) and iv.overlaps(
                    begin=begin, end=end):
                iedges_to_remove.append(iv)

        # removing found iedges
        for iv in iedges_to_remove:
            self.__remove_iedge(iv)

    def __remove_iedge(self, iedge):
        """Remove the interval edge from the interval graph.

        Quiet if the specified edge is not present.

        Parameters
        ----------
        iedge : Interval object
            Interval edge to be removed.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_edge(1, 2, 3, 10)
        >>> iedge = Interval(3, 10, (1, 2))   # Interval(begin, end, (u, v))
        >>> G.__remove_iedge(iedge)
        """
        self.tree.discard(iedge)
        self._adj[iedge.data[0]].pop(iedge, None)
        self._adj[iedge.data[1]].pop(iedge, None)

    def __get_iedge_in_tree(self, u, v, begin, end):
        """Return interval edge if found in the interval graph with the exact interval,
        otherwise return None.

        Parameters
        ----------
        u, v : nodes
            Nodes can be, for example, strings or numbers.
            Nodes must be hashable (and not None) Python objects.
        begin : integer
            Inclusive beginning time of the edge appearing in the interval graph.
        end : integer
            Non-inclusive ending time of the edge appearing in the interval graph.
            Must be bigger than begin.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_edge(1, 2, 3, 10)
        >>> G.__get_iedge_in_tree(2, 1, 3, 10)
        Interval(3, 10, (1, 2))
        >>> G.__get_iedge_in_tree(2, 1, 4, 10)
        None
        """

        temp_iedge = Interval(begin, end, (u, v))
        if temp_iedge in self.tree:
            return temp_iedge

        temp_iedge = Interval(begin, end, (v, u))
        if temp_iedge in self.tree:
            return temp_iedge

        return None

    def to_subgraph(self,
                    begin,
                    end,
                    multigraph=False,
                    edge_data=False,
                    edge_interval_data=False,
                    node_data=False):
        """Return a networkx Graph or MultiGraph which includes all the nodes and
        edges which have overlapping intervals with the given interval.

        Parameters
        ----------
        begin: integer
            Inclusive beginning time of the edge appearing in the interval graph.
            Must be bigger than begin.
        end: integer
            Non-inclusive ending time of the edge appearing in the interval graph.
        multigraph: bool, optional (default= False)
            If True, a networkx MultiGraph will be returned. If False, networkx Graph.
        edge_data: bool, optional (default= False)
            If True, edges will keep their attributes.
        edge_interval_data: bool, optional (default= False)
            If True, each edge's attribute will also include its begin and end interval data.
            If `edge_data= True` and there already exist edge attributes with names begin and end,
            they will be overwritten.
        node_data : bool, optional (default= False)
            if True, each node's attributes will be included.

        See Also
        --------
        to_snapshots : divide the interval graph to snapshots

        Notes
        -----
        If multigraph= False, and edge_data=True or edge_interval_data=True,
        in case there are multiple edges, only one will show with one of the edge's attributes.

        Note: nodes with no edges will not appear in any subgraph.

        Examples
        --------
        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)])
        >>> H = G.to_subgraph(4, 12)
        >>> type(H)
        <class 'networkx.classes.graph.Graph'>
        >>> list(H.edges(data=True))
        [(1, 2, {}), (2, 4, {})]

        >>> H = G.to_subgraph(4, 12, edge_interval_data=True)
        >>> type(H)
        <class 'networkx.classes.graph.Graph'>
        >>> list(H.edges(data=True))
        [(1, 2, {'end': 10, 'begin': 3}), (2, 4, {'end': 15, 'begin': 8})]

        >>> M = G.to_subgraph(4, 12, multigraph=True, edge_interval_data=True)
        >>> type(M)
        <class 'networkx.classes.multigraph.MultiGraph'>
        >>> list(M.edges(data=True))
        [(1, 2, {'end': 10, 'begin': 3}), (2, 4, {'end': 11, 'begin': 1}), (2, 4, {'end': 15, 'begin': 8})]
        """

        if end <= begin:
            raise NetworkXError(
                "IntervalGraph: subgraph duration must be strictly bigger than zero: "
                "begin: {}, end: {}.".format(begin, end))

        iedges = self.tree[begin:end]

        if multigraph:
            G = MultiGraph()
        else:
            G = Graph()

        if edge_data and edge_interval_data:
            G.add_edges_from((iedge.data[0], iedge.data[1],
                              dict(self._adj[iedge.data[0]][iedge],
                                   begin=iedge.begin,
                                   end=iedge.end)) for iedge in iedges)
        elif edge_data:
            G.add_edges_from((iedge.data[0], iedge.data[1],
                              self._adj[iedge.data[0]][iedge].copy())
                             for iedge in iedges)
        elif edge_interval_data:
            G.add_edges_from((iedge.data[0], iedge.data[1], {
                'begin': iedge.begin,
                'end': iedge.end
            }) for iedge in iedges)
        else:
            G.add_edges_from(
                (iedge.data[0], iedge.data[1]) for iedge in iedges)

        # include node attributes
        if node_data:
            G.add_nodes_from((n, self._node[n].copy()) for n in G.nodes)

        return G

    def to_snapshots(self,
                     number_of_snapshots,
                     multigraph=False,
                     edge_data=False,
                     edge_interval_data=False,
                     node_data=False,
                     return_length=False):
        """Return a list of networkx Graph or MultiGraph objects as snapshots
        of the interval graph in consecutive order.

        Parameters
        ----------
        number_of_snapshots : integer
            Number of snapshots to divide the interval graph into.
            Must be bigger than 1.
        multigraph : bool, optional (default= False)
            If True, a networkx MultiGraph will be returned. If False, networkx Graph.
        edge_data: bool, optional (default= False)
            If True, edges will keep their attributes.
        edge_interval_data : bool, optional (default= False)
            If True, each edge's attribute will also include its begin and end interval data.
            If `edge_data= True` and there already exist edge attributes with names begin and end,
            they will be overwritten.
        node_data : bool, optional (default= False)
            if True, each node's attributes will be included.
        return_length : bool, optional (default= False)
            If true, the length of snapshots will be returned as the second argument.

        See Also
        --------
        to_subgraph : subgraph based on an interval

        Notes
        -----
        In order to create snapshots, begin and end interval objects of the interval graph must be numbers.

        If multigraph= False, and edge_data=True or edge_interval_data=True,
        in case there are multiple edges, only one will show with one of the edge's attributes.

        Examples
        --------
        Snapshots of NetworkX Graph

        >>> G = dnx.IntervalGraph()
        >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)])
        >>> S, l = G.to_snapshots(2, edge_interval_data=True, return_length=True)
        >>> S
        [<networkx.classes.graph.Graph object at 0x100000>, <networkx.classes.graph.Graph object at 0x150d00>]
        >>> l
        9.0
        >>> for g in S:
        >>> ... g.edges(data=True))
        [(1, 2, {'begin': 3, 'end': 10}), (2, 4, {'begin': 8, 'end': 15})]
        [(2, 4, {'begin': 8, 'end': 15}), (4, 6, {'begin': 12, 'end': 19})]

        Snapshots of NetworkX MultiGraph

        >>> S, l = G.to_snapshots(3, multigraph=True, edge_interval_data=True, return_length=True)
        >>> S
        [<networkx.classes.multigraph.MultiGraph object at 0x1060d40b8>, <networkx.classes.multigraph.MultiGraph object at 0x151020c9e8>, <networkx.classes.multigraph.MultiGraph object at 0x151021d390>]
        >>> l
        6.0
        >>> for g in S:
        >>> ... g.edges(data=True))
        [(1, 2, {'end': 10, 'begin': 3}), (2, 4, {'end': 11, 'begin': 1})]
        [(1, 2, {'end': 10, 'begin': 3}), (2, 4, {'end': 11, 'begin': 1}), (2, 4, {'end': 15, 'begin': 8}), (4, 6, {'end': 19, 'begin': 12})]
        [(2, 4, {'end': 15, 'begin': 8}), (4, 6, {'end': 19, 'begin': 12})]
        """

        if number_of_snapshots < 2 or type(number_of_snapshots) is not int:
            raise NetworkXError(
                "IntervalGraph: number of snapshots must be an integer and 2 or bigger. "
                "{0} was passed.".format(number_of_snapshots))

        begin, end = self.interval()
        snapshot_len = (end - begin) / number_of_snapshots

        snapshots = []
        end_inclusive_addition = 0
        for i in range(number_of_snapshots):
            # since to_subgraph is end non-inclusive, shift the end up by 1 to include end in the last snapshot.
            if i == number_of_snapshots - 1:
                end_inclusive_addition = 1

            snapshots.append(
                self.to_subgraph(begin + snapshot_len * i,
                                 begin + snapshot_len * (i + 1) +
                                 end_inclusive_addition,
                                 multigraph=multigraph,
                                 edge_data=edge_data,
                                 edge_interval_data=edge_interval_data,
                                 node_data=node_data))
        if return_length:
            return snapshots, snapshot_len

        return snapshots

    @staticmethod
    def load_from_txt(path, delimiter=" ", nodetype=None, comments="#"):
        """Read interval graph in from path.
           Every line in the file must be an edge in the following format: "node node begin end".
           Both interval times must be integers. Nodes can be any hashable objects.

        Parameters
        ----------
        path : string or file
           Filename to read.

        nodetype : Python type, optional
           Convert nodes to this type.

        comments : string, optional
           Marker for comment lines

        delimiter : string, optional
           Separator for node labels.  The default is whitespace.

        Returns
        -------
        G: IntervalGraph
            The graph corresponding to the lines in edge list.

        Examples
        --------
        >>> G=dnx.IntervalGraph.load_from_txt("my_dygraph.txt")

        The optional nodetype is a function to convert node strings to nodetype.

        For example

        >>> G=dnx.IntervalGraph.load_from_txt("my_dygraph.txt", nodetype=int)

        will attempt to convert all nodes to integer type.

        Since nodes must be hashable, the function nodetype must return hashable
        types (e.g. int, float, str, frozenset - or tuples of those, etc.)
        """

        ig = IntervalGraph()

        with open(path, 'r') as file:
            for line in file:
                p = line.find(comments)
                if p >= 0:
                    line = line[:p]
                if not len(line):
                    continue

                line = line.rstrip().split(delimiter)
                u, v, begin, end = line

                if nodetype is not None:
                    try:
                        u = nodetype(u)
                        v = nodetype(v)
                    except:
                        raise TypeError(
                            "Failed to convert node to type {0}".format(
                                nodetype))

                try:
                    begin = int(begin)
                    end = nodetype(end)
                except:
                    raise TypeError("Failed to convert time to type int")

                ig.add_edge(u, v, begin, end)

        return ig
Exemplo n.º 12
0
def smooth_nucleotide(regions, concat_regions_d, mutations, tukey_filter,
                      simulation_window):
    """Generate a smoothing curve for a list of element's mutations in the nucleotide sequence

    Args:
        regions (IntervalTree): IntervalTree with genomic positions of an element
        concat_regions_d (dict): keys are start genomic regions, values are positions (index) relative to the start
        mutations (list): list of mutations formatted as namedtuple
        tukey_filter (numpy.ndarray): kde array, length equals smoothing window.
        simulation_window (int): simulation window

    Returns:
        final_smooth_tree (IntervalTree): interval are genomic regions or indexes (concatenate mode),
            data np.array of smoothing score by position
        mutations_in (list): list of mutations in regions
    """
    first_smooth_tree = IntervalTree()
    final_smooth_tree = IntervalTree()
    mutations_in = []

    # Generate smoothing arrays for regions
    for interval in regions:
        # Add extra bases for smoothing of simulated mutations that fall outside regions and tukey_filter
        first_smooth_tree.addi(
            interval.begin, interval.end,
            np.zeros((interval.end - interval.begin + len(tukey_filter) +
                      simulation_window - 2)))

    if not concat_regions_d:
        # Smooth
        for mutation in mutations:
            for interval in first_smooth_tree[mutation.region[0]]:
                # Get index of mutation in region
                new_begin = interval.begin - (simulation_window +
                                              len(tukey_filter) -
                                              2) // 2  # always integer
                index = mutation.position - new_begin
                tukey_begin = index - (len(tukey_filter) - 1) // 2
                # Smooth mutations
                interval.data[tukey_begin:tukey_begin +
                              len(tukey_filter)] += tukey_filter
            # Get mutations inside regions
            if regions[mutation.position]:
                mutations_in.append(mutation)

        # Remove extra bp
        for interval in first_smooth_tree:
            begin = interval.begin
            end = interval.end
            slicer = (simulation_window + len(tukey_filter) - 2) // 2
            final_smooth_tree.addi(begin, end, interval.data[slicer:-slicer])

    else:
        # Smooth simulated mutations outside regions
        for mutation in mutations:
            if not first_smooth_tree[mutation.position]:
                for interval in first_smooth_tree[mutation.region[0]]:
                    new_begin = interval.begin - (simulation_window +
                                                  len(tukey_filter) -
                                                  2) // 2  # always integer
                    index = mutation.position - new_begin
                    tukey_begin = index - (len(tukey_filter) - 1) // 2
                    # Smooth mutations
                    interval.data[tukey_begin:tukey_begin +
                                  len(tukey_filter)] += tukey_filter

        # Remove extra bp
        for interval in first_smooth_tree:
            begin = interval.begin
            end = interval.end
            slicer = (simulation_window + len(tukey_filter) - 2) // 2
            final_smooth_tree.addi(begin, end, interval.data[slicer:-slicer])

        # Merge sorted regions (one interval == concatenated sequence) and add tukey//2 to both ends
        concat_tree = IntervalTree()
        concat_array = np.zeros((len(tukey_filter) - 1) // 2)
        for interval in sorted(final_smooth_tree):
            concat_array = np.append(concat_array, interval.data)
        concat_array = np.append(concat_array,
                                 np.zeros((len(tukey_filter) - 1) // 2))
        concat_tree.addi(final_smooth_tree.begin(), final_smooth_tree.end(),
                         concat_array)
        final_smooth_tree = IntervalTree()

        # Smooth mutations inside regions
        for mutation in mutations:
            if first_smooth_tree[mutation.position]:
                for interval in concat_tree[mutation.position]:
                    # Get index of mutation in concatenated sequence
                    index = (mutation.position - mutation.region[0]
                             ) + concat_regions_d[mutation.region[0]].start
                    # Smooth mutations
                    interval.data[index:(index +
                                         len(tukey_filter))] += tukey_filter
                mutations_in.append(mutation)

        # Remove extra bp
        for interval in concat_tree:
            begin = interval.begin
            end = interval.end
            slicer = (len(tukey_filter) - 1) // 2
            final_smooth_tree.addi(begin, end, interval.data[slicer:-slicer])

    return final_smooth_tree, mutations_in
Exemplo n.º 13
0
class TemporalPathPyObject(PathPyObject):
    """Base class for a temporal object."""
    def __init__(self, uid: Optional[str] = None, **kwargs: Any) -> None:
        """Initialize the temporal object."""

        # initialize the parent class
        super().__init__(uid=uid)

        # default start and end time of the object
        self._start = float('-inf')
        self._end = float('inf')

        # initialize an intervaltree to save events
        self._events = IntervalTree()

        # add new events
        self.event(**kwargs)

        # variable to store changes in the events
        self._len_events = len(self._events)

    def __iter__(self):
        self._clean_events()

        # create generator
        for start, end, attributes in sorted(self._events):
            self._attributes = {**{'start': start, 'end': end}, **attributes}
            yield self
        self._attributes.pop('start', None)
        self._attributes.pop('end', None)

    @singledispatchmethod
    def __getitem__(self, key: Any) -> Any:
        self._clean_events()
        # get the last element
        _, _, last = self.last()
        return last.get(key, None)

    @__getitem__.register(tuple)  # type: ignore
    def _(self, key: tuple) -> Any:
        start, end, _ = _get_start_end(key[0])
        values = {
            k: v
            for _, _, o in sorted(self._events[start:end])
            for k, v in o.items()
        }
        return values.get(key[1], None) if len(key) == 2 else values

    @__getitem__.register(slice)  # type: ignore
    @__getitem__.register(int)  # type: ignore
    @__getitem__.register(float)  # type: ignore
    def _(self, key: Union[int, float, slice]) -> Any:
        start, end, _ = _get_start_end(key)
        self._clean_events()

        # create generator
        for start, end, attributes in sorted(self._events[start:end]):
            self._attributes = {**{'start': start, 'end': end}, **attributes}
            yield self
        self._attributes.pop('start', None)
        self._attributes.pop('end', None)

    @singledispatchmethod
    def __setitem__(self, key: Any, value: Any) -> None:
        self.event(start=self._events.begin(),
                   end=self._events.end(),
                   **{key: value})

    @__setitem__.register(tuple)  # type: ignore
    def _(self, key: tuple, value: Any) -> None:
        start, end, _ = _get_start_end(key[0])
        self.event(start=start, end=end, **{key[1]: value})

    @property
    def start(self):
        """start of the object"""
        return self.attributes.get('start', self._start)

    @property
    def end(self):
        """end of the object"""
        return self.attributes.get('end', self._end)

    def _clean_events(self):
        """helper function to clean events"""

        # BUG: There is a bug in the intervaltree library
        # merge_equals switches old and new data randomly
        def reducer(old, new):
            return {**old, **new}

        if len(self._events) != self._len_events:
            # split overlapping intervals
            self._events.split_overlaps()

            # combine the dict of the overlapping intervals
            self._events.merge_equals(data_reducer=reducer)

            # update the length of the events
            self._len_events = len(self._events)

    def event(self, *args, **kwargs) -> None:
        """Add a temporal event."""

        # check if object is avtive or inactive
        active = kwargs.pop('active', True)

        # get start and end time of the even
        start, end, kwargs = _get_start_end(*args, **kwargs)

        if active:
            self._events[start:end] = kwargs  # type: ignore
            self._attributes = kwargs.copy()
        else:
            self._events.chop(start, end)

        # update start and end times
        self._start = self._events.begin()
        self._end = self._events.end()

    def last(self):
        """return the last added intervall"""
        interval = sorted(self._events)[-1]
        return interval.begin, interval.end, interval.data
def precise_extension(dict_transcript, dict_exon_signal, gene_col,
                      coverage_stringtie):
    precisely_extended_dict = {}
    overlapped_transcripts = []
    coverage = coverage_stringtie * 200  # Average length of an exon = 200pb.
    # Boolean if the introns of a gene car be the exon of an other one.
    intron_exon = False
    for chromosome in dict_transcript:
        # Create a new dictionnary with the same model than dict_transcript.
        precisely_extended_dict[str(chromosome)] = IntervalTree()
        for transcript in sorted(dict_transcript[chromosome]):
            overlap_start = 0
            # Introduce the boolean extension with false as default for each transcript.
            extension = False
            # Case where the transcript is from the positive strand.
            if transcript.data[0][6] == "+":
                # Check if there is others transcripts in the area to extend.
                if len(dict_transcript[chromosome][transcript.end +
                                                   1:transcript.end +
                                                   5001]) != 0:
                    exons_it = IntervalTree()
                    introns_it = IntervalTree()
                    max_extension = 0
                    for transcript_in_iv in sorted(
                            dict_transcript[chromosome][transcript.end +
                                                        1:transcript.end +
                                                        5001]):
                        # If others transcripts are from the same strand but not the same gene, store in an IV the exons and the overlapping start.
                        if transcript_in_iv.data[0][
                                gene_col] != transcript.data[0][
                                    gene_col] and transcript_in_iv.data[0][
                                        6] == "+":
                            if overlap_start == 0:
                                if transcript_in_iv.begin > transcript.end:
                                    overlap_start = transcript_in_iv.begin
                                # If transcripts are already overlapping before extension, error in the original GTF.
                                else:
                                    overlap_start = transcript.end + 1
                                    overlapped_transcripts.append(transcript)
                            for exon_in_transcript in transcript_in_iv.data:
                                if int(exon_in_transcript[3]) > transcript.end:
                                    exons_it[int(exon_in_transcript[3]) +
                                             1:int(exon_in_transcript[4]
                                                   )] = "exon"
                                else:
                                    continue
                    # Comeback to the case where there is an overlapping issue.
                    if len(exons_it) > 1:
                        # If there is a signal in the area where overlapping start in the stringtie output.
                        if chromosome in dict_exon_signal:
                            if len(dict_exon_signal[chromosome]
                                   [overlap_start:transcript.end +
                                    5001]) != 0 and intron_exon == True:
                                exons_it.merge_overlaps()
                                # Convert the exon intervaltree in a intron one.
                                for exon_number, exons in enumerate(
                                        sorted(exons_it)):
                                    if exon_number == 0:
                                        previous_end = exons.end
                                    else:
                                        introns_it[previous_end +
                                                   1:exons.begin] = "intron"
                                        previous_end = exons.end
                                # Check if signal overlap introns and assign max extension in consequence.
                                for signal in sorted(
                                        dict_exon_signal[chromosome]
                                    [overlap_start:introns_it.end()],
                                        reverse=True):
                                    if signal.data[0] == "+":
                                        for intron in sorted(introns_it,
                                                             reverse=True):
                                            if signal.end > intron.begin and signal.begin < intron.end and signal.end <= transcript.end + 5001:
                                                if signal.end < intron.end:
                                                    max_extension = signal.end
                                                else:
                                                    max_extension = intron.end
                                                extension = True
                                                break
                                        if max_extension != 0:
                                            new_transcript_end = max_extension
                                            break
                                    else:
                                        continue
                                # Case where no signal overlap introns.
                                if max_extension == 0:
                                    if len(dict_exon_signal[chromosome]
                                           [transcript.end +
                                            1:overlap_start]) != 0:
                                        for signal in sorted(
                                                dict_exon_signal[chromosome]
                                            [transcript.end + 1:overlap_start],
                                                reverse=True):
                                            if signal.data[
                                                    0] == "+" and signal.end <= transcript.end + 5001:
                                                new_transcript_end = signal.end
                                                extension = True
                                                break
                                    else:
                                        extension = False
                            # Case where no signal overlap transcripts.
                            else:
                                if len(
                                        dict_exon_signal[chromosome]
                                    [transcript.end + 1:overlap_start]) != 0:
                                    for signal in sorted(
                                            dict_exon_signal[chromosome]
                                        [transcript.end + 1:overlap_start],
                                            reverse=True):
                                        if signal.data[
                                                0] == "+" and signal.end <= transcript.end + 5001 and signal.end < overlap_start:
                                            if signal.data[1] * (
                                                    signal.end -
                                                    signal.begin) > coverage:
                                                new_transcript_end = signal.end
                                                extension = True
                                                break
                        else:
                            extension = False
                    elif len(exons_it) == 1:
                        if chromosome in dict_exon_signal:
                            if len(dict_exon_signal[chromosome]
                                   [transcript.end + 1:exons_it.begin()]) != 0:
                                for signal in sorted(
                                        dict_exon_signal[chromosome]
                                    [transcript.end:exons_it.begin()],
                                        reverse=True):
                                    if signal.data[
                                            0] == "+" and signal.end <= exons_it.begin(
                                            ) - 1:
                                        if signal.data[1] * (
                                                signal.end -
                                                signal.begin) > coverage:
                                            new_transcript_end = signal.end
                                            extension = True
                                            break
                            else:
                                extension = False
                        else:
                            extension = False

                    else:
                        # If there is a signal present from the stringtie output overlapping from the end of the transcript to an inputted value, save the signal's end.
                        if chromosome in dict_exon_signal:
                            if len(dict_exon_signal[chromosome]
                                   [transcript.end + 1:transcript.end +
                                    5001]) != 0:
                                for signal in sorted(
                                        dict_exon_signal[chromosome]
                                    [transcript.end:transcript.end + 5001],
                                        reverse=True):
                                    if signal.data[
                                            0] == "+" and signal.end <= transcript.end + 5001:
                                        if signal.data[1] * (
                                                signal.end -
                                                signal.begin) > coverage:
                                            new_transcript_end = signal.end
                                            extension = True
                                            break
                            else:
                                extension = False
                        else:
                            extension = False
                # When extension is true, end of the transcript is changed with the signal's end and added to the new dict.
                if extension is True:
                    modified_transcript = copy.deepcopy(transcript)
                    modified_transcript.data[-1][4] = str(new_transcript_end)
                    modified_transcript.data[-1][1] = "BestScriptEver"
                    modified_transcript.data[-1].append(
                        "extension +" +
                        str(new_transcript_end - transcript.end))
                    precisely_extended_dict[chromosome][
                        int(transcript.begin):int(new_transcript_end
                                                  )] = modified_transcript.data
                # Otherwise, unmodified transcript is added.
                else:
                    precisely_extended_dict[chromosome][
                        int(transcript.begin):int(transcript.end
                                                  )] = transcript.data

            # Case where the transcript is from the negative strand.
            if transcript.data[0][6] == "-":
                # Check if there is others transcripts in the area to extend.
                if len(dict_transcript[chromosome]
                       [transcript.begin - 5000:transcript.begin]) != 0:
                    exons_it = IntervalTree()
                    introns_it = IntervalTree()
                    max_extension = 0
                    for transcript_in_iv in sorted(
                            dict_transcript[chromosome][transcript.begin -
                                                        5000:transcript.begin],
                            reverse=True):
                        # If others transcripts are from the same strand but not the same gene, store in an IV the exons and the overlapping start.
                        if transcript_in_iv.data[0][
                                gene_col] != transcript.data[0][
                                    gene_col] and transcript_in_iv.data[0][
                                        6] == "-":
                            if overlap_start == 0:
                                if transcript_in_iv.begin < transcript.begin:
                                    overlap_start = transcript_in_iv.end
                                # If transcripts are already overlapping before extension, error in the original GTF.
                                else:
                                    overlap_start = transcript.begin - 1
                                    overlapped_transcripts.append(transcript)
                            for exon_in_transcript in transcript_in_iv.data:
                                if int(exon_in_transcript[4]
                                       ) < transcript.begin:
                                    exons_it[int(exon_in_transcript[3]) +
                                             1:int(exon_in_transcript[4]
                                                   )] = "exon"
                                else:
                                    continue

                    # Comeback to the case where there is an overlapping issue.
                    if len(exons_it) > 1:
                        # If there is a signal in the area where overlapping start in the stringtie output.
                        if chromosome in dict_exon_signal:
                            if len(dict_exon_signal[chromosome]
                                   [transcript.begin - 5000:overlap_start +
                                    1]) != 0 and intron_exon == True:
                                exons_it.merge_overlaps()
                                # Convert the exon intervaltree in a intron one.
                                for exon_number, exons in enumerate(
                                        sorted(exons_it)):
                                    if exon_number == 0:
                                        previous_end = exons.end
                                    else:
                                        introns_it[previous_end +
                                                   1:exons.begin] = "intron"
                                        previous_end = exons.end
                                # Check if signal overlap introns and assign max extension in consequence.
                                for signal in sorted(
                                        dict_exon_signal[chromosome]
                                    [introns_it.begin():overlap_start + 1]):
                                    if signal.data[0] == "-":
                                        for intron in sorted(introns_it):
                                            if signal.begin < intron.end and signal.end > intron.begin and signal.begin >= transcript.begin - 5000:
                                                if signal.begin > intron.begin:
                                                    max_extension = signal.begin
                                                else:
                                                    max_extension = intron.begin
                                                extension = True
                                                break
                                        if max_extension != 0:
                                            new_transcript_end = max_extension
                                            break
                                    else:
                                        continue
                                # Case where no signal overlap introns.
                                if max_extension == 0:
                                    if len(
                                            dict_exon_signal[chromosome]
                                        [overlap_start:transcript.begin]) != 0:
                                        for signal in sorted(
                                                dict_exon_signal[chromosome]
                                            [overlap_start:transcript.begin]):
                                            if signal.data[
                                                    0] == "-" and signal.begin >= transcript.begin - 5001:
                                                new_transcript_end = signal.begin
                                                extension = True
                                                break
                                    else:
                                        extension = False
                            # Case where no signal overlap transcripts.
                            else:
                                if len(dict_exon_signal[chromosome]
                                       [overlap_start:transcript.begin]) != 0:
                                    for signal in sorted(
                                            dict_exon_signal[chromosome]
                                        [overlap_start:transcript.begin]):
                                        if signal.data[
                                                0] == "-" and signal.begin >= transcript.begin - 5001 and signal.begin > overlap_start:
                                            if signal.data[1] * (
                                                    signal.end -
                                                    signal.begin) > coverage:
                                                new_transcript_end = signal.begin
                                                extension = True
                                                break
                        else:
                            extension = False
                    elif len(exons_it) == 1:
                        if chromosome in dict_exon_signal:
                            if len(dict_exon_signal[chromosome]
                                   [exons_it.end():transcript.begin]) != 0:
                                for signal in sorted(
                                        dict_exon_signal[chromosome]
                                    [exons_it.end():transcript.begin]):
                                    if signal.data[
                                            0] == "-" and signal.begin >= exons_it.end(
                                            ) + 1:
                                        if signal.data[1] * (
                                                signal.end -
                                                signal.begin) > coverage:
                                            new_transcript_end = signal.begin
                                            extension = True
                                            break
                            else:
                                extension = False
                        else:
                            extension = False
                    else:
                        # If there is a signal present from the stringtie output overlapping from the end of the transcript to an inputted value, save the signal's end.
                        if chromosome in dict_exon_signal:
                            if len(dict_exon_signal[chromosome]
                                   [transcript.begin -
                                    5000:transcript.begin]) != 0:
                                for signal in sorted(
                                        dict_exon_signal[chromosome]
                                    [transcript.begin -
                                     5000:transcript.begin]):
                                    if signal.data[
                                            0] == "-" and signal.begin >= transcript.begin - 5000:
                                        if signal.data[1] * (
                                                signal.end -
                                                signal.begin) > coverage:
                                            new_transcript_end = signal.begin
                                            extension = True
                                            break
                            else:
                                extension = False
                        else:
                            extension = False
                # When extension is true, end of the transcript is changed with the signal's end and added to the new dict.
                if extension is True:
                    modified_transcript = copy.deepcopy(transcript)
                    modified_transcript.data[0][3] = str(new_transcript_end)
                    modified_transcript.data[0][1] = "BestScriptEver"
                    modified_transcript.data[0].append("extension " +
                                                       str(new_transcript_end -
                                                           transcript.begin))
                    precisely_extended_dict[chromosome][
                        int(new_transcript_end
                            ):int(transcript.end)] = modified_transcript.data
                # Otherwise, unmodified transcript is added.
                else:
                    precisely_extended_dict[chromosome][
                        int(transcript.begin):int(transcript.end
                                                  )] = transcript.data
    with open("errors_file.txt", "w") as filout:
        for ovlp_transcript in overlapped_transcripts:
            filout.write("{}\n".format(ovlp_transcript.data[0]))
    return precisely_extended_dict
Exemplo n.º 15
0
class TemporalNodeCollection(NodeCollection):
    """A collection of temporal nodes"""
    def __init__(self, *args, **kwargs) -> None:
        """Initialize the NodeCollection object."""

        # initialize the base class
        super().__init__(*args, **kwargs)

        # initialize an intervaltree to save events
        self._events = IntervalTree()

        # class of objects
        self._default_class: Any = TemporalNode

    @singledispatchmethod
    def __getitem__(self, key: Any) -> Any:
        return super().__getitem__(key)

    @__getitem__.register(slice)  # type: ignore
    @__getitem__.register(int)  # type: ignore
    @__getitem__.register(float)  # type: ignore
    def _(self, key: Union[int, float, slice]) -> Any:
        # pylint: disable=arguments-differ
        start, end, _ = _get_start_end(key)
        for start, end, uid in sorted(self._events[start:end]):
            for obj in self[uid][start:end]:
                yield obj

    @property
    def start(self):
        """start of the object"""
        return self._events.begin()

    @property
    def end(self):
        """end of the object"""
        return self._events.end()

    @property
    def events(self):
        """Temporal events"""
        return self._events

    @singledispatchmethod
    def add(self, *args, **kwargs: Any) -> None:
        """Add multiple nodes. """
        super().add(*args, **kwargs)

    def _add(self, obj: Any, **kwargs: Any) -> None:
        """Add an node to the set of nodes."""
        super()._add(obj, **kwargs)
        start, end, _ = obj.last()
        self._events[start:end] = obj.uid

    def _if_exist(self, obj: Any, **kwargs: Any) -> None:
        """Helper function if node already exists."""
        count: int = kwargs.pop('count', 1)
        element = self[obj.relations]
        element.event(**kwargs)
        start, end, _ = obj.last()
        self._events[start:end] = element.uid

    def _remove(self, obj) -> None:
        """Add an edge to the set of edges."""
        for interval in sorted(self._events):
            if interval.data == obj.uid:
                self._events.remove(interval)
        super()._remove(obj)
Exemplo n.º 16
0
class QueryCompleter(Completer):
    """
    Suggests JMESPath query syntax completions.

    After receiving AWS service and operation names in form
    of awscli command and subcommand, an output shape loaded
    from botocore Session is parsed by the ShapeParser object.
    This object returns a "Dummy response", which is used in
    attempt to provide sensible suggestions.

    At the moment, this completer is unable to provide suggestions
    for JMESPath functions and custom hashes and arrays.
    """
    def __init__(self, session, **kwds):
        self._session = Session(profile=session.profile_name)
        self._command_table = None
        self._shape_parser = ShapeParser()
        self._lexer = jmespath.lexer.Lexer()
        self._service = None
        self._operation = None
        # Attributes below change as the query changes.
        # They are used to to track state to provide suggestions.
        self._should_reparse = True
        self._shape_dict = None
        self._context = None
        self._last_pos = 0
        self._implicit_context = []
        self._stack = []
        self._tree = IntervalTree()
        self._start = 0
        self._colon = False
        self._disable = (False, 0)
        super(QueryCompleter, self).__init__(**kwds)

    @property
    def context(self):
        """
        Get the context attribute.

        This is used to track the state of mutating fake API response.
        """
        if self._context is None:
            self.context = self._shape_dict
        return self._context

    @context.setter
    def context(self, value):
        """Set the value of context attribute."""
        self._context = value

    @property
    def command_table(self):
        """
        Get the command table attribute.

        This is used to transform aws-cli command and subcommand
        into their API operation counterpart.
        """
        if self._command_table is None:
            self._command_table = build_command_table(self._session)
        return self._command_table

    def set_shape_dict(self, service, operation):
        """
        Set the fake response (shape dict).

        This is based on received aws-cli service and operation
        (command, subcommand).
        """
        shape_dict = self._get_shape_dict(service, operation)
        self._shape_dict = shape_dict
        self.context = shape_dict

    def reset(self):
        """Reset the state of the completer."""
        self.context = None
        self._implicit_context = list()
        self._stack = list()
        self._tree = IntervalTree()
        self._start = 0
        self._colon = False
        self._disable = (False, 0)
        self._repeat_suggestion = False

    def get_completions(self, document, c_e):
        """
        Retrieve suggestions for the JMESPath query.

        First parse the existing part of the query with the JMESPath
        lexer. Based on the last token type, choose appropriate
        handler method. This handler method then returns a list of
        suggested completions, which are then yielded from here.

        As the query is being parsed, the Completer
        tracks the state of the query. If the query is being
        corrected, deleted or a larger chunk is pasted at once,
        the Completer has to reparse the query (rebuild the state).
        """
        if not self._shape_dict:
            return
        if self._disable[0]:
            if document.cursor_position > self._disable[1]:
                return
            self._disable = (False, 0)

        should_repeat = not bool(document.get_word_before_cursor())
        self._repeat_suggestion = c_e.completion_requested or should_repeat
        completions = self._parse_completion(document, c_e)
        self._last_pos = document.cursor_position

        if not completions:
            return

        word = document.get_word_before_cursor(pattern=_FIND_IDENTIFIER)
        for c in sorted(completions):
            start_position = 0 if len(c) == 1 else -len(word)
            yield Completion(text_type(c), start_position=start_position)

    def _parse_completion(self, document, c_e):
        text = document.text_before_cursor
        self._text = ' ' if not text else text

        try:
            self._tokens = list(self._lexer.tokenize(self._text))
        except jmespath.exceptions.LexerError:
            return

        if self._tokens[-1]['type'] == 'eof':
            self._tokens.pop()

        if not self._tokens:
            return self.context.keys()

        if self._should_reparse:
            completions = self._reparse_completion()
            self._should_reparse = False
        elif document.cursor_position > self._last_pos:
            completions = self._append_completion()
        elif (document.cursor_position == self._last_pos
              and c_e.completion_requested):
            completions = self._append_completion()
        else:
            completions = self._reparse_completion()
            self._should_reparse = True
        return completions

    def _append_completion(self):
        last_token = self._tokens[-1]
        index = len(self._tokens) - 1
        penultimate_token = self._look_back(index, 1)
        try:
            return self._handle_token(last_token, penultimate_token, index)
        except NullIntervalException as e:
            self._disable = True, e.pos
            return

    def _reparse_completion(self):
        completions = list()
        self.reset()
        for i, token in enumerate(self._tokens):
            if self._disable[0]:
                return
            penultimate_token = self._look_back(i, 1)
            if token['type'] in COMPLEX_SIGNS:
                fake_lbracket = {
                    'type': 'lbracket',
                    'start': token['start'],
                    'end': token['end'] - 1
                }
                self._handle_token(fake_lbracket, penultimate_token, i)
            try:
                completions = self._handle_token(token, penultimate_token, i)
            except NullIntervalException as e:
                self._disable = True, e.pos
                return
        return completions

    def _handle_token(self, token, prev_token, index=None):
        if not index:
            index = len(self._tokens) - 1
        handler = getattr(self, '_handle_%s' % token['type'],
                          self._handle_others)
        return handler(token, prev_token, index)

    def _handle_lbracket(self, token, prev_token, index):
        if not prev_token:
            if isinstance(self.context, dict):
                return self.context.keys()
            return
        if not self._repeat_suggestion:
            self._switch_into_next_implicit_context(token)

        if (prev_token['type'] in IDENTIFIERS
                and isinstance(self.context, dict)):
            value = self.context.get(prev_token['value'], None)
            if isinstance(value, list):
                if not self._repeat_suggestion:
                    self.context = value
                return LBRACKETS_CONTINUATION
            self._disable = (True, token['end'])
            return

        if prev_token['type'] == 'dot':
            if isinstance(self.context, dict):
                return self.context.keys()
            self._disable = (True, token['end'])
            return

        if isinstance(self.context, list):
            return LBRACKETS_CONTINUATION

    def _handle_filter(self, token, prev_token, index):
        if self._repeat_suggestion:
            return self.context.keys()

        _, index = self._stack.pop()
        promise = token['type'], index
        self._stack.append(promise)
        if not isinstance(self.context, list):
            self._disable = (True, token['end'])
            return

        self.context = next(iter(self.context))
        if not isinstance(self.context, dict):
            self._disable = (True, token['end'])
            return

        self._implicit_context = copy.deepcopy(self.context)
        end = self._tree.end() - 1
        start = next(iter(self._tree.at(end))).begin
        self._tree[start:token['start']] = self._implicit_context
        return self.context.keys()

    def _handle_lbrace(self, token, _, index):
        if not isinstance(self.context, dict):
            self._disable = (True, token['end'])
            return
        if not self._repeat_suggestion:
            self._switch_into_next_implicit_context(token)

    def _handle_colon(self, token, prev_token, index):
        if not self._stack:
            self._disable = True, token['end']
            return
        if self._stack[-1][0] == 'lbracket':
            return
        if self._stack[-1][0] == 'lbrace':
            if not self._colon and prev_token['type'] in IDENTIFIERS:
                self._colon = True
                return self.context.keys()

    def _handle_flatten(self, token, _, index):
        if self._repeat_suggestion:
            return
        self.context = JMESPATH_FLATTEN.search(self.context)
        old_end = token['end']
        self._tree[self._start:old_end] = self._implicit_context
        _, returning_context_index = self._stack.pop()
        context_interval = next(iter(self._tree[returning_context_index]))
        self._implicit_context = context_interval.data
        self._start = old_end

    def _handle_rbracket(self, token, prev_token, index):
        if self._repeat_suggestion:
            return
        is_filter = (self._stack and self._stack[-1][0] == 'filter')
        self._switch_from_prev_implicit_context(token)
        # Handle [*] projection and index access (e.g.: lst[1])
        if prev_token and prev_token['type'] in {'star', 'number'}:
            # need antepenultimate (third to last) token info
            apu_token = self._look_back(index, 2)
            if apu_token and apu_token['type'] == 'lbracket':
                if isinstance(self.context, list):
                    self.context = next(iter(self.context))
            else:
                self._disable = (True, token['end'])
        elif prev_token and prev_token['type'] in STRINGS:
            if not is_filter:
                self._disable = (True, token['end'])

    def _handle_rbrace(self, token, _, index):
        self._disable = True, token['end']
        return

    def _handle_dot(self, token, prev_token, index):
        if not prev_token:
            self._disable = (True, token['end'])
            return
        # Applying subexpression to a JSON object
        if isinstance(self.context, dict):
            if self._repeat_suggestion:
                return self.context.keys()
            # Simulate application of * projection
            if prev_token['type'] == 'star':
                new_context = list(self.context.values())
            # Receive the value of identifier
            elif prev_token['type'] in IDENTIFIERS:
                new_context = self.context.get(prev_token['value'], None)
            elif prev_token['type'] in {'rbracket', 'flatten'}:
                new_context = self.context
            # Nothing else is applicable to JSON objects
            else:
                new_context = dict()
            self.context = new_context
            if isinstance(self.context, dict):
                return self.context.keys()
        # Applying subexpression to a JSON list
        if isinstance(self.context, list):
            if prev_token['type'] == 'flatten':
                self.context = next(iter(self.context))
                if isinstance(self.context, dict):
                    return self.context.keys()
            if prev_token['type'] == 'rbracket':
                return LBRACKET
            self._disable = (True, token['end'])

    def _handle_pipe(self, token, _, index):
        if not self._repeat_suggestion:
            if self._stack:
                pos = self._stack[-1][1]
                context_interval = next(iter((self._tree[pos])))
                context = context_interval.data
                lhs = self._text[pos:token['start']]
                tokens = list(self._lexer.tokenize(lhs))
                for a_token in reversed(tokens):
                    if a_token['type'] in {'colon', 'comma'}:
                        lhs = lhs[a_token['end']:]
                        break
            else:
                lhs = self._text[:token['start']]
                context = self._shape_dict

            tokens = list(self._lexer.tokenize(lhs))
            lhs = self._remove_filters(lhs, tokens)
            try:
                result = jmespath.search(lhs, context)
            except jmespath.exceptions.JMESPathError:
                return
            self.context = result

        if isinstance(self.context, list):
            return LBRACKET
        if isinstance(self.context, dict):
            return self.context.keys()

    def _handle_others(self, token, _, index):
        if token['type'] == 'comma':
            if self._stack and self._stack[-1][0] == 'lbrace':
                self._colon = False
                return
            if not self._stack:
                self._disable = (True, token['end'])
                return

        # Drop to fallback context on these... (&& || , > < etc...)
        if token['type'] in CONTEXT_RESET_SIGNS:
            if not self._repeat_suggestion:
                self.context = copy.deepcopy(self._implicit_context)
            if isinstance(self.context, dict):
                return self.context.keys()

        if token['type'] in IDENTIFIERS:
            if (self._stack and self._stack[-1][0] == 'lbrace'
                    and not self._colon):
                return
            identifier = token['value']
            if isinstance(self.context, dict):
                value = self.context.get(identifier, None)
                if isinstance(value, list):
                    return LBRACKET
                completions = [
                    c for c in self.context.keys() if c.startswith(identifier)
                ]
                return completions

    def _switch_into_next_implicit_context(self, token):
        old_end = token['end']
        if self._start == old_end:
            raise NullIntervalException(token['end'])
        self._implicit_context = copy.deepcopy(self.context)
        self._tree[self._start:old_end] = self._implicit_context
        self._start = old_end
        promise = token['type'], old_end - 1
        self._stack.append(promise)

    def _switch_from_prev_implicit_context(self, token):
        if (not self._stack
                or ENCLOSURE_MATCH[self._stack[-1][0]] != token['type']):
            self._disable = (True, token['end'])
            return
        old_end = token['end']
        if self._start == old_end:
            raise NullIntervalException(token['end'])
        self._tree[self._start:old_end] = self._implicit_context
        _, returning_context_index = self._stack.pop()
        self._implicit_context = self._tree[returning_context_index]
        self._start = old_end

    def _look_back(self, index, offset):
        if index < offset:
            return
        index = index - offset
        return self._tokens[index]

    def _remove_filters(self, expression, tokens):
        intervals = self._detect_filters(tokens)
        for interval in reversed(intervals):
            start, end = interval
            expression = expression[:start] + expression[end:]
        return expression

    def _detect_filters(self, tokens):
        in_filter_context = False
        counter = 0
        intervals = list()
        for token in tokens:
            if not in_filter_context and token['type'] == 'filter':
                in_filter_context = True
                start = token['start']
            elif in_filter_context:
                if token['type'] in {'filter', 'lbracket'}:
                    counter += 1
                elif token['type'] == 'rbracket':
                    if counter == 0:
                        in_filter_context = False
                        end = token['end']
                        intervals.append((start, end))
                    else:
                        counter -= 1
        return intervals

    def _get_shape_dict(self, service, operation):
        try:
            service, operation = (self._get_transformed_names(
                service, operation))
        except InvalidShapeData:
            return None

        try:
            return self._parse_shape(service, operation)
        except ModelLoadingError:
            return None

    def _get_transformed_names(self, service, operation):
        if service == 's3api':
            service = 's3'
        service_data = self.command_table.get(service, None)
        if not service_data:
            raise InvalidShapeData()
        operation = service_data.get_operation_name(operation)
        if not operation:
            raise InvalidShapeData()
        return service, operation

    def _parse_shape(self, service, operation):
        if service != self._service:
            self._service_model = self._load_service_model(service)
            operation_model = (self._load_operation_model(
                self._service_model, operation))
            parsed = self._shape_parser.parse(operation_model.output_shape)
            self._service = service
            self._operation = operation
            return parsed

        if operation != self._operation:
            operation_model = (self._load_operation_model(
                self._service_model, operation))
            parsed = self._shape_parser.parse(operation_model.output_shape)
            self._operation = operation
            return parsed

        return self._shape_dict

    def _load_service_model(self, service_name):
        try:
            service_model = self._session.get_service_model(service_name)
        except UnknownServiceError as e:
            raise ModelLoadingError(str(e))
        return service_model

    def _load_operation_model(self, service_model, operation):
        try:
            operation_model = service_model.operation_model(operation)
        except OperationNotFoundError as e:
            raise ModelLoadingError(str(e))
        return operation_model
Exemplo n.º 17
0
class GhostFile:

    def __init__(self, datapath, clear_cache_callback):
        self.__data_path = datapath
        self._clear_cache_callback = clear_cache_callback

        try:
            self.__filesize = os.path.getsize(datapath)
        except FileNotFoundError:
            self.__filesize = 0

        self.__rewritten_intervals = IntervalTree([Interval(0, self.__filesize)] if self.__filesize > 0 else None)

        self.__data_path_reader = open(self.__data_path, 'rb')

    def truncate(self, length):
        """
        Example of some subsequent trucates:
         X: original data
         _: null bytes
         |: truncate position

         * Original file: XXXXXXXXXXXX
         * Truncate 1:    XXXXXX|
         * Truncate 2:    XXX|
         * Truncate 3:    XXX______|
         * Writes:        XXX____X_
         * Truncate 4:    |
         * Writes:        _X_X__XX
         * Truncate 5:    _X_X_|

        :param length:
        """
        if length > 0:
            self.__rewritten_intervals.slice(length)
            self.__rewritten_intervals = IntervalTree(self.__rewritten_intervals[0:length])
        else:
            self.__rewritten_intervals = IntervalTree()

        self.__filesize = length

        assert self.__filesize >= self.__rewritten_intervals.end()

    def write(self, buf, offset, fh):
        """
        Write data to this GhostFile.
        :param buf:
        :param offset:
        :param fh:
        :return: The number of bytes written.
        """
        if offset + len(buf) <= os.path.getsize(self.__data_path) and self._is_same_data(buf, offset):
            # Ok, we don't write anything. We just remember about it.
            GhostFile._optimized_add_to_intervaltree(self.__rewritten_intervals, offset, offset + len(buf))
            self.__filesize = max(self.__filesize, offset + len(buf))

            assert self.__filesize == self.__rewritten_intervals.end()
            return len(buf)

        else:
            # TODO Do only the write if in the tree there is one contiguous interval from 0 to filesize, because it
            #      means that the previous write was real too

            # Add this write to the intervaltree so that we don't waste time filling it with zeros.
            # We're going to reset the tree anyway.
            GhostFile._optimized_add_to_intervaltree(self.__rewritten_intervals, offset, offset + len(buf))
            self.__filesize = max(self.__filesize, offset + len(buf))

            # Fill all the holes with zeros and write them
            self._write_tree_to_real_file(fh)

            # Write the new data
            os.lseek(fh, offset, os.SEEK_SET)
            written_bytes = 0
            while written_bytes < len(buf):
                written_bytes += os.write(fh, buf[written_bytes:])
            assert written_bytes == len(buf)

            # Update the structures
            self.__filesize = os.path.getsize(self.__data_path)
            self.__rewritten_intervals = IntervalTree([Interval(0, self.__filesize)] if self.__filesize > 0 else None)

            assert self.__filesize == self.__rewritten_intervals.end() == os.path.getsize(self.__data_path)
            return len(buf)

    def read(self, length, offset, fh):
        """
        Read data from this GhostFile.
        :param length:
        :param offset:
        :param fh:
        :return:
        """
        if offset >= self.__filesize or length == 0:
            return b''

        data = b''

        intervals = IntervalTree(self.__rewritten_intervals[offset:offset+length])
        intervals.merge_overlaps()
        intervals.slice(offset)
        intervals.slice(offset + length)
        intervals = sorted(intervals[offset:offset+length])
        assert offset < self.__filesize
        assert intervals[0].begin >= offset and intervals[-1].end <= offset + length if len(intervals) > 0 else True

        if len(intervals) == 0:
            return b'\x00' * min(length, self.__filesize - offset)

        assert len(intervals) > 0

        # Used to fill any hole at the start of the read range
        end_prev_interval = offset

        # Read the data
        for interv in intervals:
            # Fill any hole before this interval
            data += b'\x00' * (interv.begin - end_prev_interval)

            os.lseek(fh, interv.begin, os.SEEK_SET)
            data += os.read(fh, interv.length())

            end_prev_interval = interv.end

        # Fill any hole at the end of the read range
        data += b'\x00' * (offset + length - intervals[-1].end)

        if offset + length > self.__filesize:
            data = data[0:self.__filesize-offset]

        assert len(data) <= length
        assert offset + len(data) <= self.__filesize
        return data

    def apply(self, fh):
        # Fill all the holes with zeros and write them
        self._write_tree_to_real_file(fh)
        self.__rewritten_intervals = IntervalTree([Interval(0, self.__filesize)] if self.__filesize > 0 else None)

    def release(self):
        """
        Releases the resources used by this GhostFile. This object will no longer be valid after
        this method is called, so this should always be the last operation on this object.
        """
        self.__data_path_reader.close()

    @property
    def size(self):
        return self.__filesize

    def _is_same_data(self, buf, offset):
        self.__data_path_reader.seek(offset)
        olddata = self.__data_path_reader.read(len(buf))

        return buf == olddata

    @staticmethod
    def _optimized_add_to_intervaltree(tree, start, end):
        """
        Inserts the interval to the provided intervaltree. If the provided interval is adjacent to a previous
        interval or to a next interval, they get merged into a single interval. This method also guarantees
        that, in the end, there are no intervals overlapping within the provided range.

        :param tree:
        :param start:
        :param end:
        """
        prev_adjacent_intervals = tree[start - 1]
        next_adjacent_intervals = tree[end]

        # This should be true because we always prevent intervals from overlapping
        assert len(prev_adjacent_intervals) <= 1 and len(next_adjacent_intervals) <= 1

        prev_adjacent_interval = list(prev_adjacent_intervals)[0] if len(prev_adjacent_intervals) > 0 else None
        next_adjacent_interval = list(next_adjacent_intervals)[0] if len(next_adjacent_intervals) > 0 else None
        assert isinstance(prev_adjacent_interval, Interval) or prev_adjacent_interval is None
        assert isinstance(next_adjacent_interval, Interval) or next_adjacent_interval is None

        chop_from = start
        chop_to = end
        if prev_adjacent_interval is not None:
            chop_from = prev_adjacent_interval.begin
        if next_adjacent_interval is not None:
            chop_to = next_adjacent_interval.end

        # Chopping prevents overlapping intervals
        tree.chop(chop_from, chop_to)
        tree[chop_from:chop_to] = None

    def _write_tree_to_real_file(self, fh):
        end_prev_interval = 0

        for interval in self.__rewritten_intervals:
            zeros = b'\x00' * (interval.begin - end_prev_interval)
            written_bytes = 0
            os.lseek(fh, end_prev_interval, os.SEEK_SET)
            while written_bytes < len(zeros):
                written_bytes += os.write(fh, zeros[written_bytes:])
            assert written_bytes == len(zeros)
            end_prev_interval = interval.end

        zeros = b'\x00' * (self.__filesize - end_prev_interval)
        written_bytes = 0
        os.lseek(fh, end_prev_interval, os.SEEK_SET)
        while written_bytes < len(zeros):
            written_bytes += os.write(fh, zeros[written_bytes:])
        assert written_bytes == len(zeros)

        # TODO Find a way to avoid doing all this if nobody did a truncate since the last call to this method
        assert self.__filesize >= self.__rewritten_intervals.end()
        os.ftruncate(fh, self.__filesize)

        # Invoke a callback that should clear the stat cache of all the aliases of this file
        if self._clear_cache_callback:
            self._clear_cache_callback()
Exemplo n.º 18
0
    class CoordinateTranslator(object):
        class Leaf(object):
            def __init__(self, feature, coding_start, coding_stop):
                self.feature = feature
                self.start = feature.start
                self.stop = feature.stop
                self.coding_start = coding_start
                self.coding_stop = coding_stop

            def __str__(self):
                return 'genomic: [%s, %s], coding: [%s, %s]' % (
                    self.start, self.stop, self.coding_start, self.coding_stop)

        def __init__(self, exons, introns, strand, coding_offset,
                     coding_length):
            self.strand = strand
            self.coding_offset = coding_offset
            self.coding_length = coding_length
            self._exon_tree = IntervalTree()
            self._intron_tree = IntervalTree()
            self._genomic_tree = IntervalTree()

            _coding_start = -self.coding_offset

            for exon in (exons if self.strand == '+' else exons[::-1]):
                leaf = Transcript.CoordinateTranslator.Leaf(
                    exon, _coding_start, _coding_start + exon.length - 1)

                self._genomic_tree.addi(leaf.start, leaf.stop + 1, leaf)
                self._exon_tree.addi(leaf.coding_start, leaf.coding_stop + 1,
                                     leaf)

                # increment
                _coding_start = leaf.coding_stop + 1

            for intron in introns:
                # introns don't have coding coordinates, so use those of
                # adjacent exons
                leaf_genomic_upstream = \
                    list(self._genomic_tree[intron.start - 1])[0].data
                leaf_genomic_downstream = \
                    list(self._genomic_tree[intron.stop + 1])[0].data

                # NOTE: always assemble intronic offsets w.r.t. to the
                #  'coding stop' position of the upstream CDS
                if self.strand == '+':
                    leaf = \
                        Transcript.CoordinateTranslator.Leaf(
                            intron,
                            leaf_genomic_upstream.coding_stop,
                            leaf_genomic_downstream.coding_start
                        )
                else:
                    leaf = \
                        Transcript.CoordinateTranslator.Leaf(
                            intron,
                            leaf_genomic_downstream.coding_stop,
                            leaf_genomic_upstream.coding_start
                        )
                self._intron_tree.addi(leaf.start, leaf.stop + 1, leaf)

            # add introns that are upstream and downstream to the exon
            #  sequence
            # TODO: we may not need this, depending on how we choose to handle
            #  [start, stop] ranges that occur outside exon ranges
            if self.strand == '+':
                # straw upstream (genomic) intron
                straw0 = \
                    Feature('.', 0, self._genomic_tree.begin() - 1, self.strand, None)      # noqa
                leaf0 = \
                    Transcript.CoordinateTranslator.Leaf(straw0, -1, 0)
                self._intron_tree.addi(straw0.start, straw0.stop, leaf0)

                # straw downstream (genomic) intron
                straw1 = \
                    Feature('.', self._genomic_tree.end() + 1, sys.maxint, self.strand, None)      # noqa
                leaf1 = \
                    Transcript.CoordinateTranslator.Leaf(
                    straw1, self.coding_length - 1, self.coding_length)    # noqa
                self._intron_tree.addi(straw1.start, straw1.stop, leaf1)

            else:
                # straw upstream (genomic) intron
                straw0 = \
                    Feature('.', 0, self._genomic_tree.begin() - 1, self.strand, None)      # noqa
                leaf0 = \
                    Transcript.CoordinateTranslator.Leaf(straw0, self.coding_length - 1, self.coding_length)    # noqa

                self._intron_tree.addi(straw0.start, straw0.stop, leaf0)

                # straw downstream (genomic) intron
                straw1 = \
                    Feature('.', self._genomic_tree.end() + 1, sys.maxint, self.strand, None)      # noqa
                leaf1 = \
                    Transcript.CoordinateTranslator.Leaf(straw1, -1, 0)    # noqa
                self._intron_tree.addi(straw1.start, straw1.stop, leaf1)

        def to_coding_range(self, start, stop, hgvs_format=False):
            #  from above, introns have a coding_length == 1
            # TODO: set 'intron' attribute on leaves in '_intron_tree'
            #  above
            def _is_intron(leaf):
                return leaf.coding_stop - leaf.coding_start == 1

            # coding start
            range_coding_start = (list(self._genomic_tree[start]
                                       | self._intron_tree[start])
                                  or [None])[0]

            coding_start = None
            intron_coding_offset_start = 0
            leaf = range_coding_start.data
            if _is_intron(leaf):
                if self.strand == '+':
                    delta0 = start - leaf.start + 1
                    delta1 = leaf.stop + 1 - start
                    if hgvs_format and delta0 > delta1:
                        coding_start = leaf.coding_stop
                        intron_coding_offset_start = -delta1
                    else:
                        coding_start = leaf.coding_start
                        intron_coding_offset_start = delta0

                else:
                    delta0 = leaf.stop + 1 - stop
                    delta1 = stop - leaf.start + 1
                    if hgvs_format and delta0 > delta1:
                        coding_start = leaf.coding_stop
                        intron_coding_offset_start = -delta1
                    else:
                        coding_start = leaf.coding_start
                        intron_coding_offset_start = delta0
            else:
                if self.strand == '+':
                    coding_start = \
                        leaf.coding_start + (start - leaf.start)
                else:
                    coding_start = \
                        leaf.coding_start + (leaf.stop - stop)

            # coding stop
            range_coding_stop = (list(self._genomic_tree[stop]
                                      | self._intron_tree[stop]) or [None])[0]

            coding_stop = None
            intron_coding_offset_stop = 0
            leaf = range_coding_stop.data
            if _is_intron(leaf):
                if self.strand == '+':
                    delta0 = stop - leaf.start + 1
                    delta1 = leaf.stop + 1 - stop
                    if hgvs_format and delta0 > delta1:
                        coding_stop = leaf.coding_stop
                        intron_coding_offset_stop = -delta1
                    else:
                        coding_stop = leaf.coding_start
                        intron_coding_offset_stop = delta0

                else:
                    delta0 = leaf.stop + 1 - start
                    delta1 = start - leaf.start + 1
                    if hgvs_format and delta0 > delta1:
                        coding_stop = leaf.coding_stop
                        intron_coding_offset_stop = -delta1
                    else:
                        coding_stop = leaf.coding_start
                        intron_coding_offset_stop = delta0

            else:
                if self.strand == '+':
                    coding_stop = \
                        leaf.coding_stop - (leaf.stop - stop)
                else:
                    coding_stop = \
                        leaf.coding_stop - (start - leaf.start)

            return \
                Transcript.CodingRange(
                    coding_start,
                    coding_stop,
                    intron_coding_offset_start,
                    intron_coding_offset_stop
                )

        def to_genomic_ranges(self, coding_start, coding_stop):
            genomic_ranges = []
            list_ranges = sorted(self._exon_tree[coding_start:coding_stop + 1],
                                 reverse=self.strand == '-')

            for leaf in [r.data for r in list_ranges]:
                if self.strand == '+':
                    genomic_ranges.append(
                        Transcript.GenomicRange(
                            leaf.start +
                            max(coding_start - leaf.coding_start, 0),  # noqa
                            leaf.stop -
                            max(leaf.coding_stop - coding_stop, 0)  # noqa
                        ))
                else:
                    genomic_ranges.append(
                        Transcript.GenomicRange(
                            leaf.start +
                            max(leaf.coding_stop - coding_stop, 0),  # noqa
                            leaf.stop -
                            max(coding_start - leaf.coding_start, 0)  # noqa
                        ))

            return genomic_ranges

        def __str__(self):
            return 'coding sequences: %s' % map(str, self._tree)
Exemplo n.º 19
0
class ExonCoords:
    def __init__(self, chromosome, strand, breakpoint, gene_name,
                 exons: IntervalTree):
        self.chromosome = chromosome
        self.strand = strand
        self.breakpoint = breakpoint
        self.gene_name = gene_name
        self.exons = IntervalTree(exons)

    @classmethod
    def fromTuple(cls, a_tuple):
        return cls(a_tuple[0], a_tuple[1], a_tuple[2], a_tuple[3], a_tuple[4])

    @classmethod
    def copy_without_exons(cls, exc):
        return cls(exc.chromosome, exc.strand, exc.breakpoint, exc.gene_name,
                   IntervalTree())

    @classmethod
    def empty(cls):
        return cls("", 0, -1, "", IntervalTree())

    def print_properties(self):
        print("#########################################")
        print(
            "coordinates :", self.chromosome + ":" + str(self.exons.begin()) +
            "-" + str(self.exons.end()))
        print("gene        :", self.gene_name)
        print("strand      :", self._strand)
        print("breakpoint  :", self._breakpoint)
        print("exons       :", self._exons)
        print("#########################################")

    def print_as_bed(self):
        chromosome = self.chromosome
        for ex in sorted(self.exons):
            print(chromosome + "\t" + str(ex.begin) + "\t" + str(ex.end))

    @property
    def gene_name(self):
        return self._gene_name

    @gene_name.setter
    def gene_name(self, value):
        self._gene_name = value

    @property
    def chromosome(self):
        return self._chromosome

    @chromosome.setter
    def chromosome(self, value):
        self._chromosome = value

    @property
    def strand(self):
        return self._strand

    @strand.setter
    def strand(self, value):
        self._strand = value

    @property
    def breakpoint(self):  # int
        return self._breakpoint

    @breakpoint.setter
    def breakpoint(self, value):
        self._breakpoint = value

    @property
    def exons(self):  # IntervalTree()
        return self._exons

    @exons.setter
    def exons(self, exons):
        self._exons = exons

    def begin(self):
        return self.exons.begin()
Exemplo n.º 20
0
class TaskSet(object):
    """
    Holds a set of tasks in a priority queue.
  """
    def __init__(self):
        self._tasksQueue = TaskUnitPriorityQueue()  # keep r1 < r2 < r3 order.
        self._intervalTree = IntervalTree()

    @property
    def tasks(self):
        return self._tasksQueue.items()

    def add(self, task):
        if not self._tasksQueue.contains(task.taskID):
            self._addTaskToTree(task)
            self._tasksQueue.push(task)
        else:
            raise DuplicateTaskException

    def _addTaskToTree(self, task):
        """
      Adds task to interval tree.
    """
        self._intervalTree.addi(begin=task.release,
                                end=task.deadline,
                                data=task.taskID)

    def remove(self, task):
        self._intervalTree.discardi(task.release, task.deadline, task.taskID)
        self._tasksQueue.remove(task.taskID)

    def _findLatestInterval(self, intervals):
        """
      Find the latest interval.
    """
        latest = intervals[0]
        for interval in intervals:
            if interval.begin > latest.begin:
                latest = interval
        return latest

    def _orIntervals(self, intervalListA, intervalListB):
        return list(set(intervalListA) | set(intervalListB))

    def _conflictPath(self, interval, intervalTree):
        """
      @param interval The interval to find conflicts with.
      @param intervalTree The intervalTree that contains all intervals
      Finds the longest number of intervals that are all overlapping (conflicting).
        For example:
          if A and B conflict and B and C conflict and A is the
          interval we're looking for conflicts with, the returned
          intervals will be A, B, C.
        Another example:
          if D and E conflict and F and G conflict, and we're looking
          for all conflicts with D, only D and E will be returned as
          F and G are not overlapping with either D and E.
    """
        intervals = list(intervalTree.search(interval))
        # if only one interval, check if its the one we're
        # trying to find conflicts with.
        if len(intervals) == 1 and intervals[0] == interval:
            return []
        # now find the latest of all the intervals and get all conflicts
        # with and keep going until there are no more conflicts.
        latestInterval = self._findLatestInterval(intervals)
        # remove all the conflicts, we dont need to check them again.
        intervalTree.remove_overlap(interval)
        # put the latest conflict back into the tree and find its conflicts
        intervalTree.add(latestInterval)
        # now go find all conflicts with the latest interval until there are none.
        return self._orIntervals(
            intervals, self._conflictPath(latestInterval, intervalTree))

    def _intervalConflictAlreadyDetected(self, interval, conflicts):
        """
      Checks to see if interval was already detected to conflict.
    """
        for conflict in conflicts:
            for ival in conflict:
                if ival == interval:
                    return True
        return False

    def findConflicts(self):
        """
      Finds all conflicts within the task set.
    """
        begin = self._intervalTree.begin()
        end = self._intervalTree.end()
        conflicts = []
        conflictObjs = []
        nonConflictsObjs = []
        intervals = sorted(self._intervalTree[begin:end])
        for interval in intervals:
            # check if this interval was already detected to conflict
            if self._intervalConflictAlreadyDetected(interval, conflicts):
                continue
            conflictIntervals = self._conflictPath(interval,
                                                   self._intervalTree.copy())
            if len(conflictIntervals) > 0:  # there was a  conflict
                conflicts.append(conflictIntervals)
                conflictObjs.append(Conflict(conflictIntervals))
            else:
                nonConflictsObjs.append(Conflict(interval))
        return ConflictSet(conflictObjs), ConflictSet(nonConflictsObjs)

    def __iter__(self):
        return self._tasksQueue