Exemplo n.º 1
0
class BaseJson(ABC):
    """ Base json representation """
    def __init__(self, path: pathlib.Path):
        self.path = path
        self.data = []
        self.tree = IntervalTree()

    def __str__(self):
        return f"<{self.path}, {len(self.data)} objects>"

    def __repr__(self):
        return f"{len(self.data)}"

    def __load(self):
        raise NotImplementedError

    def query(self, start, stop):
        return self.tree.overlap(start, stop)

    def mark_words(self, start, stop):
        results = sorted(self.tree.overlap(start, stop))
        for r in results:
            r.data['hit'] = True

    def query_text(self, start, stop):
        results = sorted(self.tree.overlap(start, stop))
        s = " ".join([r.data["word"] for r in results
                      if not r.data['hit']]).strip()
        return s
Exemplo n.º 2
0
class FlashReaderContext(DebugContext):
    """! @brief Reads flash memory regions from an ELF file instead of the target."""

    def __init__(self, parent, elf):
        super(FlashReaderContext, self).__init__(parent)
        self._elf = elf

        self._build_regions()

    def _build_regions(self):
        self._tree = IntervalTree()
        for sect in [s for s in self._elf.sections if (s.region and s.region.is_flash)]:
            start = sect.start
            length = sect.length
            sect.data # Go ahead and read the data from the file.
            self._tree.addi(start, start + length, sect)
            LOG.debug("created flash section [%x:%x] for section %s", start, start + length, sect.name)

    def read_memory(self, addr, transfer_size=32, now=True):
        length = transfer_size // 8
        matches = self._tree.overlap(addr, addr + length)
        # Must match only one interval (ELF section).
        if len(matches) != 1:
            return self._parent.read_memory(addr, transfer_size, now)
        section = matches.pop().data
        addr -= section.start

        def read_memory_cb():
            LOG.debug("read flash data [%x:%x] from section %s", section.start + addr, section.start + addr  + length, section.name)
            data = section.data[addr:addr + length]
            if transfer_size == 8:
                return data[0]
            elif transfer_size == 16:
                return conversion.byte_list_to_u16le_list(data)[0]
            elif transfer_size == 32:
                return conversion.byte_list_to_u32le_list(data)[0]
            else:
                raise ValueError("invalid transfer_size (%d)" % transfer_size)

        if now:
            return read_memory_cb()
        else:
            return read_memory_cb

    def read_memory_block8(self, addr, size):
        matches = self._tree.overlap(addr, addr + size)
        # Must match only one interval (ELF section).
        if len(matches) != 1:
            return self._parent.read_memory_block8(addr, size)
        section = matches.pop().data
        addr -= section.start
        data = section.data[addr:addr + size]
        LOG.debug("read flash data [%x:%x]", section.start + addr, section.start + addr  + size)
        return list(data)

    def read_memory_block32(self, addr, size):
        return conversion.byte_list_to_u32le_list(self.read_memory_block8(addr, size))
Exemplo n.º 3
0
class Chromosome:
    COUNT_N = str.maketrans("ATGC", 'Y' * 4)

    def __init__(self, seq, fraglen, ignore_n=True):
        self.seq = seq
        self.chromlen = self.len = len(seq)
        self.peak_regions = IntervalTree()
        self.blacklist = IntervalTree()
        self.fraglen = fraglen
        if ignore_n:
            pos = 0
            for k, g in groupby(self.seq.translate(self.COUNT_N)):
                l = sum(1 for _ in g)
                if k == 'N':
                    self.blacklist.add(Interval(pos, pos + l))
                    self.len -= l
                pos += l

    def choose_peak_regions(self, n):
        while len(self.peak_regions) < n:
            pos = random.randrange(self.chromlen - self.fraglen)
            peak = Interval(pos, pos + self.fraglen)
            if not self.blacklist.overlap(
                    peak) and not self.peak_regions.overlap(peak):
                self.peak_regions.add(peak)

    def _get_read_from_fragment(self, frag, width, readlen):
        positive_strand = random.random() >= 0.5
        if positive_strand:
            pos = random.randrange(frag.begin, frag.begin + width)
        else:
            pos = random.randrange(frag.end - width, frag.end)
            pos -= readlen - 1

        return pos, positive_strand

    def get_reads_from_peaks(self, width, readlen, n):
        peaks = tuple(self.peak_regions)
        if peaks:
            for _ in range(n):
                peak = random.choice(peaks)
                yield self._get_read_from_fragment(peak, width, readlen)

    def get_reads_as_background(self, width, readlen, n):
        for _ in range(n):
            pos = random.randrange(0, self.chromlen - self.fraglen)
            fragment = Interval(pos, pos + self.fraglen)
            if not self.blacklist.overlap(fragment):
                yield self._get_read_from_fragment(fragment, width, readlen)
Exemplo n.º 4
0
def scan_tree(intervals):
    """construct an interval tree using supplied genomic intervals, check all elements on the tree against iself and return any that hit 2 or more intervals (i.e. itself + 1 other)"""

    retlist = set()
    t = IntervalTree(Interval(*iv) for iv in intervals)

    for g in intervals:

        if len(t.overlap(g[0], g[1])) > 1:
            #            print( t.overlap( g[0], g[1]) )
            o = t.overlap(g[0], g[1])
            for x in o:
                retlist.add(x.data)

    return retlist
Exemplo n.º 5
0
def test_brackets_vs_overlap():
    it = IntervalTree()
    it.addi(1, 3, "dude")
    it.addi(2, 4, "sweet")
    it.addi(6, 9, "rad")
    for iobj in it:
        assert it[iobj.begin:iobj.end] == it.overlap(iobj.begin, iobj.end)
Exemplo n.º 6
0
class AslrOracle:
  def __init__(self):
    self.queries = 0

    self.InitCache()

  def CheckAddress(self, address):
    return self.CheckRange(address, 0x1000)

  def InitCache(self):
    self.cached_queries = 0
    self.good_regions = IntervalTree()
    self.bad_regions = IntervalTree()

  def InsertToCache(self, start, end, valid):
    if valid:
      self.good_regions.add(Interval(start, end + 1))
      self.good_regions.merge_overlaps()
    else:
      self.bad_regions.add(Interval(start, end))

  def CheckCache(self, start, end):
    good_overlaps = self.good_regions.overlap(start, end)
    for overlap in good_overlaps:
      if (overlap[0] <= start) and (overlap[1] >= end):
        self.cached_queries += 1
        return True

    bad_overlaps = self.bad_regions.envelop(start, end)
    if len(bad_overlaps) > 0:
      self.cached_queries += 1
      return False

    return None
Exemplo n.º 7
0
def get_multilines(spans):
    intervals = Intervals()
    lines = []
    for start, stop, type in spans:
        line = Line(start, stop, type, level=None)
        intervals.addi(start, stop, line)
        lines.append(line)

    # level
    for line in lines:
        selected = intervals.overlap(line.start, line.stop)
        line.level = get_free_level(selected)

    # chunk
    intervals.split_overlaps()

    # group
    groups = defaultdict(list)
    for start, stop, line in intervals:
        groups[start, stop].append(line)

    for start, stop in sorted(groups):
        lines = groups[start, stop]
        lines = sorted(lines, key=lambda _: _.level)
        yield Multiline(start, stop, lines)
Exemplo n.º 8
0
class SimpleDnMedium(DnMedium):
    def __init__(self) -> None:
        self.msgs = IntervalTree()

    def add_dn(self, msg: LoraMsg) -> None:
        t0 = Simulation.time2ticks(msg.xbeg)
        t1 = t0 + Simulation.time2ticks(msg.tpreamble())
        self.msgs[t0:t1] = msg

    @staticmethod
    def overlap(i1: Interval, i2: Interval) -> int:
        return min(i1.end, i2.end) - max(i1.begin, i2.begin)  # type: ignore

    def get_dn(self,
               rxon: int,
               rxtout: int,
               freq: int,
               rps: int,
               nsym: int = 4) -> Optional[LoraMsg]:
        rxw = Interval(rxon, rxon + rxtout)
        tpn = Simulation.time2ticks(LoraMsg.symtime(rps, nsym))
        for i in self.msgs.overlap(rxw[0], rxw[1]):
            m = i.data  # type: LoraMsg
            if m.match(freq, rps) and SimpleDnMedium.overlap(i, rxw) >= tpn:
                break
        else:
            return None
        self.msgs.remove(i)
        return m

    def prune(self, ticks: int) -> None:
        exp = self.msgs.envelop(0, ticks)
        if exp:
            self.msgs.remove_envelop(0, ticks)
        return exp
Exemplo n.º 9
0
def find_len_non_overlap(interval: Interval, itree: IntervalTree) -> int:
    overlaps = IntervalTree(itree.overlap(interval))
    overlaps.merge_overlaps()

    len_overlap = sum([intersection(interval, o).length() for o in overlaps])

    return interval.length() - len_overlap
Exemplo n.º 10
0
    def countIdealOverlaps(self, nodes):
        iTree = IntervalTree()
        for node in nodes:
            iTree.addi(node.idealLeft(), node.idealRight(), data=node)

        for node in nodes:
            overlaps = iTree.overlap(node.idealLeft(), node.idealRight())
            node.overlaps = [x.data for x in overlaps]
            node.overlapCount = len(overlaps)
Exemplo n.º 11
0
    def test_cosmic_genome_pos_filter(self):
        lung_mut_interval_tree = IntervalTree()

        # Test for positive matches.
        for _, row in self.cosmic_df.iterrows():
            if row["Primary site"] != "lung":
                continue

            genome_pos = GenomePosition.from_str(
                str(row["Mutation genome position"]))

            if genome_pos is None:
                continue

            # Add the genome position to a tree for use in further assertions.
            lung_mut_interval_tree[genome_pos.start:genome_pos.
                                   end] = genome_pos.chrom

            self.assertTrue(
                self.mutation_counter._cosmic_subset_contains_genome_pos(
                    genome_pos))

        # Test for negative matches, excluding negative mutation matches which
        # overlap with positive ones.
        for _, row in self.cosmic_df.iterrows():
            genome_pos = GenomePosition.from_str(
                str(row["Mutation genome position"]))

            if genome_pos is None:
                continue

            # genome_pos overlaps with a positive match, so it cannot be assumed
            # that it shouldn't match.
            if any(
                    map(
                        lambda it: it.data == genome_pos.chrom,
                        lung_mut_interval_tree.overlap(genome_pos.start,
                                                       genome_pos.end))):
                continue

            self.assertFalse(
                self.mutation_counter._cosmic_subset_contains_genome_pos(
                    genome_pos))

        # Do some further negative testing to ensure that garbage genome
        # positions don't match the filter.

        negative_tests = [
            GenomePosition("nonexistent-chromosome", 0, 0),
            GenomePosition("1", -10, -1),
        ]

        for test in negative_tests:
            self.assertFalse(
                self.mutation_counter._cosmic_subset_contains_genome_pos(test))
Exemplo n.º 12
0
class YandexJson:
    """ Kaldi json representation """
    def __init__(self, path: pathlib.Path):
        self.path = path
        self.data = []
        self.tree = IntervalTree()
        self.__load()

    def __str__(self):
        return f"<{self.path}, {len(self.data)} objects>"

    def __repr__(self):
        return f"{len(self.data)}"

    def __load(self):
        with pathlib.Path.open(self.path, 'r') as json_data:
            data = json.load(json_data)
            if data:
                self.data = data

            for chunk in self.data['response']['chunks']:

                if chunk['channelTag'] == '1':
                    for r in chunk['alternatives'][0]['words']:
                        i = Interval(float(r["startTime"][:-1]),
                                     float(r["endTime"][:-1]), r)
                        r['hit'] = False
                        self.tree.add(i)

    def query(self, start, stop):
        return self.tree.overlap(start, stop)

    def mark_words(self, start, stop):
        results = sorted(self.tree.overlap(start, stop))
        for r in results:
            r.data['hit'] = True

    def query_text(self, start, stop):
        results = sorted(self.tree.overlap(start, stop))
        s = " ".join([r.data["word"] for r in results
                      if not r.data['hit']]).strip()
        return s
Exemplo n.º 13
0
def total_intersection(itree: IntervalTree, interval: Interval) -> int:
    if interval.length() <= 0:
        return 0

    total = 0
    ovlps = IntervalTree(itree.overlap(interval))
    ovlps.merge_overlaps()
    for ovlp in ovlps:
        inter = intersect(interval, ovlp)
        total += inter.length()

    return total
Exemplo n.º 14
0
class YandexJson:
    """ Kaldi json representation """
    def __init__(self, path: pathlib.Path):
        self.path = path
        self.data = []
        self.tree = IntervalTree()
        self.__load()

    def __str__(self):
        return f"<{self.path}, {len(self.data)} objects>"

    def __repr__(self):
        return f"{len(self.data)}"

    def __load(self):
        with pathlib.Path.open(self.path, 'r') as json_data:
            data = json.load(json_data)
            if data:
                self.data = data

            for obj in self.data:
                if obj.get("result"):
                    for r in obj["result"]:
                        i = Interval(r["start"], r["end"], r)
                        r['hit'] = False
                        self.tree.add(i)


    def query(self, start, stop):
        return self.tree.overlap(start, stop)

    def mark_words(self, start, stop):
        results = sorted(self.tree.overlap(start, stop))
        for r in results:
            r.data['hit'] = True

    def query_text(self, start, stop):
        results = sorted(self.tree.overlap(start, stop))
        s = " ".join([r.data["word"] for r in results if not r.data['hit']]).strip()
        return s
Exemplo n.º 15
0
def filter_intervals(intervals):
    it = IntervalTree()

    intervals_filtered = []
    for start, end in intervals:
        #if it.search(start, end):
        if it.overlap(start, end):
            pass
        else:
            it.addi(start, end, 1)
            #it.add(start, end, 1)
            intervals_filtered.append((start, end))
    return sorted(intervals_filtered, key=lambda tup: tup[0])
Exemplo n.º 16
0
def section_markup(markup, mode=HTML):
    arcs = []
    for source, target, type in markup.deps:
        if type == ROOT:
            continue

        if source < target:
            start, stop = source, target
            direction = RIGHT
        else:
            start, stop = target, source
            direction = LEFT

        arc = Arc(start, stop, direction, type, level=None)
        arcs.append(arc)

    # order
    arcs = sorted(arcs, key=Arc.layout_order)

    # level
    intervals = Intervals()
    for arc in arcs:
        stop = arc.stop
        if mode == ASCII:
            stop += 1  # in ascii mode include stop
        intervals.addi(arc.start, stop, arc)

    for arc in arcs:
        selected = intervals.overlap(arc.start, arc.stop)
        arc.level = get_free_level(selected)

    # group
    sections = defaultdict(list)
    for arc in arcs:
        start, stop, direction, type, level = arc
        parent = id(arc)
        for index in range(start, stop + 1):
            if index == start:
                part = BEGIN if direction == RIGHT else END
            elif index == stop:
                part = END if direction == RIGHT else BEGIN
            else:
                part = INSIDE

            section = ArcSection(part, direction, type, level, parent)
            sections[index].append(section)

    for index, word in enumerate(markup.words):
        arcs = sections[index]
        arcs = sorted(arcs, key=Arc.level_order)
        yield DepMarkupSection(word, arcs)
Exemplo n.º 17
0
class SimpleMedium(Medium):
    def __init__(self, put_up: Optional[Callable[[LoraMsg], None]]) -> None:
        self._put_up = put_up
        self.msgs = IntervalTree()

    def reset_medium(self) -> None:
        self.msgs.clear()

    def add_dn(self, msg: LoraMsg) -> None:
        t0 = Simulation.time2ticks(msg.xbeg)
        t1 = t0 + Simulation.time2ticks(msg.tpreamble())
        self.msgs[t0:t1] = msg

    @staticmethod
    def overlap(i1: Interval, i2: Interval) -> int:
        return min(i1.end, i2.end) - max(i1.begin, i2.begin)  # type: ignore

    def get_dn(self,
               rxon: int,
               rxtout: int,
               freq: int,
               rps: int,
               nsym: int = 4,
               peek=False) -> Optional[LoraMsg]:
        rxw = Interval(rxon, rxon + rxtout)
        tpn = Simulation.time2ticks(LoraMsg.symtime(rps, nsym))
        for i in self.msgs.overlap(rxw[0], rxw[1]):
            m = i.data  # type: LoraMsg
            if m.match(freq, rps) and (peek
                                       or SimpleMedium.overlap(i, rxw) >= tpn):
                break
        else:
            return None
        if not peek:
            self.msgs.remove(i)
        return m

    def prune(self, ticks: int) -> List[LoraMsg]:
        exp = cast(List[Interval], self.msgs.envelop(0, ticks))
        if exp:
            self.msgs.remove_envelop(0, ticks)
        return [iv[2] for iv in exp]
Exemplo n.º 18
0
def test_empty_queries():
    t = IntervalTree()
    e = set()

    assert len(t) == 0
    assert t.is_empty()
    assert t[3] == e
    assert t[4:6] == e
    assert t.begin() == 0
    assert t.end() == 0
    assert t[t.begin():t.end()] == e
    assert t.overlap(t.begin(), t.end()) == e
    assert t.envelop(t.begin(), t.end()) == e
    assert t.items() == e
    assert set(t) == e
    assert set(t.copy()) == e
    assert t.find_nested() == {}
    assert t.range().is_null()
    assert t.range().length() == 0
    t.verify()
Exemplo n.º 19
0
def test_empty_queries():
    t = IntervalTree()
    e = set()

    assert len(t) == 0
    assert t.is_empty()
    assert t[3] == e
    assert t[4:6] == e
    assert t.begin() == 0
    assert t.end() == 0
    assert t[t.begin():t.end()] == e
    assert t.overlap(t.begin(), t.end()) == e
    assert t.envelop(t.begin(), t.end()) == e
    assert t.items() == e
    assert set(t) == e
    assert set(t.copy()) == e
    assert t.find_nested() == {}
    assert t.range().is_null()
    assert t.range().length() == 0
    t.verify()
Exemplo n.º 20
0
def partition_spans(spans: List[Span]) -> Tuple[List[List[Span]], List[Span]]:
    """
    partitions a list of spans into

    1. a list of span clusters, where each cluster contains spans that overlap somehow

    2. a list of spans that are non-overlapping.
    :param spans:
    :return:
    """
    uf = UnionFind()
    spans_so_far = IntervalTree()
    for span in spans:
        start, end = span
        overlaps_with = spans_so_far.overlap(begin=start, end=end)
        if len(overlaps_with) > 0:
            for parent in list(overlaps_with):
                parent_span = parent.begin, parent.end
                # print(parent)
                # print(span)
                uf.union(parent_span, span)
        else:
            spans_so_far.addi(begin=start, end=end)
            uf.union(span)
    # parent to cluster dict
    p2c = {}
    for span in spans:
        parent = uf[span]
        if parent not in p2c:
            p2c[parent] = []
        p2c[parent].append(span)
    # non overlap spans are those whose cluster contain just them
    non_overlap_spans: List[Span] = [
        parent for parent in p2c if len(p2c[parent]) == 1
    ]
    # rest overlap
    overlap_groups: List[List[Span]] = [
        p2c[parent] for parent in p2c if len(p2c[parent]) > 1
    ]
    # print(parent2cluster)
    return overlap_groups, non_overlap_spans
 def get_cons(self, docta):
     cons_list = []
     tokens_list = list(enumerate(docta.get_tokens))
     spans_so_far = IntervalTree()
     for ngram_size in [4, 3, 2, 1]:
         for ngram in ngrams(tokens_list=tokens_list,
                             ngram_size=ngram_size):
             ngram_start = ngram[0][0]
             ngram_end = ngram[-1][0] + 1
             ngram_string = " ".join([n[1] for n in ngram])
             # print(ngram, ngram_start, ngram_end, ngram_string)
             cands = self.cg.get_candidates(ngram_string, pretokenized=True)
             logging.info("query: %s", ngram_string)
             logging.info("cands found: %d", len(cands))
             if len(cands) == 0:
                 continue
             most_prob_cand = cands[0]
             new_cons = {
                 "end": ngram_end,
                 "label": "MENTION",
                 "score": 1.0,
                 "start": ngram_start,
                 "most_prob_cand": most_prob_cand.en_title,
                 "most_prob_prob": most_prob_cand.p_t_given_s,
                 "ncands": len(cands),
                 "tokens": ngram_string
             }
             overlap_mentions = spans_so_far.overlap(begin=ngram_start,
                                                     end=ngram_end)
             if len(overlap_mentions) > 0:
                 # do not allow overlapping/nested mentions
                 continue
             else:
                 spans_so_far.addi(begin=ngram_start, end=ngram_end)
                 cons_list.append(new_cons)
     logging.info("#mentions found:%d", len(cons_list))
     logging.info("#total tokens:%d", len(tokens_list))
     return cons_list
Exemplo n.º 22
0
def reprocessForColHeader(lst,months,yrs,firstRow, df_columns_len,baseDataDirectory):
    mainTree = IntervalTree()
    mainList = list()
    # baseDataDirectory = '/home/swaroop/Documents/Projects/SFC_extraction/Data'
    hd=headers(baseDataDirectory,months,yrs)
    count=0

    reversed_list = []

    if lst is not None and len(lst) > 0:
        reversed_list = lst[::-1]
        # tree_start = IntervalTree()

        # removing extra lines from df starting line.
        while True:
            if len(reversed_list) <= 0:
                break
            firstLine = reversed_list.pop(0)
            reference = firstLine[0]['x1']
            line = ''
            for sen_obj in firstLine:
                line = line + sen_obj['text'] + '  '
            cnt = 0
            for val in firstRow:
                if str(val).replace(' ', '').lower() in line.replace(' ', '').lower():
                    cnt += 1
            if cnt == len(firstRow) or cnt >= 3:
                # for sen_obj in firstLine:
                #     tree_start.add(Interval(sen_obj['x0'], sen_obj['x1'], sen_obj))
                break

        cols_start_y = 0
        for line_obj in reversed_list:
            if line_obj[0]['text'].lower().replace(' ', '') == 'Table of Contents'.lower().replace(' ', ''):
                break
            if len(line_obj) == 0:
                continue
            line = ''
            for sen_obj in line_obj:
                line = line + sen_obj['text'] + '  '
            if len(mainTree) == 0:
                if (not hd.checkRegex(line) and (
                        hd.IsHeader(line_obj) or hd.isStringHeader(line)) and not hd.isHalfSentance(line)):
                    for sen_obj in line_obj:
                        if len(line_obj) == 1:
                            if sen_obj['x0'] < reference:
                                count = count + 1
                                if count >= 2:
                                    sTree = sorted(mainTree)
                                    for tr in sTree:
                                        # mainList.append('\n'.join(str(tr.data).split('\n')[::-1])
                                        mainList.append(tr.data['text'])
                                    # print(mainList)
                                    return mainList, len(reversed_list)
                                continue
                        # if len(sen_obj['text'].strip()) > 0 and sen_obj['underline_exists']:
                        if len(sen_obj['text'].strip()) > 0:
                            cols_start_y = sen_obj['top']
                            mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], sen_obj))
            elif line_obj[0]['x0'] > reference or True:
                if not (hd.checkRegex(line)):
                    for sen_obj in line_obj:
                        if sen_obj['x0'] > 0 and len(sen_obj['text'].strip()) > 0 and \
                                len(sen_obj['text'].strip().split()) < 10:
                            overlapInt = mainTree.overlap(sen_obj['x0'], sen_obj['x1'])
                            dataToAppend = ''
                            if len(overlapInt) > 0:
                                for overLap in overlapInt:
                                    dataToAppend = overLap
                                    if float(dataToAppend.data['top']) - float(sen_obj['bottom']) <= 7 or sen_obj['underline_exists']:
                                        if sen_obj['underline_exists']:
                                            # sen_obj_te = copy.deepcopy(sen_obj)
                                            # sen_obj_te['text'] = sen_obj_te['text'] + '\n' + str(dataToAppend.data['text'])
                                            prev_sen_obj = copy.deepcopy(dataToAppend.data)
                                            prev_sen_obj['text'] = sen_obj['text'] + '\n' + str(prev_sen_obj['text'])
                                            prev_sen_obj['top'] = sen_obj['top']
                                            prev_sen_obj['bottom'] = sen_obj['bottom']
                                            mainTree.remove(dataToAppend)
                                            mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], prev_sen_obj))
                                        else:
                                            previous_starts = set([overLap.begin for overLap in overlapInt])
                                            previous_ends = set([overLap.end for overLap in overlapInt])
                                            if len(overlapInt) == 1 or (len(previous_starts) == 1 and
                                                                        len(previous_ends) == 1):
                                                # sen_obj_te = copy.deepcopy(sen_obj)
                                                # sen_obj_te['text'] = sen_obj_te['text'] + '\n' + str(
                                                #     dataToAppend.data['text'])
                                                prev_sen_obj = copy.deepcopy(dataToAppend.data)
                                                prev_sen_obj['text'] = sen_obj['text'] + '\n' + str(
                                                    prev_sen_obj['text'])
                                                prev_sen_obj['top'] = sen_obj['top']
                                                prev_sen_obj['bottom'] = sen_obj['bottom']
                                                mainTree.remove(dataToAppend)
                                                mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], prev_sen_obj))
                                            else:
                                                if len(set([overLap.begin for overLap in overlapInt])) == 1 and \
                                                        len(set([overLap.end for overLap in overlapInt])) == 1:
                                                    # sen_obj_te = copy.deepcopy(sen_obj)
                                                    # sen_obj_te['text'] = sen_obj_te['text'] + '\n' + str(
                                                    #     dataToAppend.data['text'])
                                                    prev_sen_obj = copy.deepcopy(dataToAppend.data)
                                                    prev_sen_obj['text'] = sen_obj['text'] + '\n' + str(
                                                        prev_sen_obj['text'])
                                                    prev_sen_obj['top'] = sen_obj['top']
                                                    prev_sen_obj['bottom'] = sen_obj['bottom']
                                                    prev_sen_obj['bottom'] = sen_obj['bottom']
                                                    mainTree.remove(dataToAppend)
                                                    mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], prev_sen_obj))
                                                else:
                                                    count = count + 1
                                                    if count >= 1:
                                                        for tr in mainTree:
                                                            dataToAppend = tr
                                                            mainTree.remove(dataToAppend)
                                                            mainTree.add(
                                                                Interval(dataToAppend.data['x0'], dataToAppend.data['x1'], dataToAppend.data))
                                                        sTree = sorted(mainTree)
                                                        for tr in sTree:
                                                            # mainList.append('\n'.join(str(tr.data).split('\n')[::-1])
                                                            mainList.append(tr.data['text'])
                                                        # print(mainList)
                                                        return mainList, len(reversed_list)
                                    else:
                                        count = count + 1
                                        if count >= 1:
                                            for tr in mainTree:
                                                dataToAppend = tr
                                                mainTree.remove(dataToAppend)
                                                mainTree.add(
                                                    Interval(dataToAppend.data['x0'], dataToAppend.data['x1'],
                                                             dataToAppend.data))
                                            sTree = sorted(mainTree)
                                            for tr in sTree:
                                                # mainList.append('\n'.join(str(tr.data).split('\n')[::-1])
                                                mainList.append(tr.data['text'])
                                            # print(mainList)
                                            return mainList, len(reversed_list)
                            else:
                                if len(mainTree) < df_columns_len and ( sen_obj['x0'] >= reference or
                                                                        len(line_obj) > 1) and \
                                        (cols_start_y - sen_obj['bottom'] <= 7):
                                    mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], sen_obj))
                                else:
                                    count = count + 1
                                    if count >= 1:
                                        for tr in mainTree:
                                            dataToAppend = tr
                                            mainTree.remove(dataToAppend)
                                            mainTree.add(
                                                Interval(dataToAppend.data['x0'], dataToAppend.data['x1'],
                                                         dataToAppend.data))
                                        sTree = sorted(mainTree)
                                        for tr in sTree:
                                            # mainList.append('\n'.join(str(tr.data).split('\n')[::-1])
                                            mainList.append(tr.data['text'])
                                        # print(mainList)
                                        return mainList, len(reversed_list)

                else:
                    if (len(mainTree) > 0) and (line_obj[0]['text'].isupper() or hd.checkRegex(line)):
                        break
            else:
                if (len(mainTree) > 0):
                    count=count+1
                    if count>=1:
                        break
            sorted(mainTree)
        else:
            if len(mainTree) > 0:
                for tr in mainTree:
                    dataToAppend = tr
                    mainTree.remove(dataToAppend)
                    mainTree.add(
                        Interval(dataToAppend.data['x0'], dataToAppend.data['x1'], dataToAppend.data))
                sTree = sorted(mainTree)
                for tr in sTree:
                    # '\n'.join(str(tr.data).split('\n')[::-1])
                    # mainList.append(tr.data)
                    # mainList.append('\n'.join(str(tr.data).split('\n')[::-1]))
                    mainList.append(tr.data['text'])
                return mainList, len(reversed_list)
    for tr in mainTree:
        dataToAppend = tr
        mainTree.remove(dataToAppend)
        mainTree.add(
            Interval(dataToAppend.data['x0'], dataToAppend.data['x1'], dataToAppend.data))
    sTree = sorted(mainTree)
    for tr in sTree:
        # mainList.append('\n'.join(str(tr.data).split('\n')[::-1])
        mainList.append(tr.data['text'])
    # print(mainList)
    return mainList, len(reversed_list)
Exemplo n.º 23
0
class ClassFunctionDropdown(Panel):
    """
    Class and Function/Method Dropdowns Widget.

    Parameters
    ----------
    editor : :class:`spyder.plugins.editor.widgets.codeeditor.CodeEditor`
        The editor to act on.
    """
    def __init__(self, editor):
        super(ClassFunctionDropdown, self).__init__(editor)

        # Internal data
        self._tree = IntervalTree()
        self._data = None
        self.classes = []
        self.funcs = []

        # Widgets
        self._editor = editor
        self.class_cb = QComboBox()
        self.method_cb = QComboBox()

        # Widget setup
        self.class_cb.addItem(_('<None>'), 0)
        self.method_cb.addItem(_('<None>'), 0)

        # The layout
        hbox = QHBoxLayout()
        hbox.addWidget(self.class_cb)
        hbox.addWidget(self.method_cb)
        hbox.setSpacing(0)
        hbox.setContentsMargins(0, 0, 0, 0)
        self.setLayout(hbox)

        # Signals
        self._editor.sig_cursor_position_changed.connect(
            self._handle_cursor_position_change_event)
        self.class_cb.activated.connect(self.combobox_activated)
        self.method_cb.activated.connect(self.combobox_activated)

    def _getVerticalSize(self):
        """Get the default height of a QComboBox."""
        return self.class_cb.height()

    @Slot(int, int)
    def _handle_cursor_position_change_event(self, linenum, column):
        self.update_selected(linenum)

    def sizeHint(self):
        """Override Qt method."""
        return QSize(0, self._getVerticalSize())

    def combobox_activated(self):
        """Move the cursor to the selected definition."""
        sender = self.sender()
        item = sender.itemData(sender.currentIndex())

        if item:
            line = item['location']['range']['start']['line'] + 1
            self.editor.go_to_line(line)

        if sender == self.class_cb:
            self.method_cb.setCurrentIndex(0)

    def update_selected(self, linenum):
        """Updates the dropdowns to reflect the current class and function."""
        possible_parents = list(sorted(self._tree[linenum]))
        for iv in possible_parents:
            item = iv.data
            kind = item.get('kind')

            if kind in [SymbolKind.CLASS]:
                # Update class combobox
                for idx in range(self.class_cb.count()):
                    if self.class_cb.itemData(idx) == item:
                        self.class_cb.setCurrentIndex(idx)
                        break
                else:
                    self.class_cb.setCurrentIndex(0)
            elif kind in [SymbolKind.FUNCTION, SymbolKind.METHOD]:
                # Update func combobox
                for idx in range(self.method_cb.count()):
                    if self.method_cb.itemData(idx) == item:
                        self.method_cb.setCurrentIndex(idx)
                        break
                else:
                    self.method_cb.setCurrentIndex(0)
            else:
                continue

        if len(possible_parents) == 0:
            self.class_cb.setCurrentIndex(0)
            self.method_cb.setCurrentIndex(0)

    def populate(self, combobox, data, add_parents=False):
        """
        Populate the given ``combobox`` with the class or function names.

        Parameters
        ----------
        combobox : :class:`qtpy.QtWidgets.QComboBox`
            The combobox to populate
        data : list of :class:`dict`
            The data to populate with. There should be one list element per
            class or function defintion in the file.
        add_parents : bool
            Add parents to name to create a fully qualified name.

        Returns
        -------
        None
        """
        combobox.clear()
        combobox.addItem(_('<None>'), 0)
        model = combobox.model()
        item = model.item(0)
        item.setFlags(Qt.NoItemFlags)

        cb_data = []
        for item in data:
            fqn = item['name']

            # Create a list of fully-qualified names if requested
            if add_parents:
                begin = item['location']['range']['start']['line']
                end = item['location']['range']['end']['line']
                possible_parents = sorted(self._tree.overlap(begin, end),
                                          reverse=True)
                for iv in possible_parents:
                    if iv.begin == begin and iv.end == end:
                        continue

                    # Check if it is a real parent
                    p_item = iv.data
                    p_begin = p_item['location']['range']['start']['line']
                    p_end = p_item['location']['range']['end']['line']
                    if p_begin <= begin and p_end >= end:
                        fqn = p_item['name'] + "." + fqn

            cb_data.append((fqn, item))

        for fqn, item in cb_data:
            # Set the icon (See: editortools.py)
            icon = None
            name = item['name']
            if item['kind'] in [SymbolKind.CLASS]:
                icon = ima.icon('class')
            else:
                if name.startswith('__'):
                    icon = ima.icon('private2')
                elif name.startswith('_'):
                    icon = ima.icon('private1')
                else:
                    icon = ima.icon('method')

            # Add the combobox item
            if icon is not None:
                combobox.addItem(icon, fqn, item)
            else:
                combobox.addItem(fqn, item)

        line, column = self._editor.get_cursor_line_column()
        self.update_selected(line)

    def update_data(self, data):
        """Update and process symbol data."""
        if data == self._data:
            return

        self._data = data
        self._tree.clear()
        self.classes = []
        self.funcs = []

        for item in data:
            line_start = item['location']['range']['start']['line']
            line_end = item['location']['range']['end']['line']
            kind = item.get('kind')

            block = self._editor.document().findBlockByLineNumber(line_start)
            line_text = line_text = block.text() if block else ''

            # The symbol finder returns classes in import statements as well
            # so we filter them out
            if line_start != line_end and ' import ' not in line_text:
                self._tree[line_start:line_end] = item

                if kind in [SymbolKind.CLASS]:
                    self.classes.append(item)
                elif kind in [SymbolKind.FUNCTION, SymbolKind.METHOD]:
                    self.funcs.append(item)

        self.class_cb.clear()
        self.method_cb.clear()
        self.populate(self.class_cb, self.classes, add_parents=False)
        self.populate(self.method_cb, self.funcs, add_parents=True)
Exemplo n.º 24
0
class OneKg():
    def __init__(self, anno_file, chrom, start, end):
        self.anno_file = anno_file
        self.tree = IntervalTree()
        try:
            for entry in self.anno_file.fetch(chrom, start, end):
                self.tree.addi(entry.start, entry.stop, entry)
        except Exception:
            pass
        self.tree_bts = list(self.tree.boundary_table.keys())
        # Everything past the end would need to be pumped through
        self.tree_bts.append(sys.maxsize)
        self.n_header = None

    def load_header(self, in_vcf):
        """
        Returns the header of the information we'll be adding to the annotated vcfs
        """

        ret = in_vcf.header.copy()
        ret.add_line((
            '##INFO=<ID=OKG_MSTART,Number=1,Type=Integer,Description="Mitochondrial '
            'start coordinate of inserted sequence">'))
        ret.add_line((
            '##INFO=<ID=OKG_MLEN,Number=1,Type=Integer,Description="Estimated length '
            'of mitochondrial insert">'))
        ret.add_line((
            '##INFO=<ID=OKG_MEND,Number=1,Type=Integer,Description="Mitochondrial end'
            ' coordinate of inserted sequence">'))
        ret.add_line((
            '##INFO=<ID=OKG_MEINFO,Number=4,Type=String,Description="Mobile element '
            'info of the form NAME,START,END<POLARITY; If there is only 5\' OR 3\' '
            'support for this call, will be NULL NULL for START and END">'))
        ret.add_line((
            '##INFO=<ID=OKG_AF,Number=.,Type=Float,Description="Estimated allele '
            'frequency in the range (0,1)">'))
        ret.add_line((
            '##INFO=<ID=OKG_EAS_AF,Number=.,Type=Float,Description="Allele frequency '
            'in the EAS populations calculated from AC and AN, in the range (0,1)">'
        ))
        ret.add_line((
            '##INFO=<ID=OKG_EUR_AF,Number=.,Type=Float,Description="Allele frequency '
            'in the EUR populations calculated from AC and AN, in the range (0,1)">'
        ))
        ret.add_line((
            '##INFO=<ID=OKG_AFR_AF,Number=.,Type=Float,Description="Allele frequency '
            'in the AFR populations calculated from AC and AN, in the range (0,1)">'
        ))
        ret.add_line((
            '##INFO=<ID=OKG_AMR_AF,Number=.,Type=Float,Description="Allele frequency '
            'in the AMR populations calculated from AC and AN, in the range (0,1)">'
        ))
        ret.add_line((
            '##INFO=<ID=OKG_SAS_AF,Number=.,Type=Float,Description="Allele frequency '
            'in the SAS populations calculated from AC and AN, in the range (0,1)">'
        ))
        ret.add_line((
            '##INFO=<ID=OKG_SVTYPE,Number=1,Type=String,Description="OneThousandGenome'
            'ALT Type">'))
        self.n_header = ret

    def annotate(self, entry, refdist=500, size_min=50, size_max=50000):
        """
        Given an pyvcf Variant entry do the matching
        """
        # Biggest shortcut, only annotate SVs
        if "SVLEN" not in entry.info:
            return entry

        if entry.stop + refdist < self.tree_bts[0]:
            return entry

        if entry.start - refdist > self.tree_bts[0]:
            self.tree_bts.pop(0)

        if not (size_min <= abs(entry.info["SVLEN"]) <= size_max):
            return entry

        # Don't lookup until we have to
        m_type = None
        candidates = []
        for anno_entry in self.tree.overlap(entry.start - refdist,
                                            entry.stop + refdist):
            anno_entry = anno_entry.data
            a_size = truvari.get_vcf_entry_size(anno_entry)
            if not (size_min <= a_size <= size_max):
                continue

            ps, sd = truvari.entry_size_similarity(entry, anno_entry)
            if not ps >= 0.7:
                continue

            mat1 = sv_alt_match.match(anno_entry.alts[0])
            if mat1 is not None:
                a_type = mat1.groupdict()["SVTYPE"]
            else:
                a_type = truvari.get_vcf_variant_type(anno_entry)

            # Don't make until we have to, and only do so once
            if m_type is None:
                m_type = truvari.get_vcf_variant_type(entry)

            if not (a_type == m_type or
                    ((a_type == "CN0" or a_type.startswith("DEL"))
                     and m_type == "DEL") or
                    (m_type == "INS" and a_type.startswith("INS"))):
                continue

            # RO doesn't work for INS?
            ro = truvari.entry_reciprocal_overlap(entry, anno_entry)
            if m_type != "INS" and ro < 0.5:
                continue

            candidates.append((ro, ps, anno_entry))

        if candidates:
            truvari.match_sorter(candidates)
            return self.add_info(truvari.copy_entry(entry, self.n_header),
                                 candidates[0][-1])
        return entry

    def extract_info(self, annot):
        """MSTART MLEN MEND MEINFO AF EAS_AF EUR_AF AFR_AF AMR_AF SAS_AF ALT"""
        def infoc(key):
            if key in annot.info:
                return key, annot.info[key]
            return None, None

        def altp():
            """reformat the alt seq"""
            ret = []
            for i in annot.alts:
                if i.startswith("<"):
                    ret.append(i[1:-1])
            return "SVTYPE", tuple(ret)

        return [
            infoc("MSTART"),
            infoc("MLEN"),
            infoc("MEND"),
            infoc("MEINFO"),
            infoc("AF"),
            infoc("EAS_AF"),
            infoc("EUR_AF"),
            infoc("AFR_AF"),
            infoc("AMR_AF"),
            infoc("SAS_AF")
        ]

    def add_info(self, entry, annot):
        """
        Put the relevant info fields into the entry to be annotated
        """
        # Get the annotations out of the annot and add them to the entry
        if not annot:
            return entry
        i = self.extract_info(annot)
        for key, val in self.extract_info(annot):
            if val is not None:
                entry.info["OKG_" + key] = val
        return entry
class ActivityTracker:
    def __init__(self, pen_name, base_timestamp):
        self.interval_index = IntervalTree()
        self.pen_name = pen_name
        self.base_timestamp = base_timestamp

        self.current_activities = {'feeding': {}, 'drinking': {}}

    def update_activity(self, frame_id, activity_dict):

        for activity, ids in activity_dict.items():

            ## Iterate through all the pigs in the activity dict and
            ## set the starting frame for the id if the activity is not tracked
            for pig_id in ids:
                if pig_id not in self.current_activities[activity]:
                    self.current_activities[activity][pig_id] = frame_id

            ## For those IDs which were not seen in the current activity dict,
            ## Complete and add their acticity in the interval_index
            self.add_activity(
                set(self.current_activities[activity].keys()) - ids, activity,
                frame_id)

    def add_activity(self, completed_ids, activity, end_frame_id):

        for pig_id in completed_ids:
            start_frame_id = self.current_activities[activity].pop(
                pig_id, None)
            self.interval_index[start_frame_id:end_frame_id] = (activity,
                                                                pig_id)

    def export_tracker(self, pigs, frame_id):

        ## Firstly, add all current activities in the interval index
        for activity in self.current_activities.copy():
            self.add_activity(self.current_activities[activity].copy(),
                              activity, frame_id)

        base_dir = 'data/indices/'
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(
                os.path.join(
                    base_dir,
                    "Pen%s-%s.pkl" % (self.pen_name, self.base_timestamp)),
                "wb") as f:
            pickle.dump(self.interval_index, f)
            pickle.dump(pigs, f)

    def import_tracker(self, filename):
        with open(filename, "rb") as f:
            self.interval_index = pickle.load(f)
            self.pigs = pickle.load(f)

    def query(self, q_activity, start_frame, end_frame):
        activities = [
            a.data
            for a in self.interval_index.overlap(start_frame, end_frame)
        ]

        return [(activity, pig_id) for activity, pig_id in activities
                if activity == q_activity]
Exemplo n.º 26
0
def get_indel_testing_candidates(dct):
    mapping = {'A': 0, 'G': 1, 'T': 2, 'C': 3, '-': 4}
    rev_base_map = {0: 'A', 1: 'G', 2: 'T', 3: 'C', 4: '-'}

    init_time = time.time()
    start_time = str(datetime.datetime.now())

    window_before, window_after = 0, 160

    if dct['seq'] == 'pacbio':
        window_after = 260

    chrom = dct['chrom']
    start = dct['start']
    end = dct['end']
    sam_path = dct['sam_path']
    fasta_path = dct['fasta_path']
    samfile = pysam.Samfile(sam_path, "rb")
    fastafile = pysam.FastaFile(fasta_path)

    window_size = dct['win_size']
    small_size = dct['small_win_size']

    mincov, maxcov = dct['mincov'], dct['maxcov']
    ins_t, del_t = dct['ins_t'], dct['del_t']

    include_intervals, exclude_intervals = None, None

    if dct['include_bed']:
        tbx = pysam.TabixFile(dct['include_bed'])
        include_intervals = IntervalTree(
            Interval(int(row[1]), int(row[2]), "%s" % (row[1]))
            for row in tbx.fetch(chrom, parser=pysam.asBed()))

        def in_bed(tree, pos):
            return tree.overlaps(pos)

        include_intervals = IntervalTree(include_intervals.overlap(start, end))

        if not include_intervals:
            return [], [], [], [], []

        else:
            start = max(start, min(x[0] for x in include_intervals))
            end = min(end, max(x[1] for x in include_intervals))

    else:

        def in_bed(tree, pos):
            return True

    if dct['exclude_bed']:
        tbx = pysam.TabixFile(dct['exclude_bed'])
        try:
            exclude_intervals = IntervalTree(
                Interval(int(row[1]), int(row[2]), "%s" % (row[1]))
                for row in tbx.fetch(chrom, parser=pysam.asBed()))

            def ex_bed(tree, pos):
                return tree.overlaps(pos)

        except ValueError:

            def ex_bed(tree, pos):
                return False
    else:

        def ex_bed(tree, pos):
            return False

    ref_dict = {
        j: s.upper() if s in 'AGTC' else ''
        for j, s in zip(
            range(max(1, start - 200), end + 400 +
                  1), fastafile.fetch(chrom,
                                      max(1, start - 200) - 1, end + 400))
    }

    chrom_length = fastafile.get_reference_length(chrom)

    hap_dict = {1: [], 2: []}

    for pread in samfile.fetch(chrom, max(0, dct['start'] - 100000),
                               dct['end'] + 1000):
        if pread.has_tag('HP'):
            hap_dict[pread.get_tag('HP')].append(pread.qname)

    hap_reads_0 = set(hap_dict[1])
    hap_reads_1 = set(hap_dict[2])

    if dct['supplementary']:
        flag = 0x4 | 0x100 | 0x200 | 0x400
    else:
        flag = 0x4 | 0x100 | 0x200 | 0x400 | 0x800

    output_pos,output_data_0,output_data_1,output_data_total,alleles=[],[],[],[],[]

    del_queue_0, del_queue_1 = collections.deque(
        window_size * [set()],
        window_size), collections.deque(window_size * [set()], window_size)
    ins_queue_0, ins_queue_1 = collections.deque(
        window_size * [set()],
        window_size), collections.deque(window_size * [set()], window_size)
    position_queue = collections.deque(window_size * [{}], window_size)

    del_queue_small_0, del_queue_small_1 = collections.deque(
        small_size * [set()],
        small_size), collections.deque(small_size * [set()], small_size)
    ins_queue_small_0, ins_queue_small_1 = collections.deque(
        small_size * [set()],
        small_size), collections.deque(small_size * [set()], small_size)
    position_queue_small = collections.deque(small_size * [{}], small_size)

    variants = {}
    extra_variants = {}

    max_range = {0: max(10, window_size), 1: 10}

    count = 0
    prev = 0
    for pcol in samfile.pileup(chrom,max(0,start-1),end,min_base_quality=0,\
                                           flag_filter=flag,truncate=True):
        v_pos = pcol.pos + 1

        if in_bed(include_intervals,
                  v_pos) and not ex_bed(exclude_intervals, v_pos):
            read_names = pcol.get_query_names()
            read_names_0 = set(read_names) & hap_reads_0
            read_names_1 = set(read_names) & hap_reads_1
            len_seq_tot = len(read_names)
            len_seq_0 = len(read_names_0)
            len_seq_1 = len(read_names_1)

            ins_set_0 = {
                n
                for n, s in zip(
                    read_names,
                    pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True))
                if '+' in s and n in read_names_0
                and int(''.join(filter(str.isdigit, s))) > 2
            }
            ins_set_small_0 = {
                n
                for n, s in zip(
                    read_names,
                    pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True))
                if '+' in s and n in read_names_0
                and int(''.join(filter(str.isdigit, s))) <= 10
            }

            del_set_0 = {
                n
                for n, s in zip(
                    read_names,
                    pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True))
                if '-' in s and n in read_names_0
                and int(''.join(filter(str.isdigit, s))) > 2
            }
            del_set_small_0 = {
                n
                for n, s in zip(
                    read_names,
                    pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True))
                if '-' in s and n in read_names_0
                and int(''.join(filter(str.isdigit, s))) <= 10
            }

            ins_set_1 = {
                n
                for n, s in zip(
                    read_names,
                    pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True))
                if '+' in s and n in read_names_1
                and int(''.join(filter(str.isdigit, s))) > 2
            }
            ins_set_small_1 = {
                n
                for n, s in zip(
                    read_names,
                    pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True))
                if '+' in s and n in read_names_1
                and int(''.join(filter(str.isdigit, s))) <= 10
            }

            del_set_1 = {
                n
                for n, s in zip(
                    read_names,
                    pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True))
                if '-' in s and n in read_names_1
                and int(''.join(filter(str.isdigit, s))) > 2
            }
            del_set_small_1 = {
                n
                for n, s in zip(
                    read_names,
                    pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True))
                if '-' in s and n in read_names_1
                and int(''.join(filter(str.isdigit, s))) <= 10
            }

            del_queue_0.append(del_set_0)
            del_queue_1.append(del_set_1)

            ins_queue_0.append(ins_set_0)
            ins_queue_1.append(ins_set_1)

            del_queue_small_0.append(del_set_small_0)
            del_queue_small_1.append(del_set_small_1)

            ins_queue_small_0.append(ins_set_small_0)
            ins_queue_small_1.append(ins_set_small_1)

            if v_pos <= prev:
                continue

            if len_seq_0 >= mincov and len_seq_1 >= mincov:

                del_freq_0 = len(set.union(
                    *del_queue_0)) / len_seq_0 if len_seq_0 > 0 else 0
                ins_freq_0 = len(set.union(
                    *ins_queue_0)) / len_seq_0 if len_seq_0 > 0 else 0

                del_freq_1 = len(set.union(
                    *del_queue_1)) / len_seq_1 if len_seq_1 > 0 else 0
                ins_freq_1 = len(set.union(
                    *ins_queue_1)) / len_seq_1 if len_seq_1 > 0 else 0

                del_freq_small_0 = len(set.union(
                    *del_queue_small_0)) / len_seq_0 if len_seq_0 > 0 else 0
                ins_freq_small_0 = len(set.union(
                    *ins_queue_small_0)) / len_seq_0 if len_seq_0 > 0 else 0

                del_freq_small_1 = len(set.union(
                    *del_queue_small_1)) / len_seq_1 if len_seq_1 > 0 else 0
                ins_freq_small_1 = len(set.union(
                    *ins_queue_small_1)) / len_seq_1 if len_seq_1 > 0 else 0

                if max([del_freq_0, del_freq_1]) >= del_t or max(
                    [ins_freq_0, ins_freq_1]) >= ins_t:
                    prev = v_pos + window_size
                    variants[max(1, v_pos - window_size)] = 0
                    count += 1

                elif max([del_freq_small_0, del_freq_small_1]) >= del_t or max(
                    [ins_freq_small_0, ins_freq_small_1]) >= ins_t or (
                        del_freq_small_0 + ins_freq_small_0) >= 0.9 or (
                            del_freq_small_1 + ins_freq_small_1) >= 0.9:

                    prev = v_pos + 10
                    variants[max(1, v_pos - 10)] = 1
                    count += 1

            elif dct['impute_indel_phase'] and len_seq_tot >= 2 * mincov:
                seq_v2 = [
                    x.upper() for x in pcol.get_query_sequences(
                        mark_matches=False, mark_ends=False, add_indels=True)
                ]
                seq = [x[:2] for x in seq_v2]
                seq_tot = ''.join(seq)

                del_freq_tot = (seq_tot.count('-') + seq_tot.count('*')
                                ) / len_seq_tot if len_seq_tot > 0 else 0
                ins_freq_tot = seq_tot.count(
                    '+') / len_seq_tot if len_seq_tot > 0 else 0
                if (del_t <= del_freq_tot or ins_t <= ins_freq_tot):
                    groups = {}
                    for s, n in zip(seq_v2, read_names):
                        if s not in groups:
                            groups[s] = []
                        groups[s].append(n)

                    counts = sorted([(x, len(groups[x])) for x in groups],
                                    key=lambda x: x[1],
                                    reverse=True)

                    if counts[0][1] <= 0.8 * len_seq_tot:
                        read_names_0 = set(groups[counts[0][0]])
                        read_names_1 = set(groups[
                            counts[1][0]]) if counts[1][1] >= mincov else set(
                                read_names) - read_names_0
                    else:
                        read_names_0 = groups[counts[0][0]][:counts[0][1] // 2]
                        read_names_1 = groups[counts[0][0]][counts[0][1] // 2:]
                    if len(read_names_0) >= mincov and len(
                            read_names_1) >= mincov:
                        prev = v_pos + 10
                        variants[max(1, v_pos - 10)] = 1
                        extra_variants[max(1, v_pos - 10)] = (read_names_0,
                                                              read_names_1)
                        count += 1

    for pcol in samfile.pileup(chrom,
                               max(0, start - 10 - window_size),
                               end,
                               min_base_quality=0,
                               flag_filter=flag,
                               truncate=True):

        v_pos = pcol.pos + 1
        if v_pos in extra_variants:
            read_names = pcol.get_query_names()
            read_names_0, read_names_1 = extra_variants[v_pos]

        elif v_pos in variants:
            read_names = pcol.get_query_names()

            read_names_0 = set(read_names) & hap_reads_0
            read_names_1 = set(read_names) & hap_reads_1

        else:
            continue

        d = {'hap0': {}, 'hap1': {}}
        d_tot = {}

        ref = ''.join([
            ref_dict[p]
            for p in range(v_pos - window_before,
                           min(chrom_length, v_pos + window_after + 1))
        ])

        for pread in pcol.pileups:
            dt = pread.alignment.query_sequence[
                max(0, pread.query_position_or_next -
                    window_before):pread.query_position_or_next + window_after]
            d_tot[pread.alignment.qname] = dt

            if pread.alignment.qname in read_names_0:
                d['hap0'][pread.alignment.qname] = dt

            elif pread.alignment.qname in read_names_1:
                d['hap1'][pread.alignment.qname] = dt

        seq_list = d['hap0']
        flag0, _, data_0, alt_0, ref_seq_0 = msa(seq_list, ref, v_pos, 2,
                                                 dct['maxcov'])

        seq_list = d['hap1']
        flag1, _, data_1, alt_1, ref_seq_1 = msa(seq_list, ref, v_pos, 2,
                                                 dct['maxcov'])

        seq_list = d_tot
        flag_total, indel_flag_total, data_total, alt_total, ref_seq_total = msa(
            seq_list, ref, v_pos, dct['mincov'], dct['maxcov'])

        if flag0 and flag1 and flag_total:
            output_pos.append(v_pos)
            output_data_0.append(data_0)
            output_data_1.append(data_1)
            output_data_total.append(data_total)

            tp = max_range[variants[v_pos]]

            alleles.append([allele_prediction(alt_0, ref_seq_0, max_range[variants[v_pos]]),\
                            allele_prediction(alt_1, ref_seq_1, max_range[variants[v_pos]]), \
                            allele_prediction(alt_total, ref_seq_total, max_range[variants[v_pos]])])

    if len(output_pos) == 0:
        return (output_pos, output_data_0, output_data_1, output_data_total,
                alleles)

    output_pos = np.array(output_pos)
    output_data_0 = np.array(output_data_0)
    output_data_1 = np.array(output_data_1)
    output_data_total = np.array(output_data_total)

    return (output_pos, output_data_0, output_data_1, output_data_total,
            alleles)
Exemplo n.º 27
0
class DateIntervalTree:
    """A slight adaption of the intervaltree library to support python dates

    The intervaltree data structure stores integer ranges, fundamentally. Therefore, if we want to
    store dates, we must fist convert them to integers, in a way that preserves inequalities.
    Luckily, the toordinal() function on datetime.date satisfies this requirement.

    It's important to note that this interval tree structure is, unless otherwise noted inclusive of
    lower bounds and exclusive of upper bounds. That is to say, an interval from A to B includes the
    value A and excludes the value B.
    """
    def __init__(self):
        self.tree = IntervalTree()

    @staticmethod
    def to_date_interval(begin: date, end: date, data: Any) -> Interval:
        """Convert a date interval (and associated date, if any) into an ordinal interval"""
        return Interval(begin.toordinal(), end.toordinal(), data)

    @staticmethod
    def from_date_interval(ival: Interval) -> Interval:
        """Convert an ordinal interval to a date interval"""
        return Interval(date.fromordinal(ival.begin),
                        date.fromordinal(ival.end), ival.data)

    def add(self, begin: date, end: date, data: Any = None):
        """Add a date interval to the interval tree, along with any associated date"""
        self.tree.add(DateIntervalTree.to_date_interval(begin, end, data))

    def merge_overlaps(self, reducer: Callable = None, strict: bool = True):
        """Merge overlapping date intervals in the tree.

        A reduce function can be specified to determine how data elements are combined for overlapping intervals.
        The strict argument determines whether "kissing" intervals are merged. If true (the default), only "strictly"
        overlapping intervals are merged, otherwise adjacent intervals will also be merged.

        See the intervaltree library documentation for the merge_overlaps function for a more complete description.
        """
        self.tree.merge_overlaps(data_reducer=reducer, strict=strict)

    def intervals(self) -> List[Interval]:
        """Return all date intervals in this tree"""

        # Note we convert from ordinal values to actual date objects
        return [
            DateIntervalTree.from_date_interval(ival)
            for ival in self.tree.items()
        ]

    def overlaps(self, begin: date, end: date, strict: bool = True) -> bool:
        """Determine whether the given date interval overlaps with any interval in the tree.

        According to intervaltree, intervals include the lower bound but not the upper bound:
        2015-07-23 -2015-08-21 does not overlap 2015-08-21-2015-09-21
        If strict is false, add a day to the end date to return True for single day overlaps.
        """
        if strict:
            ival = DateIntervalTree.to_date_interval(begin, end, None)
        else:
            ival = DateIntervalTree.to_date_interval(begin,
                                                     end + timedelta(days=1),
                                                     None)
        return self.tree.overlaps(ival.begin, ival.end)

    def range_query(self, begin: date, end: date) -> List[Interval]:
        """Return all intervals in the tree that strictly overlap with the given interval"""
        ival = DateIntervalTree.to_date_interval(begin, end, None)
        return [
            DateIntervalTree.from_date_interval(ival)
            for ival in self.tree.overlap(ival.begin, ival.end)
        ]

    def point_query(self, point: date) -> List[Interval]:
        return [
            DateIntervalTree.from_date_interval(ival)
            for ival in self.tree.at(point.toordinal())
        ]

    @staticmethod
    def shift_endpoints(date_tree: "DateIntervalTree") -> "DateIntervalTree":
        """Produce a new tree where adjacent intervals are guaranteed to not match at a boundary

        by shifting the end dates of touching intervals
        E.g., the intervals
            (1/1/2000, 1/10/2000), (1/10/2000, 1/20/2000)
        become
            (1/1/2000, 1/9/2000), (1/10/2000, 1/20/2000)
                         ^--A day was subtracted here to avoid matching exactly with the next interval
        Loop earliest -> latest, adjusting end date.
        """
        adjusted = DateIntervalTree()
        work_list = deque(sorted(date_tree.intervals()))
        while work_list:
            cur_ival = work_list.popleft()
            if work_list:
                next_ival = work_list[0]
                if cur_ival.end == next_ival.begin:
                    cur_ival = Interval(cur_ival.begin,
                                        cur_ival.end - timedelta(days=1),
                                        cur_ival.data)

            adjusted.add(cur_ival.begin, cur_ival.end, cur_ival.data)
        return adjusted

    @staticmethod
    def shift_endpoints_start(
            date_tree: "DateIntervalTree") -> "DateIntervalTree":
        """Produce a new tree where adjacent intervals are guaranteed to not match at a boundary

        by shifting the start dates of touching intervals
        E.g., the intervals
            (1/1/2000, 1/10/2000), (1/10/2000, 1/20/2000)
        become
            (1/1/2000, 1/10/2000), (1/11/2000, 1/20/2000)
                                      ^--A day was added here to avoid matching exactly with
                                     the next interval
        Loop latest -> earliest, adjusting start date.
        """
        adjusted = DateIntervalTree()
        work_list = deque(sorted(date_tree.intervals(), reverse=True))
        while work_list:
            cur_ival = work_list.popleft()
            if work_list:
                next_ival = work_list[0]
                if cur_ival.begin == next_ival.end:
                    log.debug(
                        "adjusting start of billing period: %s-%s",
                        cur_ival.begin,
                        cur_ival.end,
                    )
                    cur_ival = Interval(cur_ival.begin + timedelta(days=1),
                                        cur_ival.end, cur_ival.data)
            adjusted.add(cur_ival.begin, cur_ival.end, cur_ival.data)
        return adjusted

    @staticmethod
    def shift_endpoints_end(
            date_tree: "DateIntervalTree") -> "DateIntervalTree":
        """Produce a new tree where adjacent intervals are guaranteed to not match at a boundary
        by shifting the end dates of touching intervals
        E.g., the intervals
            (1/1/2000, 1/10/2000), (1/10/2000, 1/20/2000)
        become
            (1/1/2000, 1/9/2000), (1/10/2000, 1/20/2000)
                         ^--A day was subtracted here to avoid matching exactly with the next interval
        Loop latest -> earliest, adjusting end date.
        """
        adjusted = DateIntervalTree()
        work_list = deque(sorted(date_tree.intervals(), reverse=True))
        prev_ival = None
        while work_list:
            cur_ival = work_list.popleft()
            if prev_ival:
                while cur_ival.end >= prev_ival.begin:
                    new_start, new_end = (
                        cur_ival.begin,
                        cur_ival.end - timedelta(days=1),
                    )

                    if new_start == new_end:
                        # If new interval is one day long, shift start date back one day too.
                        new_start = new_start - timedelta(days=1)
                    cur_ival = Interval(new_start, new_end, cur_ival.data)
            prev_ival = cur_ival
            adjusted.add(cur_ival.begin, cur_ival.end, cur_ival.data)
        return adjusted
Exemplo n.º 28
0
class MemoryCache(object):
    def __init__(self, context):
        self._context = context
        self._run_token = -1
        self._log = logging.getLogger('memcache')
        self._reset_cache()

    def _reset_cache(self):
        self._cache = IntervalTree()
        self._metrics = CacheMetrics()

    ##
    # @brief Invalidates the cache if appropriate.
    def _check_cache(self):
        if self._context.core.is_running():
            self._log.debug("core is running; invalidating cache")
            self._reset_cache()
        elif self._run_token != self._context.core.run_token:
            self._dump_metrics()
            self._log.debug("out of date run token; invalidating cache")
            self._reset_cache()
            self._run_token = self._context.core.run_token

    ##
    # @brief Splits a memory address range into cached and uncached subranges.
    # @return Returns a 2-tuple with the first element being a set of Interval objects for each
    #   of the cached subranges. The second element is a set of Interval objects for each of the
    #   non-cached subranges.
    def _get_ranges(self, addr, count):
        cached = self._cache.overlap(addr, addr + count)
        uncached = {Interval(addr, addr + count)}
        for cachedIv in cached:
            newUncachedSet = set()
            for uncachedIv in uncached:

                # No overlap.
                if cachedIv.end < uncachedIv.begin or cachedIv.begin > uncachedIv.end:
                    newUncachedSet.add(uncachedIv)
                    continue

                # Begin segment.
                if cachedIv.begin - uncachedIv.begin > 0:
                    newUncachedSet.add(
                        Interval(uncachedIv.begin, cachedIv.begin))

                # End segment.
                if uncachedIv.end - cachedIv.end > 0:
                    newUncachedSet.add(Interval(cachedIv.end, uncachedIv.end))
            uncached = newUncachedSet
        return cached, uncached

    ##
    # @brief Reads uncached memory ranges and updates the cache.
    # @return A list of Interval objects is returned. Each Interval has its @a data attribute set
    #   to a bytearray of the data read from target memory.
    def _read_uncached(self, uncached):
        uncachedData = []
        for uncachedIv in uncached:
            data = self._context.read_memory_block8(
                uncachedIv.begin, uncachedIv.end - uncachedIv.begin)
            iv = Interval(uncachedIv.begin, uncachedIv.end, bytearray(data))
            self._cache.add(iv)  # TODO merge contiguous cached intervals
            uncachedData.append(iv)
        return uncachedData

    def _update_metrics(self, cached, uncached, addr, size):
        cachedSize = 0
        for iv in cached:
            begin = iv.begin
            end = iv.end
            if iv.begin < addr:
                begin = addr
            if iv.end > addr + size:
                end = addr + size
            cachedSize += end - begin

        uncachedSize = sum((iv.end - iv.begin) for iv in uncached)

        self._metrics.reads += 1
        self._metrics.hits += cachedSize
        self._metrics.misses += uncachedSize

    def _dump_metrics(self):
        if self._metrics.total > 0:
            self._log.debug(
                "%d reads, %d bytes [%d%% hits, %d bytes]; %d bytes written",
                self._metrics.reads, self._metrics.total,
                self._metrics.percent_hit, self._metrics.hits,
                self._metrics.writes)
        else:
            self._log.debug("no reads")

    ##
    # @brief Performs a cached read operation of an address range.
    # @return A list of Interval objects sorted by address.
    def _read(self, addr, size):
        # Get the cached and uncached subranges of the requested read.
        cached, uncached = self._get_ranges(addr, size)
        self._update_metrics(cached, uncached, addr, size)

        # Read any uncached ranges.
        uncachedData = self._read_uncached(uncached)

        # Merged cached with data we just read
        combined = list(cached) + uncachedData
        combined.sort(key=lambda x: x.begin)
        return combined

    ##
    # @brief Extracts data from the intersection of an address range across a list of interval objects.
    #
    # The range represented by @a addr and @a size are assumed to overlap the intervals. The first
    # and last interval in the list may have ragged edges not fully contained in the address range, in
    # which case the correct slice of those intervals is extracted.
    #
    # @param self
    # @param combined List of Interval objects forming a contiguous range. The @a data attribute of
    #   each interval must be a bytearray.
    # @param addr Start address. Must be within the range of the first interval.
    # @param size Number of bytes. (@a addr + @a size) must be within the range of the last interval.
    # @return A single bytearray object with all data from the intervals that intersects the address
    #   range.
    def _merge_data(self, combined, addr, size):
        result = bytearray()
        resultAppend = bytearray()

        # Check for fully contained subrange.
        if len(combined) and combined[0].begin < addr and combined[
                0].end > addr + size:
            offset = addr - combined[0].begin
            endOffset = offset + size
            result = combined[0].data[offset:endOffset]
            return result

        # Take slice of leading ragged edge.
        if len(combined) and combined[0].begin < addr:
            offset = addr - combined[0].begin
            result += combined[0].data[offset:]
            combined = combined[1:]
        # Take slice of trailing ragged edge.
        if len(combined) and combined[-1].end > addr + size:
            offset = addr + size - combined[-1].begin
            resultAppend = combined[-1].data[:offset]
            combined = combined[:-1]

        # Merge.
        for iv in combined:
            result += iv.data
        result += resultAppend

        return result

    ##
    # @brief
    def _update_contiguous(self, cached, addr, value):
        size = len(value)
        end = addr + size
        leadBegin = addr
        leadData = bytearray()
        trailData = bytearray()
        trailEnd = end

        if cached[0].begin < addr and cached[0].end > addr:
            offset = addr - cached[0].begin
            leadData = cached[0].data[:offset]
            leadBegin = cached[0].begin
        if cached[-1].begin < end and cached[-1].end > end:
            offset = end - cached[-1].begin
            trailData = cached[-1].data[offset:]
            trailEnd = cached[-1].end

        self._cache.remove_overlap(addr, end)

        data = leadData + value + trailData
        self._cache.addi(leadBegin, trailEnd, data)

    ##
    # @return A bool indicating whether the given address range is fully contained within
    #       one known memory region, and that region is cacheable.
    # @exception MemoryAccessError Raised if the access is not entirely contained within a single region.
    def _check_regions(self, addr, count):
        regions = self._context.core.memory_map.get_intersecting_regions(
            addr, length=count)

        # If no regions matched, then allow an uncached operation.
        if len(regions) == 0:
            return False

        # Raise if not fully contained within one region.
        if len(regions) > 1 or not regions[0].contains_range(addr,
                                                             length=count):
            raise MemoryAccessError(
                "individual memory accesses must not cross memory region boundaries"
            )

        # Otherwise return whether the region is cacheable.
        return regions[0].is_cacheable

    def read_memory(self, addr, transfer_size=32, now=True):
        # TODO use more optimal underlying read_memory call
        if transfer_size == 8:
            data = self.read_memory_block8(addr, 1)[0]
        elif transfer_size == 16:
            data = conversion.byte_list_to_u16le_list(
                self.read_memory_block8(addr, 2))[0]
        elif transfer_size == 32:
            data = conversion.byte_list_to_u32le_list(
                self.read_memory_block8(addr, 4))[0]

        if now:
            return data
        else:

            def read_cb():
                return data

            return read_cb

    def read_memory_block8(self, addr, size):
        if size <= 0:
            return []

        self._check_cache()

        # Validate memory regions.
        if not self._check_regions(addr, size):
            self._log.debug("range [%x:%x] is not cacheable", addr,
                            addr + size)
            return self._context.read_memory_block8(addr, size)

        # Get the cached and uncached subranges of the requested read.
        combined = self._read(addr, size)

        # Extract data out of combined intervals.
        result = list(self._merge_data(combined, addr, size))
        assert len(
            result) == size, "result size ({}) != requested size ({})".format(
                len(result), size)
        return result

    def read_memory_block32(self, addr, size):
        return conversion.byte_list_to_u32le_list(
            self.read_memory_block8(addr, size * 4))

    def write_memory(self, addr, value, transfer_size=32):
        if transfer_size == 8:
            return self.write_memory_block8(addr, [value])
        elif transfer_size == 16:
            return self.write_memory_block8(
                addr, conversion.u16le_list_to_byte_list([value]))
        elif transfer_size == 32:
            return self.write_memory_block8(
                addr, conversion.u32le_list_to_byte_list([value]))

    def write_memory_block8(self, addr, value):
        if len(value) <= 0:
            return

        self._check_cache()

        # Validate memory regions.
        cacheable = self._check_regions(addr, len(value))

        # Write to the target first, so if it fails we don't update the cache.
        result = self._context.write_memory_block8(addr, value)

        if cacheable:
            size = len(value)
            end = addr + size
            cached = sorted(self._cache.overlap(addr, end),
                            key=lambda x: x.begin)
            self._metrics.writes += size

            if len(cached):
                # Write data is entirely within a single cached interval.
                if addr >= cached[0].begin and end <= cached[0].end:
                    beginOffset = addr - cached[0].begin
                    endOffset = beginOffset + size
                    cached[0].data[beginOffset:endOffset] = value

                else:
                    self._update_contiguous(cached, addr, bytearray(value))
            else:
                # No cached data in this range, so just add the entire interval.
                self._cache.addi(addr, end, bytearray(value))

        return result

    def write_memory_block32(self, addr, data):
        return self.write_memory_block8(
            addr, conversion.u32le_list_to_byte_list(data))

    def invalidate(self):
        self._reset_cache()
Exemplo n.º 29
0
Arquivo: cache.py Projeto: flit/pyOCD
class MemoryCache(object):
    """! @brief Memory cache.
    
    Maintains a cache of target memory. The constructor is passed a backing DebugContext object that
    will be used to fill the cache.
    
    The cache is invalidated whenever the target has run since the last cache operation (based on run
    tokens). If the target is currently running, all accesses cause the cache to be invalidated.
    
    The target's memory map is referenced. All memory accesses must be fully contained within a single
    memory region, or a MemoryAccessError will be raised. However, if an access is outside of all regions,
    the access is passed to the underlying context unmodified. When an access is within a region, that
    region's cacheability flag is honoured.
    """
    
    def __init__(self, context, core):
        self._context = context
        self._core = core
        self._run_token = -1
        self._log = LOG.getChild('memcache')
        self._reset_cache()

    def _reset_cache(self):
        self._cache = IntervalTree()
        self._metrics = CacheMetrics()

    def _check_cache(self):
        """! @brief Invalidates the cache if appropriate."""
        if self._core.is_running():
            self._log.debug("core is running; invalidating cache")
            self._reset_cache()
        elif self._run_token != self._core.run_token:
            self._dump_metrics()
            self._log.debug("out of date run token; invalidating cache")
            self._reset_cache()
            self._run_token = self._core.run_token

    def _get_ranges(self, addr, count):
        """! @brief Splits a memory address range into cached and uncached subranges.
        @return Returns a 2-tuple with the first element being a set of Interval objects for each
          of the cached subranges. The second element is a set of Interval objects for each of the
          non-cached subranges.
        """
        cached = self._cache.overlap(addr, addr + count)
        uncached = {Interval(addr, addr + count)}
        for cachedIv in cached:
            newUncachedSet = set()
            for uncachedIv in uncached:

                # No overlap.
                if cachedIv.end < uncachedIv.begin or cachedIv.begin > uncachedIv.end:
                    newUncachedSet.add(uncachedIv)
                    continue

                # Begin segment.
                if cachedIv.begin - uncachedIv.begin > 0:
                    newUncachedSet.add(Interval(uncachedIv.begin, cachedIv.begin))

                # End segment.
                if uncachedIv.end - cachedIv.end > 0:
                    newUncachedSet.add(Interval(cachedIv.end, uncachedIv.end))
            uncached = newUncachedSet
        return cached, uncached

    def _read_uncached(self, uncached):
        """! "@brief Reads uncached memory ranges and updates the cache.
        @return A list of Interval objects is returned. Each Interval has its @a data attribute set
          to a bytearray of the data read from target memory.
        """
        uncachedData = []
        for uncachedIv in uncached:
            data = self._context.read_memory_block8(uncachedIv.begin, uncachedIv.end - uncachedIv.begin)
            iv = Interval(uncachedIv.begin, uncachedIv.end, bytearray(data))
            self._cache.add(iv) # TODO merge contiguous cached intervals
            uncachedData.append(iv)
        return uncachedData

    def _update_metrics(self, cached, uncached, addr, size):
        cachedSize = 0
        for iv in cached:
            begin = iv.begin
            end = iv.end
            if iv.begin < addr:
                begin = addr
            if iv.end > addr + size:
                end = addr + size
            cachedSize += end - begin

        uncachedSize = sum((iv.end - iv.begin) for iv in uncached)

        self._metrics.reads += 1
        self._metrics.hits += cachedSize
        self._metrics.misses += uncachedSize

    def _dump_metrics(self):
        if self._metrics.total > 0:
            self._log.debug("%d reads, %d bytes [%d%% hits, %d bytes]; %d bytes written",
                self._metrics.reads, self._metrics.total, self._metrics.percent_hit,
                self._metrics.hits, self._metrics.writes)
        else:
            self._log.debug("no reads")

    def _read(self, addr, size):
        """! @brief Performs a cached read operation of an address range.
        @return A list of Interval objects sorted by address.
        """
        # Get the cached and uncached subranges of the requested read.
        cached, uncached = self._get_ranges(addr, size)
        self._update_metrics(cached, uncached, addr, size)

        # Read any uncached ranges.
        uncachedData = self._read_uncached(uncached)

        # Merged cached with data we just read
        combined = list(cached) + uncachedData
        combined.sort(key=lambda x: x.begin)
        return combined

    def _merge_data(self, combined, addr, size):
        """! @brief Extracts data from the intersection of an address range across a list of interval objects.
        
        The range represented by @a addr and @a size are assumed to overlap the intervals. The first
        and last interval in the list may have ragged edges not fully contained in the address range, in
        which case the correct slice of those intervals is extracted.
        
        @param self
        @param combined List of Interval objects forming a contiguous range. The @a data attribute of
          each interval must be a bytearray.
        @param addr Start address. Must be within the range of the first interval.
        @param size Number of bytes. (@a addr + @a size) must be within the range of the last interval.
        @return A single bytearray object with all data from the intervals that intersects the address
          range.
        """
        result = bytearray()
        resultAppend = bytearray()

        # Check for fully contained subrange.
        if len(combined) and combined[0].begin < addr and combined[0].end > addr + size:
            offset = addr - combined[0].begin
            endOffset = offset + size
            result = combined[0].data[offset:endOffset]
            return result
        
        # Take slice of leading ragged edge.
        if len(combined) and combined[0].begin < addr:
            offset = addr - combined[0].begin
            result += combined[0].data[offset:]
            combined = combined[1:]
        # Take slice of trailing ragged edge.
        if len(combined) and combined[-1].end > addr + size:
            offset = addr + size - combined[-1].begin
            resultAppend = combined[-1].data[:offset]
            combined = combined[:-1]

        # Merge.
        for iv in combined:
            result += iv.data
        result += resultAppend

        return result

    def _update_contiguous(self, cached, addr, value):
        size = len(value)
        end = addr + size
        leadBegin = addr
        leadData = bytearray()
        trailData = bytearray()
        trailEnd = end

        if cached[0].begin < addr and cached[0].end > addr:
            offset = addr - cached[0].begin
            leadData = cached[0].data[:offset]
            leadBegin = cached[0].begin
        if cached[-1].begin < end and cached[-1].end > end:
            offset = end - cached[-1].begin
            trailData = cached[-1].data[offset:]
            trailEnd = cached[-1].end

        self._cache.remove_overlap(addr, end)

        data = leadData + value + trailData
        self._cache.addi(leadBegin, trailEnd, data)

    def _check_regions(self, addr, count):
        """! @return A bool indicating whether the given address range is fully contained within
              one known memory region, and that region is cacheable.
        @exception MemoryAccessError Raised if the access is not entirely contained within a single region.
        """
        regions = self._core.memory_map.get_intersecting_regions(addr, length=count)

        # If no regions matched, then allow an uncached operation.
        if len(regions) == 0:
            return False

        # Raise if not fully contained within one region.
        if len(regions) > 1 or not regions[0].contains_range(addr, length=count):
            raise MemoryAccessError("individual memory accesses must not cross memory region boundaries")

        # Otherwise return whether the region is cacheable.
        return regions[0].is_cacheable

    def read_memory(self, addr, transfer_size=32, now=True):
        # TODO use more optimal underlying read_memory call
        if transfer_size == 8:
            data = self.read_memory_block8(addr, 1)[0]
        elif transfer_size == 16:
            data = conversion.byte_list_to_u16le_list(self.read_memory_block8(addr, 2))[0]
        elif transfer_size == 32:
            data = conversion.byte_list_to_u32le_list(self.read_memory_block8(addr, 4))[0]

        if now:
            return data
        else:
            def read_cb():
                return data
            return read_cb

    def read_memory_block8(self, addr, size):
        if size <= 0:
            return []

        self._check_cache()

        # Validate memory regions.
        if not self._check_regions(addr, size):
            self._log.debug("range [%x:%x] is not cacheable", addr, addr+size)
            return self._context.read_memory_block8(addr, size)

        # Get the cached and uncached subranges of the requested read.
        combined = self._read(addr, size)

        # Extract data out of combined intervals.
        result = list(self._merge_data(combined, addr, size))
        assert len(result) == size, "result size ({}) != requested size ({})".format(len(result), size)
        return result

    def read_memory_block32(self, addr, size):
        return conversion.byte_list_to_u32le_list(self.read_memory_block8(addr, size*4))

    def write_memory(self, addr, value, transfer_size=32):
        if transfer_size == 8:
            return self.write_memory_block8(addr, [value])
        elif transfer_size == 16:
            return self.write_memory_block8(addr, conversion.u16le_list_to_byte_list([value]))
        elif transfer_size == 32:
            return self.write_memory_block8(addr, conversion.u32le_list_to_byte_list([value]))

    def write_memory_block8(self, addr, value):
        if len(value) <= 0:
            return

        self._check_cache()

        # Validate memory regions.
        cacheable = self._check_regions(addr, len(value))

        # Write to the target first, so if it fails we don't update the cache.
        result = self._context.write_memory_block8(addr, value)

        if cacheable:
            size = len(value)
            end = addr + size
            cached = sorted(self._cache.overlap(addr, end), key=lambda x:x.begin)
            self._metrics.writes += size

            if len(cached):
                # Write data is entirely within a single cached interval.
                if addr >= cached[0].begin and end <= cached[0].end:
                    beginOffset = addr - cached[0].begin
                    endOffset = beginOffset + size
                    cached[0].data[beginOffset:endOffset] = value

                else:
                    self._update_contiguous(cached, addr, bytearray(value))
            else:
                # No cached data in this range, so just add the entire interval.
                self._cache.addi(addr, end, bytearray(value))

        return result

    def write_memory_block32(self, addr, data):
        return self.write_memory_block8(addr, conversion.u32le_list_to_byte_list(data))

    def invalidate(self):
        self._reset_cache()
Exemplo n.º 30
0
class IntervalPrinter:
    def __init__(self, file_type, infile, chrom, faidx, step):
        self.chrom = chrom
        # calculate chromosome length from FASTA index:
        self.chrLength = self._getChrLength(faidx)
        #print("Chromosome "+self.chrom+" length is "+str(self.chrLength))
        self.t = IntervalTree()
        self.step = step

        if (file_type == 'bedgraph'):
            with open(infile, 'r') as f:
                reader = csv.reader(f, delimiter='\t')
                for row in reader:
                    if row[0] == chrom:
                        start = int(row[1])
                        end = int(row[2])
                        data = float(row[3])
                        if start == end:
                            end = start + 1
                        self.t.addi(start, end, data)
        if (file_type == 'cnvs'):
            with open(infile, 'r') as f:
                reader = csv.reader(f, delimiter='\t')
                chrom = chrom.replace("chr", "")
                for row in reader:
                    if row[0] == chrom:
                        start = int(row[1])
                        end = int(row[2])
                        data = float(row[3])
                        if start == end:
                            end = start + 1
                        self.t.addi(start, end, data)
        if (file_type == 'ratio'):
            with open(infile, 'r') as f:
                reader = csv.reader(f, delimiter='\t')
                start = 0  # beginning of the chromosome
                chrom = chrom.replace("chr", "")
                for row in reader:
                    if row[0] == chrom:
                        end = int(row[1])
                        #data = float(row[2])   # ratio value
                        data = float(row[4])  # copy number
                        if start == end:
                            end = start + 1
                        self.t.addi(start, end, data)
                        # update
                        start = end

    def _getChrLength(self, faidx):
        with open(faidx, 'r') as idx:
            reader = csv.reader(idx, delimiter='\t')
            for row in reader:
                if row[0] == str(self.chrom):
                    return int(row[1])

    def printLine(self):
        sex_re = re.compile(".*[XY]")
        line = ""
        value = 2
        for i in range(0, self.chrLength, self.step):
            # default value is 2 for autosomes and we have to correct for sex chromosomes below
            value = 2
            # get all the values overlapping the current interval
            overlap = self.t.overlap(i, i + self.step)
            if len(overlap) != 0:
                # we can have more than one intervals overlapping the current one
                data = []
                for interval_obj in overlap:
                    data.append(interval_obj.data)
                value = max(
                    data)  #* (1 if not sex_re.match(self.chrom) else 2)
            line = line + str(value) + ","
        line = line + str(value)
        print(line)
Exemplo n.º 31
0
def PeakOverlap(genesfile, peaksfile,tssdistance=[0,0],peakname='null'):
    LuckPeak, LuckGen, LuckTree, LuckBegin , Genlist = {},{},{},{},{}

####### CREATE A INTERVALTREE VARIABLE

    tree = IntervalTree()
    n = 0
    m = 0
    intergenic = set()
    intergenic_output = {}
    for lines in open(peaksfile):
        fields = lines.split()
        namegain, chromogain, begingain, endgain = fields[3], fields[0], int(fields[1]), int(fields[2])
        space4, space5 = fields[4], fields[5]
        LuckPeak[namegain] = [chromogain, begingain, endgain, namegain, space4, space5]
        LuckBegin[begingain] = [namegain,begingain,endgain]
        intergenic = intergenic|set([namegain])

        if chromogain not in LuckTree:
            print('Chromosome '+chromogain+' of ' +peakname+'...')
            LuckTree[chromogain] = 0
            if n == 1:
                for lines2 in open(genesfile):
                    fields2 = lines2.split()
                    if fields2[0] != k:
                        continue
                    else:
                        nameid = fields2[3]
                        begingen = int(fields2[1]) - tssdistance[0]
                        endgen = int(fields2[2]) + tssdistance[1]
                        chromogen = fields2[0]
                        strand = fields2[5]
                        if tree.overlap(begingen, endgen) != set():
                            for x in tree.overlap(begingen, endgen):
                                LuckGen[m] = [chromogen] + [fields2[1]] + [fields2[2]] + [nameid] + [strand] + LuckBegin[x.begin]
                                intergenic = intergenic - set([LuckBegin[x.begin][0]])
                                m+=1
            else:
                tree[begingain:endgain] = (begingain, endgain)
            n = 1
            ### RESET THE TREE EACH TIME BEFORE START A NEW CHROMOSOME
            tree = IntervalTree()
            tree[begingain:endgain] = (begingain, endgain)
        ### get all the peaks of the chromosome to fill the tree until the next item of the field is another chromosome. Then start to compare all items of the tree with all the genes int he same chromosome
        else:
            k = chromogain
            tree[begingain:endgain] = (begingain,endgain)
    for lines2 in open(genesfile):
        fields2 = lines2.split()
        if fields2[0] != k:
            continue
        else:
            nameid = fields2[3]
            begingen = int(fields2[1]) - tssdistance[0]
            endgen = int(fields2[2]) + tssdistance[1]
            chromogen = fields2[0]
            strand = fields2[5]
            if tree.overlap(begingen, endgen) != set():
                for x in tree.overlap(begingen, endgen):
                    LuckGen[m] = [chromogen] + [fields2[1]] + [fields2[2]] + [nameid] + [strand] + LuckBegin[x.begin]
                    intergenic = intergenic - set([LuckBegin[x.begin][0]])
                    m += 1


    for x in intergenic:
        intergenic_output[x] = LuckPeak[x]

    ### OUTPUT
    if not os.path.exists(peakname):
        os.makedirs(peakname)

    if len(intergenic) == 0:
        print('No Intergenic peaks')
    else:
        results_intragenic = pd.DataFrame(list(intergenic_output.values())).sort_values(by=[0])
        results_intragenic.to_csv('./' + peakname + '/' + peakname + '_intergenic.bed', index=None, sep='\t', header=False)

    results = pd.DataFrame(list(LuckGen.values()))
    results.to_csv('./' + peakname + '/' + peakname + 'PeaksInGenes', index=None, sep='\t', header= False)
    return ('./' + peakname + '/' + peakname + 'PeaksInGenes',
            './' + peakname + '/' + peakname + '_intergenic.bed')
Exemplo n.º 32
0
def getFirstTable(line_objs,baseDataDirectory):
    symbols_to_ignore = ['$', '%', '(', ')', '((', '))', '()']
    mainTree = IntervalTree()
    mainList = list()
    table_start_bbox = get_table_start(line_objs)
    if table_start_bbox == -1:
        return None
    table_end_bbox = -1
    lst = line_objs[table_start_bbox: table_end_bbox]
    lines_com_len = 0
    if lst is not None and len(lst) > 0:
        for line_obj in lst:
            if len(mainTree) == 0:
                for sen_obj in line_obj:
                    if sen_obj['text'].replace(' ', '').lower() in symbols_to_ignore:
                        continue
                    if len(sen_obj['text'].strip()) > 0:
                        if sen_obj['underline_exists']:
                            x0 = sen_obj['x0_or']
                            x1 = sen_obj['x1_or']
                        else:
                            x0 = sen_obj['x0']
                            x1 = sen_obj['x1']
                        if len(sen_obj['text'].strip()) == 1:
                            x0 = x0 - 3
                            x1 = x1 - 3
                        if len(sen_obj['text'].strip()) == 2:
                            x0 = x0 - 3
                        mainTree.add(Interval(x0, x1, [sen_obj['text']]))
                lines_com_len += 1
            else:
                if len(line_obj[-1]['text'].replace('.', '').split()) > 10:
                    break
                for sen_obj in line_obj:
                    if sen_obj['text'].replace(' ', '').lower() in symbols_to_ignore:
                        continue
                    if sen_obj['underline_exists']:
                        x0 = sen_obj['x0_or']
                        x1 = sen_obj['x1_or']
                    else:
                        x0 = sen_obj['x0']
                        x1 = sen_obj['x1']
                    if len(sen_obj['text'].strip()) == 1:
                        x0 = x0 - 3
                        x1 = x1 - 3
                    if len(sen_obj['text'].strip()) == 2:
                        x0 = x0 - 3
                    overlapInt = mainTree.overlap(x0, x1)
                    if len(overlapInt) > 0:
                        if len(overlapInt) == 1:
                            first_col_start = min([overlap.begin for overlap in overlapInt])
                            for overlap in overlapInt:
                                if overlap.begin != first_col_start:
                                    continue
                                dataToAppend = overlap
                                te_arr = dataToAppend.data
                                for k in range(len(te_arr), lines_com_len):
                                    te_arr.append(float('NaN'))
                                te_arr.append(sen_obj['text'])
                                mainTree.remove(dataToAppend)
                                if overlap > 1:
                                    mainTree.add(Interval(dataToAppend.begin, dataToAppend.end, te_arr))
                                else:
                                    mainTree.add(Interval(min(x0, dataToAppend.begin),
                                                          max(x1, dataToAppend.end), te_arr))
                                break
                    else:
                        te_arr = []
                        for k in range(len(te_arr), lines_com_len):
                            te_arr.append(float('NaN'))
                        te_arr.append(sen_obj['text'])
                        if len(sen_obj['text'].strip()) == 1:
                            x0 = x0 - 3
                            x1 = x1 - 3
                        if len(sen_obj['text'].strip()) == 2:
                            x0 = x0 - 3
                        mainTree.add(Interval(x0, x1, te_arr))
                lines_com_len += 1
    sTree = sorted(mainTree)
    rows_to_drop = []
    max_len = max([len(tr.data) for tr in sTree])
    for tr in sTree:
        # mainList.append('\n'.join(str(tr.data).split('\n')[::-1])
        te_lst = tr.data
        for i in range(len(te_lst), max_len):
            te_lst.append(float('NaN'))
        mainList.append(te_lst)

    final_df = pd.DataFrame(mainList).T
    last_row = final_df.iloc[final_df.shape[0]-1].to_list()
    if 'note' in str(last_row[0]).replace(' ','').lower() or \
            'directors' in str(last_row[0]).replace(' ','').lower():
        final_df = final_df.drop([final_df.shape[0]-1], axis=0)
    lstColHeaders = getcolHeaders(table_start_bbox, final_df, line_objs,baseDataDirectory)
    if lstColHeaders is not None:
        # print(lstColHeaders, list(dataFrame.columns), len(lstColHeaders), len(list(dataFrame.columns)))

        if len(lstColHeaders) == len(final_df.columns):
            for index, colld in enumerate(lstColHeaders):
                final_df = final_df.rename(columns={final_df.columns[index]: str(colld)})
        elif len(lstColHeaders) == len(final_df.columns) - 1:
            for index, colld in enumerate(lstColHeaders):
                final_df = final_df.rename(columns={final_df.columns[index + 1]: str(colld)})

    return final_df