class BaseJson(ABC): """ Base json representation """ def __init__(self, path: pathlib.Path): self.path = path self.data = [] self.tree = IntervalTree() def __str__(self): return f"<{self.path}, {len(self.data)} objects>" def __repr__(self): return f"{len(self.data)}" def __load(self): raise NotImplementedError def query(self, start, stop): return self.tree.overlap(start, stop) def mark_words(self, start, stop): results = sorted(self.tree.overlap(start, stop)) for r in results: r.data['hit'] = True def query_text(self, start, stop): results = sorted(self.tree.overlap(start, stop)) s = " ".join([r.data["word"] for r in results if not r.data['hit']]).strip() return s
class FlashReaderContext(DebugContext): """! @brief Reads flash memory regions from an ELF file instead of the target.""" def __init__(self, parent, elf): super(FlashReaderContext, self).__init__(parent) self._elf = elf self._build_regions() def _build_regions(self): self._tree = IntervalTree() for sect in [s for s in self._elf.sections if (s.region and s.region.is_flash)]: start = sect.start length = sect.length sect.data # Go ahead and read the data from the file. self._tree.addi(start, start + length, sect) LOG.debug("created flash section [%x:%x] for section %s", start, start + length, sect.name) def read_memory(self, addr, transfer_size=32, now=True): length = transfer_size // 8 matches = self._tree.overlap(addr, addr + length) # Must match only one interval (ELF section). if len(matches) != 1: return self._parent.read_memory(addr, transfer_size, now) section = matches.pop().data addr -= section.start def read_memory_cb(): LOG.debug("read flash data [%x:%x] from section %s", section.start + addr, section.start + addr + length, section.name) data = section.data[addr:addr + length] if transfer_size == 8: return data[0] elif transfer_size == 16: return conversion.byte_list_to_u16le_list(data)[0] elif transfer_size == 32: return conversion.byte_list_to_u32le_list(data)[0] else: raise ValueError("invalid transfer_size (%d)" % transfer_size) if now: return read_memory_cb() else: return read_memory_cb def read_memory_block8(self, addr, size): matches = self._tree.overlap(addr, addr + size) # Must match only one interval (ELF section). if len(matches) != 1: return self._parent.read_memory_block8(addr, size) section = matches.pop().data addr -= section.start data = section.data[addr:addr + size] LOG.debug("read flash data [%x:%x]", section.start + addr, section.start + addr + size) return list(data) def read_memory_block32(self, addr, size): return conversion.byte_list_to_u32le_list(self.read_memory_block8(addr, size))
class Chromosome: COUNT_N = str.maketrans("ATGC", 'Y' * 4) def __init__(self, seq, fraglen, ignore_n=True): self.seq = seq self.chromlen = self.len = len(seq) self.peak_regions = IntervalTree() self.blacklist = IntervalTree() self.fraglen = fraglen if ignore_n: pos = 0 for k, g in groupby(self.seq.translate(self.COUNT_N)): l = sum(1 for _ in g) if k == 'N': self.blacklist.add(Interval(pos, pos + l)) self.len -= l pos += l def choose_peak_regions(self, n): while len(self.peak_regions) < n: pos = random.randrange(self.chromlen - self.fraglen) peak = Interval(pos, pos + self.fraglen) if not self.blacklist.overlap( peak) and not self.peak_regions.overlap(peak): self.peak_regions.add(peak) def _get_read_from_fragment(self, frag, width, readlen): positive_strand = random.random() >= 0.5 if positive_strand: pos = random.randrange(frag.begin, frag.begin + width) else: pos = random.randrange(frag.end - width, frag.end) pos -= readlen - 1 return pos, positive_strand def get_reads_from_peaks(self, width, readlen, n): peaks = tuple(self.peak_regions) if peaks: for _ in range(n): peak = random.choice(peaks) yield self._get_read_from_fragment(peak, width, readlen) def get_reads_as_background(self, width, readlen, n): for _ in range(n): pos = random.randrange(0, self.chromlen - self.fraglen) fragment = Interval(pos, pos + self.fraglen) if not self.blacklist.overlap(fragment): yield self._get_read_from_fragment(fragment, width, readlen)
def scan_tree(intervals): """construct an interval tree using supplied genomic intervals, check all elements on the tree against iself and return any that hit 2 or more intervals (i.e. itself + 1 other)""" retlist = set() t = IntervalTree(Interval(*iv) for iv in intervals) for g in intervals: if len(t.overlap(g[0], g[1])) > 1: # print( t.overlap( g[0], g[1]) ) o = t.overlap(g[0], g[1]) for x in o: retlist.add(x.data) return retlist
def test_brackets_vs_overlap(): it = IntervalTree() it.addi(1, 3, "dude") it.addi(2, 4, "sweet") it.addi(6, 9, "rad") for iobj in it: assert it[iobj.begin:iobj.end] == it.overlap(iobj.begin, iobj.end)
class AslrOracle: def __init__(self): self.queries = 0 self.InitCache() def CheckAddress(self, address): return self.CheckRange(address, 0x1000) def InitCache(self): self.cached_queries = 0 self.good_regions = IntervalTree() self.bad_regions = IntervalTree() def InsertToCache(self, start, end, valid): if valid: self.good_regions.add(Interval(start, end + 1)) self.good_regions.merge_overlaps() else: self.bad_regions.add(Interval(start, end)) def CheckCache(self, start, end): good_overlaps = self.good_regions.overlap(start, end) for overlap in good_overlaps: if (overlap[0] <= start) and (overlap[1] >= end): self.cached_queries += 1 return True bad_overlaps = self.bad_regions.envelop(start, end) if len(bad_overlaps) > 0: self.cached_queries += 1 return False return None
def get_multilines(spans): intervals = Intervals() lines = [] for start, stop, type in spans: line = Line(start, stop, type, level=None) intervals.addi(start, stop, line) lines.append(line) # level for line in lines: selected = intervals.overlap(line.start, line.stop) line.level = get_free_level(selected) # chunk intervals.split_overlaps() # group groups = defaultdict(list) for start, stop, line in intervals: groups[start, stop].append(line) for start, stop in sorted(groups): lines = groups[start, stop] lines = sorted(lines, key=lambda _: _.level) yield Multiline(start, stop, lines)
class SimpleDnMedium(DnMedium): def __init__(self) -> None: self.msgs = IntervalTree() def add_dn(self, msg: LoraMsg) -> None: t0 = Simulation.time2ticks(msg.xbeg) t1 = t0 + Simulation.time2ticks(msg.tpreamble()) self.msgs[t0:t1] = msg @staticmethod def overlap(i1: Interval, i2: Interval) -> int: return min(i1.end, i2.end) - max(i1.begin, i2.begin) # type: ignore def get_dn(self, rxon: int, rxtout: int, freq: int, rps: int, nsym: int = 4) -> Optional[LoraMsg]: rxw = Interval(rxon, rxon + rxtout) tpn = Simulation.time2ticks(LoraMsg.symtime(rps, nsym)) for i in self.msgs.overlap(rxw[0], rxw[1]): m = i.data # type: LoraMsg if m.match(freq, rps) and SimpleDnMedium.overlap(i, rxw) >= tpn: break else: return None self.msgs.remove(i) return m def prune(self, ticks: int) -> None: exp = self.msgs.envelop(0, ticks) if exp: self.msgs.remove_envelop(0, ticks) return exp
def find_len_non_overlap(interval: Interval, itree: IntervalTree) -> int: overlaps = IntervalTree(itree.overlap(interval)) overlaps.merge_overlaps() len_overlap = sum([intersection(interval, o).length() for o in overlaps]) return interval.length() - len_overlap
def countIdealOverlaps(self, nodes): iTree = IntervalTree() for node in nodes: iTree.addi(node.idealLeft(), node.idealRight(), data=node) for node in nodes: overlaps = iTree.overlap(node.idealLeft(), node.idealRight()) node.overlaps = [x.data for x in overlaps] node.overlapCount = len(overlaps)
def test_cosmic_genome_pos_filter(self): lung_mut_interval_tree = IntervalTree() # Test for positive matches. for _, row in self.cosmic_df.iterrows(): if row["Primary site"] != "lung": continue genome_pos = GenomePosition.from_str( str(row["Mutation genome position"])) if genome_pos is None: continue # Add the genome position to a tree for use in further assertions. lung_mut_interval_tree[genome_pos.start:genome_pos. end] = genome_pos.chrom self.assertTrue( self.mutation_counter._cosmic_subset_contains_genome_pos( genome_pos)) # Test for negative matches, excluding negative mutation matches which # overlap with positive ones. for _, row in self.cosmic_df.iterrows(): genome_pos = GenomePosition.from_str( str(row["Mutation genome position"])) if genome_pos is None: continue # genome_pos overlaps with a positive match, so it cannot be assumed # that it shouldn't match. if any( map( lambda it: it.data == genome_pos.chrom, lung_mut_interval_tree.overlap(genome_pos.start, genome_pos.end))): continue self.assertFalse( self.mutation_counter._cosmic_subset_contains_genome_pos( genome_pos)) # Do some further negative testing to ensure that garbage genome # positions don't match the filter. negative_tests = [ GenomePosition("nonexistent-chromosome", 0, 0), GenomePosition("1", -10, -1), ] for test in negative_tests: self.assertFalse( self.mutation_counter._cosmic_subset_contains_genome_pos(test))
class YandexJson: """ Kaldi json representation """ def __init__(self, path: pathlib.Path): self.path = path self.data = [] self.tree = IntervalTree() self.__load() def __str__(self): return f"<{self.path}, {len(self.data)} objects>" def __repr__(self): return f"{len(self.data)}" def __load(self): with pathlib.Path.open(self.path, 'r') as json_data: data = json.load(json_data) if data: self.data = data for chunk in self.data['response']['chunks']: if chunk['channelTag'] == '1': for r in chunk['alternatives'][0]['words']: i = Interval(float(r["startTime"][:-1]), float(r["endTime"][:-1]), r) r['hit'] = False self.tree.add(i) def query(self, start, stop): return self.tree.overlap(start, stop) def mark_words(self, start, stop): results = sorted(self.tree.overlap(start, stop)) for r in results: r.data['hit'] = True def query_text(self, start, stop): results = sorted(self.tree.overlap(start, stop)) s = " ".join([r.data["word"] for r in results if not r.data['hit']]).strip() return s
def total_intersection(itree: IntervalTree, interval: Interval) -> int: if interval.length() <= 0: return 0 total = 0 ovlps = IntervalTree(itree.overlap(interval)) ovlps.merge_overlaps() for ovlp in ovlps: inter = intersect(interval, ovlp) total += inter.length() return total
class YandexJson: """ Kaldi json representation """ def __init__(self, path: pathlib.Path): self.path = path self.data = [] self.tree = IntervalTree() self.__load() def __str__(self): return f"<{self.path}, {len(self.data)} objects>" def __repr__(self): return f"{len(self.data)}" def __load(self): with pathlib.Path.open(self.path, 'r') as json_data: data = json.load(json_data) if data: self.data = data for obj in self.data: if obj.get("result"): for r in obj["result"]: i = Interval(r["start"], r["end"], r) r['hit'] = False self.tree.add(i) def query(self, start, stop): return self.tree.overlap(start, stop) def mark_words(self, start, stop): results = sorted(self.tree.overlap(start, stop)) for r in results: r.data['hit'] = True def query_text(self, start, stop): results = sorted(self.tree.overlap(start, stop)) s = " ".join([r.data["word"] for r in results if not r.data['hit']]).strip() return s
def filter_intervals(intervals): it = IntervalTree() intervals_filtered = [] for start, end in intervals: #if it.search(start, end): if it.overlap(start, end): pass else: it.addi(start, end, 1) #it.add(start, end, 1) intervals_filtered.append((start, end)) return sorted(intervals_filtered, key=lambda tup: tup[0])
def section_markup(markup, mode=HTML): arcs = [] for source, target, type in markup.deps: if type == ROOT: continue if source < target: start, stop = source, target direction = RIGHT else: start, stop = target, source direction = LEFT arc = Arc(start, stop, direction, type, level=None) arcs.append(arc) # order arcs = sorted(arcs, key=Arc.layout_order) # level intervals = Intervals() for arc in arcs: stop = arc.stop if mode == ASCII: stop += 1 # in ascii mode include stop intervals.addi(arc.start, stop, arc) for arc in arcs: selected = intervals.overlap(arc.start, arc.stop) arc.level = get_free_level(selected) # group sections = defaultdict(list) for arc in arcs: start, stop, direction, type, level = arc parent = id(arc) for index in range(start, stop + 1): if index == start: part = BEGIN if direction == RIGHT else END elif index == stop: part = END if direction == RIGHT else BEGIN else: part = INSIDE section = ArcSection(part, direction, type, level, parent) sections[index].append(section) for index, word in enumerate(markup.words): arcs = sections[index] arcs = sorted(arcs, key=Arc.level_order) yield DepMarkupSection(word, arcs)
class SimpleMedium(Medium): def __init__(self, put_up: Optional[Callable[[LoraMsg], None]]) -> None: self._put_up = put_up self.msgs = IntervalTree() def reset_medium(self) -> None: self.msgs.clear() def add_dn(self, msg: LoraMsg) -> None: t0 = Simulation.time2ticks(msg.xbeg) t1 = t0 + Simulation.time2ticks(msg.tpreamble()) self.msgs[t0:t1] = msg @staticmethod def overlap(i1: Interval, i2: Interval) -> int: return min(i1.end, i2.end) - max(i1.begin, i2.begin) # type: ignore def get_dn(self, rxon: int, rxtout: int, freq: int, rps: int, nsym: int = 4, peek=False) -> Optional[LoraMsg]: rxw = Interval(rxon, rxon + rxtout) tpn = Simulation.time2ticks(LoraMsg.symtime(rps, nsym)) for i in self.msgs.overlap(rxw[0], rxw[1]): m = i.data # type: LoraMsg if m.match(freq, rps) and (peek or SimpleMedium.overlap(i, rxw) >= tpn): break else: return None if not peek: self.msgs.remove(i) return m def prune(self, ticks: int) -> List[LoraMsg]: exp = cast(List[Interval], self.msgs.envelop(0, ticks)) if exp: self.msgs.remove_envelop(0, ticks) return [iv[2] for iv in exp]
def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.overlap(t.begin(), t.end()) == e assert t.envelop(t.begin(), t.end()) == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} assert t.range().is_null() assert t.range().length() == 0 t.verify()
def partition_spans(spans: List[Span]) -> Tuple[List[List[Span]], List[Span]]: """ partitions a list of spans into 1. a list of span clusters, where each cluster contains spans that overlap somehow 2. a list of spans that are non-overlapping. :param spans: :return: """ uf = UnionFind() spans_so_far = IntervalTree() for span in spans: start, end = span overlaps_with = spans_so_far.overlap(begin=start, end=end) if len(overlaps_with) > 0: for parent in list(overlaps_with): parent_span = parent.begin, parent.end # print(parent) # print(span) uf.union(parent_span, span) else: spans_so_far.addi(begin=start, end=end) uf.union(span) # parent to cluster dict p2c = {} for span in spans: parent = uf[span] if parent not in p2c: p2c[parent] = [] p2c[parent].append(span) # non overlap spans are those whose cluster contain just them non_overlap_spans: List[Span] = [ parent for parent in p2c if len(p2c[parent]) == 1 ] # rest overlap overlap_groups: List[List[Span]] = [ p2c[parent] for parent in p2c if len(p2c[parent]) > 1 ] # print(parent2cluster) return overlap_groups, non_overlap_spans
def get_cons(self, docta): cons_list = [] tokens_list = list(enumerate(docta.get_tokens)) spans_so_far = IntervalTree() for ngram_size in [4, 3, 2, 1]: for ngram in ngrams(tokens_list=tokens_list, ngram_size=ngram_size): ngram_start = ngram[0][0] ngram_end = ngram[-1][0] + 1 ngram_string = " ".join([n[1] for n in ngram]) # print(ngram, ngram_start, ngram_end, ngram_string) cands = self.cg.get_candidates(ngram_string, pretokenized=True) logging.info("query: %s", ngram_string) logging.info("cands found: %d", len(cands)) if len(cands) == 0: continue most_prob_cand = cands[0] new_cons = { "end": ngram_end, "label": "MENTION", "score": 1.0, "start": ngram_start, "most_prob_cand": most_prob_cand.en_title, "most_prob_prob": most_prob_cand.p_t_given_s, "ncands": len(cands), "tokens": ngram_string } overlap_mentions = spans_so_far.overlap(begin=ngram_start, end=ngram_end) if len(overlap_mentions) > 0: # do not allow overlapping/nested mentions continue else: spans_so_far.addi(begin=ngram_start, end=ngram_end) cons_list.append(new_cons) logging.info("#mentions found:%d", len(cons_list)) logging.info("#total tokens:%d", len(tokens_list)) return cons_list
def reprocessForColHeader(lst,months,yrs,firstRow, df_columns_len,baseDataDirectory): mainTree = IntervalTree() mainList = list() # baseDataDirectory = '/home/swaroop/Documents/Projects/SFC_extraction/Data' hd=headers(baseDataDirectory,months,yrs) count=0 reversed_list = [] if lst is not None and len(lst) > 0: reversed_list = lst[::-1] # tree_start = IntervalTree() # removing extra lines from df starting line. while True: if len(reversed_list) <= 0: break firstLine = reversed_list.pop(0) reference = firstLine[0]['x1'] line = '' for sen_obj in firstLine: line = line + sen_obj['text'] + ' ' cnt = 0 for val in firstRow: if str(val).replace(' ', '').lower() in line.replace(' ', '').lower(): cnt += 1 if cnt == len(firstRow) or cnt >= 3: # for sen_obj in firstLine: # tree_start.add(Interval(sen_obj['x0'], sen_obj['x1'], sen_obj)) break cols_start_y = 0 for line_obj in reversed_list: if line_obj[0]['text'].lower().replace(' ', '') == 'Table of Contents'.lower().replace(' ', ''): break if len(line_obj) == 0: continue line = '' for sen_obj in line_obj: line = line + sen_obj['text'] + ' ' if len(mainTree) == 0: if (not hd.checkRegex(line) and ( hd.IsHeader(line_obj) or hd.isStringHeader(line)) and not hd.isHalfSentance(line)): for sen_obj in line_obj: if len(line_obj) == 1: if sen_obj['x0'] < reference: count = count + 1 if count >= 2: sTree = sorted(mainTree) for tr in sTree: # mainList.append('\n'.join(str(tr.data).split('\n')[::-1]) mainList.append(tr.data['text']) # print(mainList) return mainList, len(reversed_list) continue # if len(sen_obj['text'].strip()) > 0 and sen_obj['underline_exists']: if len(sen_obj['text'].strip()) > 0: cols_start_y = sen_obj['top'] mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], sen_obj)) elif line_obj[0]['x0'] > reference or True: if not (hd.checkRegex(line)): for sen_obj in line_obj: if sen_obj['x0'] > 0 and len(sen_obj['text'].strip()) > 0 and \ len(sen_obj['text'].strip().split()) < 10: overlapInt = mainTree.overlap(sen_obj['x0'], sen_obj['x1']) dataToAppend = '' if len(overlapInt) > 0: for overLap in overlapInt: dataToAppend = overLap if float(dataToAppend.data['top']) - float(sen_obj['bottom']) <= 7 or sen_obj['underline_exists']: if sen_obj['underline_exists']: # sen_obj_te = copy.deepcopy(sen_obj) # sen_obj_te['text'] = sen_obj_te['text'] + '\n' + str(dataToAppend.data['text']) prev_sen_obj = copy.deepcopy(dataToAppend.data) prev_sen_obj['text'] = sen_obj['text'] + '\n' + str(prev_sen_obj['text']) prev_sen_obj['top'] = sen_obj['top'] prev_sen_obj['bottom'] = sen_obj['bottom'] mainTree.remove(dataToAppend) mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], prev_sen_obj)) else: previous_starts = set([overLap.begin for overLap in overlapInt]) previous_ends = set([overLap.end for overLap in overlapInt]) if len(overlapInt) == 1 or (len(previous_starts) == 1 and len(previous_ends) == 1): # sen_obj_te = copy.deepcopy(sen_obj) # sen_obj_te['text'] = sen_obj_te['text'] + '\n' + str( # dataToAppend.data['text']) prev_sen_obj = copy.deepcopy(dataToAppend.data) prev_sen_obj['text'] = sen_obj['text'] + '\n' + str( prev_sen_obj['text']) prev_sen_obj['top'] = sen_obj['top'] prev_sen_obj['bottom'] = sen_obj['bottom'] mainTree.remove(dataToAppend) mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], prev_sen_obj)) else: if len(set([overLap.begin for overLap in overlapInt])) == 1 and \ len(set([overLap.end for overLap in overlapInt])) == 1: # sen_obj_te = copy.deepcopy(sen_obj) # sen_obj_te['text'] = sen_obj_te['text'] + '\n' + str( # dataToAppend.data['text']) prev_sen_obj = copy.deepcopy(dataToAppend.data) prev_sen_obj['text'] = sen_obj['text'] + '\n' + str( prev_sen_obj['text']) prev_sen_obj['top'] = sen_obj['top'] prev_sen_obj['bottom'] = sen_obj['bottom'] prev_sen_obj['bottom'] = sen_obj['bottom'] mainTree.remove(dataToAppend) mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], prev_sen_obj)) else: count = count + 1 if count >= 1: for tr in mainTree: dataToAppend = tr mainTree.remove(dataToAppend) mainTree.add( Interval(dataToAppend.data['x0'], dataToAppend.data['x1'], dataToAppend.data)) sTree = sorted(mainTree) for tr in sTree: # mainList.append('\n'.join(str(tr.data).split('\n')[::-1]) mainList.append(tr.data['text']) # print(mainList) return mainList, len(reversed_list) else: count = count + 1 if count >= 1: for tr in mainTree: dataToAppend = tr mainTree.remove(dataToAppend) mainTree.add( Interval(dataToAppend.data['x0'], dataToAppend.data['x1'], dataToAppend.data)) sTree = sorted(mainTree) for tr in sTree: # mainList.append('\n'.join(str(tr.data).split('\n')[::-1]) mainList.append(tr.data['text']) # print(mainList) return mainList, len(reversed_list) else: if len(mainTree) < df_columns_len and ( sen_obj['x0'] >= reference or len(line_obj) > 1) and \ (cols_start_y - sen_obj['bottom'] <= 7): mainTree.add(Interval(sen_obj['x0_or'], sen_obj['x1_or'], sen_obj)) else: count = count + 1 if count >= 1: for tr in mainTree: dataToAppend = tr mainTree.remove(dataToAppend) mainTree.add( Interval(dataToAppend.data['x0'], dataToAppend.data['x1'], dataToAppend.data)) sTree = sorted(mainTree) for tr in sTree: # mainList.append('\n'.join(str(tr.data).split('\n')[::-1]) mainList.append(tr.data['text']) # print(mainList) return mainList, len(reversed_list) else: if (len(mainTree) > 0) and (line_obj[0]['text'].isupper() or hd.checkRegex(line)): break else: if (len(mainTree) > 0): count=count+1 if count>=1: break sorted(mainTree) else: if len(mainTree) > 0: for tr in mainTree: dataToAppend = tr mainTree.remove(dataToAppend) mainTree.add( Interval(dataToAppend.data['x0'], dataToAppend.data['x1'], dataToAppend.data)) sTree = sorted(mainTree) for tr in sTree: # '\n'.join(str(tr.data).split('\n')[::-1]) # mainList.append(tr.data) # mainList.append('\n'.join(str(tr.data).split('\n')[::-1])) mainList.append(tr.data['text']) return mainList, len(reversed_list) for tr in mainTree: dataToAppend = tr mainTree.remove(dataToAppend) mainTree.add( Interval(dataToAppend.data['x0'], dataToAppend.data['x1'], dataToAppend.data)) sTree = sorted(mainTree) for tr in sTree: # mainList.append('\n'.join(str(tr.data).split('\n')[::-1]) mainList.append(tr.data['text']) # print(mainList) return mainList, len(reversed_list)
class ClassFunctionDropdown(Panel): """ Class and Function/Method Dropdowns Widget. Parameters ---------- editor : :class:`spyder.plugins.editor.widgets.codeeditor.CodeEditor` The editor to act on. """ def __init__(self, editor): super(ClassFunctionDropdown, self).__init__(editor) # Internal data self._tree = IntervalTree() self._data = None self.classes = [] self.funcs = [] # Widgets self._editor = editor self.class_cb = QComboBox() self.method_cb = QComboBox() # Widget setup self.class_cb.addItem(_('<None>'), 0) self.method_cb.addItem(_('<None>'), 0) # The layout hbox = QHBoxLayout() hbox.addWidget(self.class_cb) hbox.addWidget(self.method_cb) hbox.setSpacing(0) hbox.setContentsMargins(0, 0, 0, 0) self.setLayout(hbox) # Signals self._editor.sig_cursor_position_changed.connect( self._handle_cursor_position_change_event) self.class_cb.activated.connect(self.combobox_activated) self.method_cb.activated.connect(self.combobox_activated) def _getVerticalSize(self): """Get the default height of a QComboBox.""" return self.class_cb.height() @Slot(int, int) def _handle_cursor_position_change_event(self, linenum, column): self.update_selected(linenum) def sizeHint(self): """Override Qt method.""" return QSize(0, self._getVerticalSize()) def combobox_activated(self): """Move the cursor to the selected definition.""" sender = self.sender() item = sender.itemData(sender.currentIndex()) if item: line = item['location']['range']['start']['line'] + 1 self.editor.go_to_line(line) if sender == self.class_cb: self.method_cb.setCurrentIndex(0) def update_selected(self, linenum): """Updates the dropdowns to reflect the current class and function.""" possible_parents = list(sorted(self._tree[linenum])) for iv in possible_parents: item = iv.data kind = item.get('kind') if kind in [SymbolKind.CLASS]: # Update class combobox for idx in range(self.class_cb.count()): if self.class_cb.itemData(idx) == item: self.class_cb.setCurrentIndex(idx) break else: self.class_cb.setCurrentIndex(0) elif kind in [SymbolKind.FUNCTION, SymbolKind.METHOD]: # Update func combobox for idx in range(self.method_cb.count()): if self.method_cb.itemData(idx) == item: self.method_cb.setCurrentIndex(idx) break else: self.method_cb.setCurrentIndex(0) else: continue if len(possible_parents) == 0: self.class_cb.setCurrentIndex(0) self.method_cb.setCurrentIndex(0) def populate(self, combobox, data, add_parents=False): """ Populate the given ``combobox`` with the class or function names. Parameters ---------- combobox : :class:`qtpy.QtWidgets.QComboBox` The combobox to populate data : list of :class:`dict` The data to populate with. There should be one list element per class or function defintion in the file. add_parents : bool Add parents to name to create a fully qualified name. Returns ------- None """ combobox.clear() combobox.addItem(_('<None>'), 0) model = combobox.model() item = model.item(0) item.setFlags(Qt.NoItemFlags) cb_data = [] for item in data: fqn = item['name'] # Create a list of fully-qualified names if requested if add_parents: begin = item['location']['range']['start']['line'] end = item['location']['range']['end']['line'] possible_parents = sorted(self._tree.overlap(begin, end), reverse=True) for iv in possible_parents: if iv.begin == begin and iv.end == end: continue # Check if it is a real parent p_item = iv.data p_begin = p_item['location']['range']['start']['line'] p_end = p_item['location']['range']['end']['line'] if p_begin <= begin and p_end >= end: fqn = p_item['name'] + "." + fqn cb_data.append((fqn, item)) for fqn, item in cb_data: # Set the icon (See: editortools.py) icon = None name = item['name'] if item['kind'] in [SymbolKind.CLASS]: icon = ima.icon('class') else: if name.startswith('__'): icon = ima.icon('private2') elif name.startswith('_'): icon = ima.icon('private1') else: icon = ima.icon('method') # Add the combobox item if icon is not None: combobox.addItem(icon, fqn, item) else: combobox.addItem(fqn, item) line, column = self._editor.get_cursor_line_column() self.update_selected(line) def update_data(self, data): """Update and process symbol data.""" if data == self._data: return self._data = data self._tree.clear() self.classes = [] self.funcs = [] for item in data: line_start = item['location']['range']['start']['line'] line_end = item['location']['range']['end']['line'] kind = item.get('kind') block = self._editor.document().findBlockByLineNumber(line_start) line_text = line_text = block.text() if block else '' # The symbol finder returns classes in import statements as well # so we filter them out if line_start != line_end and ' import ' not in line_text: self._tree[line_start:line_end] = item if kind in [SymbolKind.CLASS]: self.classes.append(item) elif kind in [SymbolKind.FUNCTION, SymbolKind.METHOD]: self.funcs.append(item) self.class_cb.clear() self.method_cb.clear() self.populate(self.class_cb, self.classes, add_parents=False) self.populate(self.method_cb, self.funcs, add_parents=True)
class OneKg(): def __init__(self, anno_file, chrom, start, end): self.anno_file = anno_file self.tree = IntervalTree() try: for entry in self.anno_file.fetch(chrom, start, end): self.tree.addi(entry.start, entry.stop, entry) except Exception: pass self.tree_bts = list(self.tree.boundary_table.keys()) # Everything past the end would need to be pumped through self.tree_bts.append(sys.maxsize) self.n_header = None def load_header(self, in_vcf): """ Returns the header of the information we'll be adding to the annotated vcfs """ ret = in_vcf.header.copy() ret.add_line(( '##INFO=<ID=OKG_MSTART,Number=1,Type=Integer,Description="Mitochondrial ' 'start coordinate of inserted sequence">')) ret.add_line(( '##INFO=<ID=OKG_MLEN,Number=1,Type=Integer,Description="Estimated length ' 'of mitochondrial insert">')) ret.add_line(( '##INFO=<ID=OKG_MEND,Number=1,Type=Integer,Description="Mitochondrial end' ' coordinate of inserted sequence">')) ret.add_line(( '##INFO=<ID=OKG_MEINFO,Number=4,Type=String,Description="Mobile element ' 'info of the form NAME,START,END<POLARITY; If there is only 5\' OR 3\' ' 'support for this call, will be NULL NULL for START and END">')) ret.add_line(( '##INFO=<ID=OKG_AF,Number=.,Type=Float,Description="Estimated allele ' 'frequency in the range (0,1)">')) ret.add_line(( '##INFO=<ID=OKG_EAS_AF,Number=.,Type=Float,Description="Allele frequency ' 'in the EAS populations calculated from AC and AN, in the range (0,1)">' )) ret.add_line(( '##INFO=<ID=OKG_EUR_AF,Number=.,Type=Float,Description="Allele frequency ' 'in the EUR populations calculated from AC and AN, in the range (0,1)">' )) ret.add_line(( '##INFO=<ID=OKG_AFR_AF,Number=.,Type=Float,Description="Allele frequency ' 'in the AFR populations calculated from AC and AN, in the range (0,1)">' )) ret.add_line(( '##INFO=<ID=OKG_AMR_AF,Number=.,Type=Float,Description="Allele frequency ' 'in the AMR populations calculated from AC and AN, in the range (0,1)">' )) ret.add_line(( '##INFO=<ID=OKG_SAS_AF,Number=.,Type=Float,Description="Allele frequency ' 'in the SAS populations calculated from AC and AN, in the range (0,1)">' )) ret.add_line(( '##INFO=<ID=OKG_SVTYPE,Number=1,Type=String,Description="OneThousandGenome' 'ALT Type">')) self.n_header = ret def annotate(self, entry, refdist=500, size_min=50, size_max=50000): """ Given an pyvcf Variant entry do the matching """ # Biggest shortcut, only annotate SVs if "SVLEN" not in entry.info: return entry if entry.stop + refdist < self.tree_bts[0]: return entry if entry.start - refdist > self.tree_bts[0]: self.tree_bts.pop(0) if not (size_min <= abs(entry.info["SVLEN"]) <= size_max): return entry # Don't lookup until we have to m_type = None candidates = [] for anno_entry in self.tree.overlap(entry.start - refdist, entry.stop + refdist): anno_entry = anno_entry.data a_size = truvari.get_vcf_entry_size(anno_entry) if not (size_min <= a_size <= size_max): continue ps, sd = truvari.entry_size_similarity(entry, anno_entry) if not ps >= 0.7: continue mat1 = sv_alt_match.match(anno_entry.alts[0]) if mat1 is not None: a_type = mat1.groupdict()["SVTYPE"] else: a_type = truvari.get_vcf_variant_type(anno_entry) # Don't make until we have to, and only do so once if m_type is None: m_type = truvari.get_vcf_variant_type(entry) if not (a_type == m_type or ((a_type == "CN0" or a_type.startswith("DEL")) and m_type == "DEL") or (m_type == "INS" and a_type.startswith("INS"))): continue # RO doesn't work for INS? ro = truvari.entry_reciprocal_overlap(entry, anno_entry) if m_type != "INS" and ro < 0.5: continue candidates.append((ro, ps, anno_entry)) if candidates: truvari.match_sorter(candidates) return self.add_info(truvari.copy_entry(entry, self.n_header), candidates[0][-1]) return entry def extract_info(self, annot): """MSTART MLEN MEND MEINFO AF EAS_AF EUR_AF AFR_AF AMR_AF SAS_AF ALT""" def infoc(key): if key in annot.info: return key, annot.info[key] return None, None def altp(): """reformat the alt seq""" ret = [] for i in annot.alts: if i.startswith("<"): ret.append(i[1:-1]) return "SVTYPE", tuple(ret) return [ infoc("MSTART"), infoc("MLEN"), infoc("MEND"), infoc("MEINFO"), infoc("AF"), infoc("EAS_AF"), infoc("EUR_AF"), infoc("AFR_AF"), infoc("AMR_AF"), infoc("SAS_AF") ] def add_info(self, entry, annot): """ Put the relevant info fields into the entry to be annotated """ # Get the annotations out of the annot and add them to the entry if not annot: return entry i = self.extract_info(annot) for key, val in self.extract_info(annot): if val is not None: entry.info["OKG_" + key] = val return entry
class ActivityTracker: def __init__(self, pen_name, base_timestamp): self.interval_index = IntervalTree() self.pen_name = pen_name self.base_timestamp = base_timestamp self.current_activities = {'feeding': {}, 'drinking': {}} def update_activity(self, frame_id, activity_dict): for activity, ids in activity_dict.items(): ## Iterate through all the pigs in the activity dict and ## set the starting frame for the id if the activity is not tracked for pig_id in ids: if pig_id not in self.current_activities[activity]: self.current_activities[activity][pig_id] = frame_id ## For those IDs which were not seen in the current activity dict, ## Complete and add their acticity in the interval_index self.add_activity( set(self.current_activities[activity].keys()) - ids, activity, frame_id) def add_activity(self, completed_ids, activity, end_frame_id): for pig_id in completed_ids: start_frame_id = self.current_activities[activity].pop( pig_id, None) self.interval_index[start_frame_id:end_frame_id] = (activity, pig_id) def export_tracker(self, pigs, frame_id): ## Firstly, add all current activities in the interval index for activity in self.current_activities.copy(): self.add_activity(self.current_activities[activity].copy(), activity, frame_id) base_dir = 'data/indices/' if not os.path.exists(base_dir): os.makedirs(base_dir) with open( os.path.join( base_dir, "Pen%s-%s.pkl" % (self.pen_name, self.base_timestamp)), "wb") as f: pickle.dump(self.interval_index, f) pickle.dump(pigs, f) def import_tracker(self, filename): with open(filename, "rb") as f: self.interval_index = pickle.load(f) self.pigs = pickle.load(f) def query(self, q_activity, start_frame, end_frame): activities = [ a.data for a in self.interval_index.overlap(start_frame, end_frame) ] return [(activity, pig_id) for activity, pig_id in activities if activity == q_activity]
def get_indel_testing_candidates(dct): mapping = {'A': 0, 'G': 1, 'T': 2, 'C': 3, '-': 4} rev_base_map = {0: 'A', 1: 'G', 2: 'T', 3: 'C', 4: '-'} init_time = time.time() start_time = str(datetime.datetime.now()) window_before, window_after = 0, 160 if dct['seq'] == 'pacbio': window_after = 260 chrom = dct['chrom'] start = dct['start'] end = dct['end'] sam_path = dct['sam_path'] fasta_path = dct['fasta_path'] samfile = pysam.Samfile(sam_path, "rb") fastafile = pysam.FastaFile(fasta_path) window_size = dct['win_size'] small_size = dct['small_win_size'] mincov, maxcov = dct['mincov'], dct['maxcov'] ins_t, del_t = dct['ins_t'], dct['del_t'] include_intervals, exclude_intervals = None, None if dct['include_bed']: tbx = pysam.TabixFile(dct['include_bed']) include_intervals = IntervalTree( Interval(int(row[1]), int(row[2]), "%s" % (row[1])) for row in tbx.fetch(chrom, parser=pysam.asBed())) def in_bed(tree, pos): return tree.overlaps(pos) include_intervals = IntervalTree(include_intervals.overlap(start, end)) if not include_intervals: return [], [], [], [], [] else: start = max(start, min(x[0] for x in include_intervals)) end = min(end, max(x[1] for x in include_intervals)) else: def in_bed(tree, pos): return True if dct['exclude_bed']: tbx = pysam.TabixFile(dct['exclude_bed']) try: exclude_intervals = IntervalTree( Interval(int(row[1]), int(row[2]), "%s" % (row[1])) for row in tbx.fetch(chrom, parser=pysam.asBed())) def ex_bed(tree, pos): return tree.overlaps(pos) except ValueError: def ex_bed(tree, pos): return False else: def ex_bed(tree, pos): return False ref_dict = { j: s.upper() if s in 'AGTC' else '' for j, s in zip( range(max(1, start - 200), end + 400 + 1), fastafile.fetch(chrom, max(1, start - 200) - 1, end + 400)) } chrom_length = fastafile.get_reference_length(chrom) hap_dict = {1: [], 2: []} for pread in samfile.fetch(chrom, max(0, dct['start'] - 100000), dct['end'] + 1000): if pread.has_tag('HP'): hap_dict[pread.get_tag('HP')].append(pread.qname) hap_reads_0 = set(hap_dict[1]) hap_reads_1 = set(hap_dict[2]) if dct['supplementary']: flag = 0x4 | 0x100 | 0x200 | 0x400 else: flag = 0x4 | 0x100 | 0x200 | 0x400 | 0x800 output_pos,output_data_0,output_data_1,output_data_total,alleles=[],[],[],[],[] del_queue_0, del_queue_1 = collections.deque( window_size * [set()], window_size), collections.deque(window_size * [set()], window_size) ins_queue_0, ins_queue_1 = collections.deque( window_size * [set()], window_size), collections.deque(window_size * [set()], window_size) position_queue = collections.deque(window_size * [{}], window_size) del_queue_small_0, del_queue_small_1 = collections.deque( small_size * [set()], small_size), collections.deque(small_size * [set()], small_size) ins_queue_small_0, ins_queue_small_1 = collections.deque( small_size * [set()], small_size), collections.deque(small_size * [set()], small_size) position_queue_small = collections.deque(small_size * [{}], small_size) variants = {} extra_variants = {} max_range = {0: max(10, window_size), 1: 10} count = 0 prev = 0 for pcol in samfile.pileup(chrom,max(0,start-1),end,min_base_quality=0,\ flag_filter=flag,truncate=True): v_pos = pcol.pos + 1 if in_bed(include_intervals, v_pos) and not ex_bed(exclude_intervals, v_pos): read_names = pcol.get_query_names() read_names_0 = set(read_names) & hap_reads_0 read_names_1 = set(read_names) & hap_reads_1 len_seq_tot = len(read_names) len_seq_0 = len(read_names_0) len_seq_1 = len(read_names_1) ins_set_0 = { n for n, s in zip( read_names, pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)) if '+' in s and n in read_names_0 and int(''.join(filter(str.isdigit, s))) > 2 } ins_set_small_0 = { n for n, s in zip( read_names, pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)) if '+' in s and n in read_names_0 and int(''.join(filter(str.isdigit, s))) <= 10 } del_set_0 = { n for n, s in zip( read_names, pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)) if '-' in s and n in read_names_0 and int(''.join(filter(str.isdigit, s))) > 2 } del_set_small_0 = { n for n, s in zip( read_names, pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)) if '-' in s and n in read_names_0 and int(''.join(filter(str.isdigit, s))) <= 10 } ins_set_1 = { n for n, s in zip( read_names, pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)) if '+' in s and n in read_names_1 and int(''.join(filter(str.isdigit, s))) > 2 } ins_set_small_1 = { n for n, s in zip( read_names, pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)) if '+' in s and n in read_names_1 and int(''.join(filter(str.isdigit, s))) <= 10 } del_set_1 = { n for n, s in zip( read_names, pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)) if '-' in s and n in read_names_1 and int(''.join(filter(str.isdigit, s))) > 2 } del_set_small_1 = { n for n, s in zip( read_names, pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)) if '-' in s and n in read_names_1 and int(''.join(filter(str.isdigit, s))) <= 10 } del_queue_0.append(del_set_0) del_queue_1.append(del_set_1) ins_queue_0.append(ins_set_0) ins_queue_1.append(ins_set_1) del_queue_small_0.append(del_set_small_0) del_queue_small_1.append(del_set_small_1) ins_queue_small_0.append(ins_set_small_0) ins_queue_small_1.append(ins_set_small_1) if v_pos <= prev: continue if len_seq_0 >= mincov and len_seq_1 >= mincov: del_freq_0 = len(set.union( *del_queue_0)) / len_seq_0 if len_seq_0 > 0 else 0 ins_freq_0 = len(set.union( *ins_queue_0)) / len_seq_0 if len_seq_0 > 0 else 0 del_freq_1 = len(set.union( *del_queue_1)) / len_seq_1 if len_seq_1 > 0 else 0 ins_freq_1 = len(set.union( *ins_queue_1)) / len_seq_1 if len_seq_1 > 0 else 0 del_freq_small_0 = len(set.union( *del_queue_small_0)) / len_seq_0 if len_seq_0 > 0 else 0 ins_freq_small_0 = len(set.union( *ins_queue_small_0)) / len_seq_0 if len_seq_0 > 0 else 0 del_freq_small_1 = len(set.union( *del_queue_small_1)) / len_seq_1 if len_seq_1 > 0 else 0 ins_freq_small_1 = len(set.union( *ins_queue_small_1)) / len_seq_1 if len_seq_1 > 0 else 0 if max([del_freq_0, del_freq_1]) >= del_t or max( [ins_freq_0, ins_freq_1]) >= ins_t: prev = v_pos + window_size variants[max(1, v_pos - window_size)] = 0 count += 1 elif max([del_freq_small_0, del_freq_small_1]) >= del_t or max( [ins_freq_small_0, ins_freq_small_1]) >= ins_t or ( del_freq_small_0 + ins_freq_small_0) >= 0.9 or ( del_freq_small_1 + ins_freq_small_1) >= 0.9: prev = v_pos + 10 variants[max(1, v_pos - 10)] = 1 count += 1 elif dct['impute_indel_phase'] and len_seq_tot >= 2 * mincov: seq_v2 = [ x.upper() for x in pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True) ] seq = [x[:2] for x in seq_v2] seq_tot = ''.join(seq) del_freq_tot = (seq_tot.count('-') + seq_tot.count('*') ) / len_seq_tot if len_seq_tot > 0 else 0 ins_freq_tot = seq_tot.count( '+') / len_seq_tot if len_seq_tot > 0 else 0 if (del_t <= del_freq_tot or ins_t <= ins_freq_tot): groups = {} for s, n in zip(seq_v2, read_names): if s not in groups: groups[s] = [] groups[s].append(n) counts = sorted([(x, len(groups[x])) for x in groups], key=lambda x: x[1], reverse=True) if counts[0][1] <= 0.8 * len_seq_tot: read_names_0 = set(groups[counts[0][0]]) read_names_1 = set(groups[ counts[1][0]]) if counts[1][1] >= mincov else set( read_names) - read_names_0 else: read_names_0 = groups[counts[0][0]][:counts[0][1] // 2] read_names_1 = groups[counts[0][0]][counts[0][1] // 2:] if len(read_names_0) >= mincov and len( read_names_1) >= mincov: prev = v_pos + 10 variants[max(1, v_pos - 10)] = 1 extra_variants[max(1, v_pos - 10)] = (read_names_0, read_names_1) count += 1 for pcol in samfile.pileup(chrom, max(0, start - 10 - window_size), end, min_base_quality=0, flag_filter=flag, truncate=True): v_pos = pcol.pos + 1 if v_pos in extra_variants: read_names = pcol.get_query_names() read_names_0, read_names_1 = extra_variants[v_pos] elif v_pos in variants: read_names = pcol.get_query_names() read_names_0 = set(read_names) & hap_reads_0 read_names_1 = set(read_names) & hap_reads_1 else: continue d = {'hap0': {}, 'hap1': {}} d_tot = {} ref = ''.join([ ref_dict[p] for p in range(v_pos - window_before, min(chrom_length, v_pos + window_after + 1)) ]) for pread in pcol.pileups: dt = pread.alignment.query_sequence[ max(0, pread.query_position_or_next - window_before):pread.query_position_or_next + window_after] d_tot[pread.alignment.qname] = dt if pread.alignment.qname in read_names_0: d['hap0'][pread.alignment.qname] = dt elif pread.alignment.qname in read_names_1: d['hap1'][pread.alignment.qname] = dt seq_list = d['hap0'] flag0, _, data_0, alt_0, ref_seq_0 = msa(seq_list, ref, v_pos, 2, dct['maxcov']) seq_list = d['hap1'] flag1, _, data_1, alt_1, ref_seq_1 = msa(seq_list, ref, v_pos, 2, dct['maxcov']) seq_list = d_tot flag_total, indel_flag_total, data_total, alt_total, ref_seq_total = msa( seq_list, ref, v_pos, dct['mincov'], dct['maxcov']) if flag0 and flag1 and flag_total: output_pos.append(v_pos) output_data_0.append(data_0) output_data_1.append(data_1) output_data_total.append(data_total) tp = max_range[variants[v_pos]] alleles.append([allele_prediction(alt_0, ref_seq_0, max_range[variants[v_pos]]),\ allele_prediction(alt_1, ref_seq_1, max_range[variants[v_pos]]), \ allele_prediction(alt_total, ref_seq_total, max_range[variants[v_pos]])]) if len(output_pos) == 0: return (output_pos, output_data_0, output_data_1, output_data_total, alleles) output_pos = np.array(output_pos) output_data_0 = np.array(output_data_0) output_data_1 = np.array(output_data_1) output_data_total = np.array(output_data_total) return (output_pos, output_data_0, output_data_1, output_data_total, alleles)
class DateIntervalTree: """A slight adaption of the intervaltree library to support python dates The intervaltree data structure stores integer ranges, fundamentally. Therefore, if we want to store dates, we must fist convert them to integers, in a way that preserves inequalities. Luckily, the toordinal() function on datetime.date satisfies this requirement. It's important to note that this interval tree structure is, unless otherwise noted inclusive of lower bounds and exclusive of upper bounds. That is to say, an interval from A to B includes the value A and excludes the value B. """ def __init__(self): self.tree = IntervalTree() @staticmethod def to_date_interval(begin: date, end: date, data: Any) -> Interval: """Convert a date interval (and associated date, if any) into an ordinal interval""" return Interval(begin.toordinal(), end.toordinal(), data) @staticmethod def from_date_interval(ival: Interval) -> Interval: """Convert an ordinal interval to a date interval""" return Interval(date.fromordinal(ival.begin), date.fromordinal(ival.end), ival.data) def add(self, begin: date, end: date, data: Any = None): """Add a date interval to the interval tree, along with any associated date""" self.tree.add(DateIntervalTree.to_date_interval(begin, end, data)) def merge_overlaps(self, reducer: Callable = None, strict: bool = True): """Merge overlapping date intervals in the tree. A reduce function can be specified to determine how data elements are combined for overlapping intervals. The strict argument determines whether "kissing" intervals are merged. If true (the default), only "strictly" overlapping intervals are merged, otherwise adjacent intervals will also be merged. See the intervaltree library documentation for the merge_overlaps function for a more complete description. """ self.tree.merge_overlaps(data_reducer=reducer, strict=strict) def intervals(self) -> List[Interval]: """Return all date intervals in this tree""" # Note we convert from ordinal values to actual date objects return [ DateIntervalTree.from_date_interval(ival) for ival in self.tree.items() ] def overlaps(self, begin: date, end: date, strict: bool = True) -> bool: """Determine whether the given date interval overlaps with any interval in the tree. According to intervaltree, intervals include the lower bound but not the upper bound: 2015-07-23 -2015-08-21 does not overlap 2015-08-21-2015-09-21 If strict is false, add a day to the end date to return True for single day overlaps. """ if strict: ival = DateIntervalTree.to_date_interval(begin, end, None) else: ival = DateIntervalTree.to_date_interval(begin, end + timedelta(days=1), None) return self.tree.overlaps(ival.begin, ival.end) def range_query(self, begin: date, end: date) -> List[Interval]: """Return all intervals in the tree that strictly overlap with the given interval""" ival = DateIntervalTree.to_date_interval(begin, end, None) return [ DateIntervalTree.from_date_interval(ival) for ival in self.tree.overlap(ival.begin, ival.end) ] def point_query(self, point: date) -> List[Interval]: return [ DateIntervalTree.from_date_interval(ival) for ival in self.tree.at(point.toordinal()) ] @staticmethod def shift_endpoints(date_tree: "DateIntervalTree") -> "DateIntervalTree": """Produce a new tree where adjacent intervals are guaranteed to not match at a boundary by shifting the end dates of touching intervals E.g., the intervals (1/1/2000, 1/10/2000), (1/10/2000, 1/20/2000) become (1/1/2000, 1/9/2000), (1/10/2000, 1/20/2000) ^--A day was subtracted here to avoid matching exactly with the next interval Loop earliest -> latest, adjusting end date. """ adjusted = DateIntervalTree() work_list = deque(sorted(date_tree.intervals())) while work_list: cur_ival = work_list.popleft() if work_list: next_ival = work_list[0] if cur_ival.end == next_ival.begin: cur_ival = Interval(cur_ival.begin, cur_ival.end - timedelta(days=1), cur_ival.data) adjusted.add(cur_ival.begin, cur_ival.end, cur_ival.data) return adjusted @staticmethod def shift_endpoints_start( date_tree: "DateIntervalTree") -> "DateIntervalTree": """Produce a new tree where adjacent intervals are guaranteed to not match at a boundary by shifting the start dates of touching intervals E.g., the intervals (1/1/2000, 1/10/2000), (1/10/2000, 1/20/2000) become (1/1/2000, 1/10/2000), (1/11/2000, 1/20/2000) ^--A day was added here to avoid matching exactly with the next interval Loop latest -> earliest, adjusting start date. """ adjusted = DateIntervalTree() work_list = deque(sorted(date_tree.intervals(), reverse=True)) while work_list: cur_ival = work_list.popleft() if work_list: next_ival = work_list[0] if cur_ival.begin == next_ival.end: log.debug( "adjusting start of billing period: %s-%s", cur_ival.begin, cur_ival.end, ) cur_ival = Interval(cur_ival.begin + timedelta(days=1), cur_ival.end, cur_ival.data) adjusted.add(cur_ival.begin, cur_ival.end, cur_ival.data) return adjusted @staticmethod def shift_endpoints_end( date_tree: "DateIntervalTree") -> "DateIntervalTree": """Produce a new tree where adjacent intervals are guaranteed to not match at a boundary by shifting the end dates of touching intervals E.g., the intervals (1/1/2000, 1/10/2000), (1/10/2000, 1/20/2000) become (1/1/2000, 1/9/2000), (1/10/2000, 1/20/2000) ^--A day was subtracted here to avoid matching exactly with the next interval Loop latest -> earliest, adjusting end date. """ adjusted = DateIntervalTree() work_list = deque(sorted(date_tree.intervals(), reverse=True)) prev_ival = None while work_list: cur_ival = work_list.popleft() if prev_ival: while cur_ival.end >= prev_ival.begin: new_start, new_end = ( cur_ival.begin, cur_ival.end - timedelta(days=1), ) if new_start == new_end: # If new interval is one day long, shift start date back one day too. new_start = new_start - timedelta(days=1) cur_ival = Interval(new_start, new_end, cur_ival.data) prev_ival = cur_ival adjusted.add(cur_ival.begin, cur_ival.end, cur_ival.data) return adjusted
class MemoryCache(object): def __init__(self, context): self._context = context self._run_token = -1 self._log = logging.getLogger('memcache') self._reset_cache() def _reset_cache(self): self._cache = IntervalTree() self._metrics = CacheMetrics() ## # @brief Invalidates the cache if appropriate. def _check_cache(self): if self._context.core.is_running(): self._log.debug("core is running; invalidating cache") self._reset_cache() elif self._run_token != self._context.core.run_token: self._dump_metrics() self._log.debug("out of date run token; invalidating cache") self._reset_cache() self._run_token = self._context.core.run_token ## # @brief Splits a memory address range into cached and uncached subranges. # @return Returns a 2-tuple with the first element being a set of Interval objects for each # of the cached subranges. The second element is a set of Interval objects for each of the # non-cached subranges. def _get_ranges(self, addr, count): cached = self._cache.overlap(addr, addr + count) uncached = {Interval(addr, addr + count)} for cachedIv in cached: newUncachedSet = set() for uncachedIv in uncached: # No overlap. if cachedIv.end < uncachedIv.begin or cachedIv.begin > uncachedIv.end: newUncachedSet.add(uncachedIv) continue # Begin segment. if cachedIv.begin - uncachedIv.begin > 0: newUncachedSet.add( Interval(uncachedIv.begin, cachedIv.begin)) # End segment. if uncachedIv.end - cachedIv.end > 0: newUncachedSet.add(Interval(cachedIv.end, uncachedIv.end)) uncached = newUncachedSet return cached, uncached ## # @brief Reads uncached memory ranges and updates the cache. # @return A list of Interval objects is returned. Each Interval has its @a data attribute set # to a bytearray of the data read from target memory. def _read_uncached(self, uncached): uncachedData = [] for uncachedIv in uncached: data = self._context.read_memory_block8( uncachedIv.begin, uncachedIv.end - uncachedIv.begin) iv = Interval(uncachedIv.begin, uncachedIv.end, bytearray(data)) self._cache.add(iv) # TODO merge contiguous cached intervals uncachedData.append(iv) return uncachedData def _update_metrics(self, cached, uncached, addr, size): cachedSize = 0 for iv in cached: begin = iv.begin end = iv.end if iv.begin < addr: begin = addr if iv.end > addr + size: end = addr + size cachedSize += end - begin uncachedSize = sum((iv.end - iv.begin) for iv in uncached) self._metrics.reads += 1 self._metrics.hits += cachedSize self._metrics.misses += uncachedSize def _dump_metrics(self): if self._metrics.total > 0: self._log.debug( "%d reads, %d bytes [%d%% hits, %d bytes]; %d bytes written", self._metrics.reads, self._metrics.total, self._metrics.percent_hit, self._metrics.hits, self._metrics.writes) else: self._log.debug("no reads") ## # @brief Performs a cached read operation of an address range. # @return A list of Interval objects sorted by address. def _read(self, addr, size): # Get the cached and uncached subranges of the requested read. cached, uncached = self._get_ranges(addr, size) self._update_metrics(cached, uncached, addr, size) # Read any uncached ranges. uncachedData = self._read_uncached(uncached) # Merged cached with data we just read combined = list(cached) + uncachedData combined.sort(key=lambda x: x.begin) return combined ## # @brief Extracts data from the intersection of an address range across a list of interval objects. # # The range represented by @a addr and @a size are assumed to overlap the intervals. The first # and last interval in the list may have ragged edges not fully contained in the address range, in # which case the correct slice of those intervals is extracted. # # @param self # @param combined List of Interval objects forming a contiguous range. The @a data attribute of # each interval must be a bytearray. # @param addr Start address. Must be within the range of the first interval. # @param size Number of bytes. (@a addr + @a size) must be within the range of the last interval. # @return A single bytearray object with all data from the intervals that intersects the address # range. def _merge_data(self, combined, addr, size): result = bytearray() resultAppend = bytearray() # Check for fully contained subrange. if len(combined) and combined[0].begin < addr and combined[ 0].end > addr + size: offset = addr - combined[0].begin endOffset = offset + size result = combined[0].data[offset:endOffset] return result # Take slice of leading ragged edge. if len(combined) and combined[0].begin < addr: offset = addr - combined[0].begin result += combined[0].data[offset:] combined = combined[1:] # Take slice of trailing ragged edge. if len(combined) and combined[-1].end > addr + size: offset = addr + size - combined[-1].begin resultAppend = combined[-1].data[:offset] combined = combined[:-1] # Merge. for iv in combined: result += iv.data result += resultAppend return result ## # @brief def _update_contiguous(self, cached, addr, value): size = len(value) end = addr + size leadBegin = addr leadData = bytearray() trailData = bytearray() trailEnd = end if cached[0].begin < addr and cached[0].end > addr: offset = addr - cached[0].begin leadData = cached[0].data[:offset] leadBegin = cached[0].begin if cached[-1].begin < end and cached[-1].end > end: offset = end - cached[-1].begin trailData = cached[-1].data[offset:] trailEnd = cached[-1].end self._cache.remove_overlap(addr, end) data = leadData + value + trailData self._cache.addi(leadBegin, trailEnd, data) ## # @return A bool indicating whether the given address range is fully contained within # one known memory region, and that region is cacheable. # @exception MemoryAccessError Raised if the access is not entirely contained within a single region. def _check_regions(self, addr, count): regions = self._context.core.memory_map.get_intersecting_regions( addr, length=count) # If no regions matched, then allow an uncached operation. if len(regions) == 0: return False # Raise if not fully contained within one region. if len(regions) > 1 or not regions[0].contains_range(addr, length=count): raise MemoryAccessError( "individual memory accesses must not cross memory region boundaries" ) # Otherwise return whether the region is cacheable. return regions[0].is_cacheable def read_memory(self, addr, transfer_size=32, now=True): # TODO use more optimal underlying read_memory call if transfer_size == 8: data = self.read_memory_block8(addr, 1)[0] elif transfer_size == 16: data = conversion.byte_list_to_u16le_list( self.read_memory_block8(addr, 2))[0] elif transfer_size == 32: data = conversion.byte_list_to_u32le_list( self.read_memory_block8(addr, 4))[0] if now: return data else: def read_cb(): return data return read_cb def read_memory_block8(self, addr, size): if size <= 0: return [] self._check_cache() # Validate memory regions. if not self._check_regions(addr, size): self._log.debug("range [%x:%x] is not cacheable", addr, addr + size) return self._context.read_memory_block8(addr, size) # Get the cached and uncached subranges of the requested read. combined = self._read(addr, size) # Extract data out of combined intervals. result = list(self._merge_data(combined, addr, size)) assert len( result) == size, "result size ({}) != requested size ({})".format( len(result), size) return result def read_memory_block32(self, addr, size): return conversion.byte_list_to_u32le_list( self.read_memory_block8(addr, size * 4)) def write_memory(self, addr, value, transfer_size=32): if transfer_size == 8: return self.write_memory_block8(addr, [value]) elif transfer_size == 16: return self.write_memory_block8( addr, conversion.u16le_list_to_byte_list([value])) elif transfer_size == 32: return self.write_memory_block8( addr, conversion.u32le_list_to_byte_list([value])) def write_memory_block8(self, addr, value): if len(value) <= 0: return self._check_cache() # Validate memory regions. cacheable = self._check_regions(addr, len(value)) # Write to the target first, so if it fails we don't update the cache. result = self._context.write_memory_block8(addr, value) if cacheable: size = len(value) end = addr + size cached = sorted(self._cache.overlap(addr, end), key=lambda x: x.begin) self._metrics.writes += size if len(cached): # Write data is entirely within a single cached interval. if addr >= cached[0].begin and end <= cached[0].end: beginOffset = addr - cached[0].begin endOffset = beginOffset + size cached[0].data[beginOffset:endOffset] = value else: self._update_contiguous(cached, addr, bytearray(value)) else: # No cached data in this range, so just add the entire interval. self._cache.addi(addr, end, bytearray(value)) return result def write_memory_block32(self, addr, data): return self.write_memory_block8( addr, conversion.u32le_list_to_byte_list(data)) def invalidate(self): self._reset_cache()
class MemoryCache(object): """! @brief Memory cache. Maintains a cache of target memory. The constructor is passed a backing DebugContext object that will be used to fill the cache. The cache is invalidated whenever the target has run since the last cache operation (based on run tokens). If the target is currently running, all accesses cause the cache to be invalidated. The target's memory map is referenced. All memory accesses must be fully contained within a single memory region, or a MemoryAccessError will be raised. However, if an access is outside of all regions, the access is passed to the underlying context unmodified. When an access is within a region, that region's cacheability flag is honoured. """ def __init__(self, context, core): self._context = context self._core = core self._run_token = -1 self._log = LOG.getChild('memcache') self._reset_cache() def _reset_cache(self): self._cache = IntervalTree() self._metrics = CacheMetrics() def _check_cache(self): """! @brief Invalidates the cache if appropriate.""" if self._core.is_running(): self._log.debug("core is running; invalidating cache") self._reset_cache() elif self._run_token != self._core.run_token: self._dump_metrics() self._log.debug("out of date run token; invalidating cache") self._reset_cache() self._run_token = self._core.run_token def _get_ranges(self, addr, count): """! @brief Splits a memory address range into cached and uncached subranges. @return Returns a 2-tuple with the first element being a set of Interval objects for each of the cached subranges. The second element is a set of Interval objects for each of the non-cached subranges. """ cached = self._cache.overlap(addr, addr + count) uncached = {Interval(addr, addr + count)} for cachedIv in cached: newUncachedSet = set() for uncachedIv in uncached: # No overlap. if cachedIv.end < uncachedIv.begin or cachedIv.begin > uncachedIv.end: newUncachedSet.add(uncachedIv) continue # Begin segment. if cachedIv.begin - uncachedIv.begin > 0: newUncachedSet.add(Interval(uncachedIv.begin, cachedIv.begin)) # End segment. if uncachedIv.end - cachedIv.end > 0: newUncachedSet.add(Interval(cachedIv.end, uncachedIv.end)) uncached = newUncachedSet return cached, uncached def _read_uncached(self, uncached): """! "@brief Reads uncached memory ranges and updates the cache. @return A list of Interval objects is returned. Each Interval has its @a data attribute set to a bytearray of the data read from target memory. """ uncachedData = [] for uncachedIv in uncached: data = self._context.read_memory_block8(uncachedIv.begin, uncachedIv.end - uncachedIv.begin) iv = Interval(uncachedIv.begin, uncachedIv.end, bytearray(data)) self._cache.add(iv) # TODO merge contiguous cached intervals uncachedData.append(iv) return uncachedData def _update_metrics(self, cached, uncached, addr, size): cachedSize = 0 for iv in cached: begin = iv.begin end = iv.end if iv.begin < addr: begin = addr if iv.end > addr + size: end = addr + size cachedSize += end - begin uncachedSize = sum((iv.end - iv.begin) for iv in uncached) self._metrics.reads += 1 self._metrics.hits += cachedSize self._metrics.misses += uncachedSize def _dump_metrics(self): if self._metrics.total > 0: self._log.debug("%d reads, %d bytes [%d%% hits, %d bytes]; %d bytes written", self._metrics.reads, self._metrics.total, self._metrics.percent_hit, self._metrics.hits, self._metrics.writes) else: self._log.debug("no reads") def _read(self, addr, size): """! @brief Performs a cached read operation of an address range. @return A list of Interval objects sorted by address. """ # Get the cached and uncached subranges of the requested read. cached, uncached = self._get_ranges(addr, size) self._update_metrics(cached, uncached, addr, size) # Read any uncached ranges. uncachedData = self._read_uncached(uncached) # Merged cached with data we just read combined = list(cached) + uncachedData combined.sort(key=lambda x: x.begin) return combined def _merge_data(self, combined, addr, size): """! @brief Extracts data from the intersection of an address range across a list of interval objects. The range represented by @a addr and @a size are assumed to overlap the intervals. The first and last interval in the list may have ragged edges not fully contained in the address range, in which case the correct slice of those intervals is extracted. @param self @param combined List of Interval objects forming a contiguous range. The @a data attribute of each interval must be a bytearray. @param addr Start address. Must be within the range of the first interval. @param size Number of bytes. (@a addr + @a size) must be within the range of the last interval. @return A single bytearray object with all data from the intervals that intersects the address range. """ result = bytearray() resultAppend = bytearray() # Check for fully contained subrange. if len(combined) and combined[0].begin < addr and combined[0].end > addr + size: offset = addr - combined[0].begin endOffset = offset + size result = combined[0].data[offset:endOffset] return result # Take slice of leading ragged edge. if len(combined) and combined[0].begin < addr: offset = addr - combined[0].begin result += combined[0].data[offset:] combined = combined[1:] # Take slice of trailing ragged edge. if len(combined) and combined[-1].end > addr + size: offset = addr + size - combined[-1].begin resultAppend = combined[-1].data[:offset] combined = combined[:-1] # Merge. for iv in combined: result += iv.data result += resultAppend return result def _update_contiguous(self, cached, addr, value): size = len(value) end = addr + size leadBegin = addr leadData = bytearray() trailData = bytearray() trailEnd = end if cached[0].begin < addr and cached[0].end > addr: offset = addr - cached[0].begin leadData = cached[0].data[:offset] leadBegin = cached[0].begin if cached[-1].begin < end and cached[-1].end > end: offset = end - cached[-1].begin trailData = cached[-1].data[offset:] trailEnd = cached[-1].end self._cache.remove_overlap(addr, end) data = leadData + value + trailData self._cache.addi(leadBegin, trailEnd, data) def _check_regions(self, addr, count): """! @return A bool indicating whether the given address range is fully contained within one known memory region, and that region is cacheable. @exception MemoryAccessError Raised if the access is not entirely contained within a single region. """ regions = self._core.memory_map.get_intersecting_regions(addr, length=count) # If no regions matched, then allow an uncached operation. if len(regions) == 0: return False # Raise if not fully contained within one region. if len(regions) > 1 or not regions[0].contains_range(addr, length=count): raise MemoryAccessError("individual memory accesses must not cross memory region boundaries") # Otherwise return whether the region is cacheable. return regions[0].is_cacheable def read_memory(self, addr, transfer_size=32, now=True): # TODO use more optimal underlying read_memory call if transfer_size == 8: data = self.read_memory_block8(addr, 1)[0] elif transfer_size == 16: data = conversion.byte_list_to_u16le_list(self.read_memory_block8(addr, 2))[0] elif transfer_size == 32: data = conversion.byte_list_to_u32le_list(self.read_memory_block8(addr, 4))[0] if now: return data else: def read_cb(): return data return read_cb def read_memory_block8(self, addr, size): if size <= 0: return [] self._check_cache() # Validate memory regions. if not self._check_regions(addr, size): self._log.debug("range [%x:%x] is not cacheable", addr, addr+size) return self._context.read_memory_block8(addr, size) # Get the cached and uncached subranges of the requested read. combined = self._read(addr, size) # Extract data out of combined intervals. result = list(self._merge_data(combined, addr, size)) assert len(result) == size, "result size ({}) != requested size ({})".format(len(result), size) return result def read_memory_block32(self, addr, size): return conversion.byte_list_to_u32le_list(self.read_memory_block8(addr, size*4)) def write_memory(self, addr, value, transfer_size=32): if transfer_size == 8: return self.write_memory_block8(addr, [value]) elif transfer_size == 16: return self.write_memory_block8(addr, conversion.u16le_list_to_byte_list([value])) elif transfer_size == 32: return self.write_memory_block8(addr, conversion.u32le_list_to_byte_list([value])) def write_memory_block8(self, addr, value): if len(value) <= 0: return self._check_cache() # Validate memory regions. cacheable = self._check_regions(addr, len(value)) # Write to the target first, so if it fails we don't update the cache. result = self._context.write_memory_block8(addr, value) if cacheable: size = len(value) end = addr + size cached = sorted(self._cache.overlap(addr, end), key=lambda x:x.begin) self._metrics.writes += size if len(cached): # Write data is entirely within a single cached interval. if addr >= cached[0].begin and end <= cached[0].end: beginOffset = addr - cached[0].begin endOffset = beginOffset + size cached[0].data[beginOffset:endOffset] = value else: self._update_contiguous(cached, addr, bytearray(value)) else: # No cached data in this range, so just add the entire interval. self._cache.addi(addr, end, bytearray(value)) return result def write_memory_block32(self, addr, data): return self.write_memory_block8(addr, conversion.u32le_list_to_byte_list(data)) def invalidate(self): self._reset_cache()
class IntervalPrinter: def __init__(self, file_type, infile, chrom, faidx, step): self.chrom = chrom # calculate chromosome length from FASTA index: self.chrLength = self._getChrLength(faidx) #print("Chromosome "+self.chrom+" length is "+str(self.chrLength)) self.t = IntervalTree() self.step = step if (file_type == 'bedgraph'): with open(infile, 'r') as f: reader = csv.reader(f, delimiter='\t') for row in reader: if row[0] == chrom: start = int(row[1]) end = int(row[2]) data = float(row[3]) if start == end: end = start + 1 self.t.addi(start, end, data) if (file_type == 'cnvs'): with open(infile, 'r') as f: reader = csv.reader(f, delimiter='\t') chrom = chrom.replace("chr", "") for row in reader: if row[0] == chrom: start = int(row[1]) end = int(row[2]) data = float(row[3]) if start == end: end = start + 1 self.t.addi(start, end, data) if (file_type == 'ratio'): with open(infile, 'r') as f: reader = csv.reader(f, delimiter='\t') start = 0 # beginning of the chromosome chrom = chrom.replace("chr", "") for row in reader: if row[0] == chrom: end = int(row[1]) #data = float(row[2]) # ratio value data = float(row[4]) # copy number if start == end: end = start + 1 self.t.addi(start, end, data) # update start = end def _getChrLength(self, faidx): with open(faidx, 'r') as idx: reader = csv.reader(idx, delimiter='\t') for row in reader: if row[0] == str(self.chrom): return int(row[1]) def printLine(self): sex_re = re.compile(".*[XY]") line = "" value = 2 for i in range(0, self.chrLength, self.step): # default value is 2 for autosomes and we have to correct for sex chromosomes below value = 2 # get all the values overlapping the current interval overlap = self.t.overlap(i, i + self.step) if len(overlap) != 0: # we can have more than one intervals overlapping the current one data = [] for interval_obj in overlap: data.append(interval_obj.data) value = max( data) #* (1 if not sex_re.match(self.chrom) else 2) line = line + str(value) + "," line = line + str(value) print(line)
def PeakOverlap(genesfile, peaksfile,tssdistance=[0,0],peakname='null'): LuckPeak, LuckGen, LuckTree, LuckBegin , Genlist = {},{},{},{},{} ####### CREATE A INTERVALTREE VARIABLE tree = IntervalTree() n = 0 m = 0 intergenic = set() intergenic_output = {} for lines in open(peaksfile): fields = lines.split() namegain, chromogain, begingain, endgain = fields[3], fields[0], int(fields[1]), int(fields[2]) space4, space5 = fields[4], fields[5] LuckPeak[namegain] = [chromogain, begingain, endgain, namegain, space4, space5] LuckBegin[begingain] = [namegain,begingain,endgain] intergenic = intergenic|set([namegain]) if chromogain not in LuckTree: print('Chromosome '+chromogain+' of ' +peakname+'...') LuckTree[chromogain] = 0 if n == 1: for lines2 in open(genesfile): fields2 = lines2.split() if fields2[0] != k: continue else: nameid = fields2[3] begingen = int(fields2[1]) - tssdistance[0] endgen = int(fields2[2]) + tssdistance[1] chromogen = fields2[0] strand = fields2[5] if tree.overlap(begingen, endgen) != set(): for x in tree.overlap(begingen, endgen): LuckGen[m] = [chromogen] + [fields2[1]] + [fields2[2]] + [nameid] + [strand] + LuckBegin[x.begin] intergenic = intergenic - set([LuckBegin[x.begin][0]]) m+=1 else: tree[begingain:endgain] = (begingain, endgain) n = 1 ### RESET THE TREE EACH TIME BEFORE START A NEW CHROMOSOME tree = IntervalTree() tree[begingain:endgain] = (begingain, endgain) ### get all the peaks of the chromosome to fill the tree until the next item of the field is another chromosome. Then start to compare all items of the tree with all the genes int he same chromosome else: k = chromogain tree[begingain:endgain] = (begingain,endgain) for lines2 in open(genesfile): fields2 = lines2.split() if fields2[0] != k: continue else: nameid = fields2[3] begingen = int(fields2[1]) - tssdistance[0] endgen = int(fields2[2]) + tssdistance[1] chromogen = fields2[0] strand = fields2[5] if tree.overlap(begingen, endgen) != set(): for x in tree.overlap(begingen, endgen): LuckGen[m] = [chromogen] + [fields2[1]] + [fields2[2]] + [nameid] + [strand] + LuckBegin[x.begin] intergenic = intergenic - set([LuckBegin[x.begin][0]]) m += 1 for x in intergenic: intergenic_output[x] = LuckPeak[x] ### OUTPUT if not os.path.exists(peakname): os.makedirs(peakname) if len(intergenic) == 0: print('No Intergenic peaks') else: results_intragenic = pd.DataFrame(list(intergenic_output.values())).sort_values(by=[0]) results_intragenic.to_csv('./' + peakname + '/' + peakname + '_intergenic.bed', index=None, sep='\t', header=False) results = pd.DataFrame(list(LuckGen.values())) results.to_csv('./' + peakname + '/' + peakname + 'PeaksInGenes', index=None, sep='\t', header= False) return ('./' + peakname + '/' + peakname + 'PeaksInGenes', './' + peakname + '/' + peakname + '_intergenic.bed')
def getFirstTable(line_objs,baseDataDirectory): symbols_to_ignore = ['$', '%', '(', ')', '((', '))', '()'] mainTree = IntervalTree() mainList = list() table_start_bbox = get_table_start(line_objs) if table_start_bbox == -1: return None table_end_bbox = -1 lst = line_objs[table_start_bbox: table_end_bbox] lines_com_len = 0 if lst is not None and len(lst) > 0: for line_obj in lst: if len(mainTree) == 0: for sen_obj in line_obj: if sen_obj['text'].replace(' ', '').lower() in symbols_to_ignore: continue if len(sen_obj['text'].strip()) > 0: if sen_obj['underline_exists']: x0 = sen_obj['x0_or'] x1 = sen_obj['x1_or'] else: x0 = sen_obj['x0'] x1 = sen_obj['x1'] if len(sen_obj['text'].strip()) == 1: x0 = x0 - 3 x1 = x1 - 3 if len(sen_obj['text'].strip()) == 2: x0 = x0 - 3 mainTree.add(Interval(x0, x1, [sen_obj['text']])) lines_com_len += 1 else: if len(line_obj[-1]['text'].replace('.', '').split()) > 10: break for sen_obj in line_obj: if sen_obj['text'].replace(' ', '').lower() in symbols_to_ignore: continue if sen_obj['underline_exists']: x0 = sen_obj['x0_or'] x1 = sen_obj['x1_or'] else: x0 = sen_obj['x0'] x1 = sen_obj['x1'] if len(sen_obj['text'].strip()) == 1: x0 = x0 - 3 x1 = x1 - 3 if len(sen_obj['text'].strip()) == 2: x0 = x0 - 3 overlapInt = mainTree.overlap(x0, x1) if len(overlapInt) > 0: if len(overlapInt) == 1: first_col_start = min([overlap.begin for overlap in overlapInt]) for overlap in overlapInt: if overlap.begin != first_col_start: continue dataToAppend = overlap te_arr = dataToAppend.data for k in range(len(te_arr), lines_com_len): te_arr.append(float('NaN')) te_arr.append(sen_obj['text']) mainTree.remove(dataToAppend) if overlap > 1: mainTree.add(Interval(dataToAppend.begin, dataToAppend.end, te_arr)) else: mainTree.add(Interval(min(x0, dataToAppend.begin), max(x1, dataToAppend.end), te_arr)) break else: te_arr = [] for k in range(len(te_arr), lines_com_len): te_arr.append(float('NaN')) te_arr.append(sen_obj['text']) if len(sen_obj['text'].strip()) == 1: x0 = x0 - 3 x1 = x1 - 3 if len(sen_obj['text'].strip()) == 2: x0 = x0 - 3 mainTree.add(Interval(x0, x1, te_arr)) lines_com_len += 1 sTree = sorted(mainTree) rows_to_drop = [] max_len = max([len(tr.data) for tr in sTree]) for tr in sTree: # mainList.append('\n'.join(str(tr.data).split('\n')[::-1]) te_lst = tr.data for i in range(len(te_lst), max_len): te_lst.append(float('NaN')) mainList.append(te_lst) final_df = pd.DataFrame(mainList).T last_row = final_df.iloc[final_df.shape[0]-1].to_list() if 'note' in str(last_row[0]).replace(' ','').lower() or \ 'directors' in str(last_row[0]).replace(' ','').lower(): final_df = final_df.drop([final_df.shape[0]-1], axis=0) lstColHeaders = getcolHeaders(table_start_bbox, final_df, line_objs,baseDataDirectory) if lstColHeaders is not None: # print(lstColHeaders, list(dataFrame.columns), len(lstColHeaders), len(list(dataFrame.columns))) if len(lstColHeaders) == len(final_df.columns): for index, colld in enumerate(lstColHeaders): final_df = final_df.rename(columns={final_df.columns[index]: str(colld)}) elif len(lstColHeaders) == len(final_df.columns) - 1: for index, colld in enumerate(lstColHeaders): final_df = final_df.rename(columns={final_df.columns[index + 1]: str(colld)}) return final_df