def test_update(): t = IntervalTree() interval = Interval(0, 1) s = set([interval]) t.update(s) assert isinstance(t, IntervalTree) assert len(t) == 1 assert set(t).pop() == interval t.clear() assert not t t.extend(s) t.extend(s) assert isinstance(t, IntervalTree) assert len(t) == 1 assert set(t).pop() == interval interval = Interval(2, 3) t.update([interval]) assert isinstance(t, IntervalTree) assert len(t) == 2 assert sorted(t)[1] == interval t = IntervalTree(s) t.extend([interval]) assert isinstance(t, IntervalTree) assert len(t) == 2 assert sorted(t)[1] == interval
def test_update(): t = IntervalTree() interval = Interval(0, 1) s = set([interval]) t.update(s) assert isinstance(t, IntervalTree) assert len(t) == 1 assert set(t).pop() == interval t.clear() assert not t t.extend(s) t.extend(s) assert isinstance(t, IntervalTree) assert len(t) == 1 assert set(t).pop() == interval interval = Interval(2, 3) t.update([interval]) assert isinstance(t, IntervalTree) assert len(t) == 2 assert sorted(t)[1] == interval t = IntervalTree(s) t.extend([interval]) assert isinstance(t, IntervalTree) assert len(t) == 2 assert sorted(t)[1] == interval
def get_merged_variants(self, variants, key=None): # type: (List[vcfio.Variant], str) -> Iterable[vcfio.Variant] non_variant_tree = IntervalTree() grouped_variants = collections.defaultdict(list) for v in variants: self._align_with_window(v, key) if self._is_non_variant(v): non_variant_tree.addi(v.start, v.end, v) else: group_key = next(self._move_to_calls.get_merge_keys(v)) grouped_variants[group_key].append(v) non_variants = self._merge_non_variants(non_variant_tree) variants = self._merge_variants(grouped_variants) non_variant_tree.clear() for nv in non_variants: non_variant_tree.addi(nv.start, nv.end, nv) splits = IntervalTree() for v in variants: non_variant_interval = non_variant_tree.search(v.start, v.end) if non_variant_interval: non_variant = next(iter(non_variant_interval)).data v.calls.extend(non_variant.calls) v.calls = sorted(v.calls) self._update_splits(splits, v) yield v for non_variant in self._split_non_variants(non_variant_tree, splits): yield non_variant
def get_bitmap_iv_tree(curr_path, prev_path): if not prev_path: return get_single_iv_tree(curr_path) curr_tree = IntervalTree() retval = read_snapshot_bitmap(curr_path, add_by_qcow2_cb, curr_tree) if not retval: xlogging.raise_and_logging_error( r'读取位图文件 1 失败', r'[get_snapshot_inc_bitmap] get curr_path failed') return None retval = read_snapshot_bitmap(prev_path, exclude_by_qcow2, curr_tree) if not retval: xlogging.raise_and_logging_error( r'读取位图文件 2 失败', r'[get_snapshot_inc_bitmap] get curr_path failed') return None lba_tree = IntervalTree() retval = qcow2_to_lba(curr_tree, lba_tree) curr_tree.clear() if not retval: log_err_msg("[get_bitmap_iv_tree] qcow2_to_lba failed") return None return lba_tree
class InMemporyTimeIntervals(TimeIntervals): def __init__(self): self.tree = IntervalTree() def clear(self) -> None: self.tree.clear() def add(self, start: datetime, end: datetime, data: object) -> None: if start == end: end = end + timedelta(seconds=1) self.tree.add(Interval(start, end, data)) def is_inside(self, time: datetime) -> bool: return len(self.tree[time]) > 0
def write_transcripts_to_gtf(transcripts, gtf_file, dangling_edge_threshold=5, strand_specific=False): gene_id_number = 1 transcript_intervals_of_same_gene = IntervalTree() first_transcript = transcripts[0] gene_id = first_transcript.prefix + "GENE" + str(gene_id_number) first_transcript.set_gene_id(gene_id) transcript_intervals_of_same_gene[first_transcript.start_coord:first_transcript.end_coord + 1] = first_transcript for i in range(1, len(transcripts)): curr_transcript = transcripts[i] prev_transcript = transcripts[i - 1] overlapping_intervals = transcript_intervals_of_same_gene[curr_transcript.start_coord] part_of_same_gene = len(overlapping_intervals) != 0 and \ curr_transcript.chromosome == prev_transcript.chromosome and \ (curr_transcript.strand == prev_transcript.strand if strand_specific else True) if part_of_same_gene: #print("--------------------------", curr_transcript.transcript_id) is_contained = False for overlapping_transcript_interval in overlapping_intervals: overlapping_transcript = overlapping_transcript_interval.data containment = curr_transcript.is_contained_in(overlapping_transcript, dangling_edge_threshold) #print(overlapping_transcript.transcript_id) if containment == Containment.CONTAINED: is_contained = True break elif containment == Containment.LAST_EXON_LONGER: replace_overlapping_transcript = (len(overlapping_transcript.exons) == len(curr_transcript.exons) and \ curr_transcript.start_coord - overlapping_transcript.start_coord <= dangling_edge_threshold) #print("", curr_transcript.start_coord - overlapping_transcript.start_coord) if replace_overlapping_transcript: #print("replaced ", overlapping_transcript_interval.data.transcript_id) transcript_intervals_of_same_gene.remove(overlapping_transcript_interval) break if not is_contained: curr_transcript.set_gene_id(gene_id) transcript_intervals_of_same_gene[curr_transcript.start_coord:curr_transcript.end_coord + 1] = curr_transcript else: write_transcripts_of_gene_to_gtf(gtf_file, transcript_intervals_of_same_gene) gene_id_number += 1 gene_id = curr_transcript.prefix + "GENE" + str(gene_id_number) curr_transcript.set_gene_id(gene_id) transcript_intervals_of_same_gene.clear() transcript_intervals_of_same_gene[ curr_transcript.start_coord:curr_transcript.end_coord + 1] = curr_transcript write_transcripts_of_gene_to_gtf(gtf_file, transcript_intervals_of_same_gene)
class SimpleMedium(Medium): def __init__(self, put_up: Optional[Callable[[LoraMsg], None]]) -> None: self._put_up = put_up self.msgs = IntervalTree() def reset_medium(self) -> None: self.msgs.clear() def add_dn(self, msg: LoraMsg) -> None: t0 = Simulation.time2ticks(msg.xbeg) t1 = t0 + Simulation.time2ticks(msg.tpreamble()) self.msgs[t0:t1] = msg @staticmethod def overlap(i1: Interval, i2: Interval) -> int: return min(i1.end, i2.end) - max(i1.begin, i2.begin) # type: ignore def get_dn(self, rxon: int, rxtout: int, freq: int, rps: int, nsym: int = 4, peek=False) -> Optional[LoraMsg]: rxw = Interval(rxon, rxon + rxtout) tpn = Simulation.time2ticks(LoraMsg.symtime(rps, nsym)) for i in self.msgs.overlap(rxw[0], rxw[1]): m = i.data # type: LoraMsg if m.match(freq, rps) and (peek or SimpleMedium.overlap(i, rxw) >= tpn): break else: return None if not peek: self.msgs.remove(i) return m def prune(self, ticks: int) -> List[LoraMsg]: exp = cast(List[Interval], self.msgs.envelop(0, ticks)) if exp: self.msgs.remove_envelop(0, ticks) return [iv[2] for iv in exp]
class ClassFunctionDropdown(Panel): """ Class and Function/Method Dropdowns Widget. Parameters ---------- editor : :class:`spyder.plugins.editor.widgets.codeeditor.CodeEditor` The editor to act on. """ def __init__(self, editor): super(ClassFunctionDropdown, self).__init__(editor) # Internal data self._tree = IntervalTree() self._data = None self.classes = [] self.funcs = [] # Widgets self._editor = editor self.class_cb = QComboBox() self.method_cb = QComboBox() # Widget setup self.class_cb.addItem(_('<None>'), 0) self.method_cb.addItem(_('<None>'), 0) # The layout hbox = QHBoxLayout() hbox.addWidget(self.class_cb) hbox.addWidget(self.method_cb) hbox.setSpacing(0) hbox.setContentsMargins(0, 0, 0, 0) self.setLayout(hbox) # Signals self._editor.sig_cursor_position_changed.connect( self._handle_cursor_position_change_event) self.class_cb.activated.connect(self.combobox_activated) self.method_cb.activated.connect(self.combobox_activated) def _getVerticalSize(self): """Get the default height of a QComboBox.""" return self.class_cb.height() @Slot(int, int) def _handle_cursor_position_change_event(self, linenum, column): self.update_selected(linenum) def sizeHint(self): """Override Qt method.""" return QSize(0, self._getVerticalSize()) def combobox_activated(self): """Move the cursor to the selected definition.""" sender = self.sender() item = sender.itemData(sender.currentIndex()) if item: line = item['location']['range']['start']['line'] + 1 self.editor.go_to_line(line) if sender == self.class_cb: self.method_cb.setCurrentIndex(0) def update_selected(self, linenum): """Updates the dropdowns to reflect the current class and function.""" possible_parents = list(sorted(self._tree[linenum])) for iv in possible_parents: item = iv.data kind = item.get('kind') if kind in [SymbolKind.CLASS]: # Update class combobox for idx in range(self.class_cb.count()): if self.class_cb.itemData(idx) == item: self.class_cb.setCurrentIndex(idx) break else: self.class_cb.setCurrentIndex(0) elif kind in [SymbolKind.FUNCTION, SymbolKind.METHOD]: # Update func combobox for idx in range(self.method_cb.count()): if self.method_cb.itemData(idx) == item: self.method_cb.setCurrentIndex(idx) break else: self.method_cb.setCurrentIndex(0) else: continue if len(possible_parents) == 0: self.class_cb.setCurrentIndex(0) self.method_cb.setCurrentIndex(0) def populate(self, combobox, data, add_parents=False): """ Populate the given ``combobox`` with the class or function names. Parameters ---------- combobox : :class:`qtpy.QtWidgets.QComboBox` The combobox to populate data : list of :class:`dict` The data to populate with. There should be one list element per class or function defintion in the file. add_parents : bool Add parents to name to create a fully qualified name. Returns ------- None """ combobox.clear() combobox.addItem(_('<None>'), 0) model = combobox.model() item = model.item(0) item.setFlags(Qt.NoItemFlags) cb_data = [] for item in data: fqn = item['name'] # Create a list of fully-qualified names if requested if add_parents: begin = item['location']['range']['start']['line'] end = item['location']['range']['end']['line'] possible_parents = sorted(self._tree.overlap(begin, end), reverse=True) for iv in possible_parents: if iv.begin == begin and iv.end == end: continue # Check if it is a real parent p_item = iv.data p_begin = p_item['location']['range']['start']['line'] p_end = p_item['location']['range']['end']['line'] if p_begin <= begin and p_end >= end: fqn = p_item['name'] + "." + fqn cb_data.append((fqn, item)) for fqn, item in cb_data: # Set the icon (See: editortools.py) icon = None name = item['name'] if item['kind'] in [SymbolKind.CLASS]: icon = ima.icon('class') else: if name.startswith('__'): icon = ima.icon('private2') elif name.startswith('_'): icon = ima.icon('private1') else: icon = ima.icon('method') # Add the combobox item if icon is not None: combobox.addItem(icon, fqn, item) else: combobox.addItem(fqn, item) line, column = self._editor.get_cursor_line_column() self.update_selected(line) def update_data(self, data): """Update and process symbol data.""" if data == self._data: return self._data = data self._tree.clear() self.classes = [] self.funcs = [] for item in data: line_start = item['location']['range']['start']['line'] line_end = item['location']['range']['end']['line'] kind = item.get('kind') block = self._editor.document().findBlockByLineNumber(line_start) line_text = line_text = block.text() if block else '' # The symbol finder returns classes in import statements as well # so we filter them out if line_start != line_end and ' import ' not in line_text: self._tree[line_start:line_end] = item if kind in [SymbolKind.CLASS]: self.classes.append(item) elif kind in [SymbolKind.FUNCTION, SymbolKind.METHOD]: self.funcs.append(item) self.class_cb.clear() self.method_cb.clear() self.populate(self.class_cb, self.classes, add_parents=False) self.populate(self.method_cb, self.funcs, add_parents=True)
class MainClient: def __init__(self, zkquorum, pool_size): # Location of the ZooKeeper quorum (csv) self.zkquorum = zkquorum # Connection pool size per region server (and master!) self.pool_size = pool_size # Persistent connection to the master server. self.master_client = None # IntervalTree data structure that allows me to create ranges # representing known row keys that fall within a specific region. Any # 'region look up' is then O(logn) self.region_cache = IntervalTree() # Takes a client's host:port as key and maps it to a client instance. self.reverse_client_cache = {} # Mutex used for all caching operations. self._cache_lock = Lock() # Mutex used so only one thread can request meta information from # the master at a time. self._master_lookup_lock = Lock() """ HERE LAY CACHE OPERATIONS """ def _add_to_region_cache(self, new_region): stop_key = new_region.stop_key if stop_key == '': # This is hacky but our interval tree requires hard interval stops. # So what's the largest char out there? chr(255) -> '\xff'. If # you're using '\xff' as a prefix for your rows then this'll cause # a cache miss on every request. stop_key = '\xff' # Keys are formatted like: 'tablename,key' start_key = new_region.table + ',' + new_region.start_key stop_key = new_region.table + ',' + stop_key # Only let one person touch the cache at once. with self._cache_lock: # Get all overlapping regions (overlapping == stale) overlapping_regions = self.region_cache[start_key:stop_key] # Close the overlapping regions. self._close_old_regions(overlapping_regions) # Remove the overlapping regions. self.region_cache.remove_overlap(start_key, stop_key) # Insert my region. self.region_cache[start_key:stop_key] = new_region # Add this region to the region_client's internal # list of all the regions it serves. new_region.region_client.regions.append(new_region) def _get_from_region_cache(self, table, key): # Only let one person touch the cache at once. with self._cache_lock: # We don't care about the last two characters ',:' in the meta_key. # 'table,key,:' --> 'table,key' meta_key = self._construct_meta_key(table, key)[:-2] # Fetch the region that serves this key regions = self.region_cache[meta_key] try: # Returns a set. Pop the element from the set. # (there shouldn't be more than 1 elem in the set) a = regions.pop() return a.data except KeyError: # Returned set is empty? Cache miss! return None def _delete_from_region_cache(self, table, start_key): # Don't acquire the lock because the calling function should have done # so already self.region_cache.remove_overlap(table + "," + start_key) """ HERE LAY REQUESTS """ def get(self, table, key, families={}, filters=None): """ get a row or specified cell with optional filter :param table: hbase table :param key: row key :param families: (optional) specifies columns to get, e.g., {"columnFamily1":["col1","col2"], "colFamily2": "col3"} :param filters: (optional) column filters :return: response with cells """ try: # Step 0. Set dest_region to None so if an exception is # thrown in _find_hosting_region, the exception handling # doesn't break trying to reference dest_region. dest_region = None # Step 1. Figure out where to send it. dest_region = self._find_hosting_region(table, key) # Step 2. Build the appropriate pb message. rq = request.get_request(dest_region, key, families, filters) # Step 3. Send the message and twiddle our thumbs. response = dest_region.region_client._send_request(rq) # Step 4. Success. return Result(response) except PyBaseException as e: # Step X. Houston, we have an error. The cool thing about how # this is coded is that exceptions know how to handle themselves. # All we need to do is call _handle_exception and everything should # be happy! If it cannot handle itself (unrecoverable) then it will # re-raise the exception in the handle method and we'll die too. # # We pass dest_region in because the handling code needs to know # which region or region_client it needs to reestablish. e._handle_exception(self, dest_region=dest_region) # Everything should be dandy now. Repeat the request! return self.get(table, key, families=families, filters=filters) def put(self, table, key, values): return self._mutate(table, key, values, request.put_request) def delete(self, table, key, values): return self._mutate(table, key, values, request.delete_request) def append(self, table, key, values): return self._mutate(table, key, values, request.append_request) def increment(self, table, key, values): return self._mutate(table, key, values, request.increment_request) def _mutate(self, table, key, values, rq_type): # Same exact methodology as 'get'. Because all mutate requests have # equivalent code I've combined them into a single function. try: dest_region = None dest_region = self._find_hosting_region(table, key) rq = rq_type(dest_region, key, values) response = dest_region.region_client._send_request(rq) return Result(response) except PyBaseException as e: e._handle_exception(self, dest_region=dest_region) return self._mutate(table, key, values, rq_type) # Scan can get a bit gnarly - be prepared. def scan(self, table, start_key='', stop_key=None, families={}, filters=None): # We convert the filter immediately such that it doesn't have to be done # for every region. However if the filter has already been converted then # we can't convert it again. This means that even though we send out N RPCs # we only have to package the filter pb type once. if filters is not None and type(filters).__name__ != "Filter": filters = _to_filter(filters) previous_stop_key = start_key # Holds the contents of all responses. We return this at the end. result_set = Result(None) # We're going to need to loop over every relevant region. Break out # of this loop once we discover there are no more regions left to scan. while True: # Finds the first region and sends the initial message to it. first_response, cur_region = self._scan_hit_region_once( previous_stop_key, table, start_key, stop_key, families, filters) try: # Now we need to keep pinging this region for more results until # it has no more results to return. We can change how many rows it # returns for each call in the Requests module but I picked a # pseudo-arbitrary figure (alright, fine, I stole it from # asynchbase) # # We pass in first_response so it can pull out the scanner_id # from the first response. second_response = self._scan_region_while_more_results( cur_region, first_response) except PyBaseException as e: # Something happened to the region/region client in the middle # of a scan. We're going to handle it by... # # Handle the exception. e._handle_exception(self, dest_region=cur_region) # Recursively scan JUST this range of keys in the region (it could have been split # or merged so this recursive call may be scanning multiple regions or only half # of one region). result_set._append_response(self.scan( table, start_key=previous_stop_key, stop_key=cur_region.stop_key, families=families, filters=filters)) # We continue here because we don't want to append the # first_response results to the result_set. When we did the # recursive scan it rescanned whatever the first_response # initially contained. Appending both will produce duplicates. previous_stop_key = cur_region.stop_key if previous_stop_key == '' or (stop_key is not None and previous_stop_key > stop_key): break continue # Both calls succeeded! Append the results to the result_set. result_set._append_response(first_response) result_set._append_response(second_response) # Update the new previous_stop_key (so the next iteration can # lookup the next region to scan) previous_stop_key = cur_region.stop_key # Stopping criteria. This region is either the end ('') or the end of this region is # beyond the specific stop_key. if previous_stop_key == '' or (stop_key is not None and previous_stop_key > stop_key): break return result_set def _scan_hit_region_once(self, previous_stop_key, table, start_key, stop_key, families, filters): try: # Lookup the next region to scan by searching for the # previous_stop_key (region keys are inclusive on the start and # exclusive on the end) cur_region = self._find_hosting_region( table, previous_stop_key) except PyBaseException as e: # This means that either Master is down or something's funky with the META region. Try handling it # and recursively perform the same call again. e._handle_exception(self) return self._scan_hit_region_once(previous_stop_key, table, start_key, stop_key, families, filters) # Create the scan request object. The last two values are 'Close' and # 'Scanner_ID' respectively. rq = request.scan_request( cur_region, start_key, stop_key, families, filters, False, None) try: # Send the request. response = cur_region.region_client._send_request(rq) except PyBaseException as e: # Uh oh. Probably a region/region server issue. Handle it and try # again. e._handle_exception(self, dest_region=cur_region) return self._scan_hit_region_once(previous_stop_key, table, start_key, stop_key, families, filters) return response, cur_region def _scan_region_while_more_results(self, cur_region, response): # Create our own intermediate response set. response_set = Result(None) # Grab the scanner_id from the first_response. scanner_id = response.scanner_id # We only need to specify the scanner_id here because the region we're # pinging remembers our query based on the scanner_id. rq = request.scan_request( cur_region, None, None, None, None, False, scanner_id) while response.more_results_in_region: # Repeatedly hit it until empty. Note that we're not handling any # exceptions here, instead letting them bubble up because if any # of these calls fail we need to rescan the whole region (it seems # like a lot of work to search the results for the max row key that # we've received so far and rescan from there up) response = cur_region.region_client._send_request(rq) response_set._append_response(response) # Now close the scanner. rq = request.scan_request( cur_region, None, None, None, None, True, scanner_id) _ = cur_region.region_client._send_request(rq) # Close it and return the results! return response_set """ HERE LAY REGION AND CLIENT DISCOVERY """ def _find_hosting_region(self, table, key): # Check if it's in the cache already. dest_region = self._get_from_region_cache(table, key) if dest_region is None: # We have to reach out to master for the results. with self._master_lookup_lock: # Not ideal that we have to lock every thread however we limit # concurrent meta requests to one. This is because of the case # where 1000 greenlets all fail simultaneously we don't want # 1000 requests shot off to the master (all looking for the # same response). My solution is to only let one through at a # time and then when it's your turn, check the cache again to # see if one of the greenlets let in before you already fetched # the meta or not. We can't bucket greenlets and selectively # wake them up simply because we have no idea which key falls # into which region. We can bucket based on key but that's a # lot of overhead for an unlikely scenario. dest_region = self._get_from_region_cache(table, key) if dest_region is None: # Nope, still not in the cache. logger.debug( 'Region cache miss! Table: %s, Key: %s', table, key) # Ask master for region information. dest_region = self._discover_region(table, key) return dest_region def _discover_region(self, table, key): meta_key = self._construct_meta_key(table, key) # Create the appropriate meta request given a meta_key. meta_rq = request.master_request(meta_key) try: # This will throw standard Region/RegionServer exceptions. # We need to catch them and convert them to the Master equivalent. response = self.master_client._send_request(meta_rq) except (AttributeError, RegionServerException, RegionException): if self.master_client is None: # I don't know why this can happen but it does. raise MasterServerException(None, None) raise MasterServerException( self.master_client.host, self.master_client.port) # Master gave us a response. We need to run and parse the response, # then do all necessary work for entering it into our structures. return self._create_new_region(response, table) def _create_new_region(self, response, table): cells = response.result.cell # We have a valid response but no cells? Apparently that means the # table doesn't exist! if len(cells) == 0: raise NoSuchTableException("Table does not exist.") # We get ~4 cells back each holding different information. We only care # about two of them. for cell in cells: if cell.qualifier == "regioninfo": # Take the regioninfo information and parse it into our own # Region representation. new_region = region_from_cell(cell) elif cell.qualifier == "server": # Grab the host, port of the Region Server that this region is # hosted on. server_loc = cell.value host, port = cell.value.split(':') else: continue # Do we have an existing client for this region server already? if server_loc in self.reverse_client_cache: # If so, grab it! new_region.region_client = self.reverse_client_cache[server_loc] else: # Otherwise we need to create a new region client instance. new_client = region.NewClient(host, port, self.pool_size) if new_client is None: # Welp. We can't connect to the server that the Master # supplied. Raise an exception. raise RegionServerException(host=host, port=port) logger.info("Created new Client for RegionServer %s", server_loc) # Add it to the host,port -> instance of region client map. self.reverse_client_cache[server_loc] = new_client # Attach the region_client to the region. new_region.region_client = new_client # Region's set up! Add this puppy to the cache so future requests can # use it. self._add_to_region_cache(new_region) logger.info("Successfully discovered new region %s", new_region) return new_region def _recreate_master_client(self): if self.master_client is not None: # yep, still no idea why self.master_client can be set to None. self.master_client.close() # Ask ZooKeeper for the location of the Master. ip, port = zk.LocateMaster(self.zkquorum) try: # Try creating a new client instance and setting it as the new # master_client. self.master_client = region.NewClient(ip, port, self.pool_size) except RegionServerException: # We can't connect to the address that ZK supplied. Raise an # exception. raise MasterServerException(ip, port) """ HERE LAY THE MISCELLANEOUS """ def _close_old_regions(self, overlapping_region_intervals): # Loop over the regions to close and close whoever their # attached client is. # # TODO: ...should we really be killing a client unneccessarily? for reg in overlapping_region_intervals: reg.data.region_client.close() def _purge_client(self, region_client): # Given a client to close, purge all of it's known hosted regions from # our cache, delete the reverse lookup entry and close the client # clearing up any file descriptors. with self._cache_lock: for reg in region_client.regions: self._delete_from_region_cache(reg.table, reg.start_key) self.reverse_client_cache.pop( region_client.host + ":" + region_client.port, None) region_client.close() def _purge_region(self, reg): # Given a region, deletes it's entry from the cache and removes itself # from it's region client's region list. with self._cache_lock: self._delete_from_region_cache(reg.table, reg.start_key) try: reg.region_client.regions.remove(reg) except ValueError: pass def _construct_meta_key(self, table, key): return table + "," + key + ",:" def close(self): logger.info("Main client received close request.") # Close the master client. if self.master_client is not None: self.master_client.close() # Clear the region cache. self.region_cache.clear() # Close each open region client. for location, client in self.reverse_client_cache.items(): client.close() self.reverse_client_cache = {}
class MainClient: def __init__(self, zkquorum, pool_size): # Location of the ZooKeeper quorum (csv) self.zkquorum = zkquorum # Connection pool size per region server (and master!) self.pool_size = pool_size # Persistent connection to the master server. self.master_client = None # IntervalTree data structure that allows me to create ranges # representing known row keys that fall within a specific region. Any # 'region look up' is then O(logn) self.region_cache = IntervalTree() # Takes a client's host:port as key and maps it to a client instance. self.reverse_client_cache = {} # Mutex used for all caching operations. self._cache_lock = Lock() # Mutex used so only one thread can request meta information from # the master at a time. self._master_lookup_lock = Lock() """ HERE LAY CACHE OPERATIONS """ def _add_to_region_cache(self, new_region): stop_key = new_region.stop_key if stop_key == '': # This is hacky but our interval tree requires hard interval stops. # So what's the largest char out there? chr(255) -> '\xff'. If # you're using '\xff' as a prefix for your rows then this'll cause # a cache miss on every request. stop_key = '\xff' # Keys are formatted like: 'tablename,key' start_key = new_region.table + ',' + new_region.start_key stop_key = new_region.table + ',' + stop_key # Only let one person touch the cache at once. with self._cache_lock: # Get all overlapping regions (overlapping == stale) overlapping_regions = self.region_cache[start_key:stop_key] # Close the overlapping regions. self._close_old_regions(overlapping_regions) # Remove the overlapping regions. self.region_cache.remove_overlap(start_key, stop_key) # Insert my region. self.region_cache[start_key:stop_key] = new_region # Add this region to the region_client's internal # list of all the regions it serves. new_region.region_client.regions.append(new_region) def _get_from_region_cache(self, table, key): # Only let one person touch the cache at once. with self._cache_lock: # We don't care about the last two characters ',:' in the meta_key. # 'table,key,:' --> 'table,key' meta_key = self._construct_meta_key(table, key)[:-2] # Fetch the region that serves this key regions = self.region_cache[meta_key] try: # Returns a set. Pop the element from the set. # (there shouldn't be more than 1 elem in the set) a = regions.pop() return a.data except KeyError: # Returned set is empty? Cache miss! return None def _delete_from_region_cache(self, table, start_key): # Don't acquire the lock because the calling function should have done # so already self.region_cache.remove_overlap(table + "," + start_key) """ HERE LAY REQUESTS """ def get(self, table, key, families={}, filters=None): """ get a row or specified cell with optional filter :param table: hbase table :param key: row key :param families: (optional) specifies columns to get, e.g., {"columnFamily1":["col1","col2"], "colFamily2": "col3"} :param filters: (optional) column filters :return: response with cells """ try: # Step 0. Set dest_region to None so if an exception is # thrown in _find_hosting_region, the exception handling # doesn't break trying to reference dest_region. dest_region = None # Step 1. Figure out where to send it. dest_region = self._find_hosting_region(table, key) # Step 2. Build the appropriate pb message. rq = request.get_request(dest_region, key, families, filters) # Step 3. Send the message and twiddle our thumbs. response = dest_region.region_client._send_request(rq) # Step 4. Success. return Result(response) except PyBaseException as e: # Step X. Houston, we have an error. The cool thing about how # this is coded is that exceptions know how to handle themselves. # All we need to do is call _handle_exception and everything should # be happy! If it cannot handle itself (unrecoverable) then it will # re-raise the exception in the handle method and we'll die too. # # We pass dest_region in because the handling code needs to know # which region or region_client it needs to reestablish. e._handle_exception(self, dest_region=dest_region) # Everything should be dandy now. Repeat the request! return self.get(table, key, families=families, filters=filters) def put(self, table, key, values): return self._mutate(table, key, values, request.put_request) def delete(self, table, key, values): return self._mutate(table, key, values, request.delete_request) def append(self, table, key, values): return self._mutate(table, key, values, request.append_request) def increment(self, table, key, values): return self._mutate(table, key, values, request.increment_request) def _mutate(self, table, key, values, rq_type): # Same exact methodology as 'get'. Because all mutate requests have # equivalent code I've combined them into a single function. try: dest_region = None dest_region = self._find_hosting_region(table, key) rq = rq_type(dest_region, key, values) response = dest_region.region_client._send_request(rq) return Result(response) except PyBaseException as e: e._handle_exception(self, dest_region=dest_region) return self._mutate(table, key, values, rq_type) # Scan can get a bit gnarly - be prepared. def scan(self, table, start_key='', stop_key=None, families={}, filters=None): # We convert the filter immediately such that it doesn't have to be done # for every region. However if the filter has already been converted then # we can't convert it again. This means that even though we send out N RPCs # we only have to package the filter pb type once. if filters is not None and type(filters).__name__ != "Filter": filters = _to_filter(filters) previous_stop_key = start_key # Holds the contents of all responses. We return this at the end. result_set = Result(None) # We're going to need to loop over every relevant region. Break out # of this loop once we discover there are no more regions left to scan. while True: # Finds the first region and sends the initial message to it. first_response, cur_region = self._scan_hit_region_once( previous_stop_key, table, start_key, stop_key, families, filters) try: # Now we need to keep pinging this region for more results until # it has no more results to return. We can change how many rows it # returns for each call in the Requests module but I picked a # pseudo-arbitrary figure (alright, fine, I stole it from # asynchbase) # # We pass in first_response so it can pull out the scanner_id # from the first response. second_response = self._scan_region_while_more_results( cur_region, first_response) except PyBaseException as e: # Something happened to the region/region client in the middle # of a scan. We're going to handle it by... # # Handle the exception. e._handle_exception(self, dest_region=cur_region) # Recursively scan JUST this range of keys in the region (it could have been split # or merged so this recursive call may be scanning multiple regions or only half # of one region). result_set._append_response( self.scan(table, start_key=previous_stop_key, stop_key=cur_region.stop_key, families=families, filters=filters)) # We continue here because we don't want to append the # first_response results to the result_set. When we did the # recursive scan it rescanned whatever the first_response # initially contained. Appending both will produce duplicates. previous_stop_key = cur_region.stop_key if previous_stop_key == '' or (stop_key is not None and previous_stop_key > stop_key): break continue # Both calls succeeded! Append the results to the result_set. result_set._append_response(first_response) result_set._append_response(second_response) # Update the new previous_stop_key (so the next iteration can # lookup the next region to scan) previous_stop_key = cur_region.stop_key # Stopping criteria. This region is either the end ('') or the end of this region is # beyond the specific stop_key. if previous_stop_key == '' or (stop_key is not None and previous_stop_key > stop_key): break return result_set def _scan_hit_region_once(self, previous_stop_key, table, start_key, stop_key, families, filters): try: # Lookup the next region to scan by searching for the # previous_stop_key (region keys are inclusive on the start and # exclusive on the end) cur_region = self._find_hosting_region(table, previous_stop_key) except PyBaseException as e: # This means that either Master is down or something's funky with the META region. Try handling it # and recursively perform the same call again. e._handle_exception(self) return self._scan_hit_region_once(previous_stop_key, table, start_key, stop_key, families, filters) # Create the scan request object. The last two values are 'Close' and # 'Scanner_ID' respectively. rq = request.scan_request(cur_region, start_key, stop_key, families, filters, False, None) try: # Send the request. response = cur_region.region_client._send_request(rq) except PyBaseException as e: # Uh oh. Probably a region/region server issue. Handle it and try # again. e._handle_exception(self, dest_region=cur_region) return self._scan_hit_region_once(previous_stop_key, table, start_key, stop_key, families, filters) return response, cur_region def _scan_region_while_more_results(self, cur_region, response): # Create our own intermediate response set. response_set = Result(None) # Grab the scanner_id from the first_response. scanner_id = response.scanner_id # We only need to specify the scanner_id here because the region we're # pinging remembers our query based on the scanner_id. rq = request.scan_request(cur_region, None, None, None, None, False, scanner_id) while response.more_results_in_region: # Repeatedly hit it until empty. Note that we're not handling any # exceptions here, instead letting them bubble up because if any # of these calls fail we need to rescan the whole region (it seems # like a lot of work to search the results for the max row key that # we've received so far and rescan from there up) response = cur_region.region_client._send_request(rq) response_set._append_response(response) # Now close the scanner. rq = request.scan_request(cur_region, None, None, None, None, True, scanner_id) _ = cur_region.region_client._send_request(rq) # Close it and return the results! return response_set """ HERE LAY REGION AND CLIENT DISCOVERY """ def _find_hosting_region(self, table, key): # Check if it's in the cache already. dest_region = self._get_from_region_cache(table, key) if dest_region is None: # We have to reach out to master for the results. with self._master_lookup_lock: # Not ideal that we have to lock every thread however we limit # concurrent meta requests to one. This is because of the case # where 1000 greenlets all fail simultaneously we don't want # 1000 requests shot off to the master (all looking for the # same response). My solution is to only let one through at a # time and then when it's your turn, check the cache again to # see if one of the greenlets let in before you already fetched # the meta or not. We can't bucket greenlets and selectively # wake them up simply because we have no idea which key falls # into which region. We can bucket based on key but that's a # lot of overhead for an unlikely scenario. dest_region = self._get_from_region_cache(table, key) if dest_region is None: # Nope, still not in the cache. logger.debug('Region cache miss! Table: %s, Key: %s', table, key) # Ask master for region information. dest_region = self._discover_region(table, key) return dest_region def _discover_region(self, table, key): meta_key = self._construct_meta_key(table, key) # Create the appropriate meta request given a meta_key. meta_rq = request.master_request(meta_key) try: # This will throw standard Region/RegionServer exceptions. # We need to catch them and convert them to the Master equivalent. response = self.master_client._send_request(meta_rq) except (AttributeError, RegionServerException, RegionException): if self.master_client is None: # I don't know why this can happen but it does. raise MasterServerException(None, None) raise MasterServerException(self.master_client.host, self.master_client.port) # Master gave us a response. We need to run and parse the response, # then do all necessary work for entering it into our structures. return self._create_new_region(response, table) def _create_new_region(self, response, table): cells = response.result.cell # We have a valid response but no cells? Apparently that means the # table doesn't exist! if len(cells) == 0: raise NoSuchTableException("Table does not exist.") # We get ~4 cells back each holding different information. We only care # about two of them. for cell in cells: if cell.qualifier == "regioninfo": # Take the regioninfo information and parse it into our own # Region representation. new_region = region_from_cell(cell) elif cell.qualifier == "server": # Grab the host, port of the Region Server that this region is # hosted on. server_loc = cell.value host, port = cell.value.split(':') else: continue # Do we have an existing client for this region server already? if server_loc in self.reverse_client_cache: # If so, grab it! new_region.region_client = self.reverse_client_cache[server_loc] else: # Otherwise we need to create a new region client instance. new_client = region.NewClient(host, port, self.pool_size) if new_client is None: # Welp. We can't connect to the server that the Master # supplied. Raise an exception. raise RegionServerException(host=host, port=port) logger.info("Created new Client for RegionServer %s", server_loc) # Add it to the host,port -> instance of region client map. self.reverse_client_cache[server_loc] = new_client # Attach the region_client to the region. new_region.region_client = new_client # Region's set up! Add this puppy to the cache so future requests can # use it. self._add_to_region_cache(new_region) logger.info("Successfully discovered new region %s", new_region) return new_region def _recreate_master_client(self): if self.master_client is not None: # yep, still no idea why self.master_client can be set to None. self.master_client.close() # Ask ZooKeeper for the location of the Master. ip, port = zk.LocateMaster(self.zkquorum) try: # Try creating a new client instance and setting it as the new # master_client. self.master_client = region.NewClient(ip, port, self.pool_size) except RegionServerException: # We can't connect to the address that ZK supplied. Raise an # exception. raise MasterServerException(ip, port) """ HERE LAY THE MISCELLANEOUS """ def _close_old_regions(self, overlapping_region_intervals): # Loop over the regions to close and close whoever their # attached client is. # # TODO: ...should we really be killing a client unneccessarily? for reg in overlapping_region_intervals: reg.data.region_client.close() def _purge_client(self, region_client): # Given a client to close, purge all of it's known hosted regions from # our cache, delete the reverse lookup entry and close the client # clearing up any file descriptors. with self._cache_lock: for reg in region_client.regions: self._delete_from_region_cache(reg.table, reg.start_key) self.reverse_client_cache.pop( region_client.host + ":" + region_client.port, None) region_client.close() def _purge_region(self, reg): # Given a region, deletes it's entry from the cache and removes itself # from it's region client's region list. with self._cache_lock: self._delete_from_region_cache(reg.table, reg.start_key) try: reg.region_client.regions.remove(reg) except ValueError: pass def _construct_meta_key(self, table, key): return table + "," + key + ",:" def close(self): logger.info("Main client received close request.") # Close the master client. if self.master_client is not None: self.master_client.close() # Clear the region cache. self.region_cache.clear() # Close each open region client. for location, client in self.reverse_client_cache.items(): client.close() self.reverse_client_cache = {}