def test_collect(self): pid = 0 core._ValueClass = core._MultiProcessValue(lambda: pid) labels = dict((i, i) for i in 'abcd') def add_label(key, value): l = labels.copy() l[key] = value return l c = Counter('c', 'help', labelnames=labels.keys(), registry=None) g = Gauge('g', 'help', labelnames=labels.keys(), registry=None) h = Histogram('h', 'help', labelnames=labels.keys(), registry=None) c.labels(**labels).inc(1) g.labels(**labels).set(1) h.labels(**labels).observe(1) pid = 1 c.labels(**labels).inc(1) g.labels(**labels).set(1) h.labels(**labels).observe(5) metrics = dict((m.name, m) for m in self.collector.collect()) self.assertEqual( metrics['c'].samples, [Sample('c_total', labels, 2.0)] ) metrics['g'].samples.sort(key=lambda x: x[1]['pid']) self.assertEqual(metrics['g'].samples, [ Sample('g', add_label('pid', '0'), 1.0), Sample('g', add_label('pid', '1'), 1.0), ]) metrics['h'].samples.sort( key=lambda x: (x[0], float(x[1].get('le', 0))) ) expected_histogram = [ Sample('h_bucket', add_label('le', '0.005'), 0.0), Sample('h_bucket', add_label('le', '0.01'), 0.0), Sample('h_bucket', add_label('le', '0.025'), 0.0), Sample('h_bucket', add_label('le', '0.05'), 0.0), Sample('h_bucket', add_label('le', '0.075'), 0.0), Sample('h_bucket', add_label('le', '0.1'), 0.0), Sample('h_bucket', add_label('le', '0.25'), 0.0), Sample('h_bucket', add_label('le', '0.5'), 0.0), Sample('h_bucket', add_label('le', '0.75'), 0.0), Sample('h_bucket', add_label('le', '1.0'), 1.0), Sample('h_bucket', add_label('le', '2.5'), 1.0), Sample('h_bucket', add_label('le', '5.0'), 2.0), Sample('h_bucket', add_label('le', '7.5'), 2.0), Sample('h_bucket', add_label('le', '10.0'), 2.0), Sample('h_bucket', add_label('le', '+Inf'), 2.0), Sample('h_count', labels, 2.0), Sample('h_sum', labels, 6.0), ] self.assertEqual(metrics['h'].samples, expected_histogram)
def test_reset_registry_with_labels(self): registry = CollectorRegistry() gauge = Gauge('g', 'help', ['l'], registry=registry) gauge.labels('a').inc() self.assertEqual(1, registry.get_sample_value('g', {'l': 'a'})) counter = Counter('c_total', 'help', ['l'], registry=registry) counter.labels('a').inc() self.assertEqual(1, registry.get_sample_value('c_total', {'l': 'a'})) summary = Summary('s', 'help', ['l'], registry=registry) summary.labels('a').observe(10) self.assertEqual(1, registry.get_sample_value('s_count', {'l': 'a'})) self.assertEqual(10, registry.get_sample_value('s_sum', {'l': 'a'})) histogram = Histogram('h', 'help', ['l'], registry=registry) histogram.labels('a').observe(2) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '1.0', 'l': 'a'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '2.5', 'l': 'a'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '5.0', 'l': 'a'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '+Inf', 'l': 'a'})) self.assertEqual(1, registry.get_sample_value('h_count', {'l': 'a'})) self.assertEqual(2, registry.get_sample_value('h_sum', {'l': 'a'})) registry.reset() self.assertEqual(0, registry.get_sample_value('g', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('c_total', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('s_count', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('s_sum', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '1.0', 'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '2.5', 'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '5.0', 'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '+Inf', 'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_count', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_sum', {'l': 'a'}))
def buckets_to_metrics(self, metric_name, buckets): # Converts raw bucket metric into sorted list of buckets unit = buckets['boundary_unit'] description = 'libmedida metric type: ' + buckets['type'] c = Counter(metric_name + '_count', description, self.label_names, registry=self.registry) s = Counter(metric_name + '_sum', description, self.label_names, registry=self.registry) g = Gauge(metric_name + '_bucket', description, self.label_names + ['le'], registry=self.registry) measurements = [] for bucket in buckets['buckets']: measurements.append({ 'boundary': self.duration_to_seconds(bucket['boundary'], unit), 'count': bucket['count'], 'sum': bucket['sum'] }) count = 0 for m in sorted(measurements, key=lambda i: i['boundary']): # Buckets from core contain only values from their respective ranges. # Prometheus expects "le" buckets to be cummulative so we need some extra math count += m['count'] c.labels(*self.labels).inc(m['count']) s.labels(*self.labels).inc(self.duration_to_seconds( m['sum'], unit)) # Treat buckets larger than 30d as infinity if float(m['boundary']) > 30 * 86400: g.labels(*self.labels + ['+Inf']).inc(count) else: g.labels(*self.labels + [m['boundary']]).inc(count)
def metrics(): """Agent execution function""" # tags definition registry = CollectorRegistry(auto_describe=False) service_status = Gauge("Node_Get_ServiceStatus", SERVICE_STATUS_TITLE, ["NodeIP", "NodePort"], registry=registry) genesis_block_details = Gauge("Node_Get_GenesisBlockNumberDetails", GENESIS_BLOCK_DETAILS_TITLE, ["NodeIP", "NodePort", "GenesisBlockNumberHash"], registry=registry) chain_info = Gauge("Node_Get_ChainInfo", CHAIN_INFO_TITLE, ["NodeIP", "NodePort", "ChainName", "Operator", "TokenName", "TokenSymbol", "Version"], registry=registry) node_peers = Gauge("Node_Get_NodePeers", NODE_PEERS_TITLE, ["NodeIP", "NodePort"], registry=registry) chain_nodes = Gauge("Node_Get_ChainNodes", CHAIN_NODES_TITLE, ["NodeIP", "NodePort"], registry=registry) last_block_number = Gauge("Node_Get_LastBlockNumber", LAST_BLOCK_NUMBER_TITLE, ["NodeIP", "NodePort", "GenesisBlockNumberHash", "NodeID", "NodeAddress"], registry=registry) check_proposer = Gauge("Node_CheckProposer", CHECK_PROPOSER_TITLE, ["NodeIP", "NodePort"], registry=registry) last_block_details = Gauge("Node_Get_LastBlockNumberDetails", LAST_BLOCK_DETAILS_TITLE, ["NodeIP", "NodePort", "LastBlocknumber", "LastBlockProposer", "LastBlockHash", "NodeID", "HostPlatform", "HostName", "ConsensusStatus", "SoftVersion"], registry=registry) vote_node = Gauge("Node_Get_VoteNode", VOTE_NODE_TITLE, ["NodeIP", "NodePort", "NodeID", "Voter"], registry=registry) block_height_difference = Gauge("Node_Get_BlockDifference", BLOCK_HEIGHT_DIFFERENCE_TITLE, ["NodeIP", "NodePort", "CurrentHeight", "PreviousHeight"], registry=registry) block_interval = Gauge("Node_Get_BlockTimeDifference", BLOCK_INTERVAL_TITLE, ["NodeIP", "NodePort"], registry=registry) last_block_transactions = Gauge("Node_Get_LastBlockNumberTransactions", LAST_BLOCK_TRANSACTIONS_TITLE, ["NodeIP", "NodePort"], registry=registry) last_block_quota_used = Gauge("Node_Get_LastBlockNumberQuotaUsed", LAST_BLOCK_QUOTA_USED_TITLE, ["NodeIP", "NodePort"], registry=registry) chain_quota_price = Gauge("Node_Get_QuotaPrice", CHAIN_QUOTA_PRICE_TITLE, ["NodeIP", "NodePort"], registry=registry) block_quota_limit = Gauge("Node_Get_BlockQuotaLimit", BLOCK_QUOTA_LIMIT_TITLE, ["NodeIP", "NodePort"], registry=registry) local_voter = Gauge("Node_Get_LocalVoter", LOCAL_VOTE_TITLE, ["NodeIP", "NodePort"], registry=registry) vote_number = Gauge("Block_Vote_Number", BLOCK_VOTE_NUMBER_TITLE, ["NodeIP", "NodePort"], registry=registry) # run exporter grpc_wrapper = GrpcWrapper(GRPC_HOST, GRPC_PORT) node_address = grpc_wrapper.get_node_address() logger.debug("Node Address: %s" % (node_address)) ## Exporter Status service_status.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(1) ## Genesis Block genesis_block_info = grpc_wrapper.GetBlockByNumber(0) genesis_block_hash = base64.b64decode(genesis_block_info['header']['prevhash']).hex() genesis_block_time = genesis_block_info['header']['timestamp'] genesis_block_details.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT, GenesisBlockNumberHash=genesis_block_hash).set(genesis_block_time) logger.debug("Genesis Block - Hash: %s, Time: %s" % (genesis_block_hash, genesis_block_time)) ## Last Block block_number_info = grpc_wrapper.block_number() last_block_number_int = int(block_number_info["blockNumber"]) prev_block_number_int = last_block_number_int - 1 last_block_number.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT, NodeID=node_address, GenesisBlockNumberHash=genesis_block_hash, NodeAddress=node_address).set(last_block_number_int) logger.debug("Block Number - Last: %s, Previous: %s" % (last_block_number_int, prev_block_number_int)) ## Metadata metadata_info = grpc_wrapper.metadata() chain_name = None # TODO metadata_info['chainName'] operator = None # TODO metadata_info['operator'] token_name = None # TODO metadata_info['tokenName'] token_symbol = None # TODO metadata_info['tokenSymbol'] economical_model = 0 # TODO metadata_info['economicalModel'] chain_version = metadata_info['version'] chain_info.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT, ChainName=chain_name, Operator=operator, TokenName=token_name, TokenSymbol=token_symbol, Version=chain_version).set(economical_model) ## Chain Nodes consensus_node_list = [ base64.b64decode(validator).hex() for validator in metadata_info['validators']] consensus_node_count = len(consensus_node_list) chain_nodes.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(consensus_node_count) ## block_info = grpc_wrapper.GetBlockByNumber(last_block_number_int) previous_block_info = grpc_wrapper.GetBlockByNumber(prev_block_number_int) block_head_info = block_info['header'] block_commits = [] # TODO list(block_info['header']['proof']['Bft']['commits'].keys()) consensus_nodes_count = len(consensus_node_list) for i in range(consensus_nodes_count): voter_address = consensus_node_list[i] vote_status = 1 if voter_address in block_commits else 0 vote_node.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT, NodeID=node_address, Voter=voter_address).set(vote_status) is_committer = 1 if node_address in block_commits else 0 local_voter.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(is_committer) block_vote_number = len(block_commits) vote_number.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(block_vote_number) logger.debug("Vote Number - block_vote_number: %s " % (block_vote_number)) last_block_hash_base64 = grpc_wrapper.cli_request("GetBlockHash", CitaCloudController.BlockNumber(block_number=last_block_number_int))['hash'] last_block_hash = base64.b64decode(last_block_hash_base64).hex() block_time = int(block_head_info['timestamp']) block_proposer = base64.b64decode(block_head_info['proposer']).hex() previous_block_time = int(previous_block_info['header']['timestamp']) consensus = 1 if node_address in consensus_node_list else 0 node_software_version=grpc_wrapper.cli_request("GetVersion", common.Empty())["version"] last_block_details.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT, NodeID=node_address, LastBlocknumber=last_block_number_int, LastBlockProposer=block_proposer, LastBlockHash=last_block_hash, HostPlatform=EXPORTER_PLATFORM, HostName=AGENT_NAME, ConsensusStatus=consensus, SoftVersion=node_software_version).set(block_time) logger.debug("Last Block Details - Last Block Hash: %s " % (last_block_hash)) interval = abs(block_time - previous_block_time) block_height_difference.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT, CurrentHeight=last_block_number_int, PreviousHeight=prev_block_number_int).set(interval) block_interval.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(interval) ## Last Block Transactions block_transactions = len(block_info.get('body').get('txHashes')) last_block_transactions.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(block_transactions) ## Last Block Quota Used if block_head_info.get('quotaUsed'): block_quota_used = int(block_head_info['quotaUsed'], 16) else: block_quota_used = 0 # TODO int(block_head_info['gasUsed'], 16) #Get the previous version of CITA v0.19.1 gasUsed last_block_quota_used.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(block_quota_used) ## Check Proposer proposer = 1 if node_address == block_proposer else 0 check_proposer.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(proposer) logger.debug("CheckProposer - Node Address: %s, Block Proposer: %s" % (node_address, block_proposer)) # Peer Info peer_count = grpc_wrapper.cli_request("GetPeerCount", common.Empty())["peerCount"] node_peers.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(peer_count) ## Quota Price # quota_price = grpc_wrapper.quota_price() # price = quota_price # chain_quota_price.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(int(price, 16)) ## Block Limit # block_limit = grpc_wrapper.block_limit() # limit = block_limit # block_quota_limit.labels(NodeIP=GRPC_HOST, NodePort=GRPC_PORT).set(int(limit, 16)) # Response return Response(prometheus_client.generate_latest(registry), mimetype="text/plain")
def do_GET(self): self.set_vars() try: response = requests.get(self.metrics_url) except requests.ConnectionError: self.error(504, 'Error retrieving data from {}'.format(self.metrics_url)) return if not response.ok: self.error(504, 'Error retrieving data from {}'.format(self.metrics_url)) return try: metrics = response.json()['metrics'] except ValueError: self.error(500, 'Error parsing metrics JSON data') return # iterate over all metrics for k in metrics: metric_name = re.sub('\.|-|\s', '_', k).lower() metric_name = 'stellar_core_' + metric_name if metrics[k]['type'] == 'timer': # we have a timer, expose as a Prometheus Summary # we convert stellar-core time units to seconds, as per Prometheus best practices metric_name = metric_name + '_seconds' if 'sum' in metrics[k]: # use libmedida sum value total_duration = metrics[k]['sum'] else: # compute sum value total_duration = (metrics[k]['mean'] * metrics[k]['count']) c = Counter(metric_name + '_count', 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) c.labels(*self.labels).inc(metrics[k]['count']) s = Counter(metric_name + '_sum', 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) s.labels(*self.labels).inc(self.duration_to_seconds(total_duration, metrics[k]['duration_unit'])) # add stellar-core calculated quantiles to our summary summary = Gauge(metric_name, 'libmedida metric type: ' + metrics[k]['type'], self.label_names + ['quantile'], registry=self.registry) summary.labels(*self.labels + ['0.75']).set( self.duration_to_seconds(metrics[k]['75%'], metrics[k]['duration_unit'])) summary.labels(*self.labels + ['0.99']).set( self.duration_to_seconds(metrics[k]['99%'], metrics[k]['duration_unit'])) if metrics[k]['type'] == 'histogram': if 'count' not in metrics[k]: # Stellar-core version too old, we don't have required data continue c = Counter(metric_name + '_count', 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) c.labels(*self.labels).inc(metrics[k]['count']) s = Counter(metric_name + '_sum', 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) s.labels(*self.labels).inc(metrics[k]['sum']) # add stellar-core calculated quantiles to our summary summary = Gauge(metric_name, 'libmedida metric type: ' + metrics[k]['type'], self.label_names + ['quantile'], registry=self.registry) summary.labels(*self.labels + ['0.75']).set(metrics[k]['75%']) summary.labels(*self.labels + ['0.99']).set(metrics[k]['99%']) elif metrics[k]['type'] == 'counter': # we have a counter, this is a Prometheus Gauge g = Gauge(metric_name, 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) g.labels(*self.labels).set(metrics[k]['count']) elif metrics[k]['type'] == 'meter': # we have a meter, this is a Prometheus Counter c = Counter(metric_name, 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) c.labels(*self.labels).inc(metrics[k]['count']) # Export metrics from the info endpoint try: response = requests.get(self.info_url) except requests.ConnectionError: self.error(504, 'Error retrieving data from {}'.format(self.info_url)) return if not response.ok: self.error(504, 'Error retrieving data from {}'.format(self.info_url)) return try: info = response.json()['info'] except ValueError: self.error(500, 'Error parsing info JSON data') return if not all([i in info for i in self.info_keys]): self.error(500, 'Error - info endpoint did not return all required fields') return # Ledger metrics for core_name, prom_name in self.ledger_metrics.items(): g = Gauge('stellar_core_ledger_{}'.format(prom_name), 'Stellar core ledger metric name: {}'.format(core_name), self.label_names, registry=self.registry) g.labels(*self.labels).set(info['ledger'][core_name]) # Version 11.2.0 and later report quorum metrics in the following format: # "quorum" : { # "qset" : { # "agree": 3 # # Older versions use this format: # "quorum" : { # "758110" : { # "agree" : 3, if 'qset' in info['quorum']: tmp = info['quorum']['qset'] else: tmp = info['quorum'].values()[0] if not tmp: self.error(500, 'Error - missing quorum data') return for metric in self.quorum_metrics: g = Gauge('stellar_core_quorum_{}'.format(metric), 'Stellar core quorum metric: {}'.format(metric), self.label_names, registry=self.registry) g.labels(*self.labels).set(tmp[metric]) for metric in self.quorum_phase_metrics: g = Gauge('stellar_core_quorum_phase_{}'.format(metric), 'Stellar core quorum phase {}'.format(metric), self.label_names, registry=self.registry) if tmp['phase'].lower() == metric: g.labels(*self.labels).set(1) else: g.labels(*self.labels).set(0) # Versions >=11.2.0 expose more info about quorum if 'transitive' in info['quorum']: g = Gauge('stellar_core_quorum_transitive_intersection', 'Stellar core quorum transitive intersection', self.label_names, registry=self.registry) if info['quorum']['transitive']['intersection']: g.labels(*self.labels).set(1) else: g.labels(*self.labels).set(0) g = Gauge('stellar_core_quorum_transitive_last_check_ledger', 'Stellar core quorum transitive last_check_ledger', self.label_names, registry=self.registry) g.labels(*self.labels).set(info['quorum']['transitive']['last_check_ledger']) g = Gauge('stellar_core_quorum_transitive_node_count', 'Stellar core quorum transitive node_count', self.label_names, registry=self.registry) g.labels(*self.labels).set(info['quorum']['transitive']['node_count']) # Peers metrics g = Gauge('stellar_core_peers_authenticated_count', 'Stellar core authenticated_count count', self.label_names, registry=self.registry) g.labels(*self.labels).set(info['peers']['authenticated_count']) g = Gauge('stellar_core_peers_pending_count', 'Stellar core pending_count count', self.label_names, registry=self.registry) g.labels(*self.labels).set(info['peers']['pending_count']) g = Gauge('stellar_core_protocol_version', 'Stellar core protocol_version', self.label_names, registry=self.registry) g.labels(*self.labels).set(info['protocol_version']) for metric in self.state_metrics: name = re.sub('\s', '_', metric) g = Gauge('stellar_core_{}'.format(name), 'Stellar core state {}'.format(metric), self.label_names, registry=self.registry) if info['state'].lower().startswith(metric): # Use startswith to work around "!" g.labels(*self.labels).set(1) else: g.labels(*self.labels).set(0) g = Gauge('stellar_core_started_on', 'Stellar core start time in epoch', self.label_names, registry=self.registry) date = datetime.strptime(info['startedOn'], "%Y-%m-%dT%H:%M:%SZ") g.labels(*self.labels).set(int(date.strftime('%s'))) output = generate_latest(self.registry) if not output: self.error(500, 'Error - no metrics were genereated') return self.send_response(200) self.send_header('Content-Type', CONTENT_TYPE_LATEST) self.end_headers() self.wfile.write(output)
def write( self, path, groups_scanned=0, projects_scanned=0, projects_skipped=0, projects_no_language=0, ): total_languages = len(self.metrics.items()) total_percent = sum( [float(percent) for _, percent in self.metrics.items()]) logger.debug(f"{total_percent}% total scanned") relative_languages = { language_name: (float(language) / projects_scanned) for language_name, language in self.metrics.items() } gauge = Gauge( "languages_percent", "Languages scanned in percent", labelnames=["language"], registry=self.registry, ) language_items = relative_languages.items() for language_name, language in language_items: logger.info(f"Adding {language_name} as label") gauge.labels(language_name).set(round(language, 2)) total_languages_scanned_gauge = Gauge("languages_scanned_total", "Total languages scanned", registry=self.registry) total_languages_scanned_gauge.set(total_languages) project_scanned_gauge = Gauge("projects_scanned_total", "Total projects scanned", registry=self.registry) project_scanned_gauge.set(projects_scanned) projects_skipped_gauge = Gauge("projects_skipped_total", "Total projects skipped", registry=self.registry) projects_skipped_gauge.set(projects_skipped) projects_no_language_gauge = Gauge( "projects_no_language_total", "Projects without language detected", registry=self.registry, ) projects_no_language_gauge.set(projects_no_language) groups_scanned_gauge = Gauge("groups_scanned_total", "Total groups scanned", registry=self.registry) groups_scanned_gauge.set(groups_scanned) if Path(path).is_dir(): path = Path(path) / "metrics.txt" if Path.exists(Path(path).parents[0]): write_to_textfile(path, self.registry) logger.info(f"Metrics written to {path}") else: logger.error(f"Could not write metrics to {path}")
def exporter(): """Agent execution function""" # definition tag registry = CollectorRegistry(auto_describe=False) service_status = Gauge("Node_Get_ServiceStatus", SERVICE_STATUS_TITLE, ["NodeIP", "NodePort"], registry=registry) genesis_block_details = Gauge("Node_Get_GenesisBlockNumberDetails", GENESIS_BLOCK_DETAILS_TITLE, ["NodeIP", "NodePort", "GenesisBlockNumberHash"], registry=registry) chain_info = Gauge("Node_Get_ChainInfo", CHAIN_INFO_TITLE, ["NodeIP", "NodePort", "ChainName", "Operator", "TokenName", "TokenSymbol", "Version"], registry=registry) node_peers = Gauge("Node_Get_NodePeers", NODE_PEERS_TITLE, ["NodeIP", "NodePort"], registry=registry) chain_nodes = Gauge("Node_Get_ChainNodes", CHAIN_NODES_TITLE, ["NodeIP", "NodePort"], registry=registry) last_block_number = Gauge("Node_Get_LastBlockNumber", LAST_BLOCK_NUMBER_TITLE, ["NodeIP", "NodePort", "GenesisBlockNumberHash", "NodeID", "NodeAddress"], registry=registry) check_proposer = Gauge("Node_CheckProposer", CHECK_PROPOSER_TITLE, ["NodeIP", "NodePort"], registry=registry) last_block_details = Gauge("Node_Get_LastBlockNumberDetails", LAST_BLOCK_DETAILS_TITLE, [ "NodeIP", "NodePort", "LastBlocknumber", "LastBlockProposer", "LastBlockHash", "NodeID", "HostPlatform", "HostName", "ConsensusStatus", "SoftVersion" ], registry=registry) vote_node = Gauge("Node_Get_VoteNode", VOTE_NODE_TITLE, ["NodeIP", "NodePort", "NodeID", "Voter"], registry=registry) block_height_difference = Gauge("Node_Get_BlockDifference", BLOCK_HEIGHT_DIFFERENCE_TITLE, ["NodeIP", "NodePort", "CurrentHeight", "PreviousHeight"], registry=registry) dir_total_size = Gauge("Node_Get_DirInfo_TotalFileSize", NODE_DIR_TOTAL_SIZE_TITLE, ["NodeIP", "NodePort", "NodeDir"], registry=registry) dir_data_size = Gauge("Node_Get_DirInfo_DataFileSize", NODE_DIR_DATA_SIZE_TITLE, ["NodeIP", "NodePort", "NodeDir"], registry=registry) disk_used_size = Gauge("Node_Get_DiskInfo_UsedSize", NODE_DISK_USED_SIZE_TITLE, ["NodeIP", "NodePort", "NodeDir"], registry=registry) disk_free_size = Gauge("Node_Get_DiskInfo_FreeSize", NODE_DISK_FREE_SIZE_TITLE, ["NodeIP", "NodePort", "NodeDir"], registry=registry) block_interval = Gauge("Node_Get_BlockTimeDifference", BLOCK_INTERVAL_TITLE, ["NodeIP", "NodePort"], registry=registry) last_block_transactions = Gauge("Node_Get_LastBlockNumberTransactions", LAST_BLOCK_TRANSACTIONS_TITLE, ["NodeIP", "NodePort"], registry=registry) last_block_quota_used = Gauge("Node_Get_LastBlockNumberQuotaUsed", LAST_BLOCK_QUOTA_USED_TITLE, ["NodeIP", "NodePort"], registry=registry) chain_quota_price = Gauge("Node_Get_QuotaPrice", CHAIN_QUOTA_PRICE_TITLE, ["NodeIP", "NodePort"], registry=registry) block_quota_limit = Gauge("Node_Get_BlockQuotaLimit", BLOCK_QUOTA_LIMIT_TITLE, ["NodeIP", "NodePort"], registry=registry) local_voter = Gauge("Node_Get_LocalVoter", LOCAL_VOTE_TITLE, ["NodeIP", "NodePort"], registry=registry) vote_number = Gauge("Block_Vote_Number", BLOCK_VOTE_NUMBER_TITLE, ["NodeIP", "NodePort"], registry=registry) # run exporter node_ip = str(NODE.split(':')[0]) node_port = str(NODE.split(':')[1]) check_process = os.popen("ps alx |grep 'cita-chain' |grep -c -v grep") if check_process.read() == '0\n': service_status.labels(NodeIP=node_ip, NodePort=node_port).set(0) return Response(prometheus_client.generate_latest(registry), mimetype="text/plain") service_status.labels(NodeIP=node_ip, NodePort=node_port).set(1) class_result = ExporterFunctions(node_ip, node_port) dir_analysis(NODE_FILE_PATH) dir_total_size.labels( NodeIP=node_ip, NodePort=node_port, NodeDir=NODE_FILE_PATH, ).set(FILE_TOTAL_SIZE) dir_data_size.labels( NodeIP=node_ip, NodePort=node_port, NodeDir=NODE_FILE_PATH, ).set(DATA_TOTAL_SIZE) disk_used_size.labels( NodeIP=node_ip, NodePort=node_port, NodeDir=NODE_FILE_PATH, ).set(DISK_USED) disk_free_size.labels( NodeIP=node_ip, NodePort=node_port, NodeDir=NODE_FILE_PATH, ).set(DISK_FREE) genesis_block_info = class_result.block_number_detail('0x0') if 'result' in genesis_block_info: genesis_block_hash = genesis_block_info['result']['hash'] genesis_block_time = genesis_block_info['result']['header']['timestamp'] genesis_block_details.labels(NodeIP=node_ip, NodePort=node_port, GenesisBlockNumberHash=genesis_block_hash).set(genesis_block_time) else: print(genesis_block_info) block_number_info = class_result.block_number() if 'result' in block_number_info: hex_number = block_number_info['result'] previous_hex_number = hex(int(hex_number, 16) - 1) last_block_number.labels(NodeIP=node_ip, NodePort=node_port, GenesisBlockNumberHash=genesis_block_hash, NodeID=NODE_ID, NodeAddress=ADDRESS).set(int(hex_number, 16)) else: print(block_number_info) metadata_info = class_result.metadata(hex_number) if 'result' in metadata_info: chain_name = metadata_info['result']['chainName'] operator = metadata_info['result']['operator'] token_name = metadata_info['result']['tokenName'] token_symbol = metadata_info['result']['tokenSymbol'] economical_model = metadata_info['result']['economicalModel'] chain_version = metadata_info['result']['version'] chain_info.labels(NodeIP=node_ip, NodePort=node_port, ChainName=chain_name, Operator=operator, TokenName=token_name, TokenSymbol=token_symbol, Version=chain_version).set(economical_model) consensus_node_list = metadata_info['result']['validators'] consensus_node_count = len(consensus_node_list) chain_nodes.labels(NodeIP=node_ip, NodePort=node_port).set(consensus_node_count) else: print(metadata_info) block_info = class_result.block_number_detail(hex_number) previous_block_info = class_result.block_number_detail(previous_hex_number) if 'result' in block_info and 'result' in previous_block_info: block_head_info = block_info['result']['header'] if block_head_info.get('quotaUsed'): block_quota_used = int(block_head_info['quotaUsed'], 16) else: #Get the previous version of CITA v0.19.1 gasUsed block_head_info.get('gasUsed') block_quota_used = int(block_head_info['gasUsed'], 16) block_commits = list(block_info['result']['header']['proof']['Bft']['commits'].keys()) block_vote_number = len(block_commits) consensus_nodes_count = len(consensus_node_list) for i in range(consensus_nodes_count): voter_address = consensus_node_list[i] if voter_address in block_commits: vote_status = 1 else: vote_status = 0 vote_node.labels(NodeIP=node_ip, NodePort=node_port, NodeID=NODE_ID, Voter=voter_address).set(vote_status) if ADDRESS in block_commits: is_committer = 1 else: is_committer = 0 vote_number.labels(NodeIP=node_ip, NodePort=node_port).set(block_vote_number) local_voter.labels(NodeIP=node_ip, NodePort=node_port).set(is_committer) block_hash = block_info['result']['hash'] block_time = int(block_head_info['timestamp']) block_transactions = int(len(block_info['result']['body']['transactions'])) block_proposer = block_head_info['proposer'] previous_block_time = int(previous_block_info['result']['header']['timestamp']) interval = abs(block_time - previous_block_time) if ADDRESS in consensus_node_list: consensus = 1 else: consensus = 0 try: soft_version_exec = os.popen(SOFT_VERSION_TXT) soft_version = str(soft_version_exec.read().split(' ')[1].split('\n')[0]) except IndexError: soft_version = 'null' last_block_details.labels(NodeIP=node_ip, NodePort=node_port, LastBlocknumber=int(hex_number, 16), LastBlockProposer=block_proposer, LastBlockHash=block_hash, NodeID=NODE_ID, HostPlatform=EXPORTER_PLATFORM, HostName=AGENT_NAME, ConsensusStatus=consensus, SoftVersion=soft_version).set(block_time) block_height_difference.labels(NodeIP=node_ip, NodePort=node_port, CurrentHeight=int(hex_number, 16), PreviousHeight=int(previous_hex_number, 16)).set(interval) block_interval.labels(NodeIP=node_ip, NodePort=node_port).set(interval) last_block_transactions.labels(NodeIP=node_ip, NodePort=node_port).set(block_transactions) last_block_quota_used.labels(NodeIP=node_ip, NodePort=node_port).set(block_quota_used) if ADDRESS == block_proposer: proposer = 1 else: proposer = 0 check_proposer.labels(NodeIP=node_ip, NodePort=node_port).set(proposer) else: print(block_info) print(previous_block_info) peer_info = class_result.peer_count() if 'result' in peer_info: peers = peer_info['result'] node_peers.labels(NodeIP=node_ip, NodePort=node_port).set(int(peers, 16)) else: print(peer_info) quota_price = class_result.quota_price() if 'result' in quota_price: price = quota_price['result'] chain_quota_price.labels(NodeIP=node_ip, NodePort=node_port).set(int(price, 16)) else: print(quota_price) block_limit = class_result.block_limit() if 'result' in block_limit: limit = block_limit['result'] block_quota_limit.labels(NodeIP=node_ip, NodePort=node_port).set(int(limit, 16)) else: print(block_limit) return Response(prometheus_client.generate_latest(registry), mimetype="text/plain")
def do_GET(self): self.set_vars() ########################################### # Export metrics from the /metrics endpoint ########################################### try: response = requests.get(self.metrics_url) except requests.ConnectionError: self.error( 504, 'Error retrieving data from {}'.format(self.metrics_url)) return if not response.ok: self.error( 504, 'Error retrieving data from {}'.format(self.metrics_url)) return try: metrics = response.json()['metrics'] except ValueError: self.error(500, 'Error parsing metrics JSON data') return # iterate over all metrics for k in metrics: metric_name = re.sub('\.|-|\s', '_', k).lower() metric_name = 'digitalbits_core_' + metric_name if metrics[k]['type'] == 'timer': # we have a timer, expose as a Prometheus Summary # we convert digitalbits-core time units to seconds, as per Prometheus best practices metric_name = metric_name + '_seconds' if 'sum' in metrics[k]: # use libmedida sum value total_duration = metrics[k]['sum'] else: # compute sum value total_duration = (metrics[k]['mean'] * metrics[k]['count']) c = Counter(metric_name + '_count', 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) c.labels(*self.labels).inc(metrics[k]['count']) s = Counter(metric_name + '_sum', 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) s.labels(*self.labels).inc( self.duration_to_seconds(total_duration, metrics[k]['duration_unit'])) # add digitalbits-core calculated quantiles to our summary summary = Gauge(metric_name, 'libmedida metric type: ' + metrics[k]['type'], self.label_names + ['quantile'], registry=self.registry) summary.labels(*self.labels + ['0.75']).set( self.duration_to_seconds(metrics[k]['75%'], metrics[k]['duration_unit'])) summary.labels(*self.labels + ['0.99']).set( self.duration_to_seconds(metrics[k]['99%'], metrics[k]['duration_unit'])) elif metrics[k]['type'] == 'histogram': if 'count' not in metrics[k]: # DigitalBits-core version too old, we don't have required data continue c = Counter(metric_name + '_count', 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) c.labels(*self.labels).inc(metrics[k]['count']) s = Counter(metric_name + '_sum', 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) s.labels(*self.labels).inc(metrics[k]['sum']) # add digitalbits-core calculated quantiles to our summary summary = Gauge(metric_name, 'libmedida metric type: ' + metrics[k]['type'], self.label_names + ['quantile'], registry=self.registry) summary.labels(*self.labels + ['0.75']).set(metrics[k]['75%']) summary.labels(*self.labels + ['0.99']).set(metrics[k]['99%']) elif metrics[k]['type'] == 'counter': # we have a counter, this is a Prometheus Gauge g = Gauge(metric_name, 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) g.labels(*self.labels).set(metrics[k]['count']) elif metrics[k]['type'] == 'meter': # we have a meter, this is a Prometheus Counter c = Counter(metric_name, 'libmedida metric type: ' + metrics[k]['type'], self.label_names, registry=self.registry) c.labels(*self.labels).inc(metrics[k]['count']) elif metrics[k]['type'] == 'buckets': # We have a bucket, this is a Prometheus Histogram self.buckets_to_metrics(metric_name, metrics[k]) ####################################### # Export metrics from the info endpoint ####################################### try: response = requests.get(self.info_url) except requests.ConnectionError: self.error(504, 'Error retrieving data from {}'.format(self.info_url)) return if not response.ok: self.error(504, 'Error retrieving data from {}'.format(self.info_url)) return try: info = response.json()['info'] except ValueError: self.error(500, 'Error parsing info JSON data') return if not all([i in info for i in self.info_keys]): self.error( 500, 'Error - info endpoint did not return all required fields') return # Ledger metrics for core_name, prom_name in self.ledger_metrics.items(): g = Gauge( 'digitalbits_core_ledger_{}'.format(prom_name), 'DigitalBits core ledger metric name: {}'.format(core_name), self.label_names, registry=self.registry) g.labels(*self.labels).set(info['ledger'][core_name]) # Version 11.2.0 and later report quorum metrics in the following format: # "quorum" : { # "qset" : { # "agree": 3 # # Older versions use this format: # "quorum" : { # "758110" : { # "agree" : 3, if 'qset' in info['quorum']: tmp = info['quorum']['qset'] else: tmp = info['quorum'].values()[0] if not tmp: self.error(500, 'Error - missing quorum data') return for metric in self.quorum_metrics: g = Gauge('digitalbits_core_quorum_{}'.format(metric), 'DigitalBits core quorum metric: {}'.format(metric), self.label_names, registry=self.registry) g.labels(*self.labels).set(tmp[metric]) for metric in self.quorum_phase_metrics: g = Gauge('digitalbits_core_quorum_phase_{}'.format(metric), 'DigitalBits core quorum phase {}'.format(metric), self.label_names, registry=self.registry) if tmp['phase'].lower() == metric: g.labels(*self.labels).set(1) else: g.labels(*self.labels).set(0) # Versions >=11.2.0 expose more info about quorum if 'transitive' in info['quorum']: g = Gauge('digitalbits_core_quorum_transitive_intersection', 'DigitalBits core quorum transitive intersection', self.label_names, registry=self.registry) if info['quorum']['transitive']['intersection']: g.labels(*self.labels).set(1) else: g.labels(*self.labels).set(0) g = Gauge('digitalbits_core_quorum_transitive_last_check_ledger', 'DigitalBits core quorum transitive last_check_ledger', self.label_names, registry=self.registry) g.labels(*self.labels).set( info['quorum']['transitive']['last_check_ledger']) g = Gauge('digitalbits_core_quorum_transitive_node_count', 'DigitalBits core quorum transitive node_count', self.label_names, registry=self.registry) g.labels(*self.labels).set( info['quorum']['transitive']['node_count']) # Versions >=11.3.0 expose "critical" key if 'critical' in info['quorum']['transitive']: g = Gauge('digitalbits_core_quorum_transitive_critical', 'DigitalBits core quorum transitive critical', self.label_names + ['critical_validators'], registry=self.registry) if info['quorum']['transitive']['critical']: for peer_list in info['quorum']['transitive']['critical']: critical_peers = ','.join( sorted(peer_list) ) # label value is comma separated listof peers l = self.labels + [critical_peers] g.labels(*l).set(1) else: l = self.labels + ['null'] g.labels(*l).set(0) # Peers metrics g = Gauge('digitalbits_core_peers_authenticated_count', 'DigitalBits core authenticated_count count', self.label_names, registry=self.registry) g.labels(*self.labels).set(info['peers']['authenticated_count']) g = Gauge('digitalbits_core_peers_pending_count', 'DigitalBits core pending_count count', self.label_names, registry=self.registry) g.labels(*self.labels).set(info['peers']['pending_count']) g = Gauge('digitalbits_core_protocol_version', 'DigitalBits core protocol_version', self.label_names, registry=self.registry) g.labels(*self.labels).set(info['protocol_version']) for metric in self.state_metrics: name = re.sub('\s', '_', metric) g = Gauge('digitalbits_core_{}'.format(name), 'DigitalBits core state {}'.format(metric), self.label_names, registry=self.registry) if info['state'].lower().startswith( metric): # Use startswith to work around "!" g.labels(*self.labels).set(1) else: g.labels(*self.labels).set(0) g = Gauge('digitalbits_core_started_on', 'DigitalBits core start time in epoch', self.label_names, registry=self.registry) date = datetime.strptime(info['startedOn'], "%Y-%m-%dT%H:%M:%SZ") g.labels(*self.labels).set(int(date.strftime('%s'))) ####################################### # Export cursor metrics ####################################### try: response = requests.get(self.cursors_url) except requests.ConnectionError: self.error( 504, 'Error retrieving data from {}'.format(self.cursors_url)) return # Some server modes we want to scrape do not support 'getcursors' command at all. # These just respond with a 404 and the non-json informative unknown-commands output. if not response.ok and response.status_code != 404: self.error( 504, 'Error retrieving data from {}'.format(self.cursors_url)) return if "Supported HTTP commands" not in str(response.content): try: cursors = response.json()['cursors'] except ValueError: self.error(500, 'Error parsing cursor JSON data') return g = Gauge('digitalbits_core_active_cursors', 'DigitalBits core active cursors', self.label_names + ['cursor_name'], registry=self.registry) for cursor in cursors: if not cursor: continue l = self.labels + [cursor.get('id').strip()] g.labels(*l).set(cursor['cursor']) ####################################### # Render output ####################################### output = generate_latest(self.registry) if not output: self.error(500, 'Error - no metrics were genereated') return self.send_response(200) self.send_header('Content-Type', CONTENT_TYPE_LATEST) self.end_headers() self.wfile.write(output)
def do_GET(self): self.registry = CollectorRegistry() label_names = ["network", "account_id", "account_name", "asset_type"] m_balance = Gauge("stellar_account_balance", "Stellar core account balance", label_names, registry=self.registry) m_buying_liabilities = Gauge("stellar_account_buying_liabilities", "Stellar core account buying liabilities", label_names, registry=self.registry) m_selling_liabilities = Gauge( "stellar_account_selling_liabilities", "Stellar core account selling liabilities", label_names, registry=self.registry) m_num_sponsored = Gauge( "stellar_account_num_sponsored", "Stellar core account number of sponsored entries", label_names, registry=self.registry) m_num_sponsoring = Gauge( "stellar_account_num_sponsoring", "Stellar core account number of sponsoring entries", label_names, registry=self.registry) for network in config["networks"]: if "accounts" not in network or "name" not in network or "horizon_url" not in network: self.error( 500, 'Error - invalid network configuration: {}'.format( network)) return for account in network["accounts"]: if "account_id" not in account or "account_name" not in account: self.error( 500, 'Error - invalid account configuration: {}'.format( account)) return url = network["horizon_url"] + "/accounts/" + account[ "account_id"] try: r = requests.get(url) except requests.ConnectionError: self.error(504, 'Error retrieving data from {}'.format(url)) return if not r.ok: self.error(504, 'Error retrieving data from {}'.format(url)) return if "balances" not in r.json(): self.error( 500, "Error - no balances found for account {}".format( account["account_id"])) return if "num_sponsored" not in r.json(): self.error( 500, "Error - no num_sponsored found for account {}".format( account["account_id"])) return if "num_sponsoring" not in r.json(): self.error( 500, "Error - no num_sponsoring found for account {}". format(account["account_id"])) return labels = [ network["name"], account["account_id"], account["account_name"] ] m_num_sponsored.labels(*labels).set(r.json()["num_sponsored"]) m_num_sponsoring.labels(*labels).set( r.json()["num_sponsoring"]) for balance in r.json()["balances"]: labels = [ network["name"], account["account_id"], account["account_name"], balance["asset_type"] ] m_balance.labels(*labels).set(balance["balance"]) m_buying_liabilities.labels(*labels).set( balance["buying_liabilities"]) m_selling_liabilities.labels(*labels).set( balance["selling_liabilities"]) output = generate_latest(self.registry) self.send_response(200) self.send_header('Content-Type', CONTENT_TYPE_LATEST) self.end_headers() self.wfile.write(output)
class Indexer(RaftNode): def __init__(self, host='localhost', port=7070, seed_addr=None, conf=SyncObjConf(), data_dir='/tmp/cockatrice/index', grpc_port=5050, grpc_max_workers=10, http_port=8080, logger=getLogger(), http_logger=getLogger(), metrics_registry=CollectorRegistry()): self.__host = host self.__port = port self.__seed_addr = seed_addr self.__conf = conf self.__data_dir = data_dir self.__grpc_port = grpc_port self.__grpc_max_workers = grpc_max_workers self.__http_port = http_port self.__logger = logger self.__http_logger = http_logger self.__metrics_registry = metrics_registry # metrics self.__metrics_core_documents = Gauge( '{0}_indexer_index_documents'.format(NAME), 'The number of documents.', [ 'index_name', ], registry=self.__metrics_registry) self.__metrics_requests_total = Counter( '{0}_indexer_requests_total'.format(NAME), 'The number of requests.', ['func'], registry=self.__metrics_registry) self.__metrics_requests_duration_seconds = Histogram( '{0}_indexer_requests_duration_seconds'.format(NAME), 'The invocation duration in seconds.', ['func'], registry=self.__metrics_registry) self.__self_addr = '{0}:{1}'.format(self.__host, self.__port) self.__peer_addrs = [] if self.__seed_addr is None else get_peers( bind_addr=self.__seed_addr, timeout=10) self.__other_addrs = [ peer_addr for peer_addr in self.__peer_addrs if peer_addr != self.__self_addr ] self.__conf.serializer = self.__serialize self.__conf.deserializer = self.__deserialize self.__conf.validate() self.__indices = {} self.__index_configs = {} self.__writers = {} self.__auto_commit_timers = {} self.__lock = RLock() # create data dir os.makedirs(self.__data_dir, exist_ok=True) self.__file_storage = FileStorage(self.__data_dir, supports_mmap=True, readonly=False, debug=False) self.__ram_storage = RamStorage() # if seed addr specified and self node does not exist in the cluster, add self node to the cluster if self.__seed_addr is not None and self.__self_addr not in self.__peer_addrs: Thread(target=add_node, kwargs={ 'node_name': self.__self_addr, 'bind_addr': self.__seed_addr, 'timeout': 10 }).start() # copy snapshot from the leader node if self.__seed_addr is not None: try: metadata = get_metadata(bind_addr=get_leader( bind_addr=self.__seed_addr, timeout=10), timeout=10) response = requests.get('http://{0}/snapshot'.format( metadata['http_addr'])) if response.status_code == HTTPStatus.OK: with open(self.__conf.fullDumpFile, 'wb') as f: f.write(response.content) except Exception as ex: self.__logger.error('failed to copy snapshot: {0}'.format(ex)) # start node metadata = { 'grpc_addr': '{0}:{1}'.format(self.__host, self.__grpc_port), 'http_addr': '{0}:{1}'.format(self.__host, self.__http_port) } self.__logger.info('starting raft state machine') super(Indexer, self).__init__(self.__self_addr, self.__peer_addrs, conf=self.__conf, metadata=metadata) self.__logger.info('raft state machine has started') if os.path.exists(self.__conf.fullDumpFile): self.__logger.debug('snapshot exists: {0}'.format( self.__conf.fullDumpFile)) else: pass while not self.isReady(): # recovering data self.__logger.debug('waiting for cluster ready') self.__logger.debug(self.getStatus()) time.sleep(1) self.__logger.info('cluster ready') self.__logger.debug(self.getStatus()) # open existing indices on startup for index_name in self.get_index_names(): self.__open_index(index_name, index_config=None) # record index metrics timer self.metrics_timer = Timer(10, self.__record_index_metrics) self.metrics_timer.start() # start gRPC self.__grpc_server = grpc.server( futures.ThreadPoolExecutor(max_workers=self.__grpc_max_workers)) add_IndexServicer_to_server( IndexGRPCServicer(self, logger=self.__logger, metrics_registry=self.__metrics_registry), self.__grpc_server) self.__grpc_server.add_insecure_port('{0}:{1}'.format( self.__host, self.__grpc_port)) self.__grpc_server.start() self.__logger.info('gRPC server has started') # start HTTP server self.__http_servicer = IndexHTTPServicer(self, self.__logger, self.__http_logger, self.__metrics_registry) self.__http_server = HTTPServer(self.__host, self.__http_port, self.__http_servicer) self.__http_server.start() self.__logger.info('HTTP server has started') self.__logger.info('indexer has started') def stop(self): # stop HTTP server self.__http_server.stop() self.__logger.info('HTTP server has stopped') # stop gRPC server self.__grpc_server.stop(grace=0.0) self.__logger.info('gRPC server has stopped') self.metrics_timer.cancel() # close indices for index_name in list(self.__indices.keys()): self.__close_index(index_name) self.destroy() self.__logger.info('index core has stopped') def __record_index_metrics(self): for index_name in list(self.__indices.keys()): try: self.__metrics_core_documents.labels( index_name=index_name).set(self.get_doc_count(index_name)) except Exception as ex: self.__logger.error(ex) def __record_metrics(self, start_time, func_name): self.__metrics_requests_total.labels(func=func_name).inc() self.__metrics_requests_duration_seconds.labels( func=func_name).observe(time.time() - start_time) # def __serialize_indices(self, filename): # with self.__lock: # try: # self.__logger.info('starting serialize indices') # # except Exception as ex: # self.__logger.error('failed to create snapshot: {0}'.format(ex)) # finally: # self.__logger.info('serialize indices has finished') # def __serialize_raft_data(self, filename, raft_data): # with self.__lock: # pass # index serializer def __serialize(self, filename, raft_data): with self.__lock: try: self.__logger.debug('serializer has started') # store the index files and raft logs to the snapshot file with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as f: for index_name in self.get_index_names(): self.__commit_index(index_name) # with self.__get_writer(index_name).writelock: # with self.__indices[index_name].lock('WRITELOCK'): # index files for index_filename in self.get_index_files(index_name): if self.__index_configs.get( index_name).get_storage_type() == "ram": with self.__ram_storage.open_file( index_filename) as r: f.writestr(index_filename, r.read()) else: f.write( os.path.join(self.__file_storage.folder, index_filename), index_filename) self.__logger.debug('{0} has stored in {1}'.format( index_filename, filename)) # index config file f.write( os.path.join( self.__file_storage.folder, self.get_index_config_file(index_name)), self.get_index_config_file(index_name)) self.__logger.debug('{0} has stored in {1}'.format( self.get_index_config_file(index_name), filename)) # store the raft data f.writestr(RAFT_DATA_FILE, pickle.dumps(raft_data)) self.__logger.debug( '{0} has restored'.format(RAFT_DATA_FILE)) self.__logger.debug('snapshot has created') except Exception as ex: self.__logger.error( 'failed to create snapshot: {0}'.format(ex)) finally: self.__logger.debug('serializer has stopped') # index deserializer def __deserialize(self, filename): with self.__lock: try: self.__logger.debug('deserializer has started') with zipfile.ZipFile(filename, 'r') as zf: # get file names in snapshot file filenames = list(zf.namelist()) # get index names in snapshot file index_names = [] pattern_toc = re.compile(r'^_(.+)_\d+\.toc$') for f in filenames: match = pattern_toc.search(f) if match and match.group(1) not in index_names: index_names.append(match.group(1)) for index_name in index_names: # extract the index config first zf.extract(self.get_index_config_file(index_name), path=self.__file_storage.folder) index_config = pickle.loads( zf.read(self.get_index_config_file(index_name))) # get index files pattern_toc = re.compile(r'^_{0}_(\d+)\..+$'.format( index_name)) # ex) _myindex_0.toc pattern_seg = re.compile( r'^{0}_([a-z0-9]+)\..+$'.format(index_name) ) # ex) myindex_zseabukc2nbpvh0u.seg pattern_lock = re.compile(r'^{0}_WRITELOCK$'.format( index_name)) # ex) myindex_WRITELOCK index_files = [] for file_name in filenames: if re.match(pattern_toc, file_name): index_files.append(file_name) elif re.match(pattern_seg, file_name): index_files.append(file_name) elif re.match(pattern_lock, file_name): index_files.append(file_name) # extract the index files for index_file in index_files: if index_config.get_storage_type() == 'ram': with self.__ram_storage.create_file( index_file) as r: r.write(zf.read(index_file)) else: zf.extract(index_file, path=self.__file_storage.folder) self.__logger.debug( '{0} has restored from {1}'.format( index_file, filename)) self.__logger.debug( '{0} has restored'.format(index_name)) # extract the raft data raft_data = pickle.loads(zf.read(RAFT_DATA_FILE)) self.__logger.debug( '{0} has restored'.format(RAFT_DATA_FILE)) return raft_data except Exception as ex: self.__logger.error( 'failed to restore indices: {0}'.format(ex)) finally: self.__logger.debug('deserializer has stopped') def is_healthy(self): return self.isHealthy() def is_alive(self): return self.isAlive() def is_ready(self): return self.isReady() def get_addr(self): return self.__self_addr def get_index_files(self, index_name): index_files = [] pattern_toc = re.compile( r'^_{0}_(\d+)\..+$'.format(index_name)) # ex) _myindex_0.toc pattern_seg = re.compile(r'^{0}_([a-z0-9]+)\..+$'.format( index_name)) # ex) myindex_zseabukc2nbpvh0u.seg pattern_lock = re.compile( r'^{0}_WRITELOCK$'.format(index_name)) # ex) myindex_WRITELOCK if self.__index_configs.get(index_name).get_storage_type() == "ram": storage = self.__ram_storage else: storage = self.__file_storage for file_name in list(storage.list()): if re.match(pattern_toc, file_name): index_files.append(file_name) elif re.match(pattern_seg, file_name): index_files.append(file_name) elif re.match(pattern_lock, file_name): index_files.append(file_name) return index_files @staticmethod def get_index_config_file(index_name): return '{0}_CONFIG'.format(index_name) def get_index_names(self): index_names = [] pattern_toc = re.compile(r'^_(.+)_\d+\.toc$') for filename in list(self.__file_storage.list()): match = pattern_toc.search(filename) if match and match.group(1) not in index_names: index_names.append(match.group(1)) for filename in list(self.__ram_storage.list()): match = pattern_toc.search(filename) if match and match.group(1) not in index_names: index_names.append(match.group(1)) return index_names def is_index_exist(self, index_name): return self.__file_storage.index_exists( indexname=index_name) or self.__ram_storage.index_exists( indexname=index_name) def is_index_open(self, index_name): return index_name in self.__indices @replicated def open_index(self, index_name, index_config=None): return self.__open_index(index_name, index_config=index_config) def __open_index(self, index_name, index_config=None): start_time = time.time() index = None try: # open the index index = self.__indices.get(index_name) if index is None: self.__logger.debug('opening {0}'.format(index_name)) if index_config is None: # set saved index config with open( os.path.join( self.__file_storage.folder, self.get_index_config_file(index_name)), 'rb') as f: self.__index_configs[index_name] = pickle.loads( f.read()) else: # set given index config self.__index_configs[index_name] = index_config if self.__index_configs[index_name].get_storage_type( ) == 'ram': index = self.__ram_storage.open_index( indexname=index_name, schema=self.__index_configs[index_name].get_schema()) else: index = self.__file_storage.open_index( indexname=index_name, schema=self.__index_configs[index_name].get_schema()) self.__indices[index_name] = index self.__logger.info('{0} has opened'.format(index_name)) # open the index writer self.__open_writer(index_name) except Exception as ex: self.__logger.error('failed to open {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'open_index') return index @replicated def close_index(self, index_name): return self.__close_index(index_name) def __close_index(self, index_name): start_time = time.time() index = None try: # close the index writer self.__close_writer(index_name) # close the index index = self.__indices.pop(index_name) if index is not None: self.__logger.debug('closing {0}'.format(index_name)) index.close() self.__logger.info('{0} has closed'.format(index_name)) except Exception as ex: self.__logger.error('failed to close {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'close_index') return index @replicated def create_index(self, index_name, index_config): return self.__create_index(index_name, index_config) def __create_index(self, index_name, index_config): if self.is_index_exist(index_name): # open the index return self.__open_index(index_name, index_config=index_config) start_time = time.time() index = None with self.__lock: try: self.__logger.debug('creating {0}'.format(index_name)) # set index config self.__index_configs[index_name] = index_config self.__logger.debug( self.__index_configs[index_name].get_storage_type()) # create the index if self.__index_configs[index_name].get_storage_type( ) == 'ram': index = self.__ram_storage.create_index( self.__index_configs[index_name].get_schema(), indexname=index_name) else: index = self.__file_storage.create_index( self.__index_configs[index_name].get_schema(), indexname=index_name) self.__indices[index_name] = index self.__logger.info('{0} has created'.format(index_name)) # save the index config with open( os.path.join(self.__file_storage.folder, self.get_index_config_file(index_name)), 'wb') as f: f.write(pickle.dumps(index_config)) # open the index writer self.__open_writer(index_name) except Exception as ex: self.__logger.error('failed to create {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'create_index') return index @replicated def delete_index(self, index_name): return self.__delete_index(index_name) def __delete_index(self, index_name): # close index index = self.__close_index(index_name) start_time = time.time() with self.__lock: try: self.__logger.debug('deleting {0}'.format(index_name)) # delete index files for filename in self.get_index_files(index_name): self.__file_storage.delete_file(filename) self.__logger.debug('{0} was deleted'.format(filename)) self.__logger.info('{0} has deleted'.format(index_name)) # delete the index config self.__index_configs.pop(index_name, None) os.remove( os.path.join(self.__file_storage.folder, self.get_index_config_file(index_name))) except Exception as ex: self.__logger.error('failed to delete {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'delete_index') return index def get_index(self, index_name): return self.__get_index(index_name) def __get_index(self, index_name): start_time = time.time() try: index = self.__indices.get(index_name) except Exception as ex: raise ex finally: self.__record_metrics(start_time, 'get_index') return index def __start_auto_commit_timer(self, index_name, period): timer = self.__auto_commit_timers.get(index_name, None) if timer is None: self.__auto_commit_timers[index_name] = threading.Timer( period, self.__auto_commit_index, kwargs={ 'index_name': index_name, 'period': period }) self.__auto_commit_timers[index_name].start() self.__logger.debug( 'auto commit timer for {0} were started'.format(index_name)) def __stop_auto_commit_timer(self, index_name): timer = self.__auto_commit_timers.pop(index_name, None) if timer is not None: timer.cancel() self.__logger.debug( 'auto commit timer for {0} were stopped'.format(index_name)) def __auto_commit_index(self, index_name, period): self.__stop_auto_commit_timer(index_name) self.__commit_index(index_name) self.__start_auto_commit_timer(index_name, period=period) def __open_writer(self, index_name): writer = None try: writer = self.__writers.get(index_name, None) if writer is None or writer.is_closed: self.__logger.debug( 'opening writer for {0}'.format(index_name)) writer = self.__indices.get(index_name).writer() self.__writers[index_name] = writer self.__logger.debug( 'writer for {0} has opened'.format(index_name)) self.__start_auto_commit_timer( index_name, period=self.__index_configs.get( index_name).get_writer_auto_commit_period()) except Exception as ex: self.__logger.error('failed to open writer for {0}: {1}'.format( index_name, ex)) return writer def __close_writer(self, index_name): writer = None try: self.__stop_auto_commit_timer(index_name) # close the index writer = self.__writers.pop(index_name, None) if writer is not None: self.__logger.debug( 'closing writer for {0}'.format(index_name)) writer.commit() self.__logger.debug( 'writer for {0} has closed'.format(index_name)) except Exception as ex: self.__logger.error('failed to close writer for {0}: {1}'.format( index_name, ex)) return writer def __get_writer(self, index_name): return self.__writers.get(index_name, None) def __get_searcher(self, index_name, weighting=None): try: if weighting is None: searcher = self.__indices.get(index_name).searcher() else: searcher = self.__indices.get(index_name).searcher( weighting=weighting) except Exception as ex: raise ex return searcher @replicated def commit_index(self, index_name): return self.__commit_index(index_name) def __commit_index(self, index_name): start_time = time.time() success = False with self.__lock: try: self.__logger.debug('committing {0}'.format(index_name)) self.__get_writer(index_name).commit() self.__open_writer(index_name) # reopen writer self.__logger.info('{0} has committed'.format(index_name)) success = True except Exception as ex: self.__logger.error('failed to commit index {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'commit_index') return success @replicated def rollback_index(self, index_name): return self.__rollback_index(index_name) def __rollback_index(self, index_name): start_time = time.time() success = False with self.__lock: try: self.__logger.debug('rolling back {0}'.format(index_name)) self.__get_writer(index_name).cancel() self.__open_writer(index_name) # reopen writer self.__logger.info('{0} has rolled back'.format(index_name)) success = True except Exception as ex: self.__logger.error('failed to rollback index {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'rollback_index') return success @replicated def optimize_index(self, index_name): return self.__optimize_index(index_name) def __optimize_index(self, index_name): start_time = time.time() success = False with self.__lock: try: self.__logger.debug('optimizing {0}'.format(index_name)) self.__get_writer(index_name).commit(optimize=True, merge=False) self.__open_writer(index_name) # reopen writer self.__logger.info('{0} has optimized'.format(index_name)) success = True except Exception as ex: self.__logger.error('failed to optimize {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'optimize_index') return success def get_doc_count(self, index_name): try: cnt = self.__indices.get(index_name).doc_count() except Exception as ex: raise ex return cnt def get_schema(self, index_name): try: schema = self.__indices.get(index_name).schema except Exception as ex: raise ex return schema @replicated def put_document(self, index_name, doc_id, fields): return self.__put_document(index_name, doc_id, fields) def __put_document(self, index_name, doc_id, fields): doc = copy.deepcopy(fields) doc[self.__index_configs.get(index_name).get_doc_id_field()] = doc_id return self.__put_documents(index_name, [doc]) @replicated def put_documents(self, index_name, docs): return self.__put_documents(index_name, docs) def __put_documents(self, index_name, docs): start_time = time.time() with self.__lock: try: self.__logger.debug( 'putting documents to {0}'.format(index_name)) # count = self.__get_writer(index_name).update_documents(docs) count = 0 for doc in docs: self.__get_writer(index_name).update_document(**doc) count += 1 self.__logger.info('{0} documents has put to {1}'.format( count, index_name)) except Exception as ex: self.__logger.error( 'failed to put documents to {0}: {1}'.format( index_name, ex)) count = -1 finally: self.__record_metrics(start_time, 'put_documents') return count def get_document(self, index_name, doc_id): try: results_page = self.search_documents( index_name, doc_id, self.__index_configs.get(index_name).get_doc_id_field(), 1, page_len=1) if results_page.total > 0: self.__logger.debug('{0} was got from {1}'.format( doc_id, index_name)) else: self.__logger.debug('{0} did not exist in {1}'.format( doc_id, index_name)) except Exception as ex: raise ex return results_page @replicated def delete_document(self, index_name, doc_id): return self.__delete_document(index_name, doc_id) def __delete_document(self, index_name, doc_id): return self.__delete_documents(index_name, [doc_id]) @replicated def delete_documents(self, index_name, doc_ids): return self.__delete_documents(index_name, doc_ids) def __delete_documents(self, index_name, doc_ids): start_time = time.time() with self.__lock: try: self.__logger.debug( 'deleting documents from {0}'.format(index_name)) # count = self.__get_writer(index_name).delete_documents(doc_ids, doc_id_field=self.__index_configs.get( # index_name).get_doc_id_field()) count = 0 for doc_id in doc_ids: count += self.__get_writer(index_name).delete_by_term( self.__index_configs.get( index_name).get_doc_id_field(), doc_id) self.__logger.info('{0} documents has deleted from {1}'.format( count, index_name)) except Exception as ex: self.__logger.error( 'failed to delete documents in bulk to {0}: {1}'.format( index_name, ex)) count = -1 finally: self.__record_metrics(start_time, 'delete_documents') return count def search_documents(self, index_name, query, search_field, page_num, page_len=10, weighting=None, **kwargs): start_time = time.time() try: searcher = self.__get_searcher(index_name, weighting=weighting) query_parser = QueryParser(search_field, self.get_schema(index_name)) query_obj = query_parser.parse(query) results_page = searcher.search_page(query_obj, page_num, pagelen=page_len, **kwargs) self.__logger.info('{0} documents ware searched from {1}'.format( results_page.total, index_name)) except Exception as ex: raise ex finally: self.__record_metrics(start_time, 'search_documents') return results_page @replicated def create_snapshot(self): self.__create_snapshot() def __create_snapshot(self): self.forceLogCompaction() def get_snapshot_file_name(self): return self.__conf.fullDumpFile def is_snapshot_exist(self): return os.path.exists(self.get_snapshot_file_name()) def open_snapshot_file(self): with self.__lock: try: file = open(self.get_snapshot_file_name(), mode='rb') except Exception as ex: raise ex return file