-
Notifications
You must be signed in to change notification settings - Fork 1
/
hbase_collector.py
155 lines (126 loc) · 5.14 KB
/
hbase_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import subprocess
import os
import time
import collectd
HBASE_METRICS_TMP_FILE = "/tmp/hbase_metrics.tmp"
HBASE_SIZE_TMP_FILE = "/tmp/hbase_size.tmp"
HBASE_PORT = "60020" # HBASE Port
HBASE_DOMAIN_DNS = ".VPN.EXAMPLE.COM" # PARENT DOMAIN FOR HBASE NODES (example: hbase01.vpn.example.com)
collectd_plugin_name = "hbase"
collectd_hostname = "hbase01" # Where do you want to report the global metrics
collectd_interval = 30
collectd_type = "gauge"
def wait_for_collection():
while True:
if os.path.isfile(HBASE_METRICS_TMP_FILE) and 'load' in open(HBASE_METRICS_TMP_FILE).read():
break
else:
collectd.info("Sleeping...")
time.sleep(1)
def hbase_status():
# Block until stats are written to file
wait_for_collection()
metrics = {}
grep_alive_servers = "grep live "+HBASE_METRICS_TMP_FILE+" | awk {'print $1'}"
grep_alive_cmd = subprocess.Popen(grep_alive_servers,shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
grep_dead_servers = "grep dead "+HBASE_METRICS_TMP_FILE+" | awk {'print $1'}"
grep_dead_cmd = subprocess.Popen(grep_dead_servers,shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
awk_nodes_info_cmd = "awk '/dead/{f=0} f; /live/{f=1}' " + HBASE_METRICS_TMP_FILE
awk_nodes_info_cmd = subprocess.Popen(awk_nodes_info_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
nodes_info_output = awk_nodes_info_cmd.communicate()[0]
nodes_info_output = nodes_info_output.split('\n')
nodes_stats = {}
for idx, cmd_line in enumerate(nodes_info_output):
host = ""
stats = ""
if HBASE_DOMAIN_DNS in cmd_line:
host = cmd_line.split(HBASE_DOMAIN_DNS+":"+HBASE_PORT)[0]
host = host.strip()
nodes_stats[host] = {}
stats = nodes_info_output[idx + 1]
stats = stats.split(", ")
for stat in stats:
stat = stat.split("=")
stat_key = stat[0].strip()
stat_value = stat[1].strip()
nodes_stats[host][stat_key] = stat_value
nodes_alive = grep_alive_cmd.communicate()[0]
nodes_dead = grep_dead_cmd.communicate()[0]
metrics["alive"] = nodes_alive
metrics["dead"] = nodes_dead
metrics["nodes"] = nodes_stats
collectd.info("Alive Servers " + nodes_alive + " Dead Servers " + nodes_dead)
return metrics
def hbase_disk_usage():
# Block until stats are written to file
wait_for_collection()
metrics = {}
with open(HBASE_SIZE_TMP_FILE) as fp:
for line in fp:
splitted_line = line.split()
table_name = splitted_line[1].replace("/hbase/","")
table_size = splitted_line[0]
if table_name[:1] == '.' or table_name[:1] == '-':
continue
#collectd.info("Table [" + table_name + "] has size " + table_size)
metrics[table_name] = table_size
return metrics
def remove_temp_files():
try:
os.remove(HBASE_METRICS_TMP_FILE)
os.remove(HBASE_SIZE_TMP_FILE)
except OSError:
pass
def write_tmp_hbase_stats():
hbase_size_cmd = "hadoop fs -du /hbase/ 1> "+HBASE_SIZE_TMP_FILE+" 2> /dev/null"
subprocess.Popen(hbase_size_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
hbase_status_cmd = "echo \"status 'simple'\" | hbase shell 1> "+HBASE_METRICS_TMP_FILE+" 2> /dev/null"
subprocess.Popen(hbase_status_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
time.sleep(5)
def restore_sigchld():
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
def read_callback(data=None):
collectd.info("Hbase collector starting...")
write_tmp_hbase_stats()
status_metrics = hbase_status()
disk_usage_metrics = hbase_disk_usage()
if not disk_usage_metrics:
pass
else:
for key in disk_usage_metrics:
metric = collectd.Values()
metric.plugin = collectd_plugin_name+"_table-usage"
metric.host = collectd_hostname
metric.interval = collectd_interval
metric.type = collectd_type
metric.type_instance = key
metric.values = [disk_usage_metrics[key]]
metric.dispatch()
if not status_metrics:
pass
else:
for key in status_metrics:
if key != 'nodes':
metric = collectd.Values()
metric.plugin = collectd_plugin_name+"_status"
metric.host = collectd_hostname
metric.interval = collectd_interval
metric.type = collectd_type
metric.type_instance = key
metric.values = [int(status_metrics[key])]
metric.dispatch()
else:
for node_key in status_metrics[key]:
for stat_key in status_metrics[key][node_key]:
metric = collectd.Values()
metric.plugin = collectd_plugin_name+"_nodestatus"
metric.host = node_key
metric.interval = collectd_interval
metric.type = collectd_type
metric.type_instance = stat_key
metric.values = [int(status_metrics[key][node_key][stat_key])]
metric.dispatch()
collectd.info("HBase collection ended!")
# Init
collectd.register_init(restore_sigchld)
collectd.register_read(read_callback)