예제 #1
0
def analyze_hbase_region_server_metrics(metric_task, metrics):
  region_server_name = None
  region_operation_metrics_dict = {}
  replication_metrics_dict = {}
  for bean in metrics['beans']:
    try:
      # because root and meta region have the names, we must use region server
      # name and region name to locate a region
      if bean['name'] == REGION_SERVER_BEAN_NAME:
        region_server_name = bean['ServerName']
      elif bean['name'] == REGION_SERVER_DYNAMIC_STATISTICS_BEAN_NAME:
        for metricName in bean.keys():
          if Region.is_region_operation_metric_name(metricName):
            encodeName = Region.get_encode_name_from_region_operation_metric_name(metricName)
            region_operation_metrics = region_operation_metrics_dict.setdefault(encodeName, {})
            region_operation_metrics[metricName] = bean[metricName]
      elif bean['name'].startswith(REGION_SERVER_REPLICATION_BEAN_NAME_PREFIX):
        peerId = metric_helper.parse_replication_source(bean['name'])
        replication_metrics = replication_metrics_dict.setdefault(peerId, {})
        for metricName in bean.keys():
          replication_metrics[metricName] = bean[metricName]
    except Exception as e:
      logger.warning("%r failed to analyze metrics: %r", metric_task, e)
      continue

  region_server = None
  if region_server_name is None:
    return
  else:
    try:
      region_server = RegionServer.objects.get(name = region_server_name)
    except RegionServer.DoesNotExist:
      logger.warning("%r failed to find region_server with region_server_name=%s",
        metric_task, region_server_name)
      return

  # save replication metrics for region server
  region_server.replication_last_attempt_time = metric_task.last_attempt_time
  region_server.replicationMetrics = json.dumps(replication_metrics_dict)
  region_server.save()

  region_record_need_save = []
  for encodeName, operationMetrics in region_operation_metrics_dict.iteritems():
    region_record = dbutil.get_region_by_regionserver_and_encodename(
      region_server, encodeName)
    # we must wait region saved after analyzing master task
    if region_record is None:
      continue
    region_record.analyze_from_region_server_operation_metrics(operationMetrics,
      metric_task.last_attempt_time)
    # we first buffer the regions needed to update, then do batch update
    region_record_need_save.append(region_record)

  # we do batch update
  begin = datetime.datetime.now()
  dbutil.update_regions_for_region_server_metrics(region_record_need_save)
  logger.info("%r batch save region record for region_server, " \
    "saved regions=%d, consume=%s",
    metric_task, len(region_record_need_save),
    str((datetime.datetime.now() - begin).total_seconds()))
예제 #2
0
파일: collect.py 프로젝트: tomzhang/minos
  def analyze_hbase_region_server_metrics(self, metrics):
    region_server_name = None
    region_operation_metrics_dict = {}
    replication_metrics_dict = {}
    for bean in metrics['beans']:
      try:
        # because root and meta region have the names, we must use region server
        # name and region name to locate a region
        if bean['name'] == REGION_SERVER_BEAN_NAME:
          region_server_name = bean['ServerName']
        elif bean['name'] == REGION_SERVER_DYNAMIC_STATISTICS_BEAN_NAME:
          for metricName in bean.keys():
            if Region.is_region_operation_metric_name(metricName):
              encodeName = Region.get_encode_name_from_region_operation_metric_name(metricName)
              region_operation_metrics = region_operation_metrics_dict.setdefault(encodeName, {})
              region_operation_metrics[metricName] = bean[metricName]
        elif bean['name'].startswith(REGION_SERVER_REPLICATION_BEAN_NAME_PREFIX):
          peerId = metric_helper.parse_replication_source(bean['name'])
          replication_metrics = replication_metrics_dict.setdefault(peerId, {})
          for metricName in bean.keys():
            replication_metrics[metricName] = bean[metricName]
      except Exception as e:
        logger.warning("%r failed to analyze metrics: %r", self.task, e)
        continue

    region_server = None
    if region_server_name is None:
      return
    else:
      try:
        region_server = RegionServer.objects.get(name = region_server_name)
      except RegionServer.DoesNotExist:
        logger.warning("%r failed to find region_server with region_server_name=%s", self.task, region_server_name)
        return

    # save replication metrics for region server
    region_server.replication_last_attempt_time = self.task.last_attempt_time
    region_server.replicationMetrics = json.dumps(replication_metrics_dict)
    region_server.save()

    region_record_need_save = []
    for encodeName, operationMetrics in region_operation_metrics_dict.iteritems():
      region_record = dbutil.get_region_by_regionserver_and_encodename(region_server, encodeName)
      # we must wait region saved after analyzing master task
      if region_record is None:
        continue
      region_record.analyze_from_region_server_operation_metrics(operationMetrics,
                                                                 self.task.last_attempt_time)
      # we first buffer the regions needed to update, then do batch update
      region_record_need_save.append(region_record)

    # we do batch update
    begin = datetime.datetime.now()
    dbutil.update_regions_for_region_server_metrics(region_record_need_save)
    logger.info("%r batch save region record for region_server, saved regions=%d, consume=%s",
        self.task, len(region_record_need_save), str((datetime.datetime.now() - begin).total_seconds()))
예제 #3
0
  def analyze_hbase_master_metrics(self, metrics):
    cluster = self.task.job.cluster
    hbase_cluster_record, created = HBaseCluster.objects.get_or_create(cluster = cluster)
    self.reset_aggregated_metrics(hbase_cluster_record)
    tables = {}
    region_record_need_save = []
    for bean in metrics['beans']:
      try:
        if 'RegionServers' not in bean:
          continue
        for rs_metrics in bean['RegionServers']:
          rs_name = rs_metrics['key']
          [rs_hostname, rs_port] = self.get_host_and_port_from_region_server_name(rs_name)
          rs_task = dbutil.get_task_by_host_and_port(rs_hostname, rs_port)
          rs_record, created = RegionServer.objects.get_or_create(cluster = cluster,
                                                                  task = rs_task)
          # region server name includes startTime, which means the same region server
          # will lead different RegionServer records if the region server restarts.
          # Therefore, we won't create region server by its name.
          rs_record.name = rs_name

          rs_value = rs_metrics['value']
          rs_record.last_attempt_time = self.task.last_attempt_time
          rs_record.load = int(rs_value['load'])
          rs_record.numberOfRegions = int(rs_value['numberOfRegions'])
          self.reset_aggregated_metrics(rs_record)

          # we read out all regions belong to this region server and build a map
          all_regions_in_rs = Region.objects.filter(region_server = rs_record)
          all_regions_map = {}
          for region in all_regions_in_rs:
            all_regions_map[region.name] = region

          regionsLoad = rs_value['regionsLoad']
          for region_metrics in regionsLoad:
            region_value = region_metrics['value']
            region_name = region_value['nameAsString']
            table_name, startkey, region_id = region_name.split(',')
            region_metrics = {}

            if table_name not in tables:
              table_record, created = Table.objects.get_or_create(cluster = cluster,
                                                                  name = table_name)
              self.reset_aggregated_metrics(table_record)
              tables[table_name] = table_record

            table_record = tables[table_name]

            region_record = None
            if region_name in all_regions_map:
              region_record = all_regions_map[region_name]
            else:
              # if region record not in buffer, we get_or_create from db
              begin = datetime.datetime.now()
              region_record, created = Region.objects.get_or_create(table = table_record,
                                                                    name = region_name,
                                                                    encodeName = Region.get_encode_name(region_name),
                                                                    defaults={"region_server":rs_record})
              logger.info("%r get_or_create region in region_server from mysql, consume=%s, region_name=%s, buffered_rs=%s, get_rs=%s",
                self.task, str((datetime.datetime.now() - begin).total_seconds()), region_name, rs_record.name, region_record.region_server.name)


            region_record.region_server = rs_record
            region_record.analyze_region_record(region_value, self.task.last_attempt_time)
            # we buffer the regions needed update for batch update
            region_record_need_save.append(region_record)
            self.aggregate_metrics(region_record, rs_record)
            self.aggregate_metrics(region_record, table_record)
            self.aggregate_metrics(region_record, hbase_cluster_record)

          rs_record.save()

        for table_record in tables.itervalues():
          table_record.last_attempt_time = self.task.last_attempt_time
          table_record.availability = dbutil.getTableAvailability(table_record.cluster.name, table_record.name)
          table_record.save()

        hbase_cluster_record.save()

        # do batch update
        begin = datetime.datetime.now()
        dbutil.update_regions_for_master_metrics(region_record_need_save)
        logger.info("%r batch save region record for master, saved regions=%d, consume=%s", self.task,
            len(region_record_need_save), str((datetime.datetime.now() - begin).total_seconds()))
      except Exception as e:
        traceback.print_exc()
        logger.warning("%r failed to analyze metrics: %r", self.task, e)
        continue
예제 #4
0
def analyze_hbase_master_metrics(metric_task, metrics):
    cluster = metric_task.job.cluster
    hbase_cluster_record, created = HBaseCluster.objects.get_or_create(
        cluster=cluster)
    reset_aggregated_metrics(hbase_cluster_record)
    tables = {}
    region_record_need_save = []
    for bean in metrics['beans']:
        try:
            if 'RegionServers' not in bean:
                continue
            for rs_metrics in bean['RegionServers']:
                rs_name = rs_metrics['key']
                [rs_hostname,
                 rs_port] = get_host_and_port_from_region_server_name(rs_name)
                rs_task = dbutil.get_task_by_host_and_port(
                    rs_hostname, rs_port)
                rs_record, created = RegionServer.objects.get_or_create(
                    cluster=cluster, task=rs_task)
                # region server name includes startTime, which means the same region server
                # will lead different RegionServer records if the region server restarts.
                # Therefore, we won't create region server by its name.
                rs_record.name = rs_name

                rs_value = rs_metrics['value']
                rs_record.last_attempt_time = metric_task.last_attempt_time
                rs_record.load = int(rs_value['load'])
                rs_record.numberOfRegions = int(rs_value['numberOfRegions'])
                reset_aggregated_metrics(rs_record)

                # we read out all regions belong to this region server and build a map
                all_regions_in_rs = Region.objects.filter(
                    region_server=rs_record)
                all_regions_in_rs = dbutil.get_alive_regions_by_rs(rs_record)
                all_regions_map = {}
                logger.info("%r Finish get region: %d", metric_task,
                            len(all_regions_in_rs))
                for region in all_regions_in_rs:
                    all_regions_map[region.name] = region

                regionsLoad = rs_value['regionsLoad']
                for region_metrics in regionsLoad:
                    region_value = region_metrics['value']
                    region_name = region_value['nameAsString']
                    try:
                        table_name = region_name.split(',')[0]
                    except Exception as e:
                        logger.warning("%r failed to get region name: %r, %s",
                                       metric_task, e, region_name)
                        continue

                    region_metrics = {}

                    if table_name not in tables:
                        table_record, created = Table.objects.get_or_create(
                            cluster=cluster, name=table_name)
                        reset_aggregated_metrics(table_record)
                        tables[table_name] = table_record

                    table_record = tables[table_name]

                    region_record = None
                    if region_name in all_regions_map:
                        region_record = all_regions_map[region_name]
                    else:
                        # if region record not in buffer, we get_or_create from db
                        begin = datetime.datetime.now()
                        region_record, created = Region.objects.get_or_create(
                            table=table_record,
                            name=region_name,
                            encodeName=Region.get_encode_name(region_name),
                            defaults={"region_server": rs_record})
                        logger.info("%r get_or_create region in region_server from mysql, " \
                          "consume=%s, region_name=%s, buffered_rs=%s, get_rs=%s",
                          metric_task, str((datetime.datetime.now() - begin).total_seconds()),
                          region_name, rs_record.name, region_record.region_server.name)

                    logger.info("%r Finish analyze regionsLoad", metric_task)

                    region_record.region_server = rs_record
                    region_record.analyze_region_record(
                        region_value, metric_task.last_attempt_time)
                    # we buffer the regions needed update for batch update
                    region_record_need_save.append(region_record)
                    aggregate_metrics(region_record, rs_record)
                    aggregate_metrics(region_record, table_record)
                    aggregate_metrics(region_record, hbase_cluster_record)

                rs_record.save()

            for table_record in tables.itervalues():
                table_record.last_attempt_time = metric_task.last_attempt_time
                table_record.availability = dbutil.getTableAvailability(
                    table_record.cluster.name, table_record.name)
                table_record.save()

            hbase_cluster_record.save()

            # do batch update
            begin = datetime.datetime.now()
            dbutil.update_regions_for_master_metrics(region_record_need_save)
            logger.info("%r batch save region record for master, " \
              "saved regions=%d, consume=%s",
              metric_task, len(region_record_need_save),
              str((datetime.datetime.now() - begin).total_seconds()))
        except Exception as e:
            traceback.print_exc()
            logger.warning("%r failed to analyze metrics: %r", metric_task, e)
            continue