示例#1
0
def main():

    time_start = time.time()

    ### ------------------------------------------------------------------------------
    ### Create and Parse Arguments
    ### -----------------------------------------------------------------------------
    # if getattr(sys, 'frozen', False):
    #     # frozen
    #     BASE_DIR = os.path.dirname(sys.executable)
    # else:
    #     # unfrozen
    #     BASE_DIR = os.path.dirname(os.path.realpath(__file__))

    BASE_DIR = os.getcwd()

    full_parser = argparse.ArgumentParser()
    full_parser.add_argument(
        "--tag",
        nargs='+',
        help="Collect data from hosts that matches the tag")
    full_parser.add_argument(
        "--cmd-tag",
        nargs='+',
        help="Collect data from command that matches the tag")

    full_parser.add_argument("-s",
                             "--start",
                             action='store_true',
                             help="Start collecting (default 'no')")

    full_parser.add_argument("--loglvl",
                             default=20,
                             help="Logs verbosity, 10-debug, 50 Critical")

    full_parser.add_argument("--logdir",
                             default="",
                             help="Directory where to store logs")

    full_parser.add_argument(
        "--sharding",
        help=
        "Define if the script is part of a shard need to include the place in the shard and the size of the shard [0/3]"
    )
    full_parser.add_argument(
        "--sharding-offset",
        default=True,
        help="Define an offset needs to be applied to the shard_id")

    full_parser.add_argument("--parserdir",
                             default="parsers",
                             help="Directory where to find parsers")
    full_parser.add_argument(
        "--collector-timeout",
        default=15,
        help="Timeout for collector device rpc/rest calls")
    full_parser.add_argument("--retry", default=5, help="Max retry")

    full_parser.add_argument("--host", default=None, help="Host DNS or IP")
    full_parser.add_argument("--hosts",
                             default="hosts.yaml",
                             help="Hosts file in yaml")
    full_parser.add_argument("--commands",
                             default="commands.yaml",
                             help="Commands file in Yaml")
    full_parser.add_argument("--credentials",
                             default="credentials.yaml",
                             help="Credentials file in Yaml")

    full_parser.add_argument(
        "--no-facts",
        action='store_false',
        help=
        "Disable facts collection on device (remove version and product name in results)"
    )

    full_parser.add_argument("--output-format",
                             default="influxdb",
                             help="Format of the output")
    full_parser.add_argument("--output-type",
                             default="stdout",
                             choices=['stdout', 'http'],
                             help="Type of output")
    full_parser.add_argument("--output-addr",
                             default="http://localhost:8186/write",
                             help="Addr information for output action")

    full_parser.add_argument(
        "--no-collector-threads",
        action='store_true',
        help=
        "Dont Spawn multiple threads to collect the information on the devices"
    )
    full_parser.add_argument(
        "--nbr-collector-threads",
        type=int,
        default=10,
        help="Maximum number of collector thread to spawn (default 10)")
    full_parser.add_argument(
        "--max-worker-threads",
        type=int,
        default=1,
        help="Maximum number of worker threads per interval for scheduler")
    full_parser.add_argument("--use-scheduler",
                             action='store_true',
                             help="Use scheduler")
    full_parser.add_argument(
        "--hosts-refresh-interval",
        type=int,
        default=3 * 60 * 60,
        help="Interval to periodically refresh dynamic host inventory")
    full_parser.add_argument("--allow-zero-hosts",
                             action='store_true',
                             help="Allow scheduler to run even with 0 hosts")

    dynamic_args = vars(full_parser.parse_args())

    # Print help if no parameters are provided
    if len(sys.argv) == 1:
        full_parser.print_help()
        sys.exit(1)

    ### ------------------------------------------------------------------------------
    # Loading YAML Default Variables
    ### ------------------------------------------------------------------------------
    max_connection_retries = dynamic_args['retry']
    logging_level = int(dynamic_args['loglvl'])

    ### ------------------------------------------------------------------------------
    ### Validate Arguments
    ### ------------------------------------------------------------------------------
    pp = pprint.PrettyPrinter(indent=4)

    tag_list = []
    ###  Known and fixed arguments
    if dynamic_args['tag']:
        tag_list = dynamic_args['tag']
    else:
        tag_list = [".*"]

    if not (dynamic_args['start']):
        print('Missing <start> option, so nothing to do')
        sys.exit(0)

    ### ------------------------------------------------------------------------------
    ### Logging
    ### ------------------------------------------------------------------------------
    formatter = logging.Formatter(
        '%(asctime)s %(name)s: %(levelname)s:  %(message)s')
    sh = logging.StreamHandler()
    sh.setFormatter(formatter)
    handlers = [sh]
    if dynamic_args['logdir']:
        log_dir = BASE_DIR + "/" + dynamic_args['logdir']
        ## Check that logs directory exist, create it if needed
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        filename = log_dir + "/" + 'metric_collector.log',
        fh = logging.handlers.RotatingFileHandler(filename,
                                                  maxSize=10 * 1024 * 1024,
                                                  backupCount=5)
        fh.setFormatter(formatter)
        handlers.append(fh)

    logging.basicConfig(level=logging_level, handlers=handlers)

    ### ------------------------------------------------------------------------------
    ### LOAD all credentials in a dict
    ### ------------------------------------------------------------------------------
    credentials = {}
    credentials_yaml_file = ''

    if os.path.isfile(dynamic_args['credentials']):
        credentials_yaml_file = dynamic_args['credentials']
    else:
        credentials_yaml_file = BASE_DIR + "/" + dynamic_args['credentials']

    logger.info('Importing credentials file: %s ', credentials_yaml_file)
    try:
        with open(credentials_yaml_file) as f:
            credentials = yaml.full_load(f)
    except Exception as e:
        logger.error('Error importing credentials file: %s: %s',
                     credentials_yaml_file, str(e))
        sys.exit(0)

    ### ------------------------------------------------------------------------------
    ### LOAD all commands with their tags in a dict
    ### ------------------------------------------------------------------------------
    commands_yaml_file = ''
    commands = []

    if os.path.isfile(dynamic_args['commands']):
        commands_yaml_file = dynamic_args['commands']
    else:
        commands_yaml_file = BASE_DIR + "/" + dynamic_args['commands']

    logger.info('Importing commands file: %s ', commands_yaml_file)
    with open(commands_yaml_file) as f:
        try:
            for document in yaml.load_all(f, yaml.FullLoader):
                commands.append(document)
        except Exception as e:
            logger.error('Error importing commands file: %s, %s',
                         commands_yaml_file, str(e))
            sys.exit(0)

    general_commands = commands[0]

    use_threads = not (dynamic_args['no_collector_threads'])

    if dynamic_args['cmd_tag']:
        command_tags = dynamic_args['cmd_tag']
    else:
        command_tags = ['.*']

    sharding = dynamic_args.get('sharding')
    sharding_offset = dynamic_args.get('sharding_offset')
    max_worker_threads = dynamic_args.get('max_worker_threads', 1)
    max_collector_threads = dynamic_args.get('nbr_collector_threads')

    if dynamic_args.get('use_scheduler', False):
        device_scheduler = scheduler.Scheduler(
            credentials,
            general_commands,
            dynamic_args['parserdir'],
            dynamic_args['output_type'],
            dynamic_args['output_addr'],
            max_worker_threads=max_worker_threads,
            use_threads=use_threads,
            num_threads_per_worker=max_collector_threads,
            collector_timeout=dynamic_args['collector_timeout'])
        hri = dynamic_args.get('hosts_refresh_interval', 6 * 60 * 60)
        select_hosts(
            dynamic_args['hosts'],
            tag_list,
            sharding,
            sharding_offset,
            scheduler=device_scheduler,
            refresh_interval=float(hri),
            allow_zero_hosts=dynamic_args.get('allow_zero_hosts', False),
        )
        device_scheduler.start()  # blocking call
        return

    ### ------------------------------------------------------------------------------
    ### LOAD all parsers
    ### ------------------------------------------------------------------------------
    parsers_manager = parser_manager.ParserManager(
        parser_dirs=dynamic_args['parserdir'])
    hosts_conf = select_hosts(dynamic_args['hosts'], tag_list, sharding,
                              sharding_offset)
    hosts_manager = host_manager.HostManager(credentials=credentials,
                                             commands=general_commands)
    hosts_manager.update_hosts(hosts_conf)
    coll = collector.Collector(hosts_manager=hosts_manager,
                               parser_manager=parsers_manager,
                               output_type=dynamic_args['output_type'],
                               output_addr=dynamic_args['output_addr'],
                               collect_facts=dynamic_args.get(
                                   'no_facts', True),
                               timeout=dynamic_args['collector_timeout'])
    target_hosts = hosts_manager.get_target_hosts(tags=tag_list)

    if use_threads:
        target_hosts_lists = [
            target_hosts[x:x +
                         int(len(target_hosts) / max_collector_threads + 1)]
            for x in range(0, len(target_hosts),
                           int(len(target_hosts) / max_collector_threads + 1))
        ]

        jobs = []

        for (i, target_hosts_list) in enumerate(target_hosts_lists, 1):
            logger.info(
                'Collector Thread-%s scheduled with following hosts: %s', i,
                target_hosts_list)
            thread = threading.Thread(target=coll.collect,
                                      args=('global', ),
                                      kwargs={
                                          "hosts": target_hosts_list,
                                          "cmd_tags": command_tags
                                      })
            jobs.append(thread)
            i = i + 1

        # Start the threads
        for j in jobs:
            j.start()

        # Ensure all of the threads have finished
        for j in jobs:
            j.join()

    else:
        # Execute everythings in the main thread
        coll.collect('global', hosts=target_hosts, cmd_tags=command_tags)

    ### -----------------------------------------------------
    ### Collect Global Statistics
    ### -----------------------------------------------------
    time_end = time.time()
    time_execution = time_end - time_start

    global_datapoint = [{
        'measurement': global_measurement_prefix + '_stats_agent',
        'tags': {},
        'fields': {
            'execution_time_sec': "%.4f" % time_execution,
            'nbr_devices': len(target_hosts)
        },
        'timestamp': time.time_ns(),
    }]

    if 'sharding' in dynamic_args and dynamic_args['sharding'] != None:
        global_datapoint[0]['tags']['sharding'] = dynamic_args['sharding']

    if use_threads:
        global_datapoint[0]['fields']['nbr_threads'] = dynamic_args[
            'nbr_collector_threads']

    ### Send results to the right output
    try:
        if dynamic_args['output_type'] == 'stdout':
            utils.print_format_influxdb(global_datapoint)
        elif dynamic_args['output_type'] == 'http':
            utils.post_format_influxdb(
                global_datapoint,
                dynamic_args['output_addr'],
            )
        else:
            logger.warn('Output format unknown: %s',
                        dynamic_args['output_type'])
    except Exception as ex:
        logger.warn("Hit error trying to post to influx: ", str(ex))
    def run(self):
        ''' Main run loop '''
        while True:
            if not self._run:
                return
            self._lock.acquire()
            logger.info('{}: Starting collection for {} hosts'.format(
                self.name, len(self.hostcmds)))
            hosts = list(self.hostcmds.keys())
            time_start = time.time()
            if self.use_threads:
                target_hosts_lists = [
                    hosts[x:x +
                          int(len(hosts) / self.num_collector_threads + 1)]
                    for x in range(
                        0, len(hosts),
                        int(len(hosts) / self.num_collector_threads + 1))
                ]
                jobs = []
                for i, target_hosts_list in enumerate(target_hosts_lists, 1):
                    logger.info(
                        '{}: Collector Thread-{} scheduled with following hosts: {}'
                        .format(self.name, i, target_hosts_list))
                    hostcmds = {}
                    for host in target_hosts_list:
                        hostcmds[host] = self.hostcmds[host]
                    job = threading.Thread(target=self.collector.collect,
                                           args=(self.name, ),
                                           kwargs={"host_cmds": hostcmds})
                    job.start()
                    jobs.append(job)

                # Ensure all of the threads have finished
                for j in jobs:
                    j.join()

            else:
                # Execute everythings in the main thread
                self.collector.collect(self.name, host_cmds=self.hostcmds)

            time_end = time.time()
            time_execution = time_end - time_start
            worker_datapoint = [{
                'measurement':
                collector.global_measurement_prefix + '_worker_stats',
                'tags': {
                    'worker_name': self.name
                },
                'fields': {
                    'execution_time_sec': "%.4f" % time_execution,
                    'nbr_devices': len(self.hostcmds),
                    'nbr_threads': self.num_collector_threads
                },
                'timestamp':
                time.time_ns(),
            }]
            if os.environ.get('NOMAD_JOB_NAME'):
                worker_datapoint[0]['tags']['nomad_job_name'] = os.environ[
                    'NOMAD_JOB_NAME']
            if os.environ.get('NOMAD_ALLOC_INDEX'):
                worker_datapoint[0]['tags']['nomad_alloc_index'] = os.environ[
                    'NOMAD_ALLOC_INDEX']

            ### Send results to the right output
            try:
                if self.output_type == 'stdout':
                    utils.print_format_influxdb(worker_datapoint)
                elif self.output_type == 'http':
                    utils.post_format_influxdb(worker_datapoint,
                                               self.output_addr)
                else:
                    logger.warn('{}: Output format unknown: {}'.format(
                        self.name, self.output_type))
            except Exception as ex:
                logger.exception("Hit exception trying to post to influx")

            logger.info('Worker {} took {} seconds to run'.format(
                self.name, time_execution))
            self._lock.release()
            # sleep until next interval
            time.sleep(self.interval)
    def collect(self, worker_name, hosts=None, host_cmds=None, cmd_tags=None):
        if not hosts and not host_cmds:
            logger.error('Collector: Nothing to collect')
            return
        if hosts:
            host_cmds = {}
            tags = cmd_tags or ['.*']
            for host in hosts:
                cmds = self.hosts_manager.get_target_commands(host, tags=tags)
                target_cmds = []
                for c in cmds:
                    target_cmds += c['commands']
                host_cmds[host] = target_cmds

        for host, target_commands in host_cmds.items():
            values = []
            credential = self.hosts_manager.get_credentials(host)

            host_reachable = False

            logger.info('Collector starting for: %s', host)
            host_address = self.hosts_manager.get_address(host)
            host_context = self.hosts_manager.get_context(host)
            device_type = self.hosts_manager.get_device_type(host)

            if device_type == 'juniper':
                dev = netconf_collector.NetconfCollector(
                    host=host,
                    address=host_address,
                    credential=credential,
                    parsers=self.parser_manager,
                    context=host_context)
            elif device_type == 'f5':
                dev = f5_rest_collector.F5Collector(
                    host=host,
                    address=host_address,
                    credential=credential,
                    parsers=self.parser_manager,
                    context=host_context)
            dev.connect()

            if dev.is_connected():
                dev.collect_facts()
                host_reachable = True

            else:
                logger.error('Unable to connect to %s, skipping', host)
                host_reachable = False

            time_execution = 0
            cmd_successful = 0
            cmd_error = 0

            if host_reachable:
                time_start = time.time()

                ### Execute commands on the device
                for command in target_commands:
                    try:
                        logger.info('[%s] Collecting > %s' % (host, command))
                        data = dev.collect(command)  # returns a generator
                        if data:
                            values.append(data)
                            cmd_successful += 1

                    except Exception as err:
                        cmd_error += 1
                        logger.error(
                            'An issue happened while collecting %s on %s > %s '
                            % (host, command, err))
                        logger.error(traceback.format_exc())

                ### Save collector statistics
                time_end = time.time()
                time_execution = time_end - time_start

            host_time_datapoint = [{
                'measurement':
                global_measurement_prefix + '_host_collector_stats',
                'tags': {
                    'device': dev.hostname,
                    'worker_name': worker_name
                },
                'fields': {
                    'execution_time_sec': "%.4f" % time_execution,
                    'nbr_commands': cmd_successful + cmd_error,
                    'nbr_successful_commands': cmd_successful,
                    'nbr_error_commands': cmd_error,
                    'reacheable': int(host_reachable),
                    'unreacheable': int(not host_reachable)
                },
                'timestamp': time.time_ns(),
            }]

            host_time_datapoint[0]['tags'].update(dev.context)

            if os.environ.get('NOMAD_JOB_NAME'):
                host_time_datapoint[0]['tags']['nomad_job_name'] = os.environ[
                    'NOMAD_JOB_NAME']
            if os.environ.get('NOMAD_ALLOC_INDEX'):
                host_time_datapoint[0]['tags'][
                    'nomad_alloc_index'] = os.environ['NOMAD_ALLOC_INDEX']

            values.append((n for n in host_time_datapoint))
            values = itertools.chain(*values)

            ### Send results to the right output
            try:
                if self.output_type == 'stdout':
                    utils.print_format_influxdb(values)
                elif self.output_type == 'http':
                    utils.post_format_influxdb(values, self.output_addr)
                else:
                    logger.warn('Collector: Output format unknown: {}'.format(
                        self.output_type))
            except Exception as ex:
                logger.exception("Hit exception trying to post to influx")

            if host_reachable:
                dev.close()