def create_cluster(self, cluster, filename, blueprint=''): # log.debug('create_cluster(%s, %s)' % (filename, name)) validate_file(filename, 'cluster hosts mapping', nolog=True) try: _ = open(str(filename)) file_data = _.read() except IOError as _: err = "failed to read Ambari cluster host mapping from file '%s': %s" % ( filename, _) # log.critical(err) qquit('CRITICAL', err) log.info("creating cluster '%s' using file '%s'" % (cluster, filename)) if not isJson(file_data): qquit('CRITICAL', "invalid json found in file '%s'" % filename) # don't have access to a blueprint name to enforce reset here # json_data = json.loads(file_data) # try: # json_data['Blueprints']['blueprint_name'] = blueprint # except KeyError, e: # qquit('CRITICAL', 'failed to (re)set blueprint name in cluster/hostmapping data before creating cluster') if blueprint: try: log.info("setting blueprint in cluster creation to '%s'" % blueprint) json_data = json.loads(file_data) json_data['blueprint'] = blueprint file_data = json.dumps(json_data) except KeyError as _: log.warn( "failed to inject blueprint name '%s' in to cluster creation" % blueprint) response = self.send('clusters/%s' % cluster, file_data) log.info( "Cluster creation submitted, see Ambari web UI to track progress") return response
def process_perfdata(self): perfdata_raw = None if '|' in self.message: self.message, perfdata_raw = self.message.split('|', 1) if perfdata_raw: log.debug("raw perfdata: %s", perfdata_raw) for item in perfdata_raw.split(): if '=' in item: header, data = item.split('=', 1) data = data.split(';')[0] match = self.perfdata_regex.search(data) if match: val = match.group(1) log.debug("found numeric value '%s' in item '%s'", val, item) if match.group(2): units = match.group(2) log.debug("found units '%s' in item '%s'", units, item) header += " ({0})".format(units) header = header.strip('"') header = header.strip("'") header = header.replace(self.separator, '_') self.headers += [header.upper()] self.perfdata += [val] else: log.warn( "no valid numeric value to extract found in perfdata item '%s'", item) else: log.warn("no key=value format detected in item '%s'", item) self.message = self.message.strip()
def send_blueprint_file(self, filename, name=''): # log.debug('send_blueprint_file(%s, %s)' % (filename, name)) validate_file(filename, 'blueprint', nolog=True) try: _ = open(str(filename)) file_data = _.read() except IOError as _: err = "failed to read Ambari Blueprint from file '%s': %s" % (file, _) # log.critical(err) qquit('CRITICAL', err) if not name: try: name = self.parse_blueprint_name(file_data) log.info("name not specified, determined blueprint name from file contents as '%s'" % name) except KeyError as _: pass if not name: name = os.path.splitext(os.path.basename(file))[0] log.info("name not specified and couldn't determine blueprint name from blueprint data, reverting to using filename without extension '%s'" % name) # pylint: disable=line-too-long # this solves the issue of having duplicate Blueprint.blueprint_name keys try: json_data = json.loads(file_data) json_data['Blueprints']['blueprint_name'] = name data = json.dumps(json_data) log.info("reset blueprint field name to '%s'" % name) except ValueError as _: qquit('CRITICAL', "invalid json found in file '%s': %s" % (file, name)) except KeyError as _: log.warn('failed to reset the Blueprint name: %s' % _) return self.send_blueprint(name, data)
def __parse_args__(self): try: (self.options, self.args) = self.__parser.parse_args() # I don't agree with zero exit code from OptionParser for help/usage, # and want UNKNOWN not CRITICAL(2) for switch mis-usage... except SystemExit: # pragma: no cover sys.exit(ERRORS['UNKNOWN']) if self.options.help: # pragma: no cover self.usage() if self.options.version: # pragma: no cover print('%(version)s' % self.__dict__) sys.exit(ERRORS['UNKNOWN']) if 'timeout' in dir(self.options): self.timeout = self.get_opt('timeout') env_verbose = os.getenv('VERBOSE') if isInt(env_verbose): if env_verbose > self.verbose: log.debug('environment variable $VERBOSE = %s, increasing verbosity', env_verbose) self.verbose = env_verbose elif env_verbose is None: pass else: log.warn("$VERBOSE environment variable is not an integer ('%s')", env_verbose) self.parse_args() return self.options, self.args
def process_perfdata(self): perfdata_raw = None if '|' in self.message: self.message, perfdata_raw = self.message.split('|', 1) if perfdata_raw: log.debug("raw perfdata: %s", perfdata_raw) for item in perfdata_raw.split(): if '=' in item: header, data = item.split('=', 1) data = data.split(';')[0] match = self.perfdata_regex.search(data) if match: val = match.group(1) log.debug("found numeric value '%s' in item '%s'", val, item) if match.group(2): units = match.group(2) log.debug("found units '%s' in item '%s'", units, item) header += " ({0})".format(units) header = header.strip('"') header = header.strip("'") header = header.replace(self.separator, '_') self.headers += [header.upper()] self.perfdata += [val] else: log.warn("no valid numeric value to extract found in perfdata item '%s'", item) else: log.warn("no key=value format detected in item '%s'", item) self.message = self.message.strip()
def run(self): json_file = self.get_opt('json') parquet_dir = self.get_opt('parquet_dir') # let Spark fail if csv/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("Json Source: %s" % json_file) log.info("Parquet Destination: %s" % parquet_dir) conf = SparkConf().setAppName('HS PySpark JSON => Parquet') sc = SparkContext(conf=conf) # pylint: disable=invalid-name if self.verbose < 3 and 'setLogLevel' in dir(sc): sc.setLogLevel('WARN') sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) if isMinVersion(spark_version, 1.4): df = sqlContext.read.json(json_file) # pylint: disable=invalid-name df.write.parquet(parquet_dir) else: log.warn('running legacy code for Spark <= 1.3') df = sqlContext.jsonFile(json_file) # pylint: disable=invalid-name df.saveAsParquetFile(parquet_dir)
def process_event_selectors(self, client, trail_list): total_event_selectors = 0 num_management = 0 num_readwrite_all = 0 trails_without_selectors = 0 found = False for trail in trail_list: name = trail['Name'] if self.trail_name and self.trail_name != name: continue found = True trail_info = client.get_event_selectors(TrailName=name) log.debug('%s', jsonpp(trail_info)) event_selectors = trail_info['EventSelectors'] num_event_selectors = len(event_selectors) total_event_selectors += num_event_selectors if num_event_selectors < 1: log.warn('cloud trail %s has no event selectors', trail) self.warning() trails_without_selectors += 1 for event_selector in event_selectors: if event_selector['IncludeManagementEvents']: num_management += 1 if event_selector['ReadWriteType'].lower() == 'all': # All num_readwrite_all += 1 if num_management < num_event_selectors or \ num_readwrite_all < num_event_selectors: self.warning() if self.trail_name and not found: raise CriticalError('cloud trail \'{}\' not found'.format(self.trail_name)) if total_event_selectors == 0: self.warning() return (total_event_selectors, num_management, num_readwrite_all, trails_without_selectors)
def create_cluster(self, cluster, filename, blueprint=''): # log.debug('create_cluster(%s, %s)' % (filename, name)) validate_file(filename, 'cluster hosts mapping', nolog=True) try: _ = open(str(filename)) file_data = _.read() except IOError as _: err = "failed to read Ambari cluster host mapping from file '%s': %s" % (filename, _) # log.critical(err) qquit('CRITICAL', err) log.info("creating cluster '%s' using file '%s'" % (cluster, filename)) if not isJson(file_data): qquit('CRITICAL', "invalid json found in file '%s'" % filename) # don't have access to a blueprint name to enforce reset here # json_data = json.loads(file_data) # try: # json_data['Blueprints']['blueprint_name'] = blueprint # except KeyError, e: # qquit('CRITICAL', 'failed to (re)set blueprint name in cluster/hostmapping data before creating cluster') if blueprint: try: log.info("setting blueprint in cluster creation to '%s'" % blueprint) json_data = json.loads(file_data) json_data['blueprint'] = blueprint file_data = json.dumps(json_data) except KeyError as _: log.warn("failed to inject blueprint name '%s' in to cluster creation" % blueprint) response = self.send('clusters/%s' % cluster, file_data) log.info("Cluster creation submitted, see Ambari web UI to track progress") return response
def send_blueprint_file(self, filename, name=''): # log.debug('send_blueprint_file(%s, %s)' % (filename, name)) validate_file(filename, 'blueprint', nolog=True) try: _ = open(str(filename)) file_data = _.read() except IOError as _: err = "failed to read Ambari Blueprint from file '%s': %s" % ( filename, _) # log.critical(err) qquit('CRITICAL', err) if not name: try: name = self.parse_blueprint_name(file_data) log.info( "name not specified, determined blueprint name from file contents as '%s'" % name) except KeyError as _: pass if not name: name = os.path.splitext(os.path.basename(filename))[0] log.info("name not specified and couldn't determine blueprint name from blueprint data, reverting to using filename without extension '%s'" % name) # pylint: disable=line-too-long # this solves the issue of having duplicate Blueprint.blueprint_name keys try: json_data = json.loads(file_data) json_data['Blueprints']['blueprint_name'] = name data = json.dumps(json_data) log.info("reset blueprint field name to '%s'" % name) except ValueError as _: qquit('CRITICAL', "invalid json found in file '%s': %s" % (filename, name)) except KeyError as _: log.warn('failed to reset the Blueprint name: %s' % _) return self.send_blueprint(name, data)
def extract_response_message(response_dict): try: return'{0}: {1}. '.format(response_dict['status']['responseCode'], response_dict['status']['responseMessage']) except KeyError: log.warn('failed to extract responseCode/responseMessage for additional error information. ' \ + support_msg_api()) return ''
def run(self): if not self.args: self.usage('no Dockerfile / directory args given') args = uniq_list_ordered(self.args) self.branch_prefix = self.get_opt('branch_prefix') if self.branch_prefix is not None: validate_regex(self.branch_prefix, 'branch prefix') self.branch_prefix = re.compile(self.branch_prefix) for arg in args: if not os.path.exists(arg): print("'%s' not found" % arg) sys.exit(ERRORS['WARNING']) if os.path.isfile(arg): log_option('file', arg) elif os.path.isdir(arg): log_option('directory', arg) else: die("path '%s' could not be determined as either a file or directory" % arg) for arg in args: self.check_git_branches_dockerfiles(arg) log.info('Total Branches: %s', len(self.branches)) log.info('Selected Branches: %s', len(self.selected_branches)) log.info('Branches checked: %s', self.branches_checked) log.info('Branches with Dockerfile checked: %s', len(self.branches_dockerfile_checked)) branches_skipped = len(self.branches_skipped) if branches_skipped > 0: log.warn( '{0} branches skipped for not matching expected naming format'. format(branches_skipped)) branches_not_checked = len(self.selected_branches) - len( self.branches_dockerfile_checked) if branches_not_checked > 1: log.warn( '{0} branches not checked (no matching Dockerfile found?)'. format(branches_not_checked)) if log.isEnabledFor(logging.DEBUG): log.debug( 'Branches with no corresponding Dockerfile found:\n%s', '\n'.join( set(self.selected_branches) - set(self.branches_dockerfile_checked))) log.info('{0} Dockerfiles checked'.format(len( self.dockerfiles_checked))) branches_failed = len(self.branches_failed) _ = '{0} Dockerfiles failed validation across {1} branches'.format( self.dockerfiles_failed, branches_failed) if branches_failed > 0: log.error(_) else: log.info(_) if self.failed: log.error('Dockerfile validation FAILED') sys.exit(ERRORS['CRITICAL']) log.info('Dockerfile validation SUCCEEDED')
def send_blueprint(self, name, data): # log.debug('save_blueprint(%s, %s)' % (name, data)) blueprints = self.get_blueprints() if name in blueprints: log.warn("blueprint with name '%s' already exists" % name) log.info("sending blueprint '%s'" % name) if log.isEnabledFor(logging.DEBUG): log.debug("blueprint data = '%s'" % data) # not exposing this to user via switches - shouldn't be using this right now # return self.send('blueprints/%s?validate_topology=false' % name, data) return self.send('blueprints/%s' % name, data)
def query(self, url): log.debug('GET %s' % url) try: verify = True # workaround for Travis CI and older pythons - we're not exchanging secret data so this is ok #if os.getenv('TRAVIS'): # verify = False if os.getenv('SSL_NOVERIFY') == '1': log.warn('disabling SSL verification') verify = False auth = None if self.user and self.password: auth = (self.user, self.password) log.debug( 'setting basic HTTP authenication using username: %s, password: <omitted>', self.user) req = requests.get(url, auth=auth, verify=verify) except requests.exceptions.RequestException as _: die(_) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(), '=' * 80) if req.status_code != 200: die("%s %s" % (req.status_code, req.reason)) if not isJson(req.content): die('invalid non-JSON response from Docker Registry!') if log.isEnabledFor(logging.DEBUG): print(jsonpp(req.content)) print('=' * 80) tag_list = [] try: json_data = json.loads(req.content) # DockerHub returns like this if 'results' in json_data: tag_list = [result['name'] for result in json_data['results']] # Docker Registry returns like this elif 'tags' in json_data: tag_list = json_data['tags'] else: raise UnknownError('failed to parse response, found neither results nor tags fields. {0}'\ .format(support_msg_api())) # could perhaps stack overflow in some scenario # not as functional programming 'cool' but will do own tail recursion and just while loop instead next_page_url = None if 'next' in json_data and json_data['next']: # tag_list += self.query(json_data['next']) next_page_url = json_data['next'] return (tag_list, next_page_url) except KeyError as _: die('failed to parse output from Docker Registry (format may have changed?): {0}' .format(_))
def is_file_dup_by_size(self, filepath): size = os.stat(filepath).st_size log.debug("file '%s' size '%s'", filepath, size) if size == 0: log.warn("skipping zero byte file '%s'", filepath) return 0 if size in self.sizes: if self.compare_by_size: self.dups_by_size[size] = self.dups_by_size.get(size, set()) self.dups_by_size[size].add(*self.sizes[size]) self.dups_by_size[size].add(filepath) return size self.sizes[size] = self.sizes.get(size, {}) self.sizes[size][filepath] = None return False
def get_latest_complete_report(reports): if not isList(reports): code_error('non-list passed to get_lastest_complete_report()') if not reports: qquit('UNKNOWN', 'no reports passed to get_latest_complete_report()') num_reports = len(reports) index = 0 report = reports[index] while report['status'] == 'INCOMPLETE': index += 1 if index < num_reports: report = reports[index] else: log.warn('only incomplete workflows detected, will have to use latest incomplete workflow') report = reports[0] return report
def check_git_branches_upstream(self, target): target = os.path.abspath(target) gitroot = find_git_root(target) if gitroot is None: die('Failed to find git root for target {0}'.format(target)) log.debug("finding branches for target '{0}'".format(target)) repo = git.Repo(gitroot) branches = repo.branches if self.branch_prefix is not None: log.debug('restricting to branches matching branch prefix') branches = [ x for x in branches if self.branch_prefix.match(str(x)) ] if not branches: log.error("No branches matching '%s' for target '%s'", self.get_opt('branch_prefix'), target) self.status = 'NO BRANCHES' #if log.isEnabledFor(logging.DEBUG): #log.debug('\n\nbranches for target %s:\n\n%s\n', target, '\n'.join(list(branches))) for branch in branches: expected = '{0}/{1}'.format(self.origin, branch) # have to str() this as it returns an object that will fail equality match otherwise tracking_branch = str(branch.tracking_branch()) if tracking_branch == expected: log.info( "OK: repo '{0}' branch '{1}' is tracking '{2}'".format( gitroot, branch, tracking_branch)) elif self.get_opt('fix') and tracking_branch == 'None': log.warn( "WARN: setting repo '{0}' unconfigured branch '{1}' to track '{2}'" .format(gitroot, branch, expected)) #print(list(repo.remotes.origin.refs)) branch.set_tracking_branch( git.refs.remote.RemoteReference(repo, 'refs/remotes/' + expected)) elif self.get_opt('force_fix'): log.warn( "WARN: forcibly resetting repo '{0}' branch '{1}' to track '{2}'" .format(gitroot, branch, expected)) branch.set_tracking_branch( git.refs.remote.RemoteReference(repo, 'refs/remotes/' + expected)) else: self.status = "ERROR" log.error( "BAD: branch '{0}' is tracking '{1}' (expected '{2}')". format(branch, tracking_branch, expected))
def check_ping(host, count=None, wait=None): if count is None: count = 1 if wait is None: wait = 3 if not isInt(count): raise UnknownError("passed invalid count '{0}' to check_ping method, must be a valid integer!"\ .format(count)) if not isInt(wait): raise UnknownError("passed invalid wait '{0}' to check_ping method, must be a valid integer!"\ .format(wait)) log.info("pinging host '%s' (count=%s, wait=%s)", host, count, wait) count_switch = '-c' if platform.system().lower() == 'windows': count_switch = '-n' wait_switch = '-w' if platform.system().lower() == 'darwin': wait_switch = '-W' # causes hang if count / wait are not cast to string cmd = [ 'ping', count_switch, '{0}'.format(count), wait_switch, '{0}'.format(wait), host ] log.debug('cmd: %s', ' '.join(cmd)) #log.debug('args: %s', cmd) try: process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #log.debug('communicating') (stdout, stderr) = process.communicate() #log.debug('waiting for child process') process.wait() exitcode = process.returncode log.debug('stdout: %s', stdout) log.debug('stderr: %s', stderr) log.debug('exitcode: %s', exitcode) if exitcode == 0: log.info("host '%s' responded to ping", host) return host except subprocess.CalledProcessError as _: log.warn('ping failed: %s', _.output) except OSError as _: die('error calling ping: {0}'.format(_)) return None
def run(self): if not self.args: self.usage('no Dockerfile / directory args given') args = uniq_list_ordered(self.args) self.branch_prefix = self.get_opt('branch_prefix') if self.branch_prefix is not None: validate_regex(self.branch_prefix, 'branch prefix') self.branch_prefix = re.compile(self.branch_prefix) for arg in args: if not os.path.exists(arg): print("'%s' not found" % arg) sys.exit(ERRORS['WARNING']) if os.path.isfile(arg): log_option('file', arg) elif os.path.isdir(arg): log_option('directory', arg) else: die("path '%s' could not be determined as either a file or directory" % arg) for arg in args: self.check_git_branches_dockerfiles(arg) log.info('Total Branches: %s', len(self.branches)) log.info('Selected Branches: %s', len(self.selected_branches)) log.info('Branches checked: %s', self.branches_checked) log.info('Branches with Dockerfile checked: %s', len(self.branches_dockerfile_checked)) branches_skipped = len(self.branches_skipped) if branches_skipped > 0: log.warn('{0} branches skipped for not matching expected naming format' .format(branches_skipped)) branches_not_checked = len(self.selected_branches) - len(self.branches_dockerfile_checked) if branches_not_checked > 1: log.warn('{0} branches not checked (no matching Dockerfile found?)'.format(branches_not_checked)) if log.isEnabledFor(logging.DEBUG): log.debug('Branches with no corresponding Dockerfile found:\n%s', '\n'.join(set(self.selected_branches) - set(self.branches_dockerfile_checked))) log.info('{0} Dockerfiles checked'.format(len(self.dockerfiles_checked))) branches_failed = len(self.branches_failed) _ = '{0} Dockerfiles failed validation across {1} branches'.format(self.dockerfiles_failed, branches_failed) if branches_failed > 0: log.error(_) else: log.info(_) if self.failed: log.error('Dockerfile validation FAILED') sys.exit(ERRORS['CRITICAL']) log.info('Dockerfile validation SUCCEEDED')
def __init__(self): super(SerfEventHandler, self).__init__() # allow shorter default 10 sec timeout # self.timeout_default = 30 self.events = ['member-join', 'member-leave', 'member-failed', 'member-update', 'member-reap', 'user', 'query'] self.node = os.getenv('SERF_SELF_NAME', '') self.role = os.getenv('SERF_SELF_ROLE', None) self.event = os.getenv('SERF_EVENT', None) self.query_name = os.getenv('SERF_QUERY_NAME', None) self.user_event = os.getenv('SERF_USER_EVENT', None) # self.user_ltime = os.getenv('SERF_USER_LTIME', 0) # self.query_ltime = os.getenv('SERF_QUERY_LTIME', 0) self.command = None # "expected to exit within a reasonable amount of time" according to docs, this seems like a reasonable # safeguard and is configurable on the command line via --timeout <secs> if self.event is None: log.warn('SERF_EVENT environment variable was None!!') elif self.event not in self.events: log.warn("SERF_EVENT environment variable passed unrecognized event type '%s'" % self.event)
def check_ping(host, count=1, wait=1): log.info("pinging host '%s' (count=%s, wait=%s)", host, count, wait) ping_count = '-c {0}'.format(count) if platform.system().lower() == 'windows': ping_count = '-n 1' ping_wait = '-w {0}'.format(wait) if platform.system().lower() == 'darwin': ping_wait = '-W 1' try: exitcode = subprocess.call(["ping", ping_count, ping_wait, host], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if exitcode == 0: log.info("host '%s' responded to ping", host) return host except subprocess.CalledProcessError as _: log.warn('ping failed: %s', _.output) except OSError as _: die('error calling ping: {0}'.format(_)) return None
def branch_version(self, branch): branch_base = None branch_versions = [] # if ...-x.y-x.y match = self.branch_regex.match(branch) if match: groups = match.groups() #log.debug('groups = %s', groups) branch_base = groups[0] for version in groups[1:]: if version is None: continue branch_versions.append(version) else: log.warn("Failed to match branch format for branch '{0}'".format(branch) + ", code needs extension for this branch naming format") self.branches_skipped.add(branch) return ('', []) log.debug('branch_base = %s', branch_base) log.debug('branch_versions = %s', branch_versions) return (branch_base, branch_versions)
def run(self): if not self.args: self.usage('no Dockerfile / directory args given') args = uniq_list_ordered(self.args) self.branch_prefix = self.get_opt('branch_prefix') if self.branch_prefix is not None: validate_regex(self.branch_prefix, 'branch prefix') self.branch_prefix = re.compile(self.branch_prefix) for arg in args: if not os.path.exists(arg): print("'%s' not found" % arg) sys.exit(ERRORS['WARNING']) if os.path.isfile(arg): log_option('file', arg) elif os.path.isdir(arg): log_option('directory', arg) else: die("path '%s' could not be determined as either a file or directory" % arg) for arg in args: self.check_git_branches_dockerfiles(arg) branches_skipped = len(self.branches_skipped) if branches_skipped > 0: log.warn( '{0} branches skipped for not matching expected naming format'. format(branches_skipped)) log.info('{0} Dockerfiles checked across {1} branches'.format( len(self.dockerfiles_checked), self.branches_checked)) branches_failed = len(self.branches_failed) _ = '{0} Dockerfiles failed validation across {1} branches'.format( self.dockerfiles_failed, branches_failed) if branches_failed > 0: log.error(_) else: log.info(_) if self.failed: log.error('Dockerfile validation FAILED') sys.exit(ERRORS['CRITICAL']) log.info('Dockerfile validation SUCCEEDED')
def process_file(self, filename, file_handle): for line in file_handle: # log.debug(line) match = self.re_line.match(line) if not match: err_msg = "ERROR in file '{0}' on line: {1}".format( filename, line) if not self.skip_errors: die(err_msg) printerr() log.warn(err_msg) continue metric = match.group(1) timestamp = match.group(2) # don't have a need for this right now # value = match.group(3) tags = match.group(4) key = metric if self.include_timestamps: timestamp = int(timestamp) # remove millis if len(str(timestamp)) >= 15: timestamp = round(timestamp / 1000) hour = time.strftime('%Y-%m-%d %H:00', time.gmtime(timestamp)) key += ' ' + hour for tag in sorted(tags.split()): key += ' ' + tag.strip() if self.prefix_length is None: prefix = key else: prefix = key[0:min(self.prefix_length, len(key))] # prefix = self.bytes_to_str(prefix) if not self.keys.get(prefix): self.keys[prefix] = {'count': 0} self.keys[prefix]['count'] += 1 self.total_keys += 1 if self.verbose < 2 and self.total_keys % 10000 == 0: print('.', file=sys.stderr, end='')
def run(self): json_file = self.get_opt('json') parquet_dir = self.get_opt('parquet_dir') # let Spark fail if csv/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("Json Source: %s" % json_file) log.info("Parquet Destination: %s" % parquet_dir) conf = SparkConf().setAppName('HS PySpark JSON => Parquet') sc = SparkContext(conf=conf) # pylint: disable=invalid-name sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) if isMinVersion(spark_version, 1.4): df = sqlContext.read.json(json_file) # pylint: disable=invalid-name df.write.parquet(parquet_dir) else: log.warn('running legacy code for Spark <= 1.3') df = sqlContext.jsonFile(json_file) # pylint: disable=invalid-name df.saveAsParquetFile(parquet_dir)
def process_file(self, filename, file_handle): for line in file_handle: # log.debug(line) match = self.re_line.match(line) if not match: err_msg = "ERROR in file '{0}' on line: {1}".format(filename, line) if not self.skip_errors: die(err_msg) printerr() log.warn(err_msg) continue metric = match.group(1) timestamp = match.group(2) # don't have a need for this right now # value = match.group(3) tags = match.group(4) key = metric if self.include_timestamps: timestamp = int(timestamp) # remove millis if len(str(timestamp)) >= 15: timestamp = round(timestamp / 1000) hour = time.strftime('%Y-%m-%d %H:00', time.gmtime(timestamp)) key += ' ' + hour for tag in sorted(tags.split()): key += ' ' + tag.strip() if self.prefix_length is None: prefix = key else: prefix = key[0:min(self.prefix_length, len(key))] # prefix = self.bytes_to_str(prefix) if not self.keys.get(prefix): self.keys[prefix] = {'count': 0} self.keys[prefix]['count'] += 1 self.total_keys += 1 if self.verbose < 2 and self.total_keys % 10000 == 0: print('.', file=sys.stderr, end='')
def run(self): if not self.args: self.usage('no Dockerfile / directory args given') args = uniq_list_ordered(self.args) self.branch_prefix = self.get_opt('branch_prefix') if self.branch_prefix is not None: validate_regex(self.branch_prefix, 'branch prefix') self.branch_prefix = re.compile(self.branch_prefix) for arg in args: if not os.path.exists(arg): print("'%s' not found" % arg) sys.exit(ERRORS['WARNING']) if os.path.isfile(arg): log_option('file', arg) elif os.path.isdir(arg): log_option('directory', arg) else: die("path '%s' could not be determined as either a file or directory" % arg) for arg in args: self.check_git_branches_dockerfiles(arg) branches_skipped = len(self.branches_skipped) if branches_skipped > 0: log.warn('{0} branches skipped for not matching expected naming format' .format(branches_skipped)) log.info('{0} Dockerfiles checked across {1} branches' .format(len(self.dockerfiles_checked), self.branches_checked)) branches_failed = len(self.branches_failed) _ = '{0} Dockerfiles failed validation across {1} branches'.format(self.dockerfiles_failed, branches_failed) if branches_failed > 0: log.error(_) else: log.info(_) if self.failed: log.error('Dockerfile validation FAILED') sys.exit(ERRORS['CRITICAL']) log.info('Dockerfile validation SUCCEEDED')
def check_git_branches_upstream(self, target): target = os.path.abspath(target) gitroot = find_git_root(target) if gitroot is None: die('Failed to find git root for target {0}'.format(target)) log.debug("finding branches for target '{0}'".format(target)) repo = git.Repo(gitroot) branches = repo.branches if self.branch_prefix is not None: log.debug('restricting to branches matching branch prefix') branches = [x for x in branches if self.branch_prefix.match(str(x))] if not branches: log.error("No branches matching '%s' for target '%s'", self.get_opt('branch_prefix'), target) self.status = 'NO BRANCHES' #if log.isEnabledFor(logging.DEBUG): #log.debug('\n\nbranches for target %s:\n\n%s\n', target, '\n'.join(list(branches))) for branch in branches: expected = '{0}/{1}'.format(self.origin, branch) # have to str() this as it returns an object that will fail equality match otherwise tracking_branch = str(branch.tracking_branch()) if tracking_branch == expected: log.info("OK: repo '{0}' branch '{1}' is tracking '{2}'" .format(gitroot, branch, tracking_branch)) elif self.get_opt('fix') and tracking_branch == 'None': log.warn("WARN: setting repo '{0}' unconfigured branch '{1}' to track '{2}'" .format(gitroot, branch, expected)) #print(list(repo.remotes.origin.refs)) branch.set_tracking_branch(git.refs.remote.RemoteReference(repo, 'refs/remotes/' + expected)) elif self.get_opt('force_fix'): log.warn("WARN: forcibly resetting repo '{0}' branch '{1}' to track '{2}'" .format(gitroot, branch, expected)) branch.set_tracking_branch(git.refs.remote.RemoteReference(repo, 'refs/remotes/' + expected)) else: self.status = "ERROR" log.error("BAD: branch '{0}' is tracking '{1}' (expected '{2}')" .format(branch, tracking_branch, expected))
def __init__(self): super(SerfEventHandler, self).__init__() # allow shorter default 10 sec timeout # self.timeout_default = 30 self.events = [ 'member-join', 'member-leave', 'member-failed', 'member-update', 'member-reap', 'user', 'query' ] self.node = os.getenv('SERF_SELF_NAME', '') self.role = os.getenv('SERF_SELF_ROLE', None) self.event = os.getenv('SERF_EVENT', None) self.query_name = os.getenv('SERF_QUERY_NAME', None) self.user_event = os.getenv('SERF_USER_EVENT', None) # self.user_ltime = os.getenv('SERF_USER_LTIME', 0) # self.query_ltime = os.getenv('SERF_QUERY_LTIME', 0) self.command = None # "expected to exit within a reasonable amount of time" according to docs, this seems like a reasonable # safeguard and is configurable on the command line via --timeout <secs> if self.event is None: log.warn('SERF_EVENT environment variable was None!!') elif self.event not in self.events: log.warn( "SERF_EVENT environment variable passed unrecognized event type '%s'" % self.event)
def run(self): csv_file = self.get_opt('csv') parquet_dir = self.get_opt('parquet_dir') has_header = self.get_opt('has_header') # I don't know why the Spark guys made this a string instead of a bool header_str = 'false' if has_header: header_str = 'true' schema = self.get_opt('schema') # let Spark fail if csv/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("CSV Source: %s" % csv_file) log.info("Parquet Destination: %s" % parquet_dir) if schema: def get_type(arg): arg = str(arg).lower() if arg not in self.types_mapping: self.usage( "invalid type '%s' defined in --schema, must be one of: %s" % (arg, ', '.join(sorted(self.types_mapping.keys())))) # return self.types_mapping[arg] module = __import__('pyspark.sql.types', globals(), locals(), ['types'], -1) class_ = getattr(module, self.types_mapping[arg]) _ = class_() return _ def create_struct(arg): name = str(arg).strip() data_type = 'string' if ':' in arg: (name, data_type) = arg.split(':', 1) data_class = get_type(data_type) return StructField(name, data_class, True) # see https://github.com/databricks/spark-csv#python-api self.schema = StructType( [create_struct(_) for _ in schema.split(',')]) log.info('generated CSV => Spark schema') conf = SparkConf().setAppName('HS PySpark CSV => Parquet') sc = SparkContext(conf=conf) # pylint: disable=invalid-name sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) # pylint: disable=invalid-name df = None if isMinVersion(spark_version, 1.4): if has_header and not schema: log.info('inferring schema from CSV headers') df = sqlContext.read.format('com.databricks.spark.csv')\ .options(header=header_str, inferschema='true')\ .load(csv_file) else: log.info('using explicitly defined schema') df = sqlContext.read\ .format('com.databricks.spark.csv')\ .options(header=header_str)\ .load(csv_file, schema=self.schema) df.write.parquet(parquet_dir) else: log.warn('running legacy code for Spark <= 1.3') if has_header and not schema: log.info('inferring schema from CSV headers') df = sqlContext.load(source="com.databricks.spark.csv", path=csv_file, header=header_str, inferSchema='true') elif self.schema: log.info('using explicitly defined schema') schema = self.schema df = sqlContext.load(source="com.databricks.spark.csv", path=csv_file, header=header_str, schema=schema) else: die('no header and no schema, caught late') df.saveAsParquetFile(parquet_dir)
def check_workflow(self, workflow_name, workflow_id, max_age=None, max_runtime=None): log.info("checking workflow '%s' id '%s'", workflow_name, workflow_id) # GET /workflow/fetchWorkflowStatus/<instance_id> is also available but only uses wfId, doesn't support wfName # returns ['result']['list'] = [ {}, {}, ... ] (req, self.query_time) = self.req( url='{url_base}/workflow/publish/getWorkflowExecutionHistory'. format(url_base=self.url_base), # orders by newest first, but seems to return last 10 anyway body=json.dumps({ 'chunk_size': 1, 'currentPage': 1, 'wfName': workflow_name, 'wfId': workflow_id })) info = '' if workflow_name: info += " name '{0}'".format(workflow_name) if workflow_id: info += " id '{0}'".format(workflow_id) try: json_dict = json.loads(req.content) result = json_dict['result'] not_found_err = '{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \ 'Perhaps you specified the wrong name/id or the workflow hasn\'t run yet? ' + \ 'Use --list to see existing workflows' if result is None: if self._all: return None qquit('CRITICAL', "no results found for workflow{0}".format(not_found_err)) reports = result['jobExecutionReports'] if not isList(reports): raise ValueError('jobExecutionReports is not a list') if not reports: qquit('CRITICAL', "no reports found for workflow{0}".format(not_found_err)) # orders by newest first by default, checking last run only report = reports[0] num_reports = len(reports) index = 0 while report['status'] == 'INCOMPLETE': index += 1 if index >= num_reports: log.warn('only incomplete workflows detected') report = reports[0] report = reports[index] status = report['status'] if status == 'SUCCESS': pass elif status == 'INCOMPLETE': self.warning() else: self.critical() self.msg += "workflow '{workflow}' id '{id}' status = '{status}'".format( workflow=report['wfName'], id=report['wfId'], status=status) if not self._all: self.check_times(report['startDate'], report['endDate'], max_age, max_runtime) return status except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
def run(self): csv_file = self.options.csv parquet_dir = self.options.parquet_dir has_header = self.options.has_header # I don't know why the Spark guys made this a string instead of a bool header_str = 'false' if has_header: header_str = 'true' schema = self.options.schema # let Spark fail if csv/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("CSV Source: %s" % csv_file) log.info("Parquet Destination: %s" % parquet_dir) if schema: def get_type(arg): arg = str(arg).lower() if arg not in self.types_mapping: self.usage( "invalid type '%s' defined in --schema, must be one of: %s" % (arg, ', '.join(sorted(self.types_mapping.keys())))) # return self.types_mapping[arg] module = __import__('pyspark.sql.types', globals(), locals(), ['types'], -1) class_ = getattr(module, self.types_mapping[arg]) _ = class_() return _ def create_struct(arg): name = arg data_type = 'string' if ':' in arg: (name, data_type) = arg.split(':', 1) data_class = get_type(data_type) return StructField(name, data_class, True) # see https://github.com/databricks/spark-csv#python-api self.schema = StructType( [create_struct(_) for _ in schema.split(',')]) log.info('generated CSV => Spark schema') conf = SparkConf().setAppName('HS PySpark CSV => Parquet') sc = SparkContext(conf=conf) # pylint: disable=invalid-name sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) # pylint: disable=invalid-name df = None if isMinVersion(spark_version, 1.4): if has_header and not schema: log.info('inferring schema from CSV headers') df = sqlContext.read.format('com.databricks.spark.csv')\ .options(header=header_str, inferschema='true')\ .load(csv_file) else: log.info('using explicitly defined schema') df = sqlContext.read\ .format('com.databricks.spark.csv')\ .options(header=header_str)\ .load(csv_file, schema=self.schema) df.write.parquet(parquet_dir) else: log.warn('running legacy code for Spark <= 1.3') if has_header and not schema: log.info('inferring schema from CSV headers') df = sqlContext.load( source="com.databricks.spark.csv", path=csv_file, header=header_str, inferSchema='true') elif self.schema: log.info('using explicitly defined schema') schema = self.schema df = sqlContext.load( source="com.databricks.spark.csv", path=csv_file, header=header_str, schema=schema) else: die('no header and no schema, caught late') df.saveAsParquetFile(parquet_dir)