def run_background_job(cls, runSmokeTestSetup=True, config=None): ''' Runs background long running Flume Job :param runSmokeTestSetup: Runs smoke test setup if set to true :param config: expected configuration location :return: Total number of long running jobs started ''' from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode HDFS.createDirectory(cls._hdfs_test_dir, perm="777", force=True) UpgradePerNode.reportProgress( "[INFO][FLUME][BGJob] Long running job for Flume component started" ) logger.info("Starting the Flume Agent Topology") addlParams = "-Dflume.log.dir=%s -Dflume.log.file=agent2.log" % cls._local_work_dir agent2.start("agent2", cls._flume_test_src, addlParams=addlParams, enableDebugLogOnConsole=False) logger.info( "Sleeping for 10 seconds before starting the other Flume agent") time.sleep(10) addlParams = "-Dflume.log.dir=%s -Dflume.log.file=agent.log" % cls._local_work_dir agent1.start("agent", cls._flume_test_src, addlParams=addlParams, enableDebugLogOnConsole=False) time.sleep(5) return 1
def run_background_job(cls, runSmokeTestSetup=True, config=None, flagFile="/tmp/flagFile"): ''' Uploads Files to HDFS before upgrade starts and runs long running sleep job in background :return: number of application started ''' # start long running application which performs I/O operations (BUG-23838) #from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode #UpgradePerNode.reportProgress("### Background application for HDFS started ####") #jobArgs = {"mapred.job.queue.name" : cls._queue} #HadoopJobHelper.runSleepJob(numOfMaps = 1, numOfReduce = 1, mapSleepTime = "10000000", reduceSleepTime = "100", extraJobArg = jobArgs, runInBackground = True, config = config, directoutput = False ) #MAPRED.triggerSleepJob("1", "0", "100000", "1000000", 1, background = True) # load generator HADOOP_TEST_JAR = cls.get_hadoop_test_jar() TEST_USER = Config.get('hadoop', 'HADOOPQA_USER') HDFS.deleteDirectory(flagFile) slavelist = HDFS.getDatanodes() jobCmd = 'jar %s NNloadGenerator -Dmapred.job.queue.name=%s -mr 3 %s -root %s -numOfThreads 5 -maxDelayBetweenOps 1000 -elapsedTime 36000 -flagFile %s' % ( HADOOP_TEST_JAR, cls._queue, cls._lgTestOutputDir, cls._lgTestDataDir, flagFile) Hadoop.runInBackground(jobCmd) time.sleep(15) return 1
def setup(): out = HDFS.deleteFile(CREATE_FILE_PATH_IN_HADOOP, user=HDFS_USER) assert out[0] == 0 out = HDFS.deleteDirectory(OUT_PATH_IN_HADOOP, user=HDFS_USER) assert out[0] == 0 out = HDFS.deleteDirectory(CREATE_FILE_2_PATH_IN_HADOOP, user=HDFS_USER) assert out[0] == 0
def ru_prepare_save_state_for_upgrade(cls): ''' Prepare Namenode to save State for Upgrade ''' from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode UpgradePerNode.reportProgress( "[INFO][HDFS][Prepare] Preparing state for HDFS upgrade") # BUG-26726: we need to be in safemode only in non HA cluster if not HDFS.isHAEnabled(): exit_code, output = HDFS.runasAdmin("dfsadmin -safemode enter") ruAssert("HDFS", exit_code == 0, '[Preparation] enter safemode failed') exit_code, output = HDFS.runas( Config.get('hadoop', 'HDFS_USER'), "dfsadmin -Ddfs.client.test.drop.namenode.response.number=0 -rollingUpgrade prepare" ) ruAssert("HDFS", exit_code == 0, '[Preparation] -rollingUpgrade prepare failed') if not HDFS.isHAEnabled(): exit_code, output = HDFS.runasAdmin("dfsadmin -safemode leave") ruAssert("HDFS", exit_code == 0, '[Preparation] leave safemode failed') UpgradePerNode.reportProgress( "[INFO][HDFS][Prepare] Preparing state for HDFS upgrade finished ")
def createState4Rollback2(cls): exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), "dfs -rm -skipTrash rollback_state1") ruAssert("HDFS", exit_code == 0, "can't get remove file rollback_state1") exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), "dfs -touchz rollback_state2") ruAssert("HDFS", exit_code == 0, "can't get create file rollback_state2") # truncate the file and validate the truncated size logger.info("**** Truncate file to 1 byte ****") exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), "dfs -truncate 1 testFileTr") ruAssert("HDFS", exit_code == 0, "can't truncate file testFileTr") if os.path.isfile(cls.localTestFileTr): os.remove(cls.localTestFileTr) logger.info("**** Wait 30 second for file to be recovered ****") time.sleep(30) command = "dfs -copyToLocal testFileTr " + cls.localTestFileTr exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), command) ruAssert("HDFS", exit_code == 0, "can't copy file testFileTr") size = os.path.getsize(cls.localTestFileTr) ruAssert("HDFS", size == 1, "size not 1. Actual size:" + ` size `)
def getNameNodeURL(nameservice2=False): if Hadoop.isEncrypted(): baseUrl = "https://%s" % (HDFS.getNamenodeHttpsAddress(nameservice2)) else: baseUrl = "http://%s" % (HDFS.getNamenodeHttpAddress(nameservice2)) logger.info("URL being returned is - %s" % baseUrl) return baseUrl
def setupHS2ConcurrencyDataset(): logger.info("Setup test data") data_dir = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "hs2concur-test-data") data_tgz = os.path.join(Config.getEnv('WORKSPACE'), "hs2concur-test-data.tgz") if not os.path.isfile(data_tgz): assert util.downloadUrl(Config.get('hive', 'HS2CONCURR_TEST_DATA'), data_tgz) Machine.tarExtractAll(data_tgz, data_dir) # load data into HDFS hdfs_user = Config.get("hadoop", 'HDFS_USER') HDFS.createDirectory("/tmp/hs2data", user=hdfs_user, perm='777', force=True) HDFS.createDirectory("/tmp/hs2data/student", perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'studenttab10k'), "/tmp/hs2data/student") HDFS.createDirectory("/tmp/hs2data/voter", perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'votertab10k'), "/tmp/hs2data/voter") query = """drop table if exists student_txt; create external table student_txt (name string, age int, gpa double) row format delimited fields terminated by '\\t' stored as textfile location '/tmp/hs2data/student'; drop table if exists voter_txt; create external table voter_txt (name string, age int, registration string, contributions float) row format delimited fields terminated by '\\t' stored as textfile location '/tmp/hs2data/voter'; drop table if exists student; create table student (name string, age int, gpa double) CLUSTERED BY (name) INTO 20 BUCKETS STORED AS ORC TBLPROPERTIES('transactional'='true'); drop table if exists voter; create table voter (name string, age int, registration string, contributions float) CLUSTERED BY (name) INTO 20 BUCKETS STORED AS ORC TBLPROPERTIES('transactional'='true'); Insert into table student select * from student_txt; Insert into table voter select * from voter_txt;""" exit_code, stdout, stderr = Hive.runQueryOnBeeline(query, readFromFile=True, logoutput=True) assert exit_code == 0, "Test data creation failed"
def updateJobProperties(cls, propFile, properties=None, haEnabled=False, debug=False): fileSystemName = Hadoop.getFSDefaultValue() jobTrackerIP = MAPRED.getJobtrackerAddress() jobTracker = jobTrackerIP[0] + ":" + jobTrackerIP[1] if not properties: properties = {} if not properties.has_key('nameNode'): properties['nameNode'] = fileSystemName if not properties.has_key('jobTracker'): properties['jobTracker'] = jobTracker if "hcatalog" in propFile: if Hadoop.isSecure(): kerberosPrincipal = Hive.getConfigValue( "hive.metastore.kerberos.principal") properties[ 'hive.metastore.kerberos.principal'] = kerberosPrincipal logger.info("Updating for hcatalog workflow") hcatNode = Hive.getConfigValue("hive.metastore.uris").replace( 'thrift', 'hcat') logger.info("Hcat node is " + hcatNode) properties['hcatNode'] = hcatNode if Hadoop.isSecure(): # determine the namenode and the jobtracker principal nnPrincipal = None if haEnabled: nnPrincipal = HDFS.getNameNodePrincipal().replace( '_HOST', HDFS.getNamenodeByState('active')) else: nnPrincipal = HDFS.getNameNodePrincipal().replace( '_HOST', HDFS.getNamenodeHttpAddress()[0]) jtPrincipal = MAPRED.getMasterPrincipal().replace( '_HOST', jobTrackerIP[0]) properties['dfs.namenode.kerberos.principal'] = nnPrincipal properties['mapreduce.jobtracker.kerberos.principal'] = jtPrincipal wfPath = util.getPropertyValueFromFile(propFile, "oozie.wf.application.path") if wfPath != None and wfPath.find("hdfs://localhost:9000") != -1: wfPath = wfPath.replace("hdfs://localhost:9000", fileSystemName) logger.info("Value of replaced oozie.wf.application.path is " + wfPath) properties['oozie.wf.application.path'] = wfPath util.writePropertiesToFile(propFile, propFile, properties) if debug: logger.info('Content of properties file %s' % propFile) f = open(propFile, 'r') # print the file to the console logger.info(f.read()) f.close()
def run_client_smoketest(cls, config=None, env=None): ''' Run wordcount Job passing env variables :param config: Configuration location :param env: Set Environment variables ''' logger.info("**** Running HDFS CLI Test ****") from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode UpgradePerNode.reportProgress( "[INFO][HDFS][ClientSmoke] CLI test for HDFS started ") if not cls._SmokeInputDir: cls._SmokeInputDir = cls._base_hdfs_dir + "/smokeHdfsInput" SmokeOutputDir = cls._base_hdfs_dir + '/smokeHdfsOutput_cli' HDFS.deleteDirectory(SmokeOutputDir, Config.get('hadoop', 'HADOOPQA_USER')) jobCmd = 'jar %s wordcount \"-Dmapreduce.reduce.input.limit=-1\" \"-D%s=%s\" %s %s' % ( Config.get('hadoop', 'HADOOP_EXAMPLES_JAR'), "mapred.job.queue.name", cls._queue, cls._SmokeInputDir, SmokeOutputDir) exit_code, stdout = Hadoop.run(jobCmd, env=env) ruAssert("HDFS", exit_code == 0, "[ClientSmoke] Hdfs smoketest failed") HDFS.deleteDirectory(SmokeOutputDir) ruAssert("HDFS", exit_code == 0, "[ClientSmoke] could not delete: " + SmokeOutputDir) UpgradePerNode.reportProgress( "[INFO][HDFS][ClientSmoke] CLI test for HDFS Finished ")
def background_job_teardown(cls): ''' Cleanup for long running Yarn job ''' HDFS.deleteDirectory(cls._hdfs_input, user=Config.get('hadoop', 'HADOOPQA_USER')) HDFS.deleteDirectory(cls._hdfs_output, user=Config.get('hadoop', 'HADOOPQA_USER'))
def background_job_setup(cls, runSmokeTestSetup=True, config=None): ''' Setup for background long running job :param runSmokeTestSetup: Runs smoke test setup if set to true ''' logger.info("runSmokeTestSetup = %s, config = %s", runSmokeTestSetup, config) HDFS.createDirectory(cls.HDFS_CLUSTER_INPUT_DIR)
def background_job_teardown(cls): ''' Cleanup of HDFS background job ''' HDFS.deleteDirectory(cls._base_hdfs_dir) command = "rm -rf " + cls._lgStructureDir exit_code, stdout = Machine.runas(Machine.getAdminUser(), command, None, None, None, "True", Machine.getAdminPasswd())
def run(self): """ Move files to HDFS Input Dir after each interval period for n times. """ for count in range(0, self.times): text = "hello world \n Testing HDFS Word count Spark application" random_name = ''.join( random.choice(string.lowercase) for i in range(5)) filename = os.path.join(Config.getEnv('ARTIFACTS_DIR'), random_name) util.writeToFile(text, filename, isAppend=False) max_retry = 3 count = 0 while count < max_retry: try: if "hdfs://ns2" in self.hdfs_input_dir: cp_status = HDFS.copyFromLocal(filename, "hdfs://ns2/tmp", enableDebug=True) else: cp_status = HDFS.copyFromLocal(filename, "/tmp", enableDebug=True) assert cp_status[ 0] == 0, "Failed to copy file to HDFS 'tmp'" logger.info("copyFromLocal command finished for %s" % filename) if "hdfs://ns2" in self.hdfs_input_dir: mv_status = HDFS.mv(None, "hdfs://ns2/tmp/" + random_name, self.hdfs_input_dir, config=None) else: mv_status = HDFS.mv(None, "/tmp/" + random_name, self.hdfs_input_dir, config=None) assert mv_status[ 0] == 0, "Failed to move file from 'tmp' to test directory" except: if count < max_retry: count = count + 1 logger.info( "File copy into HDFS test directory failed after %s attempts, retrying after 120s sleep interval" % count) time.sleep(120) else: logger.error( "Failed to copy file into HDFS test directory, expect failures in HDFSWordCOunt" ) else: break logger.info("%s moved to %s" % (filename, self.hdfs_input_dir)) logger.info("sleeping for %s seconds" % self.interval) time.sleep(self.interval)
def generate_test_data(cls, hdfs_test_dir, num_of_rows): test_data_file = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "sqooptest.dat") f = open(test_data_file, 'w') userid = 100000 for i in xrange(num_of_rows): f.write("%d,%d\n" % (userid + i, random.randint(10, 80))) f.close() HDFS.createDirectory(hdfs_test_dir, user=cls._hdfs_user, perm='777', force=True) HDFS.copyFromLocal(test_data_file, hdfs_test_dir)
def background_job_teardown(cls): ''' Cleanup directories for long running tez job ''' for input in cls._hdfsInputList: HDFS.deleteDirectory(input) for output in cls._hdfsOutputList: HDFS.deleteDirectory(output) Machine.rm(user=HADOOPQA_USER, host=None, filepath=LOCAL_WORK_DIR, isdir=True) logger.info("**** Completed background job teardown for Tez ****")
def stopYarnLongRunningJob(cls): ''' Stop Long running Yarn Dshell Job ''' logger.info("**** Touch the file ****") HDFS.createDirectory(cls._multi_version_signal_file_dir, user=None, perm="777", force=False) multi_version_signal_file_path = cls._multi_version_signal_file_dir + "/signal" HDFS.touchz(multi_version_signal_file_path) #YARN.waitForApplicationFinish(cls._background_job_appId) time.sleep(2) logger.info("**** Done checking status ****")
def getKnoxHDFSURL(nameservice2=False): KNOX_HOST = Config.get('knox', 'KNOX_HOST').split(',')[0] if HDFS.isFederated(): if nameservice2: baseUrl = "https://%s:8443/gateway/ui_%s/hdfs/" % ( KNOX_HOST, HDFS.getNameServices()[1]) else: baseUrl = "https://%s:8443/gateway/ui_%s/hdfs/" % ( KNOX_HOST, HDFS.getNameServices()[0]) else: baseUrl = "https://%s:8443/gateway/ui/hdfs/" % KNOX_HOST baseUrlWithNNHost = "%s?host=%s" % (baseUrl, getNameNodeURL(nameservice2)) logger.info("URL being returned is - %s" % baseUrlWithNNHost) return baseUrl, baseUrlWithNNHost
def background_job_setup(cls, runSmokeTestSetup=True, config=None): ''' Create 5 input datasets for TestOrderedWordCount :param runSmokeTestSetup: Runs smoke test setup if set to true ''' logger.info("*** Start background job setup for Tez ***") Machine.rm(user=HADOOPQA_USER, host=None, filepath=LOCAL_WORK_DIR, isdir=True) os.mkdir(LOCAL_WORK_DIR) for i in range(0, 4, 1): inputDirName = "HDFS_INPUT%d" % i inputDirPath = os.path.join(LOCAL_WORK_DIR, inputDirName) HadoopJobHelper.runCustomWordWriter(LOCAL_WORK_DIR, inputDirPath, 10, 400, 10000) hdfsInputDir = "/user/%s/Input%d" % (HADOOPQA_USER, i) hdfsOutputDir = "/user/%s/output%d" % (HADOOPQA_USER, i) #In case already present, delete the input directory HDFS.deleteDirectory(hdfsInputDir) HDFS.createDirectory(hdfsInputDir) HDFS.deleteDirectory(hdfsOutputDir) HDFS.copyFromLocal(inputDirPath, hdfsInputDir) cls._hdfsInputList.append(hdfsInputDir + "/" + inputDirName) cls._hdfsOutputList.append(hdfsOutputDir) logger.info("Created data for input %d", i) logger.info("*** End background job setup for Tez ***")
def ru_downgrade_state(cls): ''' Downgrades Namenode A downgrade is done - may need to convert state to previous version or state is compatible - again upgrade is being abandoned NOTE: this command will not return until namenode shuts down ''' command = "sudo su - -c 'hadoop namenode -rollingUpgrade downgrade' hdfs" if HDFS.isHAEnabled(): nodes = [] nodes.append(HDFS.getNamenodeByState('standby')) nodes.append(HDFS.getNamenodeByState('active')) for node in nodes: HDFS.resetNamenode('stop', host=node) (exitcode, stdout) = Machine.runas(Machine.getAdminUser(), command, node, None, None, "True", Machine.getAdminPasswd()) ruAssert( "HDFS", exitcode == 0, "[NNDowngrade] hadoop namenode -rollingUpgrade downgrade command failed" ) return HDFS.stopNamenode() node = HDFS.getNamenode() (exitcode, stdout) = Machine.runas(Machine.getAdminUser(), command, node, None, None, "True", Machine.getAdminPasswd()) ruAssert( "HDFS", exitcode == 0, "[NNDowngrade] hadoop namenode -rollingUpgrade downgrade command failed" )
def createState4Rollback1(cls): exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), "dfs -rm -skipTrash rollback_state1") exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), "dfs -rm -skipTrash rollback_state2") exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), "dfs -rm -skipTrash testFileTr") exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), "dfs -touchz rollback_state1") ruAssert("HDFS", exit_code == 0, "can't get create file rollback_state1") command = "dfs -put " + cls.testFileTr + " testFileTr" exit_code, stdout = HDFS.runas(Config.get('hadoop', 'HADOOPQA_USER'), command) ruAssert("HDFS", exit_code == 0, "can't upload" + cls.testFileTr)
def ensure_all_jns_are_up(cls, nodes): # run roll edits HDFS.rollEdits() time.sleep(5) # capture LastAppliedOrWrittenTxId from the NN JMX. nn_url = HDFS.getNNWebAppAddress() + '/jmx' nn_data = util.getJMXData(nn_url, 'Hadoop:service=NameNode,name=NameNodeInfo', 'JournalTransactionInfo') json_data = json.loads(nn_data) last_tx_id = int(json_data['LastAppliedOrWrittenTxId']) logger.info( '******************** NN LAST TX ID: %s *************************' % last_tx_id) cls.ensure_jns_have_new_txn(nodes, last_tx_id)
def setupSchemaEvolutionDataset(): logger.info("Setup Schema Evolution dataset") HDFS.createDirectory(HCAT_TEST_DIR, user=HDFS_USER, perm='777', force=True) HDFS.createDirectory(HDFS_TEST_DIR, user=HDFS_USER, perm='777', force=True) HIVE_TEST_CMD = "-Dhive.use.beeline=true -Dhadoop.home=%s -Dhive.home=%s -Dhcat.home=%s -Dpig.home=%s -Dhbase.home=%s" % ( HADOOP_HOME, HIVE_HOME, HCATALOG_HOME, PIG_HOME, HIVE_HOME ) if Hadoop.isHadoop2(): HIVE_TEST_CMD += " -Dmapred.home=%s -Dhadoop.conf.dir=%s" % (Config.get('hadoop', 'MAPRED_HOME'), HADOOP_CONF) hiveServer2Url = str(Hive.getHiveServer2Url()) exit_code, stdout = Ant.run( HIVE_TEST_CMD + " deploy-schemaevolution", cwd=SRC_DIR, env={"HIVE_SERVER2_URL": hiveServer2Url} ) assert exit_code == 0
def perform_post_upgrade_steps(self): if Config.getEnv("HDP_STACK_INSTALLED").lower() == "true": from beaver.component.hadoop import Hadoop, HDFS from beaver.component.hive import Hive COMPONENT = str(self.COMPONENT) HDFS_USER = Config.get('hadoop', 'HDFS_USER') if 'experiment' in COMPONENT and Hive.isInstalled(): HIVE_WAREHOUSE_DIR = Hive.getConfigValue( "hive.metastore.warehouse.dir", defaultValue="/apps/hive/warehouse" ) HDFS.chmod(HDFS_USER, 777, HIVE_WAREHOUSE_DIR, True) else: UpgradeLogger.reportProgress("No additional post-upgrade steps defined for EU", True) else: logger.info("No additional post-upgrade steps defined for EU on HDF")
def verifyLongRunningQuery(cls, file_to_verify): lfile = os.path.join(Config.getEnv('ARTIFACTS_DIR'), file_to_verify) exit_code, stdout = HDFS.copyToLocal( cls._hdfs_bgjtest_dir + "/" + file_to_verify, lfile) if exit_code != 0: logger.info("Error fetching the timestamp file from HDFS") return False lines = open(lfile, 'r').readlines() if len(lines) == 0: logger.info("Empty timestamp file") return False try: ts = int(lines[-1]) # Shutdown gracefully if ts == -1: return True # Timestamp should be less than 5 minutes, which indicates # UDF wrote something atleast once in the last 5 minutes timegap = time.time() - (ts / 1000) if timegap > 300: logger.info( "Time gap is %d seconds, last line in the timestamp file was '%d'" % (timegap, ts)) return False except ValueError: logger.info("Error parsing last line in the timestamp file => '" + lines[-1] + "'") return False return True
def wait4DNLive(cls, node): i = 1 maxTries = 30 # ie 150sec - note the delay in QE configs for initial BR is 120sec logger.info('*** Waiting for DN %s to become live ****' % node) while i < maxTries: livenodes = HDFS.getDatanodesFromJmx() if node in livenodes: return True # saw strange behavious where the dns were ip addresses sometimes; convert livenodesIp = [] for iNode in livenodes: # convert to ip addresses livenodesIp.append(util.getIpAddress(iNode)) if node in livenodesIp: return True logger.info('*** Waiting for DN %s to become live ****' % node) logger.info('*** Live nodes list is: %s %s ****' % (livenodes, livenodesIp)) time.sleep(5) i = i + 1 from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode UpgradePerNode.reportProgress( "[WARNING][HDFS][XXX] Datanode %s did not become live after 150 secs of restart, continuing " % node) return False
def submit_storm_hive_topology(cls, tcId, className, args, useStandaloneCmd): if Hadoop.isSecure(): if Config.hasOption('machine', 'USER_REALM'): user_realm = Config.get('machine', 'USER_REALM', '') else: nnKerbPrincipal = HDFS.getNameNodePrincipal(defaultValue='') atloc = nnKerbPrincipal.find("@") if (atloc != -1): user_realm = nnKerbPrincipal[atloc:] if user_realm != None: args += " " + Machine.getHeadlessUserKeytab( Config.getEnv('USER')) + " " + Config.getEnv( 'USER') + '@' + user_realm exit_code, stdout = Storm.runStormHdfsTopology( TARGET_HIVE_STORM_JAR, className, args, None, logoutput=True, inBackground=False, useStandaloneCmd=useStandaloneCmd) logger.info(exit_code) ruAssert("Storm", exit_code == 0, "[StormHiveSubmit] %s Failed" % (tcId))
def validate_wordcount_written_to_HDFS(cls, hdfs_dir, patterns, expected_count, appId=None): """ Validate the wordcount results, written into HDFS directories by a streaming job. Use wildcards in the 'hdfs_dir' to recursively read sub-directories. :param hdfs_dir: HDFS directory from where contents will be read :param patterns: list of words to check :param expected_count: the expected number of occurence for each word in the 'patterns' :param appId: application ID (Optional parameter) :return: """ count = 0 word_count = {} # initialize word_count dictonary for p in patterns: word_count[p] = 0 exit_code, cat_content = HDFS.cat(hdfs_dir, logoutput=True) assert exit_code == 0, "Could not read from %s, Error: %s, appId: %s" % ( hdfs_dir, cat_content, appId) for line in cat_content: words = line.split() for word in words: if word in word_count.keys(): word_count[word] = word_count[word] + 1 logger.info(word_count) for key, value in word_count.iteritems(): assert value >= expected_count, "%s wordcount is %s. expected_count is %s, appId: %s" % \ (key, value, expected_count, appId)
def createClusterEntities(cls, colo, desc, name): try: from beaver.component.falcon import Falcon except ImportError: ## Import fails when Falcon is not installed on this machine. Nothing to do return from beaver.component.hadoop import Hadoop, HDFS, YARN write_endpoint = Hadoop.getFSDefaultValue() webhdfs_scheme = 'webhdfs' if HDFS.isHttpsEnabled(): webhdfs_scheme = 'swebhdfs' read_endpoint = '%s://%s:%s' % ( webhdfs_scheme, write_endpoint.split('/')[2].split(':')[0], HDFS.getNNWebPort()) execute_endpoint = YARN.getResourceManager() falconNode = Falcon.get_falcon_server() from beaver.component.oozie import Oozie oozieUrl = Oozie.getOozieUrl() entityText = "<?xml version=\"1.0\"?>" \ "<cluster colo=\"" + colo + "\" description=\"" + desc + "\" name=\"" + name + "\" " \ "xmlns=\"uri:falcon:cluster:0.1\"> " \ "<interfaces> " \ "<interface type=\"readonly\" endpoint=\""+read_endpoint+"\" version=\"0.20.2\"/> " \ "<interface type=\"write\" endpoint=\""+write_endpoint+"\" version=\"0.20.2\"/> " \ "<interface type=\"execute\" endpoint=\"" + execute_endpoint + "\" version=\"0.20.2\"/> " \ "<interface type=\"workflow\" endpoint=\"" + oozieUrl + "\" version=\"3.1\"/>" \ "<interface type=\"messaging\" endpoint=\"" \ "tcp://" + falconNode + ":61616?daemon=true\" version=\"5.1.6\"/>" \ "</interfaces>" \ "<locations>" \ "<location name=\"staging\" path=\"/apps/falcon/" + name + "/staging\" />" \ "<location name=\"temp\" path=\"/tmp\" />" \ "<location name=\"working\" path=\"/apps/falcon/" + name + "/working\" />" \ "</locations>" \ "<ACL owner=\"" + cls._job_user + "\" group=\"users\" permission=\"0755\"/>" if Hadoop.isSecure(): realm = HDFS.getConfigValue( 'dfs.namenode.kerberos.principal').split('@')[1] entityText += "<properties> <property name=\"dfs.namenode.kerberos.principal\" value=\"nn/_HOST@" + realm + "\"/> </properties>" entityText += "</cluster>" textFile = open(os.path.join(cls._local_workspace, name + ".xml"), "w") textFile.write("%s" % entityText) textFile.close() return
def smoke_test_setup(cls): ''' Setup function for HDFS smoke test ''' if not cls._SmokeInputDir: cls._SmokeInputDir = cls._base_hdfs_dir + "/smokeHdfsInput" HDFS.deleteDirectory(cls._SmokeInputDir, Config.get('hadoop', 'HADOOPQA_USER')) jobCmd = 'jar %s randomtextwriter \"-D%s=%s\" \"-D%s=%s\" %s' % ( Config.get('hadoop', 'HADOOP_EXAMPLES_JAR'), "mapreduce.randomtextwriter.totalbytes", "4096", "mapred.job.queue.name", cls._queue, cls._SmokeInputDir) exit_code, stdout = Hadoop.run(jobCmd) ruAssert( "HDFS", exit_code == 0, '[SmokeSetup] Randomtextwriter job failed and could not create data on hdfs' )
def setupTableauDataset(): LOCAL_DATA_DIR = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "tableau") DATA_DIR = os.path.join(LOCAL_DATA_DIR, 'data') SCHEMA_SQL_DIR = os.path.join(LOCAL_DATA_DIR, 'schema_3.0') HIVE_TABLES = [ 'Batters', 'Calcs', 'DateBins', 'DateTime', 'Election', 'FischerIris', 'Loan', 'NumericBins', 'REI', 'SeattleCrime', 'Securities', 'SpecialData', 'Staples', 'Starbucks', 'UTStarcom', 'xy' ] TABLEAU_TEST_DIR = "/user/hrt_qa/tableau" DATABASE_NAME = 'tableau' logger.info("Setup Tableau dataset") if not os.path.exists(LOCAL_DATA_DIR): TABLEAU_DATA_TGZ = LOCAL_DATA_DIR + ".tgz" assert util.downloadUrl(Config.get('hive', 'TABLEAU_DATASET'), TABLEAU_DATA_TGZ) Machine.tarExtractAll(TABLEAU_DATA_TGZ, Config.getEnv('ARTIFACTS_DIR')) assert os.path.isdir(LOCAL_DATA_DIR) logger.info("create test directory on hdfs to store tableau data files") HDFS.createDirectory(TABLEAU_TEST_DIR, user=HDFS_USER, perm='777', force=True) logger.info("create tableau database before creating tables") Hive.runQueryOnBeeline("DROP DATABASE IF EXISTS %s" % DATABASE_NAME) Hive.runQueryOnBeeline("CREATE DATABASE IF NOT EXISTS %s" % DATABASE_NAME) for tbl in HIVE_TABLES: hdfsDir = TABLEAU_TEST_DIR + '/%s' % tbl hdfsFile = hdfsDir + '/%s' % tbl localFile = os.path.join(DATA_DIR, '%s.tbl' % tbl) sqlFile = os.path.join(SCHEMA_SQL_DIR, '%s.sql' % tbl) logger.info("create directory for %s table" % tbl) exit_code, stdout = HDFS.createDirectory(hdfsDir, perm='777', force=True) assert exit_code == 0, 'Could not create dir for table %s on hdfs.' % tbl logger.info("copy file for table %s to hdfs" % tbl) exit_code, stdout = HDFS.copyFromLocal(localFile, hdfsFile) assert exit_code == 0, 'Could not copy file for table %s to hdfs.' % tbl logger.info("create %s table " % tbl) # thing-to-do Modify Hive.runQueryonBeeline to accept query file name exit_code, stdout, stderr = Hive.runQueryOnBeeline( ReadFromFile(sqlFile), readFromFile=True, hivevar={'HDFS_LOCATION': hdfsDir}, logoutput=True ) assert exit_code == 0, '%s table creation failed' % tbl
def insertFileIntoHdfs(fileName): pathFileName = '/user/' + HADOOPQA_USER + '/' + fileName if (not(HDFS.fileExists(pathFileName))): sourceFile = DATA_PATH + '/' + fileName destFile = '/user/' + HADOOPQA_USER + '/' + fileName putCmd = "dfs -put " + sourceFile + ' ' + destFile out = Hadoop.run(putCmd) return out
def HDFS_getGateway(cls, logoutput=True): try: from beaver.component.hadoop import HDFS return HDFS.getGateway() except Exception: if logoutput: logger.error("Exception occured during HDFS_getGateway() call") logger.error(traceback.format_exc()) return None
def generateTestReportConf(infile, outfile, results): config = ConfigParser() config.optionxform=str config.read(infile) if config.has_section(SECTION): for option, value in config.items(SECTION): if value != "": continue elif option == "BUILD_ID" and config.has_option(SECTION, "REPO_URL"): config.set(SECTION, option, getBuildId(config.get(SECTION, "REPO_URL"))) config.remove_option(SECTION, "REPO_URL") elif option == "HOSTNAME": config.set(SECTION, option, socket.getfqdn()) elif option == "COMPONENT_VERSION": if not config.has_option(SECTION, "COMPONENT") or config.get(SECTION, "COMPONENT") == "": config.set(SECTION, "COMPONENT", "Hadoop") config.set(SECTION, option, getComponentVersion(config.get(SECTION, "COMPONENT"))) elif option == "OS": config.set(SECTION, option, platform.platform()) elif option == "SECURE" and Config.hasOption('hadoop', 'IS_SECURE'): config.set(SECTION, option, Config.get('hadoop', 'IS_SECURE').lower()) elif option == "BLOB": pass elif option == "RAN": config.set(SECTION, option, results[0] + len(results[1])) elif option == "PASS": config.set(SECTION, option, results[0]) elif option == "FAIL": config.set(SECTION, option, len(results[1])) elif option == "SKIPPED": config.set(SECTION, option, results[2]) elif option == "ABORTED": config.set(SECTION, option, results[3]) elif option == "FAILED_TESTS": config.set(SECTION, option, ",".join(results[1])) elif option == "SINGLE_NODE": from beaver.component.hadoop import HDFS if HDFS.getDatanodeCount() > 1: config.set(SECTION, option, "false") else: config.set(SECTION, option, "true") config.write(open(outfile, 'w'))