def validate_wordcount_written_to_HDFS(cls, hdfs_dir, patterns, expected_count, appId=None): """ Validate the wordcount results, written into HDFS directories by a streaming job. Use wildcards in the 'hdfs_dir' to recursively read sub-directories. :param hdfs_dir: HDFS directory from where contents will be read :param patterns: list of words to check :param expected_count: the expected number of occurence for each word in the 'patterns' :param appId: application ID (Optional parameter) :return: """ count = 0 word_count = {} # initialize word_count dictonary for p in patterns: word_count[p] = 0 exit_code, cat_content = HDFS.cat(hdfs_dir, logoutput=True) assert exit_code == 0, "Could not read from %s, Error: %s, appId: %s" % ( hdfs_dir, cat_content, appId) for line in cat_content: words = line.split() for word in words: if word in word_count.keys(): word_count[word] = word_count[word] + 1 logger.info(word_count) for key, value in word_count.iteritems(): assert value >= expected_count, "%s wordcount is %s. expected_count is %s, appId: %s" % \ (key, value, expected_count, appId)
def checkClasspathVersion(cls, Version_Num, config=None): Local_Test_dir = os.path.join(Config.getEnv("WORKSPACE"), "tests", "rolling_upgrade", "yarn") Multi_Version_App_Dir = os.path.join(Local_Test_dir, "data") Mapper = "data/versionVerifyMapper.py" Reducer = "data/versionVerifyReducer.py" Verify_File_Name = "test.txt" Verify_Test_File = os.path.join(Multi_Version_App_Dir, Verify_File_Name) # Set up env mapred_app_path = MAPRED.getConfigValue( "mapreduce.application.framework.path", None) mapred_classpath = MAPRED.getConfigValue( "mapreduce.application.classpath", None) env = { "mapreduce.application.framework.path": mapred_app_path, "mapreduce.application.classpath": mapred_classpath } verifyInput = cls._hdfs_input + "/verify" HDFS.createDirectory(verifyInput, None, "777", False) # Copy template files for the verifier streaming job templateFile = open(Verify_Test_File, 'w') templateFile.write(Version_Num) templateFile.close() HDFS.copyFromLocal(Verify_Test_File, verifyInput, user=Config.get('hadoop', 'HADOOPQA_USER')) # Submit the special streaming job shortStreamingId = HadoopJobHelper.runStreamJob( Mapper, Reducer, verifyInput, cls._hdfs_output_verify, files=Multi_Version_App_Dir, config=config, extraJobArg=cls._jobArgs, env=env, proposedJobName=cls._shortStreamingName) MAPRED.waitForJobDoneOrTimeout(shortStreamingId, timeoutInSec=180) # Make sure task succeeded #assert YARN.getAppFinalStateFromID(appId) == 'SUCCEEDED' # Check result content retVal, checkContent = HDFS.cat(cls._hdfs_output_verify + '/part-00000') logger.info("CHECK CLASSPATH VERSION OUTPUT") logger.info(retVal) logger.info(checkContent) ruAssert("YARN", retVal == 0) ruAssert("YARN", 'True' in checkContent, "[VersionVerify] Stream job returns false: " + checkContent) #assert retVal == 0 #assert 'True' in checkContent, "Stream job returns false: " + checkContent #assert 'False' not in checkContent, "Stream job returns false: " + checkContent HDFS.deleteDirectory(cls._hdfs_output_verify, user=Config.get('hadoop', 'HADOOPQA_USER'))
def verify_hdfs_topology(cls, topologyName, targetDir, lines, type, useStandaloneCmd): """ Verifies the hdfs topologies produced expected output """ #Slider app is killed before log running job verification so disabling topology activation checks. if useStandaloneCmd == True: ruAssert( "Storm", Storm.getTopologyStatus( topologyName, logoutput=True, useStandaloneCmd=useStandaloneCmd) == 'ACTIVE') exit_code, stdout = HDFS.lsr(targetDir, False, True) hdfsListOutput = stdout.splitlines() #Picking the second last line as the first file might not have enough content and last file gets into transient #HDFS issues. if len(hdfsListOutput) >= 2: fileLine = hdfsListOutput[-2] sampleoutfile = fileLine.split(" ")[-1].strip() # Hecky solution as the test code for trident and core topologies writes under same directory. # if fileLine.endswith(".txt") and type == "cat": # sampleoutfile = fileLine.split(" ")[-1].strip() # if fileLine.endswith(".seq") and type == "text": # sampleoutfile = fileLine.split(" ")[-1].strip() logger.info("Taking sampleoutput file : %s" % (sampleoutfile)) if type == "text": exit_code, stdout = HDFS.text(sampleoutfile, None) else: exit_code, stdout = HDFS.cat(sampleoutfile, None) for line in lines: ruAssert( "Storm", stdout.find(line) >= 0, "[StormHDFSVerify] expected line : %s in %s" % (line, sampleoutfile)) else: ruAssert("Storm", False, "hdfsListOutput must have at least 2 lines")
def verifyLongRunningJob(cls, config=None): ''' Verify long running background job after it finishes :return: ''' from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode retVal, checkContent = HDFS.cat(cls._hdfs_output + '/part-00000') #assert retVal == 0 if retVal == 0: UpgradePerNode.reportProgress( "[PASSED][YARN][BGJobCheck] verifyLongRunning Job for Yarn, retVal = 0. Successful check " ) else: UpgradePerNode.reportProgress( "[FAILED][YARN][BGJobCheck] verifyLongRunning Job for Yarn, retVal != 0. Failed check " ) #assert 'true' in checkContent, "Stream job returns false: " + checkContent if 'true' in checkContent: UpgradePerNode.reportProgress( "[PASSED][YARN][BGJobCheck] verifyLongRunning Job for Yarn, true in checkContent. Successful check " ) else: UpgradePerNode.reportProgress( "[FAILED][YARN][BGJobCheck] verifyLongRunning Job for Yarn, true not in checkContent. Failed check " ) #verify application's attempt doesn't increase and no failed tasks. appID = cls._background_job_appId jobID = cls._background_job_jobId # temporarily skipping check #assert YARN.getNumAttemptsForApp(appID) == 1 #YARN.verifyMRTasksCount(jobID, appID, 0, skipAssert=True) from beaver.component.rollingupgrade.ruCommon import hdpRelease Version_Num = hdpRelease.getCurrentRelease("hadoop-client") cls.checkClasspathVersion(Version_Num, config)