def setupAcidDataset(testsuite, LOCAL_DIR): ddl_location = None if testsuite == 'acid': ddl_location = os.path.join(LOCAL_DIR, "ddl", "acid-tpch-tablesetup.sql") elif testsuite == 'unbucketed': ddl_location = os.path.join(LOCAL_DIR, "ddl", "acid-tpch-unbucketed-tablesetup.sql") else: assert 1 == 0, "The testsuite passed in not correct. Please use value 'acid' or 'unbuckted'" # change timezone on test machines Machine.resetTimeZoneOnCluster() # Download TPCH acids data tpch_newdata_dir = os.path.join(LOCAL_DIR, "tpch_newdata_5G") TPCH_STAGE_TGZ = os.path.join(LOCAL_DIR, "tpch_newdata_5G.tgz") if not os.path.isfile(TPCH_STAGE_TGZ): assert util.downloadUrl(Config.get('hive', 'TPCH_NEWDATA_5G_DNLD_URL'), TPCH_STAGE_TGZ) Machine.tarExtractAll(TPCH_STAGE_TGZ, LOCAL_DIR) # Load the acid tables in Hive HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER') HDFS.createDirectory("/tmp/lineitem_acid", user=HADOOPQA_USER, perm='777', force=True) HDFS.copyFromLocal(os.path.join(tpch_newdata_dir, "lineitem*"), "/tmp/lineitem_acid", HADOOPQA_USER) HDFS.chmod(None, 777, "/tmp/lineitem_acid", recursive=True) exit_code, stdout, stderr = Hive.runQueryOnBeeline( ddl_location, hivevar={'HDFS_LOCATION': '/tmp'}, logoutput=True, queryIsFile=True ) assert exit_code == 0, "Failed to populate the TPCH acid data in Hive"
def setupMondrianDataset(): DATABASE_NAME = 'foodmart' LOCAL_DATA_DIR = os.path.join(Config.getEnv('ARTIFACTS_DIR'), DATABASE_NAME) FOODMART_DDL = os.path.join(LOCAL_DATA_DIR, "foodmart.ddl") HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER') logger.info("Setup Mondrian dataset") if not os.path.exists(LOCAL_DATA_DIR): MONDRIAN_DATA_TGZ = LOCAL_DATA_DIR + ".tgz" assert util.downloadUrl(Config.get('hive', 'MONDRIAN_DATASET'), MONDRIAN_DATA_TGZ) Machine.tarExtractAll(MONDRIAN_DATA_TGZ, Config.getEnv('ARTIFACTS_DIR')) assert os.path.isdir(LOCAL_DATA_DIR) logger.info("create foodmart database and tables") HDFS.createDirectory("/tmp/mondrian", HADOOPQA_USER, perm='777', force=True) HDFS.copyFromLocal(LOCAL_DATA_DIR, "/tmp/mondrian", HADOOPQA_USER) HDFS.chmod(None, 777, "/tmp/mondrian", recursive=True) exit_code, stdout, stderr = Hive.runQueryOnBeeline( FOODMART_DDL, hivevar={ 'DB': 'foodmart', 'LOCATION': '/tmp/mondrian/foodmart' }, logoutput=True, queryIsFile=True ) assert exit_code == 0, "Unable to deploy foodmart dataset"
def doSetup(cls, hdfs_test_dir, tbl_name, num_of_rows, type): from beaver.component.hive import Hive from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode logger.info("Generating test table dataset with %d rows" % num_of_rows) test_data_file = os.path.join(Config.getEnv('ARTIFACTS_DIR'), tbl_name + ".dat") f = open(test_data_file, 'w') userid = 100000 for i in xrange(num_of_rows): for j in range(random.randint(3, 8)): f.write("%d|%d\n" % (userid + i, random.randint(10, 80))) f.close() hdfs_tbl_dir = hdfs_test_dir + "/" + tbl_name logger.info("Copying the test dataset to HDFS directory '%s'" % hdfs_tbl_dir) HDFS.createDirectory(hdfs_test_dir, user=cls._hdfs_user, perm='777', force=True) HDFS.createDirectory(hdfs_tbl_dir, perm='777') HDFS.copyFromLocal(test_data_file, hdfs_tbl_dir) HDFS.chmod(cls._hdfs_user, '777', hdfs_tbl_dir) logger.info("Creating table '%s' and verification tables" % tbl_name) query = "drop table if exists %s;\n" % tbl_name query += "create external table %s (userid string, age int) row format delimited fields terminated by '|' stored as textfile location '%s';\n" % ( tbl_name, hdfs_tbl_dir) query += "drop table if exists %s_hive_verify;\n" % tbl_name query += "create table %s_hive_verify (userid string, age int);\n" % tbl_name if type == "Long running": for i in range(cls._num_of_webhcat_bgj): query += "drop table if exists %s_wh_%d;\n" % (tbl_name, i + 1) query += "create table %s_wh_%d (userid string, age int);\n" % ( tbl_name, i + 1) hivesetupfile = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "hivesetup.sql") util.writeToFile(query, hivesetupfile) exit_code, stdout = Hive.run("-f " + hivesetupfile, logoutput=False) if type: msg = "%s job setup for Hive component" % type if exit_code != 0: UpgradePerNode.reportProgress( "[FAILED][Hive][Setup] %s failed due to exitcode = %d" % (msg, exit_code)) else: UpgradePerNode.reportProgress( "[PASSED][Hive][Setup] %s finished successfully" % msg)
def perform_post_upgrade_steps(self): if Config.getEnv("HDP_STACK_INSTALLED").lower() == "true": from beaver.component.hadoop import Hadoop, HDFS from beaver.component.hive import Hive COMPONENT = str(self.COMPONENT) HDFS_USER = Config.get('hadoop', 'HDFS_USER') if 'experiment' in COMPONENT and Hive.isInstalled(): HIVE_WAREHOUSE_DIR = Hive.getConfigValue( "hive.metastore.warehouse.dir", defaultValue="/apps/hive/warehouse" ) HDFS.chmod(HDFS_USER, 777, HIVE_WAREHOUSE_DIR, True) else: UpgradeLogger.reportProgress("No additional post-upgrade steps defined for EU", True) else: logger.info("No additional post-upgrade steps defined for EU on HDF")
def setupMergeScaleDataset(LOCAL_DIR): # change timezone on test machines Machine.resetTimeZoneOnCluster() # Download the TPCH dataset if not there tpch_data_dir = os.path.join(LOCAL_DIR, "data") TPCH_DATA_TGZ = os.path.join(LOCAL_DIR, "tpch_data.tgz") if not os.path.isfile(TPCH_DATA_TGZ): assert util.downloadUrl(Config.get('hive', 'TPCH_DNLD_URL'), TPCH_DATA_TGZ) Machine.tarExtractAll(TPCH_DATA_TGZ, LOCAL_DIR) # Load the tables in Hive HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER') HDFS.createDirectory("/tmp/tpch", user=HADOOPQA_USER, perm='777', force=True) HDFS.copyFromLocal(tpch_data_dir, "/tmp/tpch", user=HADOOPQA_USER) HDFS.chmod(None, 777, "/tmp/tpch", recursive=True) exit_code, stdout, stderr = Hive.runQueryOnBeeline( os.path.join(LOCAL_DIR, "ddl", "merge-tpch-tablesetup.sql"), hivevar={'HDFS_LOCATION': '/tmp/tpch/data'}, logoutput=True, queryIsFile=True ) assert exit_code == 0, "Failed to populate the TPCH data in Hive" # Download TPCH staging data tpch_stage_dir = os.path.join(LOCAL_DIR, "tpch_newdata_5G") TPCH_STAGE_TGZ = os.path.join(LOCAL_DIR, "tpch_newdata_5G.tgz") if not os.path.isfile(TPCH_STAGE_TGZ): assert util.downloadUrl(Config.get('hive', 'TPCH_NEWDATA_5G_DNLD_URL'), TPCH_STAGE_TGZ) Machine.tarExtractAll(TPCH_STAGE_TGZ, LOCAL_DIR) # Load the staged tables in Hive HDFS.createDirectory( "/tmp/lineitem_stage /tmp/orders_stage /tmp/delete_stage", user=HADOOPQA_USER, perm='777', force=True ) HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "lineitem*"), "/tmp/lineitem_stage", HADOOPQA_USER) HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "order*"), "/tmp/orders_stage", HADOOPQA_USER) HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "delete*"), "/tmp/delete_stage", HADOOPQA_USER) HDFS.chmod(None, 777, "/tmp/lineitem_stage /tmp/orders_stage /tmp/delete_stage", recursive=True) exit_code, stdout, stderr = Hive.runQueryOnBeeline( os.path.join(LOCAL_DIR, "ddl", "merge-staged-tpch-tablesetup.sql"), hivevar={'HDFS_LOCATION': '/tmp'}, logoutput=True, queryIsFile=True ) assert exit_code == 0, "Failed to populate the TPCH staging data in Hive"
def downloadDataset(dataDir, dataTgz, downloadUrl, hdfsLocalCopy, textDataDir): HDFS.createDirectory(HCAT_TEST_DIR, user=HDFS_USER, perm='777', force=True) HDFS.createDirectory(HDFS_TEST_DIR, user=HDFS_USER, perm='777', force=True) # change timezone on test machines Machine.resetTimeZoneOnCluster() # Download the TPCDS dataset if not there if not os.path.isfile(dataTgz): assert util.downloadUrl(downloadUrl, dataTgz) Machine.tarExtractAll(dataTgz, dataDir) os.makedirs(hdfsLocalCopy) for filename in os.listdir(textDataDir): hdfs_localcopy_table_dir = os.path.join(hdfsLocalCopy, filename[:-4]) os.mkdir(hdfs_localcopy_table_dir) shutil.copy(os.path.join(textDataDir, filename), hdfs_localcopy_table_dir) HDFS.copyFromLocal(hdfsLocalCopy, HDFS_TEST_DIR) HDFS.chmod(None, '777', HDFS_TEST_DIR, recursive=True)
def formatNN_SetupHDFS(duReservedValue, mod_conf_path): """ Format NN. Setup HDFS dir for MR jobs. Note that this permission is too wide for default HDP use. """ datanodes = HDFS.getDatanodes() logger.info("datanodes = %s" % datanodes) HDFS.stopDatanodes() HDFS.stopNamenode() HDFS.formatNN(force=True, logoutput=True) for dn in datanodes: Machine.rm(user=Machine.getAdminUser(), host=dn, filepath="%s/current" % HDFS.getConfigValue("dfs.datanode.data.dir"), isdir=True) balancerModifyConfig(duReservedValue) HDFS.startNamenode(mod_conf_path) HDFS.startDatanodes(mod_conf_path) sleepTime = 45 logger.info("sleep for %s sec" % sleepTime) time.sleep(sleepTime) version = Hadoop.getShortVersion() paths = [ "/hdp", "/hdp/apps", "/hdp/apps/%s" % version, "/hdp/apps/%s/mapreduce" % version ] for path in paths: HDFS.mkdir(path=path, user=HDFS_USER) HDFS.chmod(runasUser=HDFS_USER, perm="777", directory="/hdp", recursive=True) HDFS.copyFromLocal( localpath="/usr/hdp/current/hadoop-client/mapreduce.tar.gz", hdfspath="/hdp/apps/%s/mapreduce/" % version) sleepTime = 45 logger.info("sleep for %s sec for MR tarball replication" % sleepTime) time.sleep(sleepTime) paths = [ "/app-logs", "/app-logs/hrt_qa", "/app-logs/hrt_qa/logs", "/mr-history" ] for path in paths: HDFS.mkdir(path=path, user=HDFS_USER) HDFS.chmod(runasUser=HDFS_USER, perm="777", directory="/app-logs", recursive=True) HDFS.chmod(runasUser=HDFS_USER, perm="777", directory="/mr-history", recursive=True) HDFS.mkdir(path="/user", user=HDFS_USER) HDFS.mkdir(path="/user/hrt_qa", user=HDFS_USER) HDFS.chown(runasUser=HDFS_USER, new_owner="hrt_qa:hrt_qa", directory="/user/hrt_qa", recursive=False) HDFS.chmod(runasUser="******", perm="770", directory="/user/hrt_qa", recursive=True)
def setup_storm_hive_topology(cls, useStandaloneCmd): from beaver.component.hive import Hive storm_version = Storm.getVersion(useStandaloneCmd=True) hive_version = Hive.getVersion() HIVE_METASTORE_URI = Hive.getConfigValue( "hive.metastore.uris", defaultValue="thrift://localhost:9083") global HIVE_METASTORE_URI global HIVE_HOST global HIVE_PORT global HIVE_WAREHOUSE_DIR HIVE_WAREHOUSE_DIR = Hive.getConfigValue( "hive.metastore.warehouse.dir", defaultValue="/apps/hive/warehouse") HIVE_HOST = Hive.getHiveHost() HIVE_PORT = Hive.getMetastoreThriftPort() if Storm.isDalorBeyond(): JAVA_HIVE_SRC_DIR = os.path.join(Config.getEnv('WORKSPACE'), 'tests', 'rolling_upgrade', 'Storm', '2_3', 'storm-hive', 'java') else: JAVA_HIVE_SRC_DIR = os.path.join(Config.getEnv('WORKSPACE'), 'tests', 'rolling_upgrade', 'Storm', '2_2', 'storm-hive', 'java') # hive.txn.manager and hive.support.concurrency are set through ambari as per bug-40500 #logger.info("Restart Hive") #changes = {'hive-site.xml': {'hive.txn.manager': 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager', # 'hive.support.concurrency': 'true'}} #Hive.modifyConfig(changes, services=['metastore'], restartService=True) logger.info("Create test database in Hive") exit_code, stdout = Hive.runQuery( cls.get_set_queue_cmd(useStandaloneCmd) + " drop database if exists stormdb cascade; \ create database stormdb;") ruAssert("Storm", exit_code == 0, "[StormHiveSetup] Failed to create test database" + stdout) HDFS.chmod(runasUser=HDFS.getHDFSUser(), perm=777, directory=HIVE_WAREHOUSE_DIR + "/" + DATABASE_NAME + ".db") #copy tests/storm/storm-hive/java to artifacts/storm-hive-tests logger.info("JAVA_SRC_DIR " + JAVA_HIVE_SRC_DIR) logger.info("LOCAL_WORK_DIR " + LOCAL_HIVE_WORK_DIR) Machine.copy(JAVA_HIVE_SRC_DIR, LOCAL_HIVE_WORK_DIR, user=None, passwd=None) #mvn package if Machine.isWindows(): (_, _) = Maven.run( 'package -D%s=%s -D%s=%s -D%s=%s -D%s=%s' % (HADOOP_VERSION_MAVEN_PARAMETER, HADOOP_VERSION, STORM_VERSION_MAVEN_PARAMETER, storm_version, HIVE_VERSION_MAVEN_PARAMETER, hive_version, PUBLIC_REPO_MAVEN_PARAMETER, Maven.getPublicRepoUrl(), CORE_FILE_MAVEN_PARAMETER, CORE_FILE, HADOOP_CORE_MAVEN_PARAMETER, HADOOP_CONF, HIVE_CORE_MAVEN_PARAMETER, HIVE_CORE_DIR, HIVE_FILE_MAVEN_PARAMETER, HIVE_FILE), cwd=LOCAL_HIVE_WORK_DIR) else: (_, _) = Maven.run('package', cwd=LOCAL_HIVE_WORK_DIR, env={ HADOOP_VERSION_MAVEN_PARAMETER: HADOOP_VERSION, STORM_VERSION_MAVEN_PARAMETER: storm_version, HIVE_VERSION_MAVEN_PARAMETER: hive_version, PUBLIC_REPO_MAVEN_PARAMETER: Maven.getPublicRepoUrl(), CORE_FILE_MAVEN_PARAMETER: CORE_FILE, HADOOP_CONF_MAVEN_PARAMETER: HADOOP_CONF, HDFS_FILE_MAVEN_PARAMETER: HDFS_FILE, HADOOP_CORE_MAVEN_PARAMETER: HADOOP_CONF, HIVE_CORE_MAVEN_PARAMETER: HIVE_CORE_DIR, HIVE_FILE_MAVEN_PARAMETER: HIVE_FILE }) create_table_q = "use %s; \ drop table if exists %s; \ create table %s (id int, name string, phone string, street string) \ partitioned by (city string, state string) \ clustered by (id) into %s buckets \ stored as orc \ tblproperties ('transactional'='true');" % ( DATABASE_NAME, HIVE_TABLE_NAME, HIVE_TABLE_NAME, "5") exit_code, stdout = Hive.runQuery( cls.get_set_queue_cmd(useStandaloneCmd) + create_table_q) ruAssert( "Storm", exit_code == 0, "[StormHiveSetup] Failed to create test table userdata_partitioned" ) HDFS.chmod(runasUser=HDFS.getHDFSUser(), perm=777, directory=HIVE_WAREHOUSE_DIR + "/" + DATABASE_NAME + ".db/" + HIVE_TABLE_NAME)