Пример #1
0
def setupAcidDataset(testsuite, LOCAL_DIR):
    ddl_location = None
    if testsuite == 'acid':
        ddl_location = os.path.join(LOCAL_DIR, "ddl", "acid-tpch-tablesetup.sql")
    elif testsuite == 'unbucketed':
        ddl_location = os.path.join(LOCAL_DIR, "ddl", "acid-tpch-unbucketed-tablesetup.sql")
    else:
        assert 1 == 0, "The testsuite passed in not correct. Please use value 'acid' or 'unbuckted'"
    # change timezone on test machines
    Machine.resetTimeZoneOnCluster()

    # Download TPCH acids data
    tpch_newdata_dir = os.path.join(LOCAL_DIR, "tpch_newdata_5G")
    TPCH_STAGE_TGZ = os.path.join(LOCAL_DIR, "tpch_newdata_5G.tgz")
    if not os.path.isfile(TPCH_STAGE_TGZ):
        assert util.downloadUrl(Config.get('hive', 'TPCH_NEWDATA_5G_DNLD_URL'), TPCH_STAGE_TGZ)
        Machine.tarExtractAll(TPCH_STAGE_TGZ, LOCAL_DIR)

    # Load the acid tables in Hive
    HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER')
    HDFS.createDirectory("/tmp/lineitem_acid", user=HADOOPQA_USER, perm='777', force=True)
    HDFS.copyFromLocal(os.path.join(tpch_newdata_dir, "lineitem*"), "/tmp/lineitem_acid", HADOOPQA_USER)
    HDFS.chmod(None, 777, "/tmp/lineitem_acid", recursive=True)
    exit_code, stdout, stderr = Hive.runQueryOnBeeline(
        ddl_location, hivevar={'HDFS_LOCATION': '/tmp'}, logoutput=True, queryIsFile=True
    )
    assert exit_code == 0, "Failed to populate the TPCH acid data in Hive"
Пример #2
0
def setupMondrianDataset():
    DATABASE_NAME = 'foodmart'
    LOCAL_DATA_DIR = os.path.join(Config.getEnv('ARTIFACTS_DIR'), DATABASE_NAME)
    FOODMART_DDL = os.path.join(LOCAL_DATA_DIR, "foodmart.ddl")
    HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER')

    logger.info("Setup Mondrian dataset")
    if not os.path.exists(LOCAL_DATA_DIR):
        MONDRIAN_DATA_TGZ = LOCAL_DATA_DIR + ".tgz"
        assert util.downloadUrl(Config.get('hive', 'MONDRIAN_DATASET'), MONDRIAN_DATA_TGZ)
        Machine.tarExtractAll(MONDRIAN_DATA_TGZ, Config.getEnv('ARTIFACTS_DIR'))
        assert os.path.isdir(LOCAL_DATA_DIR)

    logger.info("create foodmart database and tables")
    HDFS.createDirectory("/tmp/mondrian", HADOOPQA_USER, perm='777', force=True)
    HDFS.copyFromLocal(LOCAL_DATA_DIR, "/tmp/mondrian", HADOOPQA_USER)
    HDFS.chmod(None, 777, "/tmp/mondrian", recursive=True)
    exit_code, stdout, stderr = Hive.runQueryOnBeeline(
        FOODMART_DDL,
        hivevar={
            'DB': 'foodmart',
            'LOCATION': '/tmp/mondrian/foodmart'
        },
        logoutput=True,
        queryIsFile=True
    )
    assert exit_code == 0, "Unable to deploy foodmart dataset"
Пример #3
0
    def doSetup(cls, hdfs_test_dir, tbl_name, num_of_rows, type):

        from beaver.component.hive import Hive
        from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode

        logger.info("Generating test table dataset with %d rows" % num_of_rows)
        test_data_file = os.path.join(Config.getEnv('ARTIFACTS_DIR'),
                                      tbl_name + ".dat")
        f = open(test_data_file, 'w')
        userid = 100000
        for i in xrange(num_of_rows):
            for j in range(random.randint(3, 8)):
                f.write("%d|%d\n" % (userid + i, random.randint(10, 80)))
        f.close()

        hdfs_tbl_dir = hdfs_test_dir + "/" + tbl_name
        logger.info("Copying the test dataset to HDFS directory '%s'" %
                    hdfs_tbl_dir)
        HDFS.createDirectory(hdfs_test_dir,
                             user=cls._hdfs_user,
                             perm='777',
                             force=True)
        HDFS.createDirectory(hdfs_tbl_dir, perm='777')
        HDFS.copyFromLocal(test_data_file, hdfs_tbl_dir)
        HDFS.chmod(cls._hdfs_user, '777', hdfs_tbl_dir)

        logger.info("Creating table '%s' and verification tables" % tbl_name)
        query = "drop table if exists %s;\n" % tbl_name
        query += "create external table %s (userid string, age int) row format delimited fields terminated by '|' stored as textfile location '%s';\n" % (
            tbl_name, hdfs_tbl_dir)
        query += "drop table if exists %s_hive_verify;\n" % tbl_name
        query += "create table %s_hive_verify (userid string, age int);\n" % tbl_name
        if type == "Long running":
            for i in range(cls._num_of_webhcat_bgj):
                query += "drop table if exists %s_wh_%d;\n" % (tbl_name, i + 1)
                query += "create table %s_wh_%d (userid string, age int);\n" % (
                    tbl_name, i + 1)
        hivesetupfile = os.path.join(Config.getEnv('ARTIFACTS_DIR'),
                                     "hivesetup.sql")
        util.writeToFile(query, hivesetupfile)
        exit_code, stdout = Hive.run("-f " + hivesetupfile, logoutput=False)
        if type:
            msg = "%s job setup for Hive component" % type
            if exit_code != 0:
                UpgradePerNode.reportProgress(
                    "[FAILED][Hive][Setup] %s failed due to exitcode = %d" %
                    (msg, exit_code))
            else:
                UpgradePerNode.reportProgress(
                    "[PASSED][Hive][Setup] %s finished successfully" % msg)
Пример #4
0
 def perform_post_upgrade_steps(self):
     if Config.getEnv("HDP_STACK_INSTALLED").lower() == "true":
         from beaver.component.hadoop import Hadoop, HDFS
         from beaver.component.hive import Hive
         COMPONENT = str(self.COMPONENT)
         HDFS_USER = Config.get('hadoop', 'HDFS_USER')
         if 'experiment' in COMPONENT and Hive.isInstalled():
             HIVE_WAREHOUSE_DIR = Hive.getConfigValue(
                 "hive.metastore.warehouse.dir", defaultValue="/apps/hive/warehouse"
             )
             HDFS.chmod(HDFS_USER, 777, HIVE_WAREHOUSE_DIR, True)
         else:
             UpgradeLogger.reportProgress("No additional post-upgrade steps defined for EU", True)
     else:
         logger.info("No additional post-upgrade steps defined for EU on HDF")
Пример #5
0
def setupMergeScaleDataset(LOCAL_DIR):
    # change timezone on test machines
    Machine.resetTimeZoneOnCluster()

    # Download the TPCH dataset if not there
    tpch_data_dir = os.path.join(LOCAL_DIR, "data")
    TPCH_DATA_TGZ = os.path.join(LOCAL_DIR, "tpch_data.tgz")
    if not os.path.isfile(TPCH_DATA_TGZ):
        assert util.downloadUrl(Config.get('hive', 'TPCH_DNLD_URL'), TPCH_DATA_TGZ)
        Machine.tarExtractAll(TPCH_DATA_TGZ, LOCAL_DIR)

    # Load the tables in Hive
    HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER')
    HDFS.createDirectory("/tmp/tpch", user=HADOOPQA_USER, perm='777', force=True)
    HDFS.copyFromLocal(tpch_data_dir, "/tmp/tpch", user=HADOOPQA_USER)
    HDFS.chmod(None, 777, "/tmp/tpch", recursive=True)
    exit_code, stdout, stderr = Hive.runQueryOnBeeline(
        os.path.join(LOCAL_DIR, "ddl", "merge-tpch-tablesetup.sql"),
        hivevar={'HDFS_LOCATION': '/tmp/tpch/data'},
        logoutput=True,
        queryIsFile=True
    )
    assert exit_code == 0, "Failed to populate the TPCH data in Hive"

    # Download TPCH staging data
    tpch_stage_dir = os.path.join(LOCAL_DIR, "tpch_newdata_5G")
    TPCH_STAGE_TGZ = os.path.join(LOCAL_DIR, "tpch_newdata_5G.tgz")
    if not os.path.isfile(TPCH_STAGE_TGZ):
        assert util.downloadUrl(Config.get('hive', 'TPCH_NEWDATA_5G_DNLD_URL'), TPCH_STAGE_TGZ)
        Machine.tarExtractAll(TPCH_STAGE_TGZ, LOCAL_DIR)

    # Load the staged tables in Hive
    HDFS.createDirectory(
        "/tmp/lineitem_stage /tmp/orders_stage /tmp/delete_stage", user=HADOOPQA_USER, perm='777', force=True
    )
    HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "lineitem*"), "/tmp/lineitem_stage", HADOOPQA_USER)
    HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "order*"), "/tmp/orders_stage", HADOOPQA_USER)
    HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "delete*"), "/tmp/delete_stage", HADOOPQA_USER)
    HDFS.chmod(None, 777, "/tmp/lineitem_stage /tmp/orders_stage /tmp/delete_stage", recursive=True)
    exit_code, stdout, stderr = Hive.runQueryOnBeeline(
        os.path.join(LOCAL_DIR, "ddl", "merge-staged-tpch-tablesetup.sql"),
        hivevar={'HDFS_LOCATION': '/tmp'},
        logoutput=True,
        queryIsFile=True
    )
    assert exit_code == 0, "Failed to populate the TPCH staging data in Hive"
Пример #6
0
def downloadDataset(dataDir, dataTgz, downloadUrl, hdfsLocalCopy, textDataDir):
    HDFS.createDirectory(HCAT_TEST_DIR, user=HDFS_USER, perm='777', force=True)
    HDFS.createDirectory(HDFS_TEST_DIR, user=HDFS_USER, perm='777', force=True)

    # change timezone on test machines
    Machine.resetTimeZoneOnCluster()

    # Download the TPCDS dataset if not there
    if not os.path.isfile(dataTgz):
        assert util.downloadUrl(downloadUrl, dataTgz)
        Machine.tarExtractAll(dataTgz, dataDir)

    os.makedirs(hdfsLocalCopy)
    for filename in os.listdir(textDataDir):
        hdfs_localcopy_table_dir = os.path.join(hdfsLocalCopy, filename[:-4])
        os.mkdir(hdfs_localcopy_table_dir)
        shutil.copy(os.path.join(textDataDir, filename), hdfs_localcopy_table_dir)
    HDFS.copyFromLocal(hdfsLocalCopy, HDFS_TEST_DIR)
    HDFS.chmod(None, '777', HDFS_TEST_DIR, recursive=True)
Пример #7
0
def formatNN_SetupHDFS(duReservedValue, mod_conf_path):
    """
    Format NN. Setup HDFS dir for MR jobs.

    Note that this permission is too wide for default HDP use.
    """
    datanodes = HDFS.getDatanodes()
    logger.info("datanodes = %s" % datanodes)
    HDFS.stopDatanodes()
    HDFS.stopNamenode()
    HDFS.formatNN(force=True, logoutput=True)

    for dn in datanodes:
        Machine.rm(user=Machine.getAdminUser(),
                   host=dn,
                   filepath="%s/current" %
                   HDFS.getConfigValue("dfs.datanode.data.dir"),
                   isdir=True)

    balancerModifyConfig(duReservedValue)
    HDFS.startNamenode(mod_conf_path)
    HDFS.startDatanodes(mod_conf_path)
    sleepTime = 45
    logger.info("sleep for %s sec" % sleepTime)
    time.sleep(sleepTime)

    version = Hadoop.getShortVersion()
    paths = [
        "/hdp", "/hdp/apps",
        "/hdp/apps/%s" % version,
        "/hdp/apps/%s/mapreduce" % version
    ]
    for path in paths:
        HDFS.mkdir(path=path, user=HDFS_USER)
    HDFS.chmod(runasUser=HDFS_USER,
               perm="777",
               directory="/hdp",
               recursive=True)
    HDFS.copyFromLocal(
        localpath="/usr/hdp/current/hadoop-client/mapreduce.tar.gz",
        hdfspath="/hdp/apps/%s/mapreduce/" % version)
    sleepTime = 45
    logger.info("sleep for %s sec for MR tarball replication" % sleepTime)
    time.sleep(sleepTime)
    paths = [
        "/app-logs", "/app-logs/hrt_qa", "/app-logs/hrt_qa/logs", "/mr-history"
    ]
    for path in paths:
        HDFS.mkdir(path=path, user=HDFS_USER)
    HDFS.chmod(runasUser=HDFS_USER,
               perm="777",
               directory="/app-logs",
               recursive=True)
    HDFS.chmod(runasUser=HDFS_USER,
               perm="777",
               directory="/mr-history",
               recursive=True)
    HDFS.mkdir(path="/user", user=HDFS_USER)
    HDFS.mkdir(path="/user/hrt_qa", user=HDFS_USER)
    HDFS.chown(runasUser=HDFS_USER,
               new_owner="hrt_qa:hrt_qa",
               directory="/user/hrt_qa",
               recursive=False)
    HDFS.chmod(runasUser="******",
               perm="770",
               directory="/user/hrt_qa",
               recursive=True)
Пример #8
0
    def setup_storm_hive_topology(cls, useStandaloneCmd):
        from beaver.component.hive import Hive

        storm_version = Storm.getVersion(useStandaloneCmd=True)
        hive_version = Hive.getVersion()
        HIVE_METASTORE_URI = Hive.getConfigValue(
            "hive.metastore.uris", defaultValue="thrift://localhost:9083")

        global HIVE_METASTORE_URI
        global HIVE_HOST
        global HIVE_PORT
        global HIVE_WAREHOUSE_DIR
        HIVE_WAREHOUSE_DIR = Hive.getConfigValue(
            "hive.metastore.warehouse.dir",
            defaultValue="/apps/hive/warehouse")
        HIVE_HOST = Hive.getHiveHost()
        HIVE_PORT = Hive.getMetastoreThriftPort()
        if Storm.isDalorBeyond():
            JAVA_HIVE_SRC_DIR = os.path.join(Config.getEnv('WORKSPACE'),
                                             'tests', 'rolling_upgrade',
                                             'Storm', '2_3', 'storm-hive',
                                             'java')
        else:
            JAVA_HIVE_SRC_DIR = os.path.join(Config.getEnv('WORKSPACE'),
                                             'tests', 'rolling_upgrade',
                                             'Storm', '2_2', 'storm-hive',
                                             'java')
        # hive.txn.manager and hive.support.concurrency are set through ambari as per bug-40500
        #logger.info("Restart Hive")
        #changes = {'hive-site.xml': {'hive.txn.manager': 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager',
        #                             'hive.support.concurrency': 'true'}}
        #Hive.modifyConfig(changes, services=['metastore'], restartService=True)
        logger.info("Create test database in Hive")

        exit_code, stdout = Hive.runQuery(
            cls.get_set_queue_cmd(useStandaloneCmd) +
            " drop database if exists stormdb cascade; \
                                               create database stormdb;")
        ruAssert("Storm", exit_code == 0,
                 "[StormHiveSetup] Failed to create test database" + stdout)
        HDFS.chmod(runasUser=HDFS.getHDFSUser(),
                   perm=777,
                   directory=HIVE_WAREHOUSE_DIR + "/" + DATABASE_NAME + ".db")
        #copy tests/storm/storm-hive/java to artifacts/storm-hive-tests
        logger.info("JAVA_SRC_DIR " + JAVA_HIVE_SRC_DIR)
        logger.info("LOCAL_WORK_DIR " + LOCAL_HIVE_WORK_DIR)
        Machine.copy(JAVA_HIVE_SRC_DIR,
                     LOCAL_HIVE_WORK_DIR,
                     user=None,
                     passwd=None)
        #mvn package
        if Machine.isWindows():
            (_, _) = Maven.run(
                'package -D%s=%s -D%s=%s -D%s=%s -D%s=%s' %
                (HADOOP_VERSION_MAVEN_PARAMETER, HADOOP_VERSION,
                 STORM_VERSION_MAVEN_PARAMETER, storm_version,
                 HIVE_VERSION_MAVEN_PARAMETER,
                 hive_version, PUBLIC_REPO_MAVEN_PARAMETER,
                 Maven.getPublicRepoUrl(), CORE_FILE_MAVEN_PARAMETER,
                 CORE_FILE, HADOOP_CORE_MAVEN_PARAMETER, HADOOP_CONF,
                 HIVE_CORE_MAVEN_PARAMETER, HIVE_CORE_DIR,
                 HIVE_FILE_MAVEN_PARAMETER, HIVE_FILE),
                cwd=LOCAL_HIVE_WORK_DIR)
        else:
            (_, _) = Maven.run('package',
                               cwd=LOCAL_HIVE_WORK_DIR,
                               env={
                                   HADOOP_VERSION_MAVEN_PARAMETER:
                                   HADOOP_VERSION,
                                   STORM_VERSION_MAVEN_PARAMETER:
                                   storm_version,
                                   HIVE_VERSION_MAVEN_PARAMETER:
                                   hive_version,
                                   PUBLIC_REPO_MAVEN_PARAMETER:
                                   Maven.getPublicRepoUrl(),
                                   CORE_FILE_MAVEN_PARAMETER:
                                   CORE_FILE,
                                   HADOOP_CONF_MAVEN_PARAMETER:
                                   HADOOP_CONF,
                                   HDFS_FILE_MAVEN_PARAMETER:
                                   HDFS_FILE,
                                   HADOOP_CORE_MAVEN_PARAMETER:
                                   HADOOP_CONF,
                                   HIVE_CORE_MAVEN_PARAMETER:
                                   HIVE_CORE_DIR,
                                   HIVE_FILE_MAVEN_PARAMETER:
                                   HIVE_FILE
                               })
        create_table_q = "use %s; \
          drop table if exists %s; \
          create table %s (id int, name string, phone string, street string) \
          partitioned by (city string, state string) \
          clustered by (id) into %s buckets \
          stored as orc \
          tblproperties ('transactional'='true');" % (
            DATABASE_NAME, HIVE_TABLE_NAME, HIVE_TABLE_NAME, "5")

        exit_code, stdout = Hive.runQuery(
            cls.get_set_queue_cmd(useStandaloneCmd) + create_table_q)
        ruAssert(
            "Storm", exit_code == 0,
            "[StormHiveSetup] Failed to create test table userdata_partitioned"
        )
        HDFS.chmod(runasUser=HDFS.getHDFSUser(),
                   perm=777,
                   directory=HIVE_WAREHOUSE_DIR + "/" + DATABASE_NAME +
                   ".db/" + HIVE_TABLE_NAME)