def integrate(self): """ Integrates Hadoop and GPDB by performing the following: 1. Setup kerberos server 2. Setup hadoop cluster 3. Setup GPDB configurations 4. Create sql sepcific test data """ # check for GPHOME and MASTER_DATA_DIRECTORY # throw exception is not set gphome = os.getenv("GPHOME") if not gphome: raise HadoopIntegrationException("GPHOME not set!!") mdd = os.getenv("MASTER_DATA_DIRECTORY") if not mdd: raise HadoopIntegrationException("MASTER_DATA_DIRECTORY not set!!") self.fqdn = self.hostname + '.' + self.domain # check if hostname is present in /etc/hosts # if not append the hostname to file self._validate_hostname() # setup kerberos server if security is enabled if self.secure_hadoop: self.kerberos_template_conf = local_path(os.path.join(self.template_conf_dir, "kerberos")) self.kerberos_util = KerberosUtil(self.fqdn, self.domain, self.kerberos_template_conf, self.node_list) self.kerberos_util.configure_server() self.kerberos_util.get_kerberos_ticket("hdfs") self.kerberos_util.get_kerberos_ticket("gpadmin") # setup hadoop cluster hadoop_conf_dir = local_path(os.path.join(self.template_conf_dir, "hdfs/rpm")) if self.hadoop_type == "phd": self.hadoop_util = PHDRpmUtil(self.hadoop_artifact_url, self.hadoop_install_dir, self.hadoop_data_dir, hadoop_conf_dir, self.fqdn, self.secure_hadoop) hadoop_guc = "gphd-2.0" elif self.hadoop_type == "cdh": self.hadoop_util = CDHRpmUtil(self.hadoop_artifact_url, self.hadoop_install_dir, self.hadoop_data_dir, hadoop_conf_dir, self.fqdn, self.secure_hadoop) hadoop_guc = "cdh4.1" elif self.hadoop_type == "apache": self.hadoop_util = ApacheTarUtil(self.hadoop_artifact_url, self.hadoop_install_dir, self.hadoop_data_dir, hadoop_conf_dir, self.fqdn, self.secure_hadoop) hadoop_guc = "gphd-2.0" # setup up hadoop cluster self.hadoop_util.init_cluster() hadoop_home = self.hadoop_util.get_hadoop_env()['HADOOP_HOME'] hadoop_common_home = self.hadoop_util.get_hadoop_env()['HADOOP_COMMON_HOME'] if self.hadoop_type == "apache": hadoop_common_home = hadoop_common_home + "common" # setup up GPDB configurations & test data gpdb_template_conf = local_path(os.path.join(self.template_conf_dir, "gpdb")) self._setup_gpdb_configurations(gphome, mdd, gpdb_template_conf, hadoop_home, hadoop_common_home, hadoop_guc) export_env = "export HADOOP_HOME=%s; source %s/lib/hadoop/hadoop_env.sh;" %(hadoop_home, gphome) java_classpath = ".:$CLASSPATH:%s/lib/hadoop/%s" %(gphome, self.gphdfs_connector) self._create_test_jars(export_env, java_classpath) self.java_cmd = self._create_java_cmd_string(export_env, java_classpath) test_data_types = [ 'regression', 'time', 'timestamp', 'date', \ 'bigint', 'int', 'smallint', 'real', 'float', \ 'boolean', 'varchar', 'bpchar', 'numeric', 'text', 'all' ] datasize = 5000 largedatasize = str(int(datasize) * 2000) self._create_test_data(datasize, largedatasize, test_data_types)