def wrapper(test_class_obj): """ inner """ _first_run = True if test_class_obj.pipeline_type in modes: for filesystem in expect_filesystems: if filesystem in test_class_obj.support_file_system: if _skip_filesystem_test(filesystem): continue test_class_obj.root_path = test_class_obj.root_path_dict[ filesystem] test_class_obj.running_on_filesystem = filesystem logger.info( "running case [%s.%s] root_path=[%s], filesystem=[%s]" % (type(test_class_obj).__name__, fn.func_name, test_class_obj.root_path, test_class_obj.running_on_filesystem)) if not _first_run: test_class_obj.tearDown() test_class_obj.setUp() fn(test_class_obj) _first_run = False else: logger.warn('\033[01;31mWarning!!! %s not executed,' \ ' because filesystem is %s.\033[00m' \ %(fn.__name__, filesystem))
def setUp(self): """ no comments """ if self.shortDescription() is None: case_msg = self.id() else: case_msg = "%s - %s" % (self.shortDescription(), self.id()) logger.info('setUp for case: %s' % case_msg) self._tmp_path = [] self.pipeline_type = os.environ.get('PIPELINE_TYPE', 'local').lower() if self.pipeline_type == 'hadoop': self.pipeline_type = 'dagmr' if not hasattr(self, "running_on_filesystem"): if self.pipeline_type == "local": self.running_on_filesystem = "local" else: self.running_on_filesystem = "hdfs" if self.pipeline_type not in self._supported_pipeline_type(): self.skipTest("pipeline type is not supported in this case") hdfs_root_path = os.environ.get('HDFS_ROOT_PATH', '').strip() self.support_file_system = ['local'] if hdfs_root_path: self.support_file_system.append("hdfs") self.root_path_dict = {'local': '.', 'hdfs': hdfs_root_path} self.root_path = self.root_path_dict[self.running_on_filesystem] self.setConfig() self._conditions = []
def _upload_file(self, local_path): user_provided_config = self._hadoop_config hdfs_path = [] tmp_data_path = self._job_config.tmp_data_path for path in glob.glob(local_path): if os.path.isdir(path): target = os.path.join(tmp_data_path, "local_input", str(uuid.uuid4())) logger.info("Uploading input directory [%s] to [%s]" % (path, target)) elif os.path.isfile(path): # do not change the basename of the input file to keep the suffix. file_name = os.path.basename(path) target = os.path.join(tmp_data_path, "local_input", str(uuid.uuid4()), file_name) logger.info("Uploading input file [%s] to [%s]" % (path, target)) else: raise error.BigflowRuntimeException( "file [%s] (matched by pattern [%s]) " "is neither a dir nor a regular file" % (path, local_path)) hdfs_path.append(target) self._client.fs_put(path, target, user_provided_config) self._remote_temp_files.append(target) return hdfs_path
def act(self, line): pos = 0 for level in _SparkDriverParser.spark_log_levels: pos = line.find(level) if pos != -1: break self._con_lines_with_space.reset_mismatch_line() logger.info(line[pos+1:])
def act(self, line): pos = 0 for level in _SparkDriverParser.spark_log_levels: pos = line.find(level) if pos != -1: break self._con_lines_with_space.reset_mismatch_line() logger.info(line[pos + 1:])
def accept(self, line): # line starts with at least `space_num` white spaces or starts with '\t' matched = len(line[:self.space_num].strip()) == 0 or \ line.startswith(self.leading_w_space) if not matched: if not self.empty(): # buffered log should be flushed logger.info(self.msg()) self.reset() self.pre_mismatched_line = line return matched
def run_tests(self): """execute tests""" for _, value in self.tests.iteritems(): lhs, rhs, expr, need_sorted = value logger.info("I am testing: %s" % str(expr)) if isinstance(lhs, list): if need_sorted: self.assertItemsEqual(lhs, rhs.get(), expr) else: self.assertListEqual(lhs, rhs.get(), expr) elif isinstance(lhs, dict): self.assertDictEqual(lhs, rhs.get(), expr) else: self.assertEqual(lhs, rhs.get(), expr) # clear all the tests self.tests.clear()
def _print_counters(self): # print counters after run c_dict = counter._get_all(grouped=True) if len(c_dict) > 0: logger.info("=========================================================") logger.info("all counters:") for group in sorted(c_dict.iterkeys()): logger.info("\t%s:" % group) for k, v in c_dict[group].iteritems(): logger.info("\t\t%s=%d" % (k, v))
def _after_run(self): super(SparkPipeline, self)._after_run() for local_uri_info in self._local_uri_infos: local_uri = local_uri_info['local_uri'] hdfs_uri = local_uri_info['hdfs_uri'] if local_uri_info['overwrite']: logger.info("Preparing local directory: %s" % local_uri) if not self._force_delete_file(local_uri): raise error.BigflowHDFSException("Failed to remove target path: %s" % local_uri) else: if self._path_exists(local_uri): raise error.BigflowHDFSException( "Failed to output target path: %s, target path is existed" % local_uri) os.makedirs(local_uri) self._client.fs_get(hdfs_uri + "/*", local_uri, self._hadoop_config) self._local_uri_infos = [] if SparkPipeline.output_dir_conf_key in self._config["spark_conf"]: del self._config["spark_conf"][SparkPipeline.output_dir_conf_key]
def run(self): """ 立刻运行Pipeline并等待结束 Raises: BigflowRuntimeException: 若运行期出错抛出此异常 """ self._before_run() try: commit_args = [] for key, value in self._hadoop_config.iteritems(): commit_args.extend(["-D", key + "=" + value]) requests.launch(self._id, self._plan_message, self._resource_message, commit_args) logger.info("Job ran successfully") except Exception as e: self._handle_serialized_exception() raise self._after_run()
def _print_counters(self): # print counters after run c_dict = counter._get_all(grouped=True) if len(c_dict) > 0: logger.info( "=========================================================") logger.info("all counters:") for group in sorted(c_dict.iterkeys()): logger.info("\t%s:" % group) for k, v in c_dict[group].iteritems(): logger.info("\t\t%s=%d" % (k, v))
def end_serde_test(self): """ test """ import sys from bigflow.core import entity logger.info(str(self._checking_condition)) values = map(lambda condition: condition[1], self._checking_condition) p_values = self._pipeline.parallelize([values ]) # 避免map结点超过32个(Hadoop的限制) p_value_list = [] out = [] for (i, (sd, value)) in enumerate(self._checking_condition): sd1 = serde.of(int) sd2 = sd cpp_deserialize_fn = entity.KVDeserializeFn(sd1, sd2) cpp_serialize_fn = entity.KVSerializeFn(sd1, sd2) python_deserialize_fn = lambda kv: (sd1.deserialize(kv[0]), sd2.deserialize(kv[1])) python_serialize_fn = lambda kv: (sd1.serialize(kv[0]), sd2.serialize(kv[1])) serialize_fns = [cpp_serialize_fn, python_serialize_fn] deserialize_fns = [cpp_deserialize_fn, python_deserialize_fn] kv_val = (1, value) def _assert_eq_val(v): assert v == kv_val for serialize_fn in serialize_fns: for deserialize_fn in deserialize_fns: out.append( p_values.map(lambda x: (1, x[i])).map(serialize_fn). map(deserialize_fn).map(_assert_eq_val)) if out: transforms.union(*out).cache() else: print >> sys.stderr, "SKIP a test!!!" self._pipeline.run()
def _after_run(self): super(SparkPipeline, self)._after_run() for local_uri_info in self._local_uri_infos: local_uri = local_uri_info['local_uri'] hdfs_uri = local_uri_info['hdfs_uri'] if local_uri_info['overwrite']: logger.info("Preparing local directory: %s" % local_uri) if not self._force_delete_file(local_uri): raise error.BigflowHDFSException( "Failed to remove target path: %s" % local_uri) else: if self._path_exists(local_uri): raise error.BigflowHDFSException( "Failed to output target path: %s, target path is existed" % local_uri) os.makedirs(local_uri) self._client.fs_get(hdfs_uri + "/*", local_uri, self._hadoop_config) self._local_uri_infos = [] if SparkPipeline.output_dir_conf_key in self._config["spark_conf"]: del self._config["spark_conf"][SparkPipeline.output_dir_conf_key]
def _handle_new_writtens(self): if len(self._uri_to_write) > 0: logger.info("=========================================================") logger.info("all outputs:") for uri in self._uri_to_write: logger.info("\t%s" % uri) self._uri_to_write[:] = []
def _handle_new_writtens(self): if len(self._uri_to_write) > 0: logger.info( "=========================================================") logger.info("all outputs:") for uri in self._uri_to_write: logger.info("\t%s" % uri) self._uri_to_write[:] = []
def end_serde_test(self): """ test """ import sys from bigflow.core import entity logger.info(str(self._checking_condition)) values = map(lambda condition: condition[1], self._checking_condition) p_values = self._pipeline.parallelize([values]) # 避免map结点超过32个(Hadoop的限制) p_value_list = [] out = [] for (i, (sd, value)) in enumerate(self._checking_condition): sd1 = serde.of(int) sd2 = sd cpp_deserialize_fn = entity.KVDeserializeFn(sd1, sd2) cpp_serialize_fn = entity.KVSerializeFn(sd1, sd2) python_deserialize_fn = lambda kv: (sd1.deserialize(kv[0]), sd2.deserialize(kv[1])) python_serialize_fn = lambda kv: (sd1.serialize(kv[0]), sd2.serialize(kv[1])) serialize_fns = [cpp_serialize_fn, python_serialize_fn] deserialize_fns = [cpp_deserialize_fn, python_deserialize_fn] kv_val = (1, value) def _assert_eq_val(v): assert v == kv_val for serialize_fn in serialize_fns: for deserialize_fn in deserialize_fns: out.append(p_values.map(lambda x: (1, x[i])) .map(serialize_fn) .map(deserialize_fn) .map(_assert_eq_val)) if out: transforms.union(*out).cache() else: print >> sys.stderr, "SKIP a test!!!" self._pipeline.run()
def wrapper(test_class_obj): """ inner """ _first_run = True if test_class_obj.pipeline_type in modes: for filesystem in expect_filesystems: if filesystem in test_class_obj.support_file_system: if _skip_filesystem_test(filesystem): continue test_class_obj.root_path = test_class_obj.root_path_dict[filesystem] test_class_obj.running_on_filesystem = filesystem logger.info("running case [%s.%s] root_path=[%s], filesystem=[%s]" % (type(test_class_obj).__name__, fn.func_name, test_class_obj.root_path, test_class_obj.running_on_filesystem)) if not _first_run: test_class_obj.tearDown() test_class_obj.setUp() fn(test_class_obj) _first_run = False else: logger.warn('\033[01;31mWarning!!! %s not executed,' \ ' because filesystem is %s.\033[00m' \ %(fn.__name__, filesystem))
def _prepare_cache_archive(self): logger.info("Checking PreparedArchive for Spark Pipeline...") existed = self._client.fs_test(self.prepared_archive_path, self._hadoop_config) tmp_path = self.prepared_archive_path + '-' + str(uuid.uuid4()) self._job_config.prepared_archive_path = self.prepared_archive_path self._job_config.tmp_data_path = tmp_path if self._config['reprepare_cache_archive'] or not existed: if self._config['reprepare_cache_archive']: if not existed: logger.info("Bigflow PreparedArchive does not exist") else: logger.info("Re-prepare Bigflow PreparedArchive") self._client.fs_rmr(self.prepared_archive_path, self._hadoop_config) import subprocess bigflow_home = self._get_bigflow_python_home() local_cache_archive = "bigflow_python_%s.tar.gz" % (str( uuid.uuid4())) cmd = "tar czf %s -C %s --exclude=flume/worker python_runtime flume" % ( local_cache_archive, bigflow_home) ret = subprocess.call(cmd, shell=True) if ret != 0: raise error.BigflowPlanningException( "Cannot make PreparedArchive file") try: self._client.fs_put(local_cache_archive, tmp_path, self._hadoop_config) self._client.fs_mv(tmp_path, self.prepared_archive_path, self._hadoop_config) except error.BigflowHDFSException: # only need to delete archive path when exception occurs. self._remote_temp_files.append(tmp_path) if not self._client.fs_test(self.prepared_archive_path, self._hadoop_config): msg = "Unable to upload Bigflow PreparedArchive, please " \ "make sure you have write permission to " \ "tmp_data_path['%s']" % self._config['tmp_data_path'] raise error.BigflowHDFSException(msg) finally: ret = subprocess.call("rm %s" % local_cache_archive, shell=True) self._client.fs_rmr(tmp_path, self._hadoop_config) else: logger.info("Bigflow PreparedArchive exists already")
def _upload_file(self, local_path): user_provided_config = self._hadoop_config hdfs_path = [] tmp_data_path = self._job_config.tmp_data_path for path in glob.glob(local_path): if os.path.isdir(path): target = os.path.join(tmp_data_path, "local_input", str(uuid.uuid4())) logger.info("Uploading input directory [%s] to [%s]" % (path, target)) elif os.path.isfile(path): # do not change the basename of the input file to keep the suffix. file_name = os.path.basename(path) target = os.path.join(tmp_data_path, "local_input", str(uuid.uuid4()), file_name) logger.info("Uploading input file [%s] to [%s]" % (path, target)) else: raise error.BigflowRuntimeException("file [%s] (matched by pattern [%s]) " "is neither a dir nor a regular file" % (path, local_path)) hdfs_path.append(target) self._client.fs_put(path, target, user_provided_config) self._remote_temp_files.append(target) return hdfs_path
def _prepare_cache_archive(self): logger.info("Checking PreparedArchive for Spark Pipeline...") existed = self._client.fs_test(self.prepared_archive_path, self._hadoop_config) tmp_path = self.prepared_archive_path + '-' + str(uuid.uuid4()) self._job_config.prepared_archive_path = self.prepared_archive_path self._job_config.tmp_data_path = tmp_path if self._config['reprepare_cache_archive'] or not existed: if self._config['reprepare_cache_archive']: if not existed: logger.info("Bigflow PreparedArchive does not exist") else: logger.info("Re-prepare Bigflow PreparedArchive") self._client.fs_rmr(self.prepared_archive_path, self._hadoop_config) import subprocess bigflow_home = self._get_bigflow_python_home() local_cache_archive = "bigflow_python_%s.tar.gz" % (str(uuid.uuid4())) cmd = "tar czf %s -C %s --exclude=flume/worker python_runtime flume" % (local_cache_archive, bigflow_home) ret = subprocess.call(cmd, shell=True) if ret != 0: raise error.BigflowPlanningException("Cannot make PreparedArchive file") try: self._client.fs_put( local_cache_archive, tmp_path, self._hadoop_config) self._client.fs_mv( tmp_path, self.prepared_archive_path, self._hadoop_config) except error.BigflowHDFSException: # only need to delete archive path when exception occurs. self._remote_temp_files.append(tmp_path) if not self._client.fs_test(self.prepared_archive_path, self._hadoop_config): msg = "Unable to upload Bigflow PreparedArchive, please " \ "make sure you have write permission to " \ "tmp_data_path['%s']" % self._config['tmp_data_path'] raise error.BigflowHDFSException(msg) finally: ret = subprocess.call("rm %s" % local_cache_archive, shell=True) self._client.fs_rmr(tmp_path, self._hadoop_config) else: logger.info("Bigflow PreparedArchive exists already")
def act(self, line): pos = line.find("split uri : ") + len("split uri : ") logger.info("Reading input: %s" % line[pos:])
def act(self, line): if isinstance(self.replace, list): for r in self.replace: logger.info(r) else: logger.info(self.replace)
def act(self, line): pos = line.find(_HadoopLogParser.hadoop_stderr_msg) logger.info(line[pos:])