def test_add_remote_file_for_local(self): """test add remote file""" import subprocess prog_dir = self.generate_tmp_path() data_dir = self.generate_tmp_path() prog_list = ["#!/bin/env python", "#-*- coding:utf-8 -*-"] prog_list += ["def map(x):", " return (x, 1)"] words = ["zhu", "xi", "da", "fa", "hao"] prog = self._pipeline.parallelize(prog_list) data = self._pipeline.parallelize(words) # set partition number to 1 # for mv * can only handle one file if target is a file self._pipeline.write(prog, output.TextFile(prog_dir).partition(n=1)) self._pipeline.write(data, output.TextFile(data_dir).partition(n=1)) self._pipeline.run() # rename file_prog = os.path.join(prog_dir, "*") file_data = os.path.join(data_dir, "*") target_prog = os.path.join(prog_dir, "remote_map.py") target_data = os.path.join(data_dir, "words.txt") if self.running_on_filesystem == "local": # only file in the folder subprocess.Popen("mv %s %s" % (file_prog, target_prog), shell=True).wait() subprocess.Popen("mv %s %s" % (file_data, target_data), shell=True).wait() else: hadoop = "{bin} fs -conf {conf_path} -mv".format( bin=self._pipeline._config['hadoop_client_path'], conf_path=self._pipeline._config['hadoop_config_path']) subprocess.Popen("{hadoop} {source} {target}".format( hadoop=hadoop, source=file_prog, target=target_prog), shell=True).wait() subprocess.Popen("{hadoop} {source} {target}".format( hadoop=hadoop, source=file_data, target=target_data), shell=True).wait() self._pipeline.add_file(target_prog, "remote_map.py") self._pipeline.add_file(target_data, "words.txt") def _remote_map(x): """inner map""" local_words = [] with open("words.txt") as fd: for line in fd: local_words.append(line.strip()) assert local_words == words, "local words and remote words not equal" import remote_map return remote_map.map(x) p_words = self._pipeline.parallelize(words) p_words_map = p_words.map(_remote_map) p_res = p_words_map.get() p_ori = map(lambda w: (w, 1), words) self.assertItemsEqual(p_ori, p_res)
def test_write_binary_set_by_user(self): # Set record delimiter by user raw_data = ["aaa", "bbb", "ccc"] special_record_delimiter = chr(2) + chr(3) + chr(4) record_delimiters = [ "\t", "\r\n", special_record_delimiter, ] expect_data = [ ["aaa\tbbb\tccc\t"], ["aaa", "bbb", "ccc"], [ special_record_delimiter.join(raw_data) + special_record_delimiter ], ] self.tmp_output_dirs = [] for record_delimiter in record_delimiters: data = self._pipeline.parallelize(raw_data) output_dir = self.generate_tmp_path() self.tmp_output_dirs.append(output_dir) self._pipeline.write( data, output.TextFile(output_dir, record_delimiter=record_delimiter)) self._pipeline.run() for idx, output_dir in enumerate(self.tmp_output_dirs): self._compare_expect_data_and_output(expect_data[idx], output_dir)
def test_partition(self): """ Test partition output """ import os try: p = self._pipeline.parallelize(["1", "2", "3"]) self._pipeline.write(p, output.TextFile('./output-1').partition(5)) self._pipeline.write( p, output.SequenceFile('./output-2').partition( 2, lambda x, n: int(x) % n)) self._pipeline.run() o1 = self._pipeline.read( input.SequenceFile('./output-2/part-00000')) o1.cache() o2 = self._pipeline.read( input.SequenceFile('./output-2/part-00001')) o2.cache() self.assertEqual(["2"], o1.get()) self.assertItemsEqual(["1", "3"], o2.get()) n = os.popen('ls output-1/[^_]* | wc -l').read() self.assertEqual(5, int(n)) o = self._pipeline.read(input.TextFile('output-1')).get() self.assertItemsEqual(["1", "2", "3"], o) finally: os.system("rm output-1 output-2 -r")
def test_add_remote_file(self): """test add remote file""" prog_dir = self.generate_tmp_path() data_dir = self.generate_tmp_path() prog_list = ["#!/bin/env python", "#-*- coding:utf-8 -*-"] prog_list += ["def map(x):", " return (x, 1)"] words = ["zhu", "xi", "da", "fa", "hao"] prog = self._pipeline.parallelize(prog_list) data = self._pipeline.parallelize(words) # set partition number to 1 so we can make sure all the data is in the part-00000 self._pipeline.write(prog, output.TextFile(prog_dir).partition(n=1)) self._pipeline.write(data, output.TextFile(data_dir).partition(n=1)) self._pipeline.run() target_prog = os.path.join(prog_dir, "part-00000") # Currently spark fails to deal with cache files with same file name. # In this test, move part-00000 to part-data temporarily. Should revert this change when # spark can handle same file name cache files. origin_target_data = os.path.join(data_dir, "part-00000") if self.running_on_filesystem == "local": target_data = origin_target_data else: target_data = os.path.join(data_dir, "part-data") self._pipeline._client.fs_mv(origin_target_data, target_data) # reset the pipeline, so this can work on spark pipeline. # todo: when spark_pipeline provides addFile, use addFile to handle add_file request after # todo: first run. self.setConfig() self._pipeline.add_file(target_prog, "remote_map.py") self._pipeline.add_file(target_data, "words.txt") def _remote_map(x): """inner map""" local_words = [] with open("words.txt") as fd: for line in fd: local_words.append(line.strip()) assert local_words == words, "local words and remote words not equal" import remote_map return remote_map.map(x) p_words = self._pipeline.parallelize(words) p_words_map = p_words.map(_remote_map) p_ori = p_words.map(lambda x: (x, 1)) self.passertEqual(0, p_words_map.diff(p_ori).count())
def test_text_file(self): """ Case: test text file """ data = self._pipeline.parallelize(['1', '2', '3', '400', '5']) local_file = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(local_file)) self._pipeline.run() result = self._pipeline.read(input.TextFile(local_file)) self.assertItemsEqual(['1', '2', '3', '400', '5'], result.get()) # test read with \0 in the file null_data = data.map(lambda x: x + "\0") null_file = self.generate_tmp_path() self._pipeline.write(null_data, output.TextFile(null_file)) self._pipeline.run() null_read = self._pipeline.read(input.TextFile(null_file)) self.passertEqual(0, null_read.diff(null_data).count())
def test_write_binary_use_default(self): # Use default record delimiter raw_data = ["aaa", "bbb", "ccc"] data = self._pipeline.parallelize(raw_data) output_dir = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(output_dir)) self._pipeline.run() self._compare_expect_data_and_output(raw_data, output_dir)
def test_text_file_sync(self): """ Case: test text file """ data = self._pipeline.parallelize(['1', '2', '3', '400', '5']) local_file = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(local_file, async_mode=False)) self._pipeline.run() result = self._pipeline.read(input.TextFile(local_file)) self.assertItemsEqual(['1', '2', '3', '400', '5'], result.get())
def test_sequence_file_invalid(self): """ Case: test sequence file invalid """ data = self._pipeline.parallelize(['1', '2', '3', '400', '5']) local_file = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(local_file)) self._pipeline.run() result = self._pipeline.read(input.SequenceFile(local_file)) with self.assertRaises(error.BigflowRuntimeException): result.get()
def test_text_file_sort(self): """ Case: test text file sort """ data = self._pipeline.parallelize([500, 2, 100, 600, 7]) local_file = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(local_file).sort()) self._pipeline.run() data_file = local_file + '/part-00000' file = open(data_file) self.assertItemsEqual(['2\n', '7\n', '100\n', '500\n', '600\n'], file.readlines())
def test_output_sort(self): self.setConfig(spark_conf={ "spark.default.parallelism": "1", }) """ test """ lines = self._pipeline.parallelize([5, 1, 2, 0, 3, 4])\ .map(lambda x: str(x), serde=serde.of(str)) out1_path = self.generate_tmp_path() + '/output-1/' out2_path = self.generate_tmp_path() + '/output-2/' self._pipeline.write( lines, output.TextFile(out1_path).sort().partition( n=2, partition_fn=lambda x, n: int(x) % n)) self._pipeline.write( lines, output.TextFile(out2_path).sort(reverse=True).partition( n=2, partition_fn=lambda x, n: int(x) % n)) self._pipeline.run() l11 = self._pipeline.read(input.TextFile(out1_path + '/part-00000'))\ .accumulate('', lambda x, y: x + y) l12 = self._pipeline.read(input.TextFile(out1_path + '/part-00001'))\ .accumulate('', lambda x, y: x + y) l21 = self._pipeline.read(input.TextFile(out2_path + '/part-00000'))\ .accumulate('', lambda x, y: x + y) l22 = self._pipeline.read(input.TextFile(out2_path + '/part-00001'))\ .accumulate('', lambda x, y: x + y) l11.cache() l12.cache() l21.cache() l22.cache() self.assertEqual('024', l11.get()) self.assertEqual('135', l12.get()) self.assertEqual('420', l21.get()) self.assertEqual('531', l22.get())
def wildcard_case(self): """ Case: test wildcard """ input_data = [['1', '2', '3'], ['400', '5'], ['401', '501']] index = 0 root_path = self.generate_tmp_path() for tmp_data in input_data: data = self._pipeline.parallelize(tmp_data) path = root_path + '/X' + str(index) self._pipeline.write(data, output.TextFile(path)) index = index + 1 self._pipeline.run() match_path = root_path + '/*' result = self._pipeline.read(input.TextFile(match_path)) self.assertItemsEqual(['401', '501', '1', '2', '3', '400', '5'], result.get())
def test_broadcast(self): """ Unit tests entrance """ output_path = self.generate_tmp_path() pc = self._pipeline.parallelize([1,2,3]) pc1 = self._pipeline.parallelize([1,2,3]) pc2 = self._pipeline.parallelize([4,2,6]) pc3 = pc.map(lambda x,y,z:(x, (x in y) and (x in z)), pc1, pc2) pc4 = pc3.map(lambda x:"\t".join(map(str, x))) self._pipeline.write(pc4, output.TextFile(output_path).partition(n=2)) self._pipeline.run() parts = ['part-00000', 'part-00001'] input_path = map(lambda path:os.path.join(output_path, path), parts) result = self._pipeline.read(input.TextFile(*input_path)) target = ['1\tFalse', '2\tTrue', '3\tFalse'] self.assertItemsEqual(result.get(), target)
def test_gzip_file(self): """ Test read/write gzip files """ import os try: mem_testdata = ['1', '2', '3'] p = self._pipeline.parallelize(mem_testdata) target = output.TextFile('output-gzip').with_compression( "gzip").partition(2) self._pipeline.write(p, target) self._pipeline.run() self.assertTrue(os.path.isdir('output-gzip')) read = os.popen('gzip -cd output-gzip/*').read() self.assertItemsEqual(mem_testdata, read.rstrip('\n').split('\n')) finally: os.system("rm output-gzip -r")
def test_write_binary_none(self): # Don't set record_delimiter, write binary. chars = [chr(i) for i in xrange(1, 10)] from random import shuffle from random import randint shuffle(chars) raw_data = [] for cnt in xrange(100): index1 = randint(0, len(chars) - 1) index2 = randint(0, len(chars) - 1) raw_data.append("".join( chars[min(index1, index2):max(index1, index2)])) raw_data data = self._pipeline.parallelize(raw_data) output_dir = self.generate_tmp_path() self._pipeline.write( data, output.TextFile(output_dir, record_delimiter=None)) self._pipeline.run() expect_data = ["".join(raw_data)] self._compare_expect_data_and_output(expect_data, output_dir)
计算得出每个网址访问的uv(被不同人访问的次数): g.cn 1 qq.com 2 baidu.com 3 163.com 1 """ import os from bigflow import base, input, output, transforms #输入是pcollection,对其做distinct和count,即求每个网址的uv def count_distinct(p): return p.distinct().count() #创建pipeline _pipeline = base.Pipeline.create("LOCAL") dir = os.path.dirname(os.path.abspath(__file__)) + "/data" input_path = dir + "/" + "uv.text" #读取输入并格式化 col = _pipeline.read(input.TextFile(input_path)) col = col.map(lambda x: x.split()) #按网址分组,并对每个网址求uv col = col.group_by_key().apply_values(count_distinct).flatten() col = col.map(lambda x: x[0] + "\t" + str(x[1])) #写输出 _pipeline.write(col, output.TextFile("/tmp/website_uv")) _pipeline.run()
def end(self): record = (self._word, self._sum) self._emitter.emit(record) pipeline = base.Pipeline.create('local') plan = pipeline.plan() plan.set_environment(entity.PythonEnvironment()) input_path = sys.path[0] + "/" + __file__ input_urls = [input_path] output_path = sys.path[0] + "/" + "output" single_word = plan.load(input_urls)\ .by(input.TextFile(input_urls[0]).input_format).as_type(record_objector.RecordObjector())\ .process_by(PythonFromRecordProcessor()).as_type(serde.any())\ .process_by(WordSpliter()).as_type(serde.any()) result = plan.shuffle(single_word.scope(), [single_word])\ .with_concurrency(10)\ .node(0).match_by(WordIdentity(lambda x: x[0], serde.any()))\ .process_by(WordCount()).as_type(serde.any())\ .input(0).allow_partial_processing().done()\ .process_by(WordCount()).as_type(serde.any()) plan.shuffle(plan.global_scope(), [result]).node(0).distribute_by_default()\ .process_by(PythonToRecordProcessor()).as_type(record_objector.RecordObjector())\ .sink_by(output.TextFile(output_path).output_format) pipeline.run()
pipeline = base.Pipeline.create( # 指定计算引擎为"spark"或"SPARK" "spark", # 指定tmp_data_path tmp_data_path="hdfs:///app/dc/bigflow/tmp", # 指定spark配置 spark_conf=spark_conf, # default_concurrency不是必须选项,该example数据量小可以设置小一些 default_concurrency=250, ) #case_str = "case4_2" input_path = sys.argv[1] output_path = sys.argv[2] # 可通过 parallelize 构造P类型 data = pipeline.read(input.TextFile(input_path)) # 在P类型上应用transforms result = data.map(lambda x: x.split()).group_by_key()\ .apply_values(lambda x: x.max_elements(5, lambda x: x)).flatten()\ .map(lambda t: "%s %s" % (t[0], t[1])) # 当前预览版不支持get操作,只能通过pipelined的write方法将P类型写入文件系统 pipeline.write(result, output.TextFile(output_path)) pipeline.run()
def test_schema_text_file(self): """ Case: test schema text file """ data = self._pipeline.parallelize( ['www.baidu.com,3', 'www.sina.com,6']) local_file = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(local_file)) self._pipeline.run() result_dict_sd = self._pipeline.read( input.SchemaTextFile(local_file, columns=[("website", str), ("clicknum", int)], separator=",")) result_dict = self._pipeline.read( input.SchemaTextFile(local_file, columns=["website", "clicknum"], separator=",")) result_tuple = self._pipeline.read( input.SchemaTextFile(local_file, columns=2, separator=",")) result_tuple_type = self._pipeline.read( input.SchemaTextFile(local_file, columns=[str, int], separator=",")) expect_dict_sd = \ [{'clicknum': 3, 'website': 'www.baidu.com'}, {'clicknum': 6, 'website': 'www.sina.com'}] self.assertItemsEqual(expect_dict_sd, result_dict_sd.get()) expect_dict = \ [{'clicknum': '3', 'website': 'www.baidu.com'}, {'clicknum': '6', 'website': 'www.sina.com'}] self.assertItemsEqual(expect_dict, result_dict.get()) expect_tuple = \ [('www.baidu.com', '3'), ('www.sina.com', '6')] self.assertItemsEqual(expect_tuple, result_tuple.get()) expect_tuple_type = \ [('www.baidu.com', 3), ('www.sina.com', 6)] self.assertItemsEqual(expect_tuple_type, result_tuple_type.get()) self._pipeline.write( result_dict, output.SchemaTextFile(local_file, columns=["website", "clicknum"], separator=",")) self._pipeline.run() result_dict = self._pipeline.read( input.SchemaTextFile(local_file, columns=["website", "clicknum"], separator=",")) self.assertItemsEqual(expect_dict, result_dict.get()) self._pipeline.write( result_dict_sd, output.SchemaTextFile(local_file, columns=["website", "clicknum"], separator=",")) self._pipeline.run() result_dict_sd = self._pipeline.read( input.SchemaTextFile(local_file, columns=[("website", str), ("clicknum", int)], separator=",")) self.assertItemsEqual(expect_dict_sd, result_dict_sd.get()) self._pipeline.write(result_tuple, output.SchemaTextFile(local_file)) self._pipeline.write(result_tuple, output.SchemaTextFile(local_file)) self._pipeline.run() result_tuple = self._pipeline.read( input.SchemaTextFile(local_file, columns=2)) self.assertItemsEqual(expect_tuple, result_tuple.get()) result_tuple_type = result_tuple_type.map(lambda (w, c): (w, c)) self._pipeline.write(result_tuple_type, output.SchemaTextFile(local_file, columns=2)) self._pipeline.run() result_tuple_type = self._pipeline.read( input.SchemaTextFile(local_file, columns=[str, int])) self.assertItemsEqual(expect_tuple_type, result_tuple_type.get())
task_name, DATE) #job_name = 'feed_production_day_relerec_state' + "_" + DATE pipeline = base.Pipeline.create("local" if ISTEST else "DAGMR", job_name=job_name, tmp_data_path=afs_tmp, hadoop_job_conf=job_conf) # 核心任务逻辑 pipeline.add_file("./bigflow_python/proto/sample_pb2.py", "./sample_pb2.py") # to run in local mode, run code below first, then read from local file #pipeline = base.Pipeline.create("DAGMR", # job_name=job_name, # tmp_data_path=afs_tmp, # hadoop_job_conf=job_conf) #pbs = pipeline.read(input.SequenceFile(*input_path, serde=serde.StrSerde())) #pipeline.write(pbs, output.SequenceFile(output_path, serde=serde.StrSerde())) #pipeline.run() pbs = pipeline.read( input.SequenceFile(*input_path, serde=serde.ProtobufSerde(sample_pb2.Sample))) p = pbs.flat_map(emit_features)\ .group_by(key_extractor=lambda x:x[0], value_extractor=lambda x:x[1])\ .apply_values(transforms.reduce, lambda a,b: map(operator.add, a, b)).flatten()\ .map(lambda x: [x[0], x[1] + [float(x[1][2]) / (x[1][1]) if x[1][1] > 0 else 0]])\ .group_by(key_extractor=lambda x:x[0].split('\t')[0], value_extractor=lambda x:x[1])\ .apply_values(average).flatten().map(lambda x: '\t'.join(x[0].split('#') + map(str, x[1]))) # output pipeline.write(p, output.TextFile(output_path).partition(n=1)) pipeline.run()