def test_get_key_values_stream2(self): stream = get_stream(STREAM_2_DATA) kvs_stream = get_key_values_stream(stream, private_encoding=False) kvs2_stream = it.groupby(STREAM_2_DATA, keyfunc) for (k1, vals1), (k2, itx) in czip(kvs_stream, kvs2_stream): self.assertEqual(k1, k2) if k1 == 'key2': vals2 = cmap(lambda t: t[1], cfilter(lambda t: t[0] == streams.REDUCE_VALUE, itx)) for v1, v2 in czip(vals1, vals2): self.assertEqual(v1, v2)
def test_avro_reader(self): N = 500 fn = self.write_avro_file(avro_user_record, N, 1024) url = hdfs.path.abspath(fn, local=True) class FunkyCtx(object): def __init__(self, isplit): self.input_split = isplit def get_areader(offset, length): isplit = InputSplit(InputSplit.to_string(url, offset, length)) ctx = FunkyCtx(isplit) return AvroReader(ctx) areader = get_areader(0, 14) file_length = areader.reader.file_length with self.assertRaises(StopIteration): next(areader) areader = get_areader(0, file_length) with SeekableDataFileReader(open(fn, 'rb'), DatumReader()) as sreader: for (o, a), s in czip(areader, sreader): self.assertEqual(a, s) mid_len = int(file_length / 2) lows = [x for x in get_areader(0, mid_len)] highs = [x for x in get_areader(mid_len, file_length)] self.assertEqual(N, len(lows) + len(highs))
def test_seekable(self): fn = self.write_avro_file(avro_user_record, 500, 1024) with open(fn, 'rb') as f: sreader = SeekableDataFileReader(f, DatumReader()) res = [t for t in czip(cmap( lambda _: f.tell(), it.repeat(1) ), sreader)] sreader.align_after(res[-1][0]) with self.assertRaises(StopIteration): r = next(sreader) sreader.align_after(0) r = next(sreader) self.assertEqual(r, res[0][1]) def offset_iterator(): s = -1 for o, r in res: sreader.align_after(o) t = f.tell() if t == s: continue s = t x = next(sreader) yield (t, x) i = 0 for xo, x in offset_iterator(): sreader.align_after(xo) for o, r in res[i:]: if o >= xo: self.assertEqual(x, r) break i += 1
def link_helper(self, mod, Writer, DownStreamAdapter): fname = self._mkfn('foo.' + ('bin' if mod == 'b' else 'txt')) stream_writer(fname, STREAM_1, mod, Writer) with open(fname, 'r' + mod) as f: stream = DownStreamAdapter(f) try: at_least_once_in_loop = False for (cmd, args), vals in czip(stream, STREAM_1): at_least_once_in_loop = True self.assertEqual(cmd, vals[0]) vals = vals[1:] if mod == 'b': vals = [ x.encode('utf-8') if isinstance(x, unicode) else x for x in vals ] vals = tuple(vals) if cmd == streams.SET_JOB_CONF: self.assertEqual(len(args), 1) self.assertEqual(args[0], vals) else: self.assertTrue((len(vals) == 0 and not args) or (vals == args)) self.assertTrue(at_least_once_in_loop) except ProtocolError as e: print('error -- %s' % e)
def link_helper(self, mod, Writer, DownStreamAdapter): fname = self._mkfn('foo.' + ('bin' if mod == 'b' else 'txt')) stream_writer(fname, STREAM_1, mod, Writer) with open(fname, 'r' + mod) as f: stream = DownStreamAdapter(f) try: at_least_once_in_loop = False for (cmd, args), vals in czip(stream, STREAM_1): at_least_once_in_loop = True self.assertEqual(cmd, vals[0]) vals = vals[1:] if mod == 'b': vals = [x.encode('utf-8') if isinstance(x, unicode) else x for x in vals] vals = tuple(vals) if cmd == streams.SET_JOB_CONF: self.assertEqual(len(args), 1) self.assertEqual(args[0], vals) else: self.assertTrue((len(vals) == 0 and not args) or (vals == args)) self.assertTrue(at_least_once_in_loop) except ProtocolError as e: print('error -- %s' % e)
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data) # check semantics when target dir already exists hdfs.rmr(copy_on_wd) hdfs.mkdir(copy_on_wd) hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data)
def __ls(self, ls_func, path_transform): for wd, paths in czip((self.local_wd, self.hdfs_wd), (self.local_paths, self.hdfs_paths)): for p in paths: hdfs.dump(self.data, p, mode="wb") test_dir = "%s/%s" % (wd, "test_dir") test_path = "%s/%s" % (test_dir, "test_path") hdfs.dump(self.data, test_path, mode="wb") paths.append(test_dir) for recursive in False, True: if recursive: paths.append(test_path) dir_list = [ path_transform(p) for p in ls_func(wd, recursive=recursive) ] self.assertEqual(sorted(dir_list), sorted(paths))
def __ls(self, ls_func, path_transform): for wd, paths in czip( (self.local_wd, self.hdfs_wd), (self.local_paths, self.hdfs_paths) ): for p in paths: hdfs.dump(self.data, p, mode="wb") test_dir = "%s/%s" % (wd, "test_dir") test_path = "%s/%s" % (test_dir, "test_path") hdfs.dump(self.data, test_path, mode="wb") paths.append(test_dir) for recursive in False, True: if recursive: paths.append(test_path) dir_list = [ path_transform(p) for p in ls_func(wd, recursive=recursive) ] self.assertEqual(sorted(dir_list), sorted(paths))
def test_get_key_value_stream(self): stream = get_stream(STREAM_1_DATA) kv_stream = get_key_value_stream(stream) for ((k, v), (cmd, k1, v1)) in czip(kv_stream, STREAM_1_DATA): self.assertEqual(k, k1) self.assertEqual(v, v1)