Пример #1
0
 def test_get_key_values_stream2(self):
     stream = get_stream(STREAM_2_DATA)
     kvs_stream = get_key_values_stream(stream, private_encoding=False)
     kvs2_stream = it.groupby(STREAM_2_DATA, keyfunc)
     for (k1, vals1), (k2, itx) in czip(kvs_stream, kvs2_stream):
         self.assertEqual(k1, k2)
         if k1 == 'key2':
             vals2 = cmap(lambda t: t[1],
                          cfilter(lambda t: t[0] == streams.REDUCE_VALUE,
                                  itx))
             for v1, v2 in czip(vals1, vals2):
                 self.assertEqual(v1, v2)
Пример #2
0
    def test_avro_reader(self):

        N = 500
        fn = self.write_avro_file(avro_user_record, N, 1024)
        url = hdfs.path.abspath(fn, local=True)

        class FunkyCtx(object):
            def __init__(self, isplit):
                self.input_split = isplit

        def get_areader(offset, length):
            isplit = InputSplit(InputSplit.to_string(url, offset, length))
            ctx = FunkyCtx(isplit)
            return AvroReader(ctx)

        areader = get_areader(0, 14)
        file_length = areader.reader.file_length
        with self.assertRaises(StopIteration):
            next(areader)
        areader = get_areader(0, file_length)
        with SeekableDataFileReader(open(fn, 'rb'), DatumReader()) as sreader:
            for (o, a), s in czip(areader, sreader):
                self.assertEqual(a, s)
        mid_len = int(file_length / 2)
        lows = [x for x in get_areader(0, mid_len)]
        highs = [x for x in get_areader(mid_len, file_length)]
        self.assertEqual(N, len(lows) + len(highs))
Пример #3
0
    def test_seekable(self):
        fn = self.write_avro_file(avro_user_record, 500, 1024)
        with open(fn, 'rb') as f:
            sreader = SeekableDataFileReader(f, DatumReader())
            res = [t for t in czip(cmap(
                lambda _: f.tell(), it.repeat(1)
            ), sreader)]
            sreader.align_after(res[-1][0])
            with self.assertRaises(StopIteration):
                r = next(sreader)
            sreader.align_after(0)
            r = next(sreader)
            self.assertEqual(r, res[0][1])

            def offset_iterator():
                s = -1
                for o, r in res:
                    sreader.align_after(o)
                    t = f.tell()
                    if t == s:
                        continue
                    s = t
                    x = next(sreader)
                    yield (t, x)

            i = 0
            for xo, x in offset_iterator():
                sreader.align_after(xo)
                for o, r in res[i:]:
                    if o >= xo:
                        self.assertEqual(x, r)
                        break
                    i += 1
Пример #4
0
 def link_helper(self, mod, Writer, DownStreamAdapter):
     fname = self._mkfn('foo.' + ('bin' if mod == 'b' else 'txt'))
     stream_writer(fname, STREAM_1, mod, Writer)
     with open(fname, 'r' + mod) as f:
         stream = DownStreamAdapter(f)
         try:
             at_least_once_in_loop = False
             for (cmd, args), vals in czip(stream, STREAM_1):
                 at_least_once_in_loop = True
                 self.assertEqual(cmd, vals[0])
                 vals = vals[1:]
                 if mod == 'b':
                     vals = [
                         x.encode('utf-8') if isinstance(x, unicode) else x
                         for x in vals
                     ]
                 vals = tuple(vals)
                 if cmd == streams.SET_JOB_CONF:
                     self.assertEqual(len(args), 1)
                     self.assertEqual(args[0], vals)
                 else:
                     self.assertTrue((len(vals) == 0 and not args)
                                     or (vals == args))
             self.assertTrue(at_least_once_in_loop)
         except ProtocolError as e:
             print('error -- %s' % e)
Пример #5
0
 def link_helper(self, mod, Writer, DownStreamAdapter):
     fname = self._mkfn('foo.' + ('bin' if mod == 'b' else 'txt'))
     stream_writer(fname, STREAM_1, mod, Writer)
     with open(fname, 'r' + mod) as f:
         stream = DownStreamAdapter(f)
         try:
             at_least_once_in_loop = False
             for (cmd, args), vals in czip(stream, STREAM_1):
                 at_least_once_in_loop = True
                 self.assertEqual(cmd, vals[0])
                 vals = vals[1:]
                 if mod == 'b':
                     vals = [x.encode('utf-8')
                             if isinstance(x, unicode) else x
                             for x in vals]
                 vals = tuple(vals)
                 if cmd == streams.SET_JOB_CONF:
                     self.assertEqual(len(args), 1)
                     self.assertEqual(args[0], vals)
                 else:
                     self.assertTrue((len(vals) == 0 and not args) or
                                     (vals == args))
             self.assertTrue(at_least_once_in_loop)
         except ProtocolError as e:
             print('error -- %s' % e)
Пример #6
0
 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
Пример #7
0
 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
Пример #8
0
 def __ls(self, ls_func, path_transform):
     for wd, paths in czip((self.local_wd, self.hdfs_wd),
                           (self.local_paths, self.hdfs_paths)):
         for p in paths:
             hdfs.dump(self.data, p, mode="wb")
         test_dir = "%s/%s" % (wd, "test_dir")
         test_path = "%s/%s" % (test_dir, "test_path")
         hdfs.dump(self.data, test_path, mode="wb")
         paths.append(test_dir)
         for recursive in False, True:
             if recursive:
                 paths.append(test_path)
             dir_list = [
                 path_transform(p) for p in ls_func(wd, recursive=recursive)
             ]
             self.assertEqual(sorted(dir_list), sorted(paths))
Пример #9
0
 def __ls(self, ls_func, path_transform):
     for wd, paths in czip(
         (self.local_wd, self.hdfs_wd), (self.local_paths, self.hdfs_paths)
     ):
         for p in paths:
             hdfs.dump(self.data, p, mode="wb")
         test_dir = "%s/%s" % (wd, "test_dir")
         test_path = "%s/%s" % (test_dir, "test_path")
         hdfs.dump(self.data, test_path, mode="wb")
         paths.append(test_dir)
         for recursive in False, True:
             if recursive:
                 paths.append(test_path)
             dir_list = [
                 path_transform(p) for p in ls_func(wd, recursive=recursive)
             ]
             self.assertEqual(sorted(dir_list), sorted(paths))
Пример #10
0
 def test_get_key_value_stream(self):
     stream = get_stream(STREAM_1_DATA)
     kv_stream = get_key_value_stream(stream)
     for ((k, v), (cmd, k1, v1)) in czip(kv_stream, STREAM_1_DATA):
         self.assertEqual(k, k1)
         self.assertEqual(v, v1)