예제 #1
0
class HadoopFSTestCase(MockSubprocessTestCase):
    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join('mock_hadoop_home', 'contrib', 'streaming',
                         'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HADOOP_TMP'] = self.makedirs('mock_hadoop')
        self.env['MOCK_HADOOP_VERSION'] = '2.7.1'

        self.env['USER'] = '******'

    def make_mock_file(self, name, contents='contents'):
        return self.makefile(os.path.join(get_mock_hdfs_root(self.env), name),
                             contents)

    def test_cat_uncompressed(self):
        self.make_mock_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.join('hdfs:///data', 'foo')

        self.assertEqual(b''.join(self.fs._cat_file(remote_path)),
                         b'foo\nfoo\n')

    def test_cat_bz2(self):
        self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000))

        remote_path = self.fs.join('hdfs:///data', 'foo.bz2')

        self.assertEqual(b''.join(self.fs._cat_file(remote_path)),
                         b'foo\n' * 1000)

    def test_cat_gz(self):
        self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000))

        remote_path = self.fs.join('hdfs:///data', 'foo.gz')

        self.assertEqual(b''.join(self.fs._cat_file(remote_path)),
                         b'foo\n' * 10000)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_mock_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_mock_file('f')
        self.make_mock_file('f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_mock_file('f')
        self.make_mock_file('d/f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///d/f2', 'hdfs:///f'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3n://bucket/')),
                         ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_ls_s3a(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3a://bucket/')),
                         ['s3a://bucket/f', 's3a://bucket/f3 win'])

    def test_single_space(self):
        self.make_mock_file('foo bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_mock_file('foo  bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo  bar'])

    def test_du(self):
        self.make_mock_file('data1', 'abcd')
        self.make_mock_file('more/data2', 'defg')
        self.make_mock_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_du_non_existent(self):
        self.assertEqual(self.fs.du('hdfs:///does-not-exist'), 0)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d/ave')
        local_path = os.path.join(get_mock_hdfs_root(self.env), 'd', 'ave')
        self.assertEqual(os.path.isdir(local_path), True)

    def test_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.exists(path), False)

    def test_exists_yes(self):
        self.make_mock_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.exists(path), True)

    def test_rm(self):
        local_path = self.make_mock_file('f')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_rm_recursive(self):
        local_path = self.make_mock_file('foo/bar')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///foo')  # remove containing directory
        self.assertEqual(os.path.exists(local_path), False)

    def test_rm_nonexistent(self):
        self.fs.rm('hdfs:///baz')

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
예제 #2
0
class HadoopFSTestCase(MockSubprocessTestCase):

    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join(
                'mock_hadoop_home',
                'contrib',
                'streaming',
                'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_hdfs_file(self, name, contents):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_hdfs_file('f', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_hdfs_file('f', 'contents')
        self.make_hdfs_file('f2', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f',
                                                        'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_hdfs_file('f', 'contents')
        self.make_hdfs_file('d/f2', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///d/f2'])

    def test_cat_uncompressed(self):
        # mockhadoop doesn't support compressed files, so we won't test for it.
        # this is only a sanity check anyway.
        self.makefile(os.path.join('mock_hdfs_root', 'data', 'foo'), 'foo\nfoo\n')
        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)), ['foo\n', 'foo\n'])

    def test_du(self):
        self.makefile(os.path.join('mock_hdfs_root', 'data1'), 'abcd')
        self.makedirs('mock_hdfs_root/more')
        self.makefile(os.path.join('mock_hdfs_root', 'more', 'data2'), 'defg')
        self.makefile(os.path.join('mock_hdfs_root', 'more', 'data3'), 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d')
        local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
        self.assertEqual(os.path.isdir(local_path), True)

    def test_rm(self):
        local_path = self.make_hdfs_file('f', 'contents')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
예제 #3
0
class HadoopFSTestCase(MockSubprocessTestCase):
    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(["hadoop"])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env["HADOOP_HOME"] = self.makedirs("mock_hadoop_home")

        self.makefile(
            os.path.join("mock_hadoop_home", "contrib", "streaming", "hadoop-0.X.Y-streaming.jar"),
            "i are java bytecode",
        )

        self.env["MOCK_HDFS_ROOT"] = self.makedirs("mock_hdfs_root")
        self.env["MOCK_HADOOP_OUTPUT"] = self.makedirs("mock_hadoop_output")
        self.env["USER"] = "******"
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_mock_file(self, name, contents="contents"):
        return self.makefile(os.path.join("mock_hdfs_root", name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls("hdfs:///")), [])

    def test_ls_basic(self):
        self.make_mock_file("f")
        self.assertEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f"])

    def test_ls_basic_2(self):
        self.make_mock_file("f")
        self.make_mock_file("f2")
        self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f", "hdfs:///f2"])

    def test_ls_recurse(self):
        self.make_mock_file("f")
        self.make_mock_file("d/f2")
        self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f", "hdfs:///d/f2"])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file("f", "foo")
        self.make_mock_file("f3 win", "foo" * 10)
        self.assertItemsEqual(list(self.fs.ls("s3n://bucket/")), ["s3n://bucket/f", "s3n://bucket/f3 win"])

    def test_single_space(self):
        self.make_mock_file("foo bar")
        self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///foo bar"])

    def test_double_space(self):
        self.make_mock_file("foo  bar")
        self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///foo  bar"])

    def test_cat_uncompressed(self):
        self.make_mock_file("data/foo", "foo\nfoo\n")

        remote_path = self.fs.path_join("hdfs:///data", "foo")

        self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n", "foo\n"])

    def test_cat_bz2(self):
        self.make_mock_file("data/foo.bz2", bz2.compress("foo\n" * 1000))

        remote_path = self.fs.path_join("hdfs:///data", "foo.bz2")

        self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n"] * 1000)

    def test_cat_gz(self):
        self.make_mock_file("data/foo.gz", gzip_compress("foo\n" * 10000))

        remote_path = self.fs.path_join("hdfs:///data", "foo.gz")

        self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n"] * 10000)

    def test_du(self):
        self.make_mock_file("data1", "abcd")
        self.make_mock_file("more/data2", "defg")
        self.make_mock_file("more/data3", "hijk")

        self.assertEqual(self.fs.du("hdfs:///"), 12)
        self.assertEqual(self.fs.du("hdfs:///data1"), 4)
        self.assertEqual(self.fs.du("hdfs:///more"), 8)
        self.assertEqual(self.fs.du("hdfs:///more/*"), 8)
        self.assertEqual(self.fs.du("hdfs:///more/data2"), 4)
        self.assertEqual(self.fs.du("hdfs:///more/data3"), 4)

    def test_mkdir(self):
        for hadoop_version in ["0.20.0", "0.23.0", "1.2.0", "2.0.0"]:
            self.env["MOCK_HADOOP_VERSION"] = hadoop_version
            self.fs.mkdir("hdfs:///d")
            local_path = os.path.join(self.tmp_dir, "mock_hdfs_root", "d")
            self.assertEqual(os.path.isdir(local_path), True)

    def test_path_exists_no(self):
        path = "hdfs:///f"
        self.assertEqual(self.fs.path_exists(path), False)

    def test_path_exists_yes(self):
        self.make_mock_file("f")
        path = "hdfs:///f"
        self.assertEqual(self.fs.path_exists(path), True)

    def test_rm(self):
        local_path = self.make_mock_file("f")
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm("hdfs:///f")
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
예제 #4
0
class HadoopFSTestCase(MockSubprocessTestCase):
    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join('mock_hadoop_home', 'contrib', 'streaming',
                         'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_hdfs_file(self, name, contents):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_hdfs_file('f', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_hdfs_file('f', 'contents')
        self.make_hdfs_file('f2', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_hdfs_file('f', 'contents')
        self.make_hdfs_file('d/f2', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///d/f2'])

    def test_cat_uncompressed(self):
        # mockhadoop doesn't support compressed files, so we won't test for it.
        # this is only a sanity check anyway.
        self.makefile(os.path.join('mock_hdfs_root', 'data', 'foo'),
                      'foo\nfoo\n')
        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n', 'foo\n'])

    def test_du(self):
        self.makefile(os.path.join('mock_hdfs_root', 'data1'), 'abcd')
        self.makedirs('mock_hdfs_root/more')
        self.makefile(os.path.join('mock_hdfs_root', 'more', 'data2'), 'defg')
        self.makefile(os.path.join('mock_hdfs_root', 'more', 'data3'), 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d')
        local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
        self.assertEqual(os.path.isdir(local_path), True)

    def test_rm(self):
        local_path = self.make_hdfs_file('f', 'contents')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
예제 #5
0
파일: test_hadoop.py 프로젝트: Yelp/mrjob
class HadoopFSTestCase(MockSubprocessTestCase):

    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join(
                'mock_hadoop_home',
                'contrib',
                'streaming',
                'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HADOOP_TMP'] = self.makedirs('mock_hadoop')
        self.env['MOCK_HADOOP_VERSION'] = '2.7.1'

        self.env['USER'] = '******'

    def make_mock_file(self, name, contents='contents'):
        return self.makefile(
            os.path.join(get_mock_hdfs_root(self.env), name), contents)

    def test_cat_uncompressed(self):
        self.make_mock_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.join('hdfs:///data', 'foo')

        self.assertEqual(
            b''.join(self.fs._cat_file(remote_path)),
            b'foo\nfoo\n')

    def test_cat_bz2(self):
        self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000))

        remote_path = self.fs.join('hdfs:///data', 'foo.bz2')

        self.assertEqual(
            b''.join(self.fs._cat_file(remote_path)),
            b'foo\n' * 1000)

    def test_cat_gz(self):
        self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000))

        remote_path = self.fs.join('hdfs:///data', 'foo.gz')

        self.assertEqual(
            b''.join(self.fs._cat_file(remote_path)),
            b'foo\n' * 10000)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_mock_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_mock_file('f')
        self.make_mock_file('f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_mock_file('f')
        self.make_mock_file('d/f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///d/f2', 'hdfs:///f'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3n://bucket/')),
                         ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_ls_s3a(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3a://bucket/')),
                         ['s3a://bucket/f', 's3a://bucket/f3 win'])

    def test_single_space(self):
        self.make_mock_file('foo bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_mock_file('foo  bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///foo  bar'])

    def test_du(self):
        self.make_mock_file('data1', 'abcd')
        self.make_mock_file('more/data2', 'defg')
        self.make_mock_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_du_non_existent(self):
        self.assertEqual(self.fs.du('hdfs:///does-not-exist'), 0)

    def test_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.exists(path), False)

    def test_exists_yes(self):
        self.make_mock_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.exists(path), True)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d/ave')
        path_in_mock_hdfs = os.path.join(
            get_mock_hdfs_root(self.env), 'd', 'ave')
        self.assertEqual(os.path.isdir(path_in_mock_hdfs), True)

    def test_put(self):
        local_path = self.makefile('foo', contents=b'bar')
        dest = 'hdfs:///bar'

        self.fs.put(local_path, dest)
        self.assertEqual(b''.join(self.fs.cat(dest)), b'bar')

    def test_no_put_to_dir(self):
        local_path = self.makefile('foo', contents=b'bar')

        self.assertRaises(ValueError, self.fs.put, local_path, 'hdfs:///')

    def test_rm(self):
        path_in_mock_hdfs = self.make_mock_file('f')
        self.assertEqual(os.path.exists(path_in_mock_hdfs), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(path_in_mock_hdfs), False)

    def test_rm_recursive(self):
        path_in_mock_hdfs = self.make_mock_file('foo/bar')
        self.assertEqual(os.path.exists(path_in_mock_hdfs), True)
        self.fs.rm('hdfs:///foo')  # remove containing directory
        self.assertEqual(os.path.exists(path_in_mock_hdfs), False)

    def test_rm_nonexistent(self):
        self.fs.rm('hdfs:///baz')

    def test_touchz(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

        self.fs.touchz('hdfs:///empty')

        self.assertEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///empty'])
예제 #6
0
class HadoopFSTestCase(MockSubprocessTestCase):
    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join('mock_hadoop_home', 'contrib', 'streaming',
                         'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_mock_file(self, name, contents='contents'):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_mock_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_mock_file('f')
        self.make_mock_file('f2')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                              ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_mock_file('f')
        self.make_mock_file('d/f2')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                              ['hdfs:///f', 'hdfs:///d/f2'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertItemsEqual(list(self.fs.ls('s3n://bucket/')),
                              ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_single_space(self):
        self.make_mock_file('foo bar')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                              ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_mock_file('foo  bar')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                              ['hdfs:///foo  bar'])

    def test_cat_uncompressed(self):
        self.make_mock_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n', 'foo\n'])

    def test_cat_bz2(self):
        self.make_mock_file('data/foo.bz2', bz2.compress('foo\n' * 1000))

        remote_path = self.fs.path_join('hdfs:///data', 'foo.bz2')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n'] * 1000)

    def test_cat_gz(self):
        self.make_mock_file('data/foo.gz', gzip_compress('foo\n' * 10000))

        remote_path = self.fs.path_join('hdfs:///data', 'foo.gz')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n'] * 10000)

    def test_du(self):
        self.make_mock_file('data1', 'abcd')
        self.make_mock_file('more/data2', 'defg')
        self.make_mock_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        for hadoop_version in ['0.20.0', '0.23.0', '1.2.0', '2.0.0']:
            self.env['MOCK_HADOOP_VERSION'] = hadoop_version
            self.fs.mkdir('hdfs:///d')
            local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
            self.assertEqual(os.path.isdir(local_path), True)

    def test_path_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), False)

    def test_path_exists_yes(self):
        self.make_mock_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), True)

    def test_rm(self):
        local_path = self.make_mock_file('f')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
예제 #7
0
class HadoopFSTestCase(MockSubprocessTestCase):

    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join(
                'mock_hadoop_home',
                'contrib',
                'streaming',
                'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_hdfs_file(self, name, contents='contents'):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def make_hdfs_dir(self, name):
        return self.makedirs(os.path.join('mock_hdfs_root', name))

    def make_hdfs_tree(self, path, files=None):
        if files is None:
            files = ('f', 'g/a/b', 'g/a/a/b')
        test_files = []
        for f in sorted(files):
            f = os.path.join(path, f)
            self.make_hdfs_file(f, f)
            test_files.append("hdfs:///" + f)
        self.assertEqual(
            sorted(self.fs.ls("hdfs:///" + path.rstrip('/') + '/*')),
            test_files
        )
        return path

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_hdfs_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_hdfs_file('f')
        self.make_hdfs_file('f2')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f',
                                                        'hdfs:///f2'])
    def test_ls_recurse(self):
        self.make_hdfs_file('f')
        self.make_hdfs_file('d/f2')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///d/f2'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_hdfs_file('f', 'foo')
        self.make_hdfs_file('f3 win', 'foo' * 10)
        self.assertItemsEqual(list(self.fs.ls('s3n://bucket/')),
                         ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_single_space(self):
        self.make_hdfs_file('foo bar')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_hdfs_file('foo  bar')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///foo  bar'])

    def test_cat_uncompressed(self):
        # mockhadoop doesn't support compressed files, so we won't test for it.
        # this is only a sanity check anyway.
        self.make_hdfs_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n', 'foo\n'])

    def test_write_str(self):
        path = 'hdfs:///write-test-str'
        content = 'some content!'
        self.fs.write(path, content)
        self.assertEqual("".join(self.fs.cat(path)), content)

    def test_write_file(self):
        path = 'hdfs:///write-test-fileobj'
        content = StringIO('some content!')
        self.fs.write(path, content)
        self.assertEqual("".join(self.fs.cat(path)), content.getvalue())

    def test_write_overwrite(self):
        self.make_hdfs_file('existing', 'this file already exists')
        self.assertRaises(OSError, self.fs.write, 'hdfs:///existing',
                          'can not overwrite')

    def test_copy_from_local(self):
        content = 'file filler'
        dst = 'hdfs:///hadoop-copy'
        src = self.makefile('local-source', content)

        self.fs.copy_from_local(dst, src)
        self.assertEqual("".join(self.fs.cat(dst)), content)

    def test_copy_from_local_override(self):
        src = self.makefile('local-source', 'source')
        self.make_hdfs_file('existing', 'this file already exists')
        self.assertRaises(OSError, self.fs.copy_from_local,
                          'hdfs:///existing', src)

    def test_du(self):
        self.make_hdfs_file('data1', 'abcd')
        self.make_hdfs_file('more/data2', 'defg')
        self.make_hdfs_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d')
        local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
        self.assertEqual(os.path.isdir(local_path), True)

    def test_path_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), False)

    def test_path_exists_yes(self):
        self.make_hdfs_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), True)

    def test_rm(self):
        local_path = self.make_hdfs_file('f')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_rm_tree_noslash_files(self):
        path = "icio/goodbye-1"
        hdfs_path = "hdfs:///%s" % path
        real_path = self.make_hdfs_dir(path)
        self.make_hdfs_tree(path)

        self.fs.rm(hdfs_path.rstrip("/"))

        # Check that the directory and its files have been removed
        self.assertEqual(os.path.isdir(real_path), False)
        self.assertEqual(self.fs.path_exists(path), False)
        self.assertEqual(list(self.fs.ls(hdfs_path)), [])

    def test_rm_tree_slash_files(self):
        path = "icio/goodbye-2"
        hdfs_path = "hdfs:///%s" % path
        real_path = self.make_hdfs_dir(path)
        self.make_hdfs_tree(path)

        self.fs.rm(hdfs_path.rstrip("/") + "/")

        # Check that the directory and its files have been removed
        self.assertEqual(os.path.isdir(real_path), False)
        self.assertEqual(self.fs.path_exists(hdfs_path), False)
        self.assertEqual(list(self.fs.ls(hdfs_path)), [])

    def test_rm_tree_star_files(self):
        path = "icio/goodbye-3"
        hdfs_path = "hdfs:///%s" % path
        real_path = self.make_hdfs_dir(path)
        self.make_hdfs_tree('icio/goodbye-3')

        self.fs.rm(hdfs_path.rstrip("/") + "/*")

        # Check that the files have been removed but not the root directory
        self.assertEqual(os.path.isdir(real_path), True)
        self.assertEqual(self.fs.path_exists(hdfs_path), True)
        self.assertEqual(list(self.fs.ls(hdfs_path)), [])

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
예제 #8
0
class HadoopFSTestCase(MockSubprocessTestCase):

    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join(
                'mock_hadoop_home',
                'contrib',
                'streaming',
                'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_mock_file(self, name, contents='contents'):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_mock_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_mock_file('f')
        self.make_mock_file('f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_mock_file('f')
        self.make_mock_file('d/f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///d/f2', 'hdfs:///f'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3n://bucket/')),
                         ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_single_space(self):
        self.make_mock_file('foo bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_mock_file('foo  bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///foo  bar'])

    def test_cat_uncompressed(self):
        self.make_mock_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         [b'foo\n', b'foo\n'])

    def test_cat_bz2(self):
        self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000))

        remote_path = self.fs.path_join('hdfs:///data', 'foo.bz2')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         [b'foo\n'] * 1000)

    def test_cat_gz(self):
        self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000))

        remote_path = self.fs.path_join('hdfs:///data', 'foo.gz')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         [b'foo\n'] * 10000)

    def test_du(self):
        self.make_mock_file('data1', 'abcd')
        self.make_mock_file('more/data2', 'defg')
        self.make_mock_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        for hadoop_version in ['0.20.0', '0.23.0', '1.2.0', '2.0.0']:
            self.env['MOCK_HADOOP_VERSION'] = hadoop_version
            self.fs.mkdir('hdfs:///d')
            local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
            self.assertEqual(os.path.isdir(local_path), True)

    def test_path_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), False)

    def test_path_exists_yes(self):
        self.make_mock_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), True)

    def test_rm(self):
        local_path = self.make_mock_file('f')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass