def test_delete_dir(self): local = LocalFS(os.path.dirname(os.path.realpath(__file__))) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Target HDFS dir does not exists") hdfs_file.delete(recursive=True) self.assertFalse(hdfs_file.exists(), "Target HDFS dir was not deleted")
def apply_hdfs_snapshot(config): """Creates initial directory structure on HDFS and applies ACL rules """ _hdfs_snapshot = FsSnapshot.load_from_config( config, fs_section=CONFIG_HDFS_DIRS_KEY, acl_section=CONFIG_ACLS_KEY) _hdfs_snapshot.apply( mkdir_command=lambda path: HDFS(path).create_directory(recursive=True), apply_acls_command=lambda path, acls: HDFS(path).apply_acl(acls))
def test_file_size(self): local = LocalFS(os.path.realpath(__file__)) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") self.assertEqual(hdfs_file.size(), local.size()) finally: hdfs_file.delete()
def should_raise_error_mkdir_not_recursive(self): _base_dir = os.path.join("/tmp", str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4())) _dir = HDFS(_path) self.assertFalse(_base_dir.exists(), "Folder is already exists") try: self.assertRaises(FileSystemException, _dir.create_directory, recursive=False) finally: self.assertFalse(_dir.exists(), "File was created")
def should_raise_error_mkdir_not_recursive(self): _base_dir = os.path.join('/tmp', str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4())) _dir = HDFS(_path) self.assertFalse(_base_dir.exists(), "Folder is already exists") try: self.assertRaises(FileSystemException, _dir.create_directory, recursive=False) finally: self.assertFalse(_dir.exists(), "File was created")
def _create_non_empty_dir_(self, path): _dir = HDFS(path) _dir.create_directory() self.assertTrue(_dir.exists(), "source directory not found") for i in range(5): _file = HDFS(os.path.join(path, str(uuid.uuid4()))) _file.create(directory=(i % 2 == 0)) self.assertTrue(_file.exists(), "File was not created") return _dir
def test_get_permissions(self): self.assertEqual("drwxr-xr-x", HDFS("/").permissions(), "Root dir permissions should be 'drwxr-xr-x'") # Permissions to '/tmp' folder are different on different CDH versions # self.assertEqual("drwxrwxrwt", HDFS("/tmp").permissions(), "Tmp dir permissions should be 'drwxrwxrwxt'") hbase_file = HDFS("/hbase/hbase.id") if hbase_file.exists(): self.assertEqual( "-rw-r--r--", hbase_file.permissions(), "/hbase/hbase.id permissions should be '-rw-r--r--'" )
def test_get_permissions(self): self.assertEqual("drwxr-xr-x", HDFS("/").permissions(), "Root dir permissions should be 'drwxr-xr-x'") # Permissions to '/tmp' folder are different on different CDH versions # self.assertEqual("drwxrwxrwt", HDFS("/tmp").permissions(), "Tmp dir permissions should be 'drwxrwxrwxt'") hbase_file = HDFS("/hbase/hbase.id") if hbase_file.exists(): self.assertEqual("-rw-r--r--", hbase_file.permissions(), "/hbase/hbase.id permissions should be '-rw-r--r--'")
def test_create_directory(self): new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(new_dir.exists(), "Directory is already exists") try: new_dir.create_directory() self.assertTrue(new_dir.exists(), "Directory was not created") self.assertTrue(new_dir.is_directory()) finally: new_dir.delete(recursive=True) self.assertFalse(new_dir.exists(), "Directory was not removed")
def test_create_file(self): new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(new_file.exists(), "File is already exists") try: new_file.create_file() self.assertTrue(new_file.exists(), "File was not created") self.assertFalse(new_file.is_directory(), "New file should not be a folder") finally: new_file.delete() self.assertFalse(new_file.exists(), "File was not removed")
def test_dir_size(self): local_basedir = os.path.dirname(os.path.realpath(__file__)) local = LocalFS(os.path.join(local_basedir, "resources", "test_dir_size")) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") expected_fsize = local.size() self.assertEqual(hdfs_file.size(), expected_fsize) finally: hdfs_file.delete(recursive=True)
def should_create_file_recursively(self): _base_dir = os.path.join('/tmp', str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()), 'file.txt') _file = HDFS(_path) self.assertFalse(_file.exists(), "File is already exists") try: _file.create_file(recursive=True) self.assertTrue(_file.exists(), "File was not created") self.assertFalse(_file.is_directory(), "New file should not be a directory") finally: HDFS(_base_dir).delete_directory() self.assertFalse(_file.exists(), "File was not removed") self.assertFalse(HDFS(_base_dir).exists(), "Bse dir was not removed")
def test_copy_to_local(self): new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) local_path = os.path.join("/tmp", "copied_from_hdfs") self.assertFalse(os.path.exists(local_path)) try: new_file.create_file() self.assertTrue(new_file.exists(), "File was not created") new_file.copy_to_local(local_path) self.assertTrue(os.path.exists(local_path), "File was not copied from HDFS") finally: new_file.delete() self.assertFalse(new_file.exists(), "File was not removed") os.remove(local_path) self.assertFalse(os.path.exists(local_path))
def test_list_files(self): basedir = HDFS("/tmp") new_file = HDFS("/tmp/test.txt") try: new_file.create(directory=False) self.assertTrue(new_file.exists(), "File was not created") files = basedir.list_files() self.assertTrue(new_file in files) finally: new_file.delete() self.assertFalse(new_file.exists(), "File was not deleted")
def test_streaming_job_without_reducer(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path, map_only_job=True) command_result = job.run() command_result.if_failed_raise(AssertionError("Cannot run map-only job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # check counters self.assertEqual(2, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(11, _job_status.counter(group='Map-Reduce Framework', counter='Map input records')) self.assertEqual(3252, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _job_basedir.delete_directory()
def test_get_description(self): directory = HDFS("/tmp/bar") try: directory.create() self.assertEqual(directory.get_description().name, "/tmp/bar") self.assertEqual(directory.get_description().size, 0) self.assertEqual(directory.get_description().owner, getpass.getuser()) self.assertEqual(directory.get_description().create_date, None) finally: directory.delete(recursive=True) self.assertFalse(directory.delete(), "File was not deleted")
def test_run_preconfigured_job_without_parameters_substitution(self): _test_id = str(uuid.uuid4()) _job_name = "TEST_PIG_{}".format(_test_id) _input_dir = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) _output_dir = "/tmp/data_{}".format(_test_id) _commands = "A = load '{}' using PigStorage(',');".format(_input_dir) _commands += "B = foreach A generate \$0 as id;" _commands += "STORE B into '{}';".format(_output_dir) # create job configuration. can also be loaded from .ini file _config = Configuration.create() _config.set(_job_name, TaskOptions.CONFIG_KEY_COMMANDS_STRING, _commands) _config.set(_job_name, TaskOptions.CONFIG_KEY_LOG_BRIEF, 'enabled') _config.set( _job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE, 'input_dir={}\noutput_dir={}'.format(_input_dir, _output_dir)) try: _pig = Pig.load_preconfigured_job(config=_config, job_name=_job_name) _result = _pig.run() _result.if_failed_raise( AssertionError("test_run_preconfigured_job failed")) self.assertTrue( HDFS(_output_dir).exists(), "Cannot find job output") finally: self.delete_file_in_hdfs(_input_dir) self.delete_file_in_hdfs(_output_dir)
def test_streaming_job(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path) command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_generated test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # counters self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Spilled Records'), "counters['Map-Reduce Framework']['Spilled Records']") self.assertEqual(143, _job_status.counter(group='Map-Reduce Framework', counter='Reduce output records'), "counters['Map-Reduce Framework']['Reduce output records']") self.assertEqual(370, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'), "counters['Map-Reduce Framework']['Reduce input records']") finally: _job_basedir.delete_directory()
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _base_dir = HDFS(os.path.join("/tmp", _job_name)) _base_dir.create_directory() try: jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar') # configure job inputs _job_input = HDFS(os.path.join(_base_dir.path, "input")) _job_input.create_directory() LocalFS(os.path.join( os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) # configure job output _job_output = HDFS(os.path.join(_base_dir.path, "output")) if not os.path.exists(jar): self.skipTest("'%s' not found" % jar) job = MapReduce.prepare_mapreduce_job(jar=jar, main_class="wordcount", name=_job_name) \ .with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments( _job_input.path, _job_output.path ) _command_submission_result = job.run() _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed") self.assertTrue(_job_output.exists(), "Error: empty job output") # check counters self.assertEqual(6, _job_status.counter(group='File System Counters', counter='HDFS: Number of write operations')) self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks')) self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _base_dir.delete_directory()
def test_get_replicas(self): self.assertEqual("0", HDFS("/").replicas(), "Root dir replicas should be 0") self.assertNotEqual("0", HDFS("/tmp").replicas(), "dir replicas should be 0") name = uuid.uuid4() hdfs_file = HDFS("/tmp/{0}".format(name)) hdfs_file.create_file() shell.execute_shell_command("hadoop dfs", "-setrep -w 1 /tmp/{0}".format(name)) if hdfs_file.exists(): self.assertEqual("1", hdfs_file.replicas(), "Number replicas of file must be 1") hdfs_file.delete() self.assertFalse(hdfs_file.exists())
def load_file_from_local_to_hdfs(context): context['new_pathes'] = [] for _file in LocalFS( os.path.join(os.path.dirname(__file__), "resources/tmp")): HDFS("/tmp/raw/{0}".format(parser_partition(_file.path))) \ .create(directory=True) LocalFS(os.path.join(os.path.dirname(__file__), "resources/tmp/{0}").format(_file.path)) \ .copy_to_hdfs(hdfs_path="/tmp/raw/{0}/".format(parser_partition(_file.path))) context['new_pathes'].append("/tmp/raw/{0}".format( parser_partition(_file.path)))
def test_import_to_hive(self): _path = HDFS(os.path.join('/user', getpass.getuser(), 'table_name')) try: if _path.exists(): _path.delete(recursive=_path.is_directory()) # shell.execute_shell_command('hadoop fs', '-rm -r /user/', getpass.getuser(), '/table_name') cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests" ).table( table="table_name" ).to_hive().run() # self.assertEquals(cmd.status, 0, cmd.stderr) # result = shell.execute_shell_command('hadoop fs', '-du -s /user/hive/warehouse/table_name/part-m-*') # self.assertNotEqual(result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command('hive', "-e 'DROP TABLE IF EXISTS table_name'")
def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False): if not os.path.exists(HADOOP_STREAMING_JAR): self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR) _hdfs_basdir = HDFS(base_dir) if not _hdfs_basdir.exists(): _hdfs_basdir.create_directory() _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input")) _job_input.create_directory() _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output")) home = os.path.dirname(__file__) _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py') _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py') LocalFS( os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \ .take(_job_input.path) \ .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \ .save(_job_output.path)
def should_create_file_recursively(self): _base_dir = os.path.join("/tmp", str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()), "file.txt") _file = HDFS(_path) self.assertFalse(_file.exists(), "File is already exists") try: _file.create_file(recursive=True) self.assertTrue(_file.exists(), "File was not created") self.assertFalse(_file.is_directory(), "New file should not be a directory") finally: HDFS(_base_dir).delete_directory() self.assertFalse(_file.exists(), "File was not removed") self.assertFalse(HDFS(_base_dir).exists(), "Bse dir was not removed")
def test_merge(self): basedir = os.path.dirname(os.path.realpath(__file__)) local = LocalFS(os.path.join(basedir, "resources", "test_merge")) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt")) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") hdfs_file.merge(merged_file.path) self.assertTrue(merged_file.exists(), "merged file was not copied to local fs") finally: hdfs_file.delete_directory() merged_file.delete()
def test_move_non_empty_dir(self): dst = HDFS("/tmp/dst_" + str(uuid.uuid4())) _dir = None try: _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(dst.exists(), "dst directory is already exists") _dir.move(dst.path) self.assertFalse(_dir.exists(), "original directory should be deleted") self.assertTrue(dst.exists(), "directory move operation failed") finally: if _dir: _dir.delete_directory() self.assertFalse(_dir.exists(), "Folder was not deleted") dst.delete_directory() self.assertFalse(dst.exists(), "Dst Folder was not deleted")
def test_run_commands_from_file(self): _test_id = str(uuid.uuid4()) _inputs = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) commands = "A = load '$input_dir' using PigStorage(',');" commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '$output_dir';" files_s = self.temp_file(commands) try: _output_dir = "/tmp/data_{}".format(_test_id) pig = Pig.load_commands_from_file(files_s) \ .with_parameter("input_dir", _inputs) \ .with_parameter("output_dir", _output_dir) self.assertTrue(pig.run().is_ok()) self.assertTrue(HDFS(_output_dir).exists()) finally: self.delete_local(files_s) self.delete_file_in_hdfs() self.delete_file_in_hdfs(_inputs)
def test_run_commands_from_string_without_param_substitution(self): _test_id = str(uuid.uuid4()) _output_dir = "/tmp/data_{}".format(_test_id) _input_dir = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) commands = "A = load '{}' using PigStorage(',');".format(_input_dir) commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '{}';".format(_output_dir) try: _pig = Pig.load_commands_from_string(commands) _result = _pig.run() _result.if_failed_raise( AssertionError("test_run_commands_from_string failed")) self.assertTrue( HDFS(_output_dir).exists(), "Cannot find job output") finally: self.delete_file_in_hdfs(_input_dir) self.delete_file_in_hdfs(_output_dir)
def test_streaming_job_with_multiple_inputs(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path) _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2")) _additional_datasource.create_directory() LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _additional_datasource.path) job.take(_additional_datasource.path) command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # check counters self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'), "counters['Map-Reduce Framework']['Reduce input records']") finally: _job_basedir.delete_directory()
def test_import_to_hive(self): _path = HDFS(os.path.join('/user', getpass.getuser(), 'table_name')) try: if _path.exists(): _path.delete(recursive=_path.is_directory()) # shell.execute_shell_command('hadoop fs', '-rm -r /user/', getpass.getuser(), '/table_name') cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table( table="table_name").to_hive().run() # self.assertEquals(cmd.status, 0, cmd.stderr) # result = shell.execute_shell_command('hadoop fs', '-du -s /user/hive/warehouse/table_name/part-m-*') # self.assertNotEqual(result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hive', "-e 'DROP TABLE IF EXISTS table_name'")
def test_recursive_list_files(self): basedir = HDFS("/tmp") new_folder = HDFS("/tmp/test123") new_file = HDFS("/tmp/test123/test.txt") try: new_folder.create(directory=True) self.assertTrue(new_folder.exists(), "Folder was not created") new_file.create(directory=False) self.assertTrue(new_file.exists(), "File was not created") files = basedir.recursive_list_files() self.assertTrue(new_file in files) self.assertTrue(new_folder in files) finally: new_folder.delete(recursive=True) self.assertFalse(new_file.exists(), "Folder was not deleted")
def test_get_modification_time(self): now = datetime.now().strftime("%Y-%m-%d") _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) _file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: _dir.create_directory() _file.create_file() self.assertTrue(_dir.exists(), "Dir was not created") self.assertTrue(_file.exists(), "File was not created") self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time") self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time") finally: _dir.delete_directory() _file.delete()
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # from merlin.fs.hdfs import HDFS BASE_DIR = "/tmp" if __name__ == "__main__": # Cleans resources after flow. hdfs_file = HDFS("{0}/data_to_export".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete(recursive=True) hdfs_file = HDFS("{0}/data_from_import".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete(recursive=True) hdfs_file = HDFS("{0}/rdbms.password".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete()
def on_flow_failed(context): hdfs_file = HDFS("{0}/raw".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete(recursive=True)
def load_file_on_hdfs(context): _hdfs = HDFS('/tmp/raw') context['files_on_HDFS'] = [] for _file in _hdfs.recursive_list_files(): if not _file.is_directory(): context['files_on_HDFS'].append(_file.get_description())
def test_distcp(self): directory = HDFS("/tmp/bar") directory.create() new_file = HDFS("/tmp/test_dist.txt") new_file.create(directory=False) _host = "sandbox.hortonworks.com" try: self.assertTrue(new_file.exists(), "File was not created") _file = HDFS("hdfs://{host}:8020/tmp/test_dist.txt".format(host=_host)) _file.distcp(dest="hdfs://{host}:8020/tmp/bar/test_dist.txt".format(host=_host)) file_after_copy = HDFS("/tmp/bar/test_dist.txt") self.assertTrue(file_after_copy.exists(), "File was not copied") finally: new_file.delete() directory.delete(recursive=True) self.assertFalse(new_file.exists(), "File was not deleted") self.assertFalse(directory.delete(), "File was not deleted")
def test_apply_hdfs_snapshot(self): _config_file = os.path.join(os.path.dirname(__file__), 'resources', 'bootsrap', 'bootstrap.ini') _raw_sales_dir = HDFS('/tmp/raw/sales') _raw_users_dir = HDFS('/tmp/raw/users') _raw_tmp_dir = HDFS('/tmp/raw/tmp') try: # run bootstrap script metastore = IniFileMetaStore(file=_config_file) _config = Configuration.load(metastore) apply_hdfs_snapshot(_config) # asserts # assert directories were created self.assertTrue(_raw_sales_dir.exists(), "Directory '/tmp/raw/sales' was not created") self.assertTrue(_raw_users_dir.exists(), "Directory '/tmp/raw/users' was not created") self.assertTrue(_raw_tmp_dir.exists(), "Directory '/tmp/raw/tmp' was not created") # assert acls were applied sales_dir_acls = _raw_sales_dir.get_acls() users_dir_acls = _raw_users_dir.get_acls() self.assertIsNotNone(sales_dir_acls, '/tmp/raw/sales : ACL were not applied') self.assertTrue('group:sys-pii:r-x' in sales_dir_acls, '/tmp/raw/sales : pii acl was not applied') self.assertTrue('group:sales:r--' in sales_dir_acls, '/tmp/raw/sales : salse acl was not applied') self.assertIsNotNone(users_dir_acls, '/tmp/raw/users : ACL were not applied') self.assertTrue('group:sys-pii:r-x' in sales_dir_acls, '/tmp/raw/users : pii acl was not applied') finally: _test_basedir = HDFS('/tmp/raw') _test_basedir.delete_directory() self.assertFalse(_test_basedir.exists(), "ERROR: clean up failed")
# specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # import os from merlin.fs.hdfs import HDFS from merlin.fs.localfs import LocalFS if __name__ == "__main__": _basedir = HDFS(os.path.join('/tmp', 'scd.active')) _basedir.create_directory() _scd_active_snapshot = LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv')) _scd_active_snapshot.copy_to_hdfs(_basedir.path)
file.write(action_name) file.close() if __name__ == '__main__': log = get_logger("SCD") # Prepare paths _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig') _scd_active_snapshot = '/tmp/scd.active/scd.active.csv' _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv') _hdfs_job_output = '/tmp/scd.updated' _local_folder_to_monitor = LocalFS(os.path.join(os.path.dirname(__file__), 'resources')) _hdfs_basedir = HDFS('/tmp/scd.active') _hdfs_tmpdir = HDFS('/tmp/scd.tmp') _hdfs_tmpdir.create_directory() if _scd_updates and LocalFS(_scd_updates).exists(): # Checks if file with last failed step is exists # and reads this step step = 'Copying scd updates to raw area on HDFS' if os.path.isfile('resources/step'): file = open('resources/step', 'r') step = file.read() file.close() flow = FlowRegistry.flow('Flow') # Runs flow
# for additional information regarding copyright ownership and licensing. # from merlin.tools.hive import Hive from ConfigParser import RawConfigParser from merlin.fs.localfs import LocalFS from merlin.fs.hdfs import HDFS from merlin.fs.ftp import ftp_client import os BASE_DIR = "/tmp" if __name__ == "__main__": # create empty directory '/tmp/raw' on HDFS hdfs_file = HDFS("{0}/raw".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete(recursive=True) hdfs_file.create(directory=True) # create empty directory '/tmp/base_dir' on FTP config = RawConfigParser() config.read( os.path.join(os.path.dirname(__file__), "resources/ftp_config.ini")) host_download = config.get("ftp", "host.download") user_name = config.get("ftp", "user.name") password = config.get("ftp", "password") path = config.get("ftp", "path") ftp = ftp_client(host=host_download, login=user_name, password=password,
# may be used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # import os from merlin.fs.hdfs import HDFS from merlin.fs.localfs import LocalFS if __name__ == "__main__": _basedir = HDFS(os.path.join('/tmp', 'scd.active')) _basedir.create_directory() _scd_active_snapshot = LocalFS( os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv')) _scd_active_snapshot.copy_to_hdfs(_basedir.path)
def test_create(self): new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) # tets new file creation try: new_file.create(directory=False) self.assertTrue(new_file.exists(), "File was not created") self.assertFalse(new_file.is_directory(), "New file should not be a directory") finally: new_file.delete() self.assertFalse(new_file.exists(), "File was not removed") # test new folder creation try: new_dir.create(directory=True) self.assertTrue(new_dir.exists(), "Directory was not created") self.assertTrue(new_dir.is_directory(), "New file should be a directory") finally: new_dir.delete(recursive=True) self.assertFalse(new_dir.exists(), "Directory was not removed")
# See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # from merlin.tools.hive import Hive from ConfigParser import RawConfigParser from merlin.fs.localfs import LocalFS from merlin.fs.hdfs import HDFS from merlin.fs.ftp import ftp_client import os BASE_DIR = "/tmp" if __name__ == "__main__": hdfs_file = HDFS("{0}/raw".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete(recursive=True) config = RawConfigParser() config.read(os.path.join(os.path.dirname(__file__), "resources/ftp_config.ini")) host_download = config.get("ftp", "host.download") user_name = config.get("ftp", "user.name") password = config.get("ftp", "password") path = config.get("ftp", "path") ftp = ftp_client(host=host_download, login=user_name, password=password, path="/tmp") if ftp.exists():
def test_get_owner(self): self.assertEqual("hdfs", HDFS("/").owner(), "ERROR: Root dir owner") self.assertEqual("hdfs", HDFS("/tmp").owner(), "ERROR: /tmp dir owner") hbase_file = HDFS("/hbase/hbase.id") if hbase_file.exists(): self.assertEqual("hbase", HDFS("/hbase/hbase.id").owner(), "ERROR: /hbase/hbase.id dir owner")