def test_delete_dir(self): local = LocalFS(os.path.dirname(os.path.realpath(__file__))) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Target HDFS dir does not exists") hdfs_file.delete(recursive=True) self.assertFalse(hdfs_file.exists(), "Target HDFS dir was not deleted")
def download_dir(self, path, local_path, predicate=lambda path, connector: True, recursive=True): """ Copies a remote directory (path) with files from the SFTP server to the local host into a local_path. In addition, copies inner directories if parameter 'recursive' is True. Filters file if predicate was given :param path: path to directory on ftp :param local_path: path to directory on local file system :param predicate: predicate for filter file :param recursive: copies all inner directory at the given path if is True :type path: str :type local_path: str :type recursive: bool """ path = self.__normalize_path(path) self.__assert_exists(path) self.__assert_is_dir(path) local_path = self.__normalize_path(local_path) LocalFS(local_path).assert_exists() LocalFS(local_path).assert_is_dir() local_path = os.path.join(local_path, self.__get_name(path)) LocalFS(local_path).create_directory() list_ = self.list_files(path) if predicate: list_ = [path for path in list_ if predicate(path, self)] for tmp in list_: if recursive and self.is_directory(tmp): self.download_dir(tmp, local_path, predicate, recursive) elif not self.is_directory(tmp): self.download_file(tmp, local_path)
def download_file(self, path, local_path): """ Copies a remote file (path) from the SFTP server to the local host as local_path :param path: path to file on ftp :param local_path: path to future file or existing directory :type path: str :type local_path: str """ path = self.__normalize_path(path) self.__assert_exists(path) local_path = self.__normalize_path(local_path) self.__assert_is_not_dir(path) if LocalFS(local_path).exists(): _local_dst = local_path if not LocalFS(local_path).is_directory() else \ os.path.join(local_path, self.__get_name(path)) self.__ftp_instance.get(remotepath=path, localpath=_local_dst) elif LocalFS(self.__get_basename(local_path)).exists(): local = LocalFS(self.__get_basename(local_path)) if local.is_directory(): self.__ftp_instance.get(remotepath=path, localpath=local_path) else: raise FTPFileError("'{0}' is not directory".format(local_path)) else: raise FTPFileError("'{0}' is not exists".format(local_path))
def test_file_size(self): local = LocalFS(os.path.realpath(__file__)) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") self.assertEqual(hdfs_file.size(), local.size()) finally: hdfs_file.delete()
def test_dir_size(self): local_basedir = os.path.dirname(os.path.realpath(__file__)) local = LocalFS(os.path.join(local_basedir, "resources", "test_dir_size")) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") expected_fsize = local.size() self.assertEqual(hdfs_file.size(), expected_fsize) finally: hdfs_file.delete(recursive=True)
def _run_(self, application, test_id=str(uuid.uuid4())): basedir = LocalFS(os.path.join("/tmp", "test_spark", test_id)) try: basedir.create_directory() _app_input = self.input_path _app_output_dir = os.path.join(basedir.path, "output") status = application.run('file:' + _app_input, 'file:' + _app_output_dir) self.assertTrue(status.is_ok(), status.stderr()) self.assertTrue(os.path.exists(_app_output_dir), status.stderr()) finally: basedir.delete_directory()
def load_file_from_local_to_hdfs(context): context['new_pathes'] = [] for _file in LocalFS( os.path.join(os.path.dirname(__file__), "resources/tmp")): HDFS("/tmp/raw/{0}".format(parser_partition(_file.path))) \ .create(directory=True) LocalFS(os.path.join(os.path.dirname(__file__), "resources/tmp/{0}").format(_file.path)) \ .copy_to_hdfs(hdfs_path="/tmp/raw/{0}/".format(parser_partition(_file.path))) context['new_pathes'].append("/tmp/raw/{0}".format( parser_partition(_file.path)))
def __create_local_dir(self, path, local_path): """ Creates local directory :param path: path to file on ftp :param local_path: path to file on local file system :return: """ LocalFS(local_path).assert_exists() names = os.path.basename(path) local_path = os.path.join(local_path, names) LocalFS(local_path).create_directory() return local_path
def upload(self, path, local_path, update=False): """ Copies a local file (local_path) to the SFTP server as path. Updates exists file if parameter 'update' is True :param path: path to file on ftp :param local_path: path to file on local file system :param update: updates file on ftp if is True :type path: str :type local_path: str :type update: bool """ path = self.__normalize_path(path) local_path = self.__normalize_path(local_path) LocalFS(local_path).assert_exists() if self.exists(path): if not self.is_directory(path): if update: self.__ftp_instance.put(localpath=local_path, remotepath=path) else: raise FTPFileError("'{0}' is exists".format(path)) else: path_name = self.__get_name(local_path) self.__ftp_instance.put(localpath=local_path, remotepath=os.path.join( path, path_name)) else: path_name = self.__get_basename(path) if self.exists(path_name): self.__ftp_instance.put(localpath=local_path, remotepath=path) else: raise FileNotFoundException( "'{path}' does not exists".format(path=path_name))
def upload(self, path, local_path, update=False): """ Copies a local file (local_path) to the SFTP server as path. Updates exists file if parameter 'update' is True :param path: path to file on ftp :param local_path: path to file on local file system :param update: updates file on ftp if is True :type path: str :type local_path: str :type update: bool """ base_path = self.base_dir(path) LocalFS(local_path).assert_exists() if self.exists(path): if self.is_directory(path): name_files = os.path.basename(local_path) self.__copy_file_from_local(local_path, "/".join([path, name_files])) else: self.__assert_is_update(update) self.__copy_file_from_local(local_path, path) else: self.__assert_exists(base_path) name_files = path[len(base_path):] self.__copy_file_from_local(local_path, "/".join([base_path, name_files]))
def test_merge(self): basedir = os.path.dirname(os.path.realpath(__file__)) local = LocalFS(os.path.join(basedir, "resources", "test_merge")) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt")) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") hdfs_file.merge(merged_file.path) self.assertTrue(merged_file.exists(), "merged file was not copied to local fs") finally: hdfs_file.delete_directory() merged_file.delete()
def test_apply_local_fs_snapshot(self): _config_file = os.path.join(os.path.dirname(__file__), 'resources', 'bootsrap', 'bootstrap.ini') test_dir = LocalFS('/tmp/data_tmp') if test_dir.exists(): test_dir.delete_directory() try: metastore = IniFileMetaStore(file=_config_file) _config = Configuration.load(metastore) apply_localfs_snapshot(_config) self.assertTrue(test_dir.exists(), "Folder was not created") finally: test_dir.delete_directory()
def __copy_file_from_local(self, local_path, path, create_parents=False): """ Copies file from local """ LocalFS(local_path).assert_exists() base_dir = self.base_dir(path) if self.exists(base_dir): self.__ftp_driver.storbinary("STOR {0}".format(path), open(local_path, "rb")) else: self.__assert_recursive(create_parents) self.create_dir(base_dir) self.__ftp_driver.storbinary("STOR {0}".format(path), open(local_path, "rb"))
def test_broker(self): shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT)) local = LocalFS("/tmp/kafka-test") if not local.exists(): local.create_directory() thread = KafkaThreadBroker() thread.daemon = True thread.start() sleep(TIME) cmd = shell.execute_shell_command('netstat -lntu') self.assertTrue("9010" in cmd.stdout, cmd.stdout) local.delete_directory() shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
def download_file(self, path, local_path): """ Copies a remote file (path) from the SFTP server to the local host as local_path :param path: path to file on ftp :param local_path: path to future file or existing directory :type path: str :type local_path: str """ self.__assert_exists(path) LocalFS(local_path).assert_exists() self.__assert_is_not_dir(path) self.__ftp_driver.retrbinary( "RETR {0}".format(path), open(os.path.join(local_path, os.path.basename(path)), "w+b").write)
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _base_dir = HDFS(os.path.join("/tmp", _job_name)) _base_dir.create_directory() try: jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar') # configure job inputs _job_input = HDFS(os.path.join(_base_dir.path, "input")) _job_input.create_directory() LocalFS(os.path.join( os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) # configure job output _job_output = HDFS(os.path.join(_base_dir.path, "output")) if not os.path.exists(jar): self.skipTest("'%s' not found" % jar) job = MapReduce.prepare_mapreduce_job(jar=jar, main_class="wordcount", name=_job_name) \ .with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments( _job_input.path, _job_output.path ) _command_submission_result = job.run() _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed") self.assertTrue(_job_output.exists(), "Error: empty job output") # check counters self.assertEqual(6, _job_status.counter(group='File System Counters', counter='HDFS: Number of write operations')) self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks')) self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _base_dir.delete_directory()
def test_streaming_job_with_multiple_inputs(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path) _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2")) _additional_datasource.create_directory() LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _additional_datasource.path) job.take(_additional_datasource.path) command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # check counters self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'), "counters['Map-Reduce Framework']['Reduce input records']") finally: _job_basedir.delete_directory()
def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False): if not os.path.exists(HADOOP_STREAMING_JAR): self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR) _hdfs_basdir = HDFS(base_dir) if not _hdfs_basdir.exists(): _hdfs_basdir.create_directory() _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input")) _job_input.create_directory() _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output")) home = os.path.dirname(__file__) _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py') _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py') LocalFS( os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \ .take(_job_input.path) \ .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \ .save(_job_output.path)
if ftp.exists(): ftp.delete(recursive=True) ftp.create(make_dir=True) # upload file to directory on FTP ftp.upload(local_path=os.path.join(os.path.dirname(__file__), "resources/file_12.11.2014_.txt")) ftp.upload(local_path=os.path.join(os.path.dirname(__file__), "resources/file_13.11.2014_.txt")) ftp.upload(local_path=os.path.join(os.path.dirname(__file__), "resources/file_14.11.2014_.txt")) # upload file to HDFS/ create directories hdfs_file = HDFS("{0}/raw/12.11.2014".format(BASE_DIR)) hdfs_file.create(directory=True) local_file = LocalFS(path=os.path.join(os.path.dirname(__file__), 'resources/file_12.11.2014_.txt')) local_file.copy_to_hdfs(hdfs_path="{0}/raw/12.11.2014".format(BASE_DIR)) hdfs_file = HDFS("{0}/raw/13.11.2014".format(BASE_DIR)) hdfs_file.create(directory=True) local_file = LocalFS(path=os.path.join(os.path.dirname(__file__), 'resources/file_13.11.2014_.txt')) local_file.copy_to_hdfs(hdfs_path="{0}/raw/13.11.2014".format(BASE_DIR)) # create empty local directory 'tmp' in folder 'resources' local_file = LocalFS( path=os.path.join(os.path.dirname(__file__), 'resources/tmp')) if local_file.exists(): local_file.delete_directory() local_file.create(directory=True)
def on_flow_failed(context): local_file = LocalFS(path=os.path.join(os.path.dirname(__file__), 'resources/tmp')) if local_file.exists(): local_file.delete_directory()
def on_flow_failed(context): local_file = LocalFS( path=os.path.join(os.path.dirname(__file__), 'resources/tmp')) if local_file.exists(): local_file.delete_directory()
# specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # import os from merlin.fs.hdfs import HDFS from merlin.fs.localfs import LocalFS if __name__ == "__main__": _basedir = HDFS(os.path.join('/tmp', 'scd.active')) _basedir.create_directory() _scd_active_snapshot = LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv')) _scd_active_snapshot.copy_to_hdfs(_basedir.path)
password=password, path=path) if ftp.exists(): ftp.delete(recursive=True) ftp.create(make_dir=True) # upload file to directory on FTP ftp.upload(local_path=os.path.join(os.path.dirname(__file__), "resources/file_12.11.2014_.txt")) ftp.upload(local_path=os.path.join(os.path.dirname(__file__), "resources/file_13.11.2014_.txt")) ftp.upload(local_path=os.path.join(os.path.dirname(__file__), "resources/file_14.11.2014_.txt")) # upload file to HDFS/ create directories hdfs_file = HDFS("{0}/raw/12.11.2014".format(BASE_DIR)) hdfs_file.create(directory=True) local_file = LocalFS(path=os.path.join(os.path.dirname(__file__), 'resources/file_12.11.2014_.txt')) local_file.copy_to_hdfs(hdfs_path="{0}/raw/12.11.2014".format(BASE_DIR)) hdfs_file = HDFS("{0}/raw/13.11.2014".format(BASE_DIR)) hdfs_file.create(directory=True) local_file = LocalFS(path=os.path.join(os.path.dirname(__file__), 'resources/file_13.11.2014_.txt')) local_file.copy_to_hdfs(hdfs_path="{0}/raw/13.11.2014".format(BASE_DIR)) # create empty local directory 'tmp' in folder 'resources' local_file = LocalFS(path=os.path.join(os.path.dirname(__file__), 'resources/tmp')) if local_file.exists(): local_file.delete_directory() local_file.create(directory=True)
# may be used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # import os from merlin.fs.hdfs import HDFS from merlin.fs.localfs import LocalFS if __name__ == "__main__": _basedir = HDFS(os.path.join('/tmp', 'scd.active')) _basedir.create_directory() _scd_active_snapshot = LocalFS( os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv')) _scd_active_snapshot.copy_to_hdfs(_basedir.path)
def upload_to_hdfs_updates(context): LocalFS(_scd_updates).copy_to_hdfs(_hdfs_tmpdir.path)
from merlin.fs.localfs import LocalFS from merlin.fs.hdfs import HDFS BASE_DIR = "/tmp" def clean_resources(): """ Cleans resources from previously flow. """ hdfs_file = HDFS("{0}/data_to_export".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete(recursive=True) hdfs_file = HDFS("{0}/data_from_import".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete(recursive=True) hdfs_file = HDFS("{0}/rdbms.password".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete() if __name__ == "__main__": clean_resources() # Copies new configuration files from 'resource' folder LocalFS(path=os.path.join(os.path.dirname(__file__), 'resources/rdbms.password')).copy_to_hdfs( hdfs_path=BASE_DIR)
def apply_localfs_snapshot(config): """Creates initial directory structure on local file system""" _localfs_snapshot = FsSnapshot.load_from_config( config, fs_section=CONFIG_LOCAL_FS_DIRS_KEY) _localfs_snapshot.apply( mkdir_command=lambda path: LocalFS(path).create_directory())
if __name__ == "__main__": hdfs_file = HDFS("{0}/raw".format(BASE_DIR)) if hdfs_file.exists(): hdfs_file.delete(recursive=True) config = RawConfigParser() config.read(os.path.join(os.path.dirname(__file__), "resources/ftp_config.ini")) host_download = config.get("ftp", "host.download") user_name = config.get("ftp", "user.name") password = config.get("ftp", "password") path = config.get("ftp", "path") ftp = ftp_client(host=host_download, login=user_name, password=password, path="/tmp") if ftp.exists(): ftp.delete(recursive=True) local_file = LocalFS(path=os.path.join(os.path.dirname(__file__), 'resources/tmp')) if local_file.exists(): local_file.delete_directory() hive = Hive.load_queries_from_string(query="DROP DATABASE IF EXISTS hive_monitoring CASCADE;") hive.run()
file = open('resources/step', 'w') file.write(action_name) file.close() if __name__ == '__main__': log = get_logger("SCD") # Prepare paths _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig') _scd_active_snapshot = '/tmp/scd.active/scd.active.csv' _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv') _hdfs_job_output = '/tmp/scd.updated' _local_folder_to_monitor = LocalFS( os.path.join(os.path.dirname(__file__), 'resources')) _hdfs_basedir = HDFS('/tmp/scd.active') _hdfs_tmpdir = HDFS('/tmp/scd.tmp') _hdfs_tmpdir.create_directory() if _scd_updates and LocalFS(_scd_updates).exists(): # Checks if file with last failed step is exists # and reads this step step = 'Copying scd updates to raw area on HDFS' if os.path.isfile('resources/step'): file = open('resources/step', 'r') step = file.read() file.close() flow = FlowRegistry.flow('Flow')