def _create_non_empty_dir_(self, path): _dir = HDFS(path) _dir.create_directory() self.assertTrue(_dir.exists(), "source directory not found") for i in range(5): _file = HDFS(os.path.join(path, str(uuid.uuid4()))) _file.create(directory=(i % 2 == 0)) self.assertTrue(_file.exists(), "File was not created") return _dir
def test_create_directory(self): new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(new_dir.exists(), "Directory is already exists") try: new_dir.create_directory() self.assertTrue(new_dir.exists(), "Directory was not created") self.assertTrue(new_dir.is_directory()) finally: new_dir.delete(recursive=True) self.assertFalse(new_dir.exists(), "Directory was not removed")
def should_create_directory_recursively(self): _base_dir = os.path.join("/tmp", str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4())) _dir = HDFS(_path) self.assertFalse(_dir.exists(), "Folder is already exists") try: _dir.create_directory(recursive=True) self.assertTrue(_dir.exists(), "Folder was not created") self.assertTrue(_dir.is_directory(), "New file should be a directory") finally: HDFS(_base_dir).delete_directory() self.assertFalse(_dir.exists(), "File was not removed") self.assertFalse(HDFS(_base_dir).exists(), "Base dir was not removed")
def should_create_directory_recursively(self): _base_dir = os.path.join('/tmp', str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4())) _dir = HDFS(_path) self.assertFalse(_dir.exists(), "Folder is already exists") try: _dir.create_directory(recursive=True) self.assertTrue(_dir.exists(), "Folder was not created") self.assertTrue(_dir.is_directory(), "New file should be a directory") finally: HDFS(_base_dir).delete_directory() self.assertFalse(_dir.exists(), "File was not removed") self.assertFalse(HDFS(_base_dir).exists(), "Base dir was not removed")
def test_get_modification_time(self): now = datetime.now().strftime("%Y-%m-%d") _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) _file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: _dir.create_directory() _file.create_file() self.assertTrue(_dir.exists(), "Dir was not created") self.assertTrue(_file.exists(), "File was not created") self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time") self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time") finally: _dir.delete_directory() _file.delete()
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _base_dir = HDFS(os.path.join("/tmp", _job_name)) _base_dir.create_directory() try: jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar') # configure job inputs _job_input = HDFS(os.path.join(_base_dir.path, "input")) _job_input.create_directory() LocalFS(os.path.join( os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) # configure job output _job_output = HDFS(os.path.join(_base_dir.path, "output")) if not os.path.exists(jar): self.skipTest("'%s' not found" % jar) job = MapReduce.prepare_mapreduce_job(jar=jar, main_class="wordcount", name=_job_name) \ .with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments( _job_input.path, _job_output.path ) _command_submission_result = job.run() _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed") self.assertTrue(_job_output.exists(), "Error: empty job output") # check counters self.assertEqual(6, _job_status.counter(group='File System Counters', counter='HDFS: Number of write operations')) self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks')) self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _base_dir.delete_directory()
def test_streaming_job_with_multiple_inputs(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path) _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2")) _additional_datasource.create_directory() LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _additional_datasource.path) job.take(_additional_datasource.path) command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # check counters self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'), "counters['Map-Reduce Framework']['Reduce input records']") finally: _job_basedir.delete_directory()
def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False): if not os.path.exists(HADOOP_STREAMING_JAR): self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR) _hdfs_basdir = HDFS(base_dir) if not _hdfs_basdir.exists(): _hdfs_basdir.create_directory() _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input")) _job_input.create_directory() _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output")) home = os.path.dirname(__file__) _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py') _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py') LocalFS( os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \ .take(_job_input.path) \ .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \ .save(_job_output.path)
# may be used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # import os from merlin.fs.hdfs import HDFS from merlin.fs.localfs import LocalFS if __name__ == "__main__": _basedir = HDFS(os.path.join('/tmp', 'scd.active')) _basedir.create_directory() _scd_active_snapshot = LocalFS( os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv')) _scd_active_snapshot.copy_to_hdfs(_basedir.path)
# specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # import os from merlin.fs.hdfs import HDFS from merlin.fs.localfs import LocalFS if __name__ == "__main__": _basedir = HDFS(os.path.join('/tmp', 'scd.active')) _basedir.create_directory() _scd_active_snapshot = LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv')) _scd_active_snapshot.copy_to_hdfs(_basedir.path)
file.close() if __name__ == '__main__': log = get_logger("SCD") # Prepare paths _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig') _scd_active_snapshot = '/tmp/scd.active/scd.active.csv' _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv') _hdfs_job_output = '/tmp/scd.updated' _local_folder_to_monitor = LocalFS(os.path.join(os.path.dirname(__file__), 'resources')) _hdfs_basedir = HDFS('/tmp/scd.active') _hdfs_tmpdir = HDFS('/tmp/scd.tmp') _hdfs_tmpdir.create_directory() if _scd_updates and LocalFS(_scd_updates).exists(): # Checks if file with last failed step is exists # and reads this step step = 'Copying scd updates to raw area on HDFS' if os.path.isfile('resources/step'): file = open('resources/step', 'r') step = file.read() file.close() flow = FlowRegistry.flow('Flow') # Runs flow _context = flow.run(action=step,
if __name__ == '__main__': log = get_logger("SCD") # Prepare paths _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig') _scd_active_snapshot = '/tmp/scd.active/scd.active.csv' _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv') _hdfs_job_output = '/tmp/scd.updated' _local_folder_to_monitor = LocalFS( os.path.join(os.path.dirname(__file__), 'resources')) _hdfs_basedir = HDFS('/tmp/scd.active') _hdfs_tmpdir = HDFS('/tmp/scd.tmp') _hdfs_tmpdir.create_directory() if _scd_updates and LocalFS(_scd_updates).exists(): # Checks if file with last failed step is exists # and reads this step step = 'Copying scd updates to raw area on HDFS' if os.path.isfile('resources/step'): file = open('resources/step', 'r') step = file.read() file.close() flow = FlowRegistry.flow('Flow') # Runs flow _context = flow.run(