def test_rename(self): utils.create_dir(self.path_) test_exists = utils.path_exists(self.path_) self.assertTrue(test_exists) utils.rename(self.path_, '/temp') test_exists = utils.path_exists(self.path_) self.assertFalse(test_exists) temp_exists = utils.path_exists('/temp') self.assertTrue(temp_exists) utils.delete_dir('/temp')
def init_dir(rm, recursive, create_dir): """ Create directories in HDFS to run the recommendation engine. """ try: listenbrainz_spark.init_spark_session('Manage Directories') except Py4JJavaError as err: logging.error('{}\n{}\nAborting...'.format(str(err), err.java_exception)) sys.exit(-1) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) if rm: try: utils.delete_dir(path.RECOMMENDATION_PARENT_DIR) utils.delete_dir(path.CHECKPOINT_DIR) logging.info('Successfully deleted directories.') except HdfsError as err: logging.error( '{}: Some/all directories are non-empty. Try "--recursive" to delete recursively.' .format(type(err).__name__)) logging.warning( 'Deleting directory recursively will delete all the recommendation data.' ) sys.exit(-1) if recursive: try: utils.delete_dir(path.RECOMMENDATION_PARENT_DIR, recursive=True) utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) logging.info('Successfully deleted directories recursively.') except HdfsError as err: logging.error( '{}: An error occurred while deleting directories recursively.\n{}\nAborting...' .format(type(err).__name__, str(err))) sys.exit(-1) if create_dir: try: logging.info('Creating directory to store dataframes...') utils.create_dir(path.DATAFRAME_DIR) logging.info('Creating directory to store models...') utils.create_dir(path.MODEL_DIR) logging.info('Creating directory to store candidate sets...') utils.create_dir(path.CANDIDATE_SET_DIR) logging.info('Creating directory to store RDD checkpoints...') utils.create_dir(path.CHECKPOINT_DIR) print('Done!') except HdfsError as err: logging.error( '{}: An error occured while creating some/more directories.\n{}\nAborting...' .format(type(err).__name__, str(err))) sys.exit(-1)
def test_copy(self): # Test directories utils.create_dir(self.path_) utils.create_dir(os.path.join(self.path_, "a")) utils.create_dir(os.path.join(self.path_, "b")) # DataFrames to create parquets df_a = utils.create_dataframe([Row(column1=1, column2=2)], schema=None) df_b = utils.create_dataframe([Row(column1=3, column2=4)], schema=None) df_c = utils.create_dataframe([Row(column1=5, column2=6)], schema=None) # Save DataFrames in respective directories utils.save_parquet(df_a, os.path.join(self.path_, "a", "df_a.parquet")) utils.save_parquet(df_b, os.path.join(self.path_, "b", "df_b.parquet")) utils.save_parquet(df_c, os.path.join(self.path_, "df_c.parquet")) utils.copy(self.path_, self.temp_path_, overwrite=True) # Read copied DataFrame cp_df_a = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "a", "df_a.parquet")) cp_df_b = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "b", "df_b.parquet")) cp_df_c = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "df_c.parquet")) # Check if both DataFrames are same self.assertListEqual(df_a.rdd.map(list).collect(), cp_df_a.rdd.map(list).collect()) self.assertListEqual(df_b.rdd.map(list).collect(), cp_df_b.rdd.map(list).collect()) self.assertListEqual(df_c.rdd.map(list).collect(), cp_df_c.rdd.map(list).collect())
def test_path_exists(self): utils.create_dir(self.path_) status = utils.path_exists(self.path_) self.assertTrue(status)
def test_delete_dir(self): utils.create_dir(self.path_) utils.delete_dir(self.path_) status = utils.path_exists(self.path_) self.assertFalse(status)
def upload_archive(self, tmp_dump_dir, tar, dest_path, schema, callback=None, overwrite=False): """ Upload data dump to HDFS. Args: tmp_dump_dir (str): Path to temporary directory to upload JSON. tar: Uncompressed tar object. dest_path (str): HDFS path to upload data dump. schema: Schema of parquet to be uploaded. callback: Function to process JSON files. overwrite: If True deletes dir at dest_path """ if callback is None: raise NotImplementedError( 'Callback to process JSON missing. Aborting...') # Delete TEMP_DIR_PATH if it exists if utils.path_exists(TEMP_DIR_PATH): utils.delete_dir(TEMP_DIR_PATH, recursive=True) # Copy data from dest_path to TEMP_DIR_PATH to be merged with new data if not overwrite and utils.path_exists(dest_path): t0 = time.monotonic() logger.info("Copying old listens into '{}'".format(TEMP_DIR_PATH)) utils.copy(dest_path, TEMP_DIR_PATH, overwrite=True) logger.info("Done! Time taken: {:.2f}".format(time.monotonic() - t0)) logger.info("Uploading listens to temporary directory in HDFS...") total_files = 0 total_time = 0.0 for member in tar: if member.isfile() and self._is_json_file(member.name): logger.info("Uploading {}...".format(member.name)) t0 = time.monotonic() try: tar.extract(member) except TarError as err: # Cleanup if utils.path_exists(TEMP_DIR_PATH): utils.delete_dir(TEMP_DIR_PATH, recursive=True) if utils.path_exists(tmp_dump_dir): utils.delete_dir(tmp_dump_dir, recursive=True) raise DumpInvalidException( "{} while extracting {}, aborting import".format( type(err).__name__, member.name)) tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name) utils.upload_to_HDFS(tmp_hdfs_path, member.name) callback(member.name, TEMP_DIR_PATH, tmp_hdfs_path, not overwrite, schema) utils.delete_dir(tmp_hdfs_path, recursive=True) os.remove(member.name) time_taken = time.monotonic() - t0 total_files += 1 total_time += time_taken logger.info( "Done! Current file processed in {:.2f} sec".format( time_taken)) logger.info( "Done! Total files processed {}. Average time taken: {:.2f}". format(total_files, total_time / total_files)) # Delete dest_path if present if utils.path_exists(dest_path): logger.info('Removing {} from HDFS...'.format(dest_path)) utils.delete_dir(dest_path, recursive=True) logger.info('Done!') logger.info("Moving the processed files to {}".format(dest_path)) t0 = time.monotonic() # Check if parent directory exists, if not create a directory dest_path_parent = pathlib.Path(dest_path).parent if not utils.path_exists(dest_path_parent): utils.create_dir(dest_path_parent) utils.rename(TEMP_DIR_PATH, dest_path) utils.logger.info("Done! Time taken: {:.2f}".format(time.monotonic() - t0)) # Cleanup utils.delete_dir(tmp_dump_dir, recursive=True)
def test_create_dir(self): path_ = '/tests/test' utils.create_dir(path_) status = utils.path_exists(path_) self.assertTrue(status)