def tearDown(self): """ Delete the parquet file stored to ensure that the tests are independant. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) return super().tearDown()
def delete_model(): """ Delete model. Note: At any point in time, only one model is in HDFS """ dir_exists = utils.path_exists(path.DATA_DIR) if dir_exists: utils.delete_dir(path.DATA_DIR, recursive=True)
def test_get_latest_full_dump_file_missing(self): """ Test to ensure 'None' is returned if metadata file is missing. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) self.assertIsNone(import_utils.get_latest_full_dump())
def test_search_dump_file_missing(self): """ Test to ensure 'False' is returned if metadata file is missing. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) self.assertFalse( import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
def test_path_exists(self): path_ = '/tests/test' utils.create_dir(path_) status = utils.path_exists(path_) self.assertTrue(status) utils.delete_dir(path_) status = utils.path_exists(path_) self.assertFalse(status)
def test_insert_dump_data_file_missing(self): """ Test to ensure a file is created if it is missing. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) self.assertFalse(import_utils.search_dump(1, "full", datetime.fromtimestamp(1))) import_utils.insert_dump_data(1, "full", datetime.fromtimestamp(1)) self.assertTrue(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
def test_rename(self): utils.create_dir(self.path_) test_exists = utils.path_exists(self.path_) self.assertTrue(test_exists) utils.rename(self.path_, self.temp_path_) test_exists = utils.path_exists(self.path_) self.assertFalse(test_exists) temp_exists = utils.path_exists(self.temp_path_) self.assertTrue(temp_exists) utils.delete_dir(self.temp_path_)
def test_get_latest_full_dump_no_full(self): """ Test to ensure 'None' is returned if not full import has been made. """ # Remove full dump entries from parquet import_meta_df = read_files_from_HDFS(self.path_) result = import_meta_df.filter(import_meta_df.dump_type != "full") # We have to save the dataframe as a different file and move it as the df itself is read from the file save_parquet(result, '/temp.parquet') delete_dir(self.path_, recursive=True) rename('/temp.parquet', self.path_) self.assertIsNone(import_utils.get_latest_full_dump())
def test_upload_archive(self): archive_path = self.create_test_tar() pxz = self.uploader.get_pxz_output(archive_path) tmp_dump_dir = tempfile.mkdtemp() with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: self.uploader.upload_archive(tmp_dump_dir, tar, '/artist_relations.parquet', schema.artist_relation_schema, self.uploader.process_json) df = utils.read_files_from_HDFS('/artist_relations.parquet') self.assertEqual(df.count(), 1) status = utils.path_exists(tmp_dump_dir) self.assertFalse(status) utils.delete_dir('/artist_relations.parquet', recursive=True)
def init_dir(rm, recursive, create_dir): """ Create directories in HDFS to run the recommendation engine. """ try: listenbrainz_spark.init_spark_session('Manage Directories') except Py4JJavaError as err: logging.error('{}\n{}\nAborting...'.format(str(err), err.java_exception)) sys.exit(-1) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) if rm: try: utils.delete_dir(path.RECOMMENDATION_PARENT_DIR) utils.delete_dir(path.CHECKPOINT_DIR) logging.info('Successfully deleted directories.') except HdfsError as err: logging.error( '{}: Some/all directories are non-empty. Try "--recursive" to delete recursively.' .format(type(err).__name__)) logging.warning( 'Deleting directory recursively will delete all the recommendation data.' ) sys.exit(-1) if recursive: try: utils.delete_dir(path.RECOMMENDATION_PARENT_DIR, recursive=True) utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) logging.info('Successfully deleted directories recursively.') except HdfsError as err: logging.error( '{}: An error occurred while deleting directories recursively.\n{}\nAborting...' .format(type(err).__name__, str(err))) sys.exit(-1) if create_dir: try: logging.info('Creating directory to store dataframes...') utils.create_dir(path.DATAFRAME_DIR) logging.info('Creating directory to store models...') utils.create_dir(path.MODEL_DIR) logging.info('Creating directory to store candidate sets...') utils.create_dir(path.CANDIDATE_SET_DIR) logging.info('Creating directory to store RDD checkpoints...') utils.create_dir(path.CHECKPOINT_DIR) print('Done!') except HdfsError as err: logging.error( '{}: An error occured while creating some/more directories.\n{}\nAborting...' .format(type(err).__name__, str(err))) sys.exit(-1)
def extract_and_upload_archive(self, archive, local_dir, hdfs_dir, cleanup_on_failure=True): """ Extract the archive and upload it to the given hdfs directory. Args: archive: path to the tar archive to uploaded local_dir: path to local dir to be used for extraction hdfs_dir: path to hdfs dir where contents of tar should be uploaded cleanup_on_failure: whether to delete local and hdfs directories if error occurs during extraction """ total_files = 0 total_time = 0.0 with tarfile.open(archive, mode='r') as tar: for member in tar: if member.isfile() and member.name.endswith(".parquet"): logger.info(f"Uploading {member.name}...") t0 = time.monotonic() try: tar.extract(member, path=local_dir) except tarfile.TarError as err: if cleanup_on_failure: if utils.path_exists(hdfs_dir): utils.delete_dir(hdfs_dir, recursive=True) shutil.rmtree(local_dir, ignore_errors=True) raise DumpInvalidException( f"{type(err).__name__} while extracting {member.name}, aborting import" ) hdfs_path = os.path.join(hdfs_dir, member.name) local_path = os.path.join(local_dir, member.name) utils.upload_to_HDFS(hdfs_path, local_path) time_taken = time.monotonic() - t0 total_files += 1 total_time += time_taken logger.info( f"Done! Current file processed in {time_taken:.2f} sec" ) logger.info( f"Done! Total files processed {total_files}. Average time taken: {total_time / total_files:.2f}" )
def test_upload_archive(self): archive_path = self.create_test_tar() pxz = ListenbrainzHDFSUploader().get_pxz_output(archive_path) tmp_dump_dir = tempfile.mkdtemp() with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: ListenbrainzHDFSUploader().upload_archive(tmp_dump_dir, tar, '/test', schema.listen_schema, ListenbrainzDataUploader().process_json_listens) walk = utils.hdfs_walk('/test', depth=1) dirs = next(walk)[1] self.assertEqual(len(dirs), 1) df = utils.read_files_from_HDFS('/test/2020/1.parquet') self.assertEqual(df.count(), 1) status = utils.path_exists(tmp_dump_dir) self.assertFalse(status) utils.delete_dir('/test', recursive=True)
def upload_new_listens_incremental_dump(self, archive: str): """ Upload new format parquet listens of an incremental dump to HDFS. Args: archive: path to parquet listens dump to be uploaded """ # upload parquet file to temporary path so that we can # read it in spark in next step hdfs_path = self.upload_archive_to_temp(archive) # read the parquet file from the temporary path and append # it to incremental.parquet for permanent storage read_files_from_HDFS(hdfs_path) \ .repartition(1) \ .write \ .mode("append") \ .parquet(INCREMENTAL_DUMPS_SAVE_PATH) # delete parquet from hdfs temporary path utils.delete_dir(hdfs_path, recursive=True)
def insert_dump_data(dump_id: int, dump_type: str, imported_at: datetime): """ Insert information about dump imported """ import_meta_df = None try: import_meta_df = read_files_from_HDFS(IMPORT_METADATA) except PathNotFoundException: current_app.logger.info("Import metadata file not found, creating...") data = create_dataframe(Row(dump_id, dump_type, imported_at), schema=import_metadata_schema) if import_meta_df: result = import_meta_df \ .filter(f"dump_id != '{dump_id}' OR dump_type != '{dump_type}'") \ .union(data) else: result = data # We have to save the dataframe as a different file and move it as the df itself is read from the file save_parquet(result, "/temp.parquet") if path_exists(IMPORT_METADATA): delete_dir(IMPORT_METADATA, recursive=True) rename("/temp.parquet", IMPORT_METADATA)
def upload_archive(self, tmp_dump_dir, tar, dest_path, schema, callback=None, force=False): """ Upload data dump to HDFS. Args: tmp_dump_dir (str): Path to temporary directory to upload JSON. tar: Uncompressed tar object. dest_path (str): HDFS path to upload data dump. schema: Schema of parquet to be uploaded. callback: Function to process JSON files. force: If True deletes dir at dest_path """ if callback is None: raise NotImplementedError('Callback to process JSON missing. Aboritng...') if force: current_app.logger.info('Removing {} from HDFS...'.format(dest_path)) utils.delete_dir(dest_path, recursive=True) current_app.logger.info('Done!') file_count = 0 total_time = 0.0 for member in tar: if member.isfile() and self._is_json_file(member.name): current_app.logger.info('Loading {}...'.format(member.name)) t = time.time() tar.extract(member) tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name) utils.upload_to_HDFS(tmp_hdfs_path, member.name) callback(member.name, dest_path, tmp_hdfs_path, schema) utils.delete_dir(tmp_hdfs_path, recursive=True) os.remove(member.name) file_count += 1 time_taken = time.time() - t current_app.logger.info("Done! Processed {} files. Current file done in {:.2f} sec".format( file_count, time_taken)) total_time += time_taken average_time = total_time / file_count current_app.logger.info("Total time: {:.2f}, average time: {:.2f}".format(total_time, average_time)) utils.delete_dir(tmp_dump_dir, recursive=True) shutil.rmtree(tmp_dump_dir)
def main(ranks=None, lambdas=None, iterations=None, alpha=None): if ranks is None: current_app.logger.critical('model param "ranks" missing') if lambdas is None: current_app.logger.critical('model param "lambdas" missing') raise if iterations is None: current_app.logger.critical('model param "iterations" missing') raise if alpha is None: current_app.logger.critical('model param "alpha" missing') raise ti = time.monotonic() time_ = defaultdict(dict) try: listenbrainz_spark.init_spark_session('Train Models') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise # Add checkpoint dir to break and save RDD lineage. listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI + path.CHECKPOINT_DIR) try: playcounts_df = utils.read_files_from_HDFS( path.PLAYCOUNTS_DATAFRAME_PATH) dataframe_metadata_df = utils.read_files_from_HDFS( path.DATAFRAME_METADATA) except PathNotFoundException as err: current_app.logger.error( '{}\nConsider running create_dataframes.py'.format(str(err)), exc_info=True) raise except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) raise time_['load_playcounts'] = '{:.2f}'.format((time.monotonic() - ti) / 60) t0 = time.monotonic() training_data, validation_data, test_data = preprocess_data(playcounts_df) time_['preprocessing'] = '{:.2f}'.format((time.monotonic() - t0) / 60) # An action must be called for persist to evaluate. num_training = training_data.count() num_validation = validation_data.count() num_test = test_data.count() t0 = time.monotonic() best_model, model_metadata = get_best_model(training_data, validation_data, num_validation, ranks, lambdas, iterations, alpha) models_training_time = '{:.2f}'.format((time.monotonic() - t0) / 3600) best_model_metadata = get_best_model_metadata(best_model) current_app.logger.info( "Calculating test RMSE for best model with model id: {}".format( best_model.model_id)) best_model_metadata['test_rmse'] = compute_rmse(best_model.model, test_data, num_test, best_model.model_id) current_app.logger.info("Test RMSE calculated!") best_model_metadata['training_data_count'] = num_training best_model_metadata['validation_data_count'] = num_validation best_model_metadata['test_data_count'] = num_test best_model_metadata['dataframe_id'] = get_latest_dataframe_id( dataframe_metadata_df) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) t0 = time.monotonic() save_model(best_model.model_id, best_model.model) time_['save_model'] = '{:.2f}'.format((time.monotonic() - t0) / 60) save_model_metadata_to_hdfs(best_model_metadata) # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway. try: utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) except HDFSDirectoryNotDeletedException as err: current_app.logger.error(str(err), exc_info=True) raise if SAVE_TRAINING_HTML: current_app.logger.info('Saving HTML...') save_training_html(time_, num_training, num_validation, num_test, model_metadata, best_model_metadata, ti, models_training_time) current_app.logger.info('Done!') message = [{ 'type': 'cf_recording_model', 'model_upload_time': str(datetime.utcnow()), 'total_time': '{:.2f}'.format(time.monotonic() - ti), }] return message
def tearDown(self): path_found = utils.path_exists(self.path_) if path_found: utils.delete_dir(self.path_, recursive=True)
def delete_uploaded_listens(): if utils.path_exists(LISTENBRAINZ_NEW_DATA_DIRECTORY): utils.delete_dir(LISTENBRAINZ_NEW_DATA_DIRECTORY, recursive=True)
def delete_dir(cls): walk = utils.hdfs_walk('/', depth=1) # dirs in '/' dirs = next(walk)[1] for directory in dirs: utils.delete_dir(os.path.join('/', directory), recursive=True)
def main(): ti = time() time_ = defaultdict(dict) try: listenbrainz_spark.init_spark_session('Train Models') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) # Add checkpoint dir to break and save RDD lineage. listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI + path.CHECKPOINT_DIR) try: playcounts_df = utils.read_files_from_HDFS( path.PLAYCOUNTS_DATAFRAME_PATH) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) time_['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60) t0 = time() training_data, validation_data, test_data = preprocess_data(playcounts_df) time_['preprocessing'] = '{:.2f}'.format((time() - t0) / 60) # Rdds that are used in model training iterative process are cached to improve performance. # Caching large files may cause Out of Memory exception. training_data.persist() validation_data.persist() # An action must be called for persist to evaluate. num_training = training_data.count() num_validation = validation_data.count() num_test = test_data.count() current_app.logger.info('Training models...') t0 = time() model, model_metadata, best_model_metadata = train( training_data, validation_data, num_validation, config.RANKS, config.LAMBDAS, config.ITERATIONS) models_training_time = '{:.2f}'.format((time() - t0) / 3600) try: best_model_test_rmse = compute_rmse(model.model, test_data, num_test) except Py4JJavaError as err: current_app.logger.error( 'Root mean squared error for best model for test data not computed\n{}\nAborting...' .format(str(err.java_exception)), exc_info=True) sys.exit(-1) # Cached data must be cleared to avoid OOM. training_data.unpersist() validation_data.unpersist() current_app.logger.info('Saving model...') t0 = time() model_save_path = os.path.join(path.DATA_DIR, best_model_metadata['model_id']) save_model(model_save_path, best_model_metadata['model_id'], model) time_['save_model'] = '{:.2f}'.format((time() - t0) / 60) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway. try: utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) except HDFSDirectoryNotDeletedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) if SAVE_TRAINING_HTML: save_training_html(time_, num_training, num_validation, num_test, model_metadata, best_model_metadata, ti, models_training_time) # Save best model id to a JSON file metadata_file_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(metadata_file_path, 'r') as f: recommendation_metadata = json.load(f) recommendation_metadata['best_model_id'] = best_model_metadata[ 'model_id'] with open(metadata_file_path, 'w') as f: json.dump(recommendation_metadata, f)
def test_delete_dir(self): utils.create_dir(self.path_) utils.delete_dir(self.path_) status = utils.path_exists(self.path_) self.assertFalse(status)
def tearDown(self): if utils.path_exists(self.path_): utils.delete_dir(self.path_, recursive=True) if utils.path_exists(self.temp_path_): utils.delete_dir(self.temp_path_, recursive=True)
def upload_archive(self, tmp_dump_dir, tar, dest_path, schema, callback=None, overwrite=False): """ Upload data dump to HDFS. Args: tmp_dump_dir (str): Path to temporary directory to upload JSON. tar: Uncompressed tar object. dest_path (str): HDFS path to upload data dump. schema: Schema of parquet to be uploaded. callback: Function to process JSON files. overwrite: If True deletes dir at dest_path """ if callback is None: raise NotImplementedError( 'Callback to process JSON missing. Aborting...') # Delete TEMP_DIR_PATH if it exists if utils.path_exists(TEMP_DIR_PATH): utils.delete_dir(TEMP_DIR_PATH, recursive=True) # Copy data from dest_path to TEMP_DIR_PATH to be merged with new data if not overwrite and utils.path_exists(dest_path): t0 = time.monotonic() logger.info("Copying old listens into '{}'".format(TEMP_DIR_PATH)) utils.copy(dest_path, TEMP_DIR_PATH, overwrite=True) logger.info("Done! Time taken: {:.2f}".format(time.monotonic() - t0)) logger.info("Uploading listens to temporary directory in HDFS...") total_files = 0 total_time = 0.0 for member in tar: if member.isfile() and self._is_json_file(member.name): logger.info("Uploading {}...".format(member.name)) t0 = time.monotonic() try: tar.extract(member) except TarError as err: # Cleanup if utils.path_exists(TEMP_DIR_PATH): utils.delete_dir(TEMP_DIR_PATH, recursive=True) if utils.path_exists(tmp_dump_dir): utils.delete_dir(tmp_dump_dir, recursive=True) raise DumpInvalidException( "{} while extracting {}, aborting import".format( type(err).__name__, member.name)) tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name) utils.upload_to_HDFS(tmp_hdfs_path, member.name) callback(member.name, TEMP_DIR_PATH, tmp_hdfs_path, not overwrite, schema) utils.delete_dir(tmp_hdfs_path, recursive=True) os.remove(member.name) time_taken = time.monotonic() - t0 total_files += 1 total_time += time_taken logger.info( "Done! Current file processed in {:.2f} sec".format( time_taken)) logger.info( "Done! Total files processed {}. Average time taken: {:.2f}". format(total_files, total_time / total_files)) # Delete dest_path if present if utils.path_exists(dest_path): logger.info('Removing {} from HDFS...'.format(dest_path)) utils.delete_dir(dest_path, recursive=True) logger.info('Done!') logger.info("Moving the processed files to {}".format(dest_path)) t0 = time.monotonic() # Check if parent directory exists, if not create a directory dest_path_parent = pathlib.Path(dest_path).parent if not utils.path_exists(dest_path_parent): utils.create_dir(dest_path_parent) utils.rename(TEMP_DIR_PATH, dest_path) utils.logger.info("Done! Time taken: {:.2f}".format(time.monotonic() - t0)) # Cleanup utils.delete_dir(tmp_dump_dir, recursive=True)