def import_newest_incremental_dump_handler(): imported_dumps = [] latest_full_dump = utils.get_latest_full_dump() if latest_full_dump is None: # If no prior full dump is present, just import the lates incremental dump imported_dumps.append( import_dump_to_hdfs('incremental', overwrite=False)) current_app.logger.warn( "No previous full dump found, importing latest incremental dump", exc_info=True) else: # Import all missing dumps from last full dump import dump_id = latest_full_dump["dump_id"] + 1 imported_at = latest_full_dump["imported_at"] while True: if not utils.search_dump(dump_id, 'incremental', imported_at): try: imported_dumps.append( import_dump_to_hdfs('incremental', False, dump_id)) except DumpNotFoundException: break except Exception as e: # Exit if any other error occurs during import current_app.logger.error( f"Error while importing incremental dump with ID {dump_id}: {e}", exc_info=True) break dump_id += 1 return [{ 'type': 'import_incremental_dump', 'imported_dump': imported_dumps, 'time': str(datetime.utcnow()), }]
def test_get_latest_full_dump_present(self): """ Test to ensure correct dump is returned if full dump has been imported. """ self.assertDictEqual(import_utils.get_latest_full_dump(), { "dump_id": 7, "dump_type": "full", "imported_at": datetime.fromtimestamp(7) })
def test_get_latest_full_dump_file_missing(self): """ Test to ensure 'None' is returned if metadata file is missing. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) self.assertIsNone(import_utils.get_latest_full_dump())
def import_newest_incremental_dump_handler(): errors = [] imported_dumps = [] latest_full_dump = utils.get_latest_full_dump() if latest_full_dump is None: # If no prior full dump is present, just import the latest incremental dump imported_dumps.append(import_incremental_dump_to_hdfs(dump_id=None)) error_msg = "No previous full dump found, importing latest incremental dump" errors.append(error_msg) logger.warning(error_msg, exc_info=True) else: # Import all missing dumps from last full dump import start_id = latest_full_dump["dump_id"] + 1 imported_at = latest_full_dump["imported_at"] end_id = ListenbrainzDataDownloader().get_latest_dump_id(DumpType.INCREMENTAL) + 1 for dump_id in range(start_id, end_id, 1): if not utils.search_dump(dump_id, DumpType.INCREMENTAL, imported_at): try: imported_dumps.append(import_incremental_dump_to_hdfs(dump_id)) except Exception as e: # Skip current dump if any error occurs during import error_msg = f"Error while importing incremental dump with ID {dump_id}: {e}" errors.append(error_msg) logger.error(error_msg, exc_info=True) continue dump_id += 1 request_consumer.rc.ping() return [{ 'type': 'import_incremental_dump', 'imported_dump': imported_dumps, 'errors': errors, 'time': str(datetime.utcnow()), }]
def test_get_latest_full_dump_no_full(self): """ Test to ensure 'None' is returned if not full import has been made. """ # Remove full dump entries from parquet import_meta_df = read_files_from_HDFS(self.path_) result = import_meta_df.filter(import_meta_df.dump_type != "full") # We have to save the dataframe as a different file and move it as the df itself is read from the file save_parquet(result, '/temp.parquet') delete_dir(self.path_, recursive=True) rename('/temp.parquet', self.path_) self.assertIsNone(import_utils.get_latest_full_dump())