def test_create_incremental_dump_with_id(self): # if the dump ID does not exist, it should exit with a -1 result = self.runner.invoke( dump_manager.create_incremental, ['--location', self.tempdir, '--dump-id', 1000]) self.assertEqual(result.exit_code, -1) # create a base dump entry t = int(time.time()) db_dump.add_dump_entry(t) sleep(1) self.listenstore.insert(generate_data(1, self.user_name, 1500000000, 5)) sleep(1) # create a new dump ID to recreate later dump_id = db_dump.add_dump_entry(int(time.time())) # now, create a dump with that specific dump id result = self.runner.invoke( dump_manager.create_incremental, ['--location', self.tempdir, '--dump-id', dump_id]) self.assertEqual(len(os.listdir(self.tempdir)), 1) dump_name = os.listdir(self.tempdir)[0] created_dump_id = int(dump_name.split('-')[2]) self.assertEqual(dump_id, created_dump_id) # dump should contain the listen and spark archive archive_count = 0 for file_name in os.listdir(os.path.join(self.tempdir, dump_name)): if file_name.endswith('.tar.xz'): archive_count += 1 self.assertEqual(archive_count, 2)
def test_copy_table(self): db_dump.add_dump_entry(datetime.today().strftime('%s')) with db.engine.connect() as connection: db_dump.copy_table( cursor=connection.connection.cursor(), location=self.tempdir, columns='id, created', table_name='data_dump', ) dumps = db_dump.get_dump_entries() with open(os.path.join(self.tempdir, 'data_dump'), 'r') as f: file_contents = [line for line in f] self.assertEqual(len(dumps), len(file_contents))
def test_create_incremental(self, mock_notify): # create a incremental dump, this won't work because the incremental dump does # not have a previous dump result = self.runner.invoke(dump_manager.create_incremental, ['--location', self.tempdir]) self.assertEqual(result.exit_code, -1) self.assertEqual(len(os.listdir(self.tempdir)), 0) base = int(time.time()) dump_id = db_dump.add_dump_entry(base - 60) print("%d dump id" % dump_id) sleep(1) self.listenstore.insert(generate_data(1, self.user_name, base - 30, 5)) result = self.runner.invoke(dump_manager.create_incremental, ['--location', self.tempdir]) self.assertEqual(len(os.listdir(self.tempdir)), 1) dump_name = os.listdir(self.tempdir)[0] mock_notify.assert_called_with(dump_name, 'incremental') # created dump ID should be one greater than previous dump's ID created_dump_id = int(dump_name.split('-')[2]) print("%d created dump id" % created_dump_id) self.assertEqual(created_dump_id, dump_id + 1) # make sure that the dump contains a full listens and spark dump archive_count = 0 for file_name in os.listdir(os.path.join(self.tempdir, dump_name)): if file_name.endswith('.tar.xz'): archive_count += 1 self.assertEqual(archive_count, 2)
def test_create_full_dump_with_id(self): self.listenstore.insert(generate_data(1, self.user_name, 1500000000, 5)) # if the dump ID does not exist, it should exit with a -1 result = self.runner.invoke( dump_manager.create_full, ['--location', self.tempdir, '--dump-id', 1000]) self.assertEqual(result.exit_code, -1) # make sure no directory was created either self.assertEqual(len(os.listdir(self.tempdir)), 0) # now, add a dump entry to the database and create a dump with that specific dump id dump_id = db_dump.add_dump_entry(int(time.time())) result = self.runner.invoke( dump_manager.create_full, ['--location', self.tempdir, '--dump-id', dump_id]) self.assertEqual(len(os.listdir(self.tempdir)), 1) dump_name = os.listdir(self.tempdir)[0] created_dump_id = int(dump_name.split('-')[2]) self.assertEqual(dump_id, created_dump_id) # dump should contain the 4 archives archive_count = 0 for file_name in os.listdir(os.path.join(self.tempdir, dump_name)): if file_name.endswith('.tar.xz'): archive_count += 1 self.assertEqual(archive_count, 4)
def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): from listenbrainz.webserver.influx_connection import _influx as ls if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_path = os.path.join(location, 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=False) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.influx_connection import _influx as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] dump_path = os.path.join( location, 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=False) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def test_dump_get_200(self): t0 = datetime.now() dump_id = db_dump.add_dump_entry(int(t0.strftime("%s"))) r = self.client.get("/1/status/get-dump-info", query_string={"id": dump_id}) self.assert200(r) self.assertDictEqual(r.json, { "id": dump_id, "timestamp": t0.strftime("%Y%m%d-%H%M%S"), }) # should return the latest dump if no dump ID passed t1 = t0 + timedelta(seconds=1) dump_id_1 = db_dump.add_dump_entry(int(t1.strftime("%s"))) r = self.client.get("/1/status/get-dump-info") self.assert200(r) self.assertDictEqual(r.json, { "id": dump_id_1, "timestamp": t1.strftime("%Y%m%d-%H%M%S"), })
def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): from listenbrainz.webserver.timescale_connection import _ts as ls if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) dump_path = os.path.join(location, dump_name) create_path(dump_path) listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads) spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-incremental.tar.xz'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) spark_dump_path = os.path.join(location, dump_path, spark_dump_file) transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'incremental') current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.timescale_connection import _ts as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format( dump_id=dump_id, time=ts) spark_dump_path = os.path.join(location, dump_path, spark_dump_file) transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 12): return sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) sys.exit(0)
def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump: bool, do_db_dump: bool): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore. Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump do_listen_dump: If True, make a listens dump do_spark_dump: If True, make a spark listens dump do_db_dump: If True, make a public/private postgres/timescale dump """ app = create_app() with app.app_context(): ls = DumpListenStore(app) if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) expected_num_dumps = 0 if do_db_dump: db_dump.dump_postgres_db(dump_path, end_time, threads) expected_num_dumps += 4 if do_listen_dump: ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) expected_num_dumps += 1 if do_spark_dump: ls.dump_listens_for_spark(dump_path, dump_id=dump_id, dump_type="full", end_time=end_time) expected_num_dumps += 1 try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: # 6 types of dumps, archive, md5, sha256 for each expected_num_dump_files = expected_num_dumps * 3 if not sanity_check_dumps(dump_path, expected_num_dump_files): return sys.exit(-1) except OSError: sys.exit(-1) current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') sys.exit(0)
def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): ls = DumpListenStore(app) if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error( "Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) dump_path = os.path.join(location, dump_name) create_path(dump_path) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads) ls.dump_listens_for_spark(dump_path, dump_id=dump_id, dump_type="incremental", start_time=start_time, end_time=end_time) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 6): return sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'incremental') # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s incremental\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id)) current_app.logger.info('Dumps created and hashes written at %s' % dump_path) sys.exit(0)
def test_add_dump_entry(self): prev_dumps = db_dump.get_dump_entries() db_dump.add_dump_entry(datetime.today().strftime('%s')) now_dumps = db_dump.get_dump_entries() self.assertEqual(len(now_dumps), len(prev_dumps) + 1)
def test_add_dump_entry(self): prev_dumps = db_dump.get_dump_entries() db_dump.add_dump_entry() now_dumps = db_dump.get_dump_entries() self.assertEqual(len(now_dumps), len(prev_dumps) + 1)