def test_dump_postgres_db_table_entries(self): db_user.create('test_user') timestamp = datetime.today() location = db_dump.dump_postgres_db(self.tempdir, dump_time=timestamp) dump_entries = db_dump.get_dump_entries() self.assertEqual(len(dump_entries), 1) self.assertEqual(dump_entries[0]['created'].strftime('%s'), timestamp.strftime('%s'))
def test_dump_postgres_db_table_entries(self): with self.app.app_context(): db_user.create(1, 'test_user') timestamp = datetime.today() location = db_dump.dump_postgres_db(self.tempdir, dump_time=timestamp) dump_entries = db_dump.get_dump_entries() self.assertEqual(len(dump_entries), 1) self.assertEqual(dump_entries[0]['created'].strftime('%s'), timestamp.strftime('%s'))
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.influx_connection import _influx as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] dump_path = os.path.join( location, 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=False) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def test_copy_table(self): db_dump.add_dump_entry(datetime.today().strftime('%s')) with db.engine.connect() as connection: db_dump.copy_table( cursor=connection.connection.cursor(), location=self.tempdir, columns='id, created', table_name='data_dump', ) dumps = db_dump.get_dump_entries() with open(os.path.join(self.tempdir, 'data_dump'), 'r') as f: file_contents = [line for line in f] self.assertEqual(len(dumps), len(file_contents))
def test_copy_table(self): db_dump.add_dump_entry(datetime.today().strftime('%s')) with db.engine.connect() as connection: db_dump.copy_table( cursor=connection.connection.cursor(), location=self.tempdir, columns='id, created', table_name='data_dump', ) dumps = db_dump.get_dump_entries() with open(os.path.join(self.tempdir, 'data_dump'), 'r') as f: file_contents = [line for line in f] self.assertEqual(len(dumps), len(file_contents))
def get_dump_info(): """ Get information about ListenBrainz data dumps. You need to pass the `id` parameter in a GET request to get data about that particular dump. **Example response**: .. code-block:: json { "id": 1, "timestamp": "20190625-165900" } :query id: Integer specifying the ID of the dump, if not provided, the endpoint returns information about the latest data dump. :statuscode 200: You have data. :statuscode 400: You did not provide a valid dump ID. See error message for details. :statuscode 404: Dump with given ID does not exist. :resheader Content-Type: *application/json* """ dump_id = request.args.get("id") if dump_id is None: try: dump = db_dump.get_dump_entries()[0] # return the latest dump except IndexError: raise APINotFound("No dump entry exists.") else: try: dump_id = int(dump_id) except ValueError: raise APIBadRequest("The `id` parameter needs to be an integer.") dump = db_dump.get_dump_entry(dump_id) if dump is None: raise APINotFound("No dump exists with ID: %d" % dump_id) return jsonify({ "id": dump["id"], "timestamp": _convert_timestamp_to_string_dump_format(dump["created"]), })
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.timescale_connection import _ts as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format( dump_id=dump_id, time=ts) spark_dump_path = os.path.join(location, dump_path, spark_dump_file) transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 12): return sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) sys.exit(0)
def test_add_dump_entry(self): prev_dumps = db_dump.get_dump_entries() db_dump.add_dump_entry(datetime.today().strftime('%s')) now_dumps = db_dump.get_dump_entries() self.assertEqual(len(now_dumps), len(prev_dumps) + 1)
def test_add_dump_entry(self): prev_dumps = db_dump.get_dump_entries() db_dump.add_dump_entry() now_dumps = db_dump.get_dump_entries() self.assertEqual(len(now_dumps), len(prev_dumps) + 1)
def test_add_dump_entry(self): prev_dumps = db_dump.get_dump_entries() db_dump.add_dump_entry(datetime.today().strftime('%s')) now_dumps = db_dump.get_dump_entries() self.assertEqual(len(now_dumps), len(prev_dumps) + 1)