def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): from listenbrainz.webserver.influx_connection import _influx as ls if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_path = os.path.join(location, 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=False) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.influx_connection import _influx as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] dump_path = os.path.join( location, 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=False) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): from listenbrainz.webserver.timescale_connection import _ts as ls if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) dump_path = os.path.join(location, dump_name) create_path(dump_path) listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads) spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-incremental.tar.xz'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) spark_dump_path = os.path.join(location, dump_path, spark_dump_file) transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'incremental') current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def get_dump_info(): """ Get information about ListenBrainz data dumps. You need to pass the `id` parameter in a GET request to get data about that particular dump. **Example response**: .. code-block:: json { "id": 1, "timestamp": "20190625-165900" } :query id: Integer specifying the ID of the dump, if not provided, the endpoint returns information about the latest data dump. :statuscode 200: You have data. :statuscode 400: You did not provide a valid dump ID. See error message for details. :statuscode 404: Dump with given ID does not exist. :resheader Content-Type: *application/json* """ dump_id = request.args.get("id") if dump_id is None: try: dump = db_dump.get_dump_entries()[0] # return the latest dump except IndexError: raise APINotFound("No dump entry exists.") else: try: dump_id = int(dump_id) except ValueError: raise APIBadRequest("The `id` parameter needs to be an integer.") dump = db_dump.get_dump_entry(dump_id) if dump is None: raise APINotFound("No dump exists with ID: %d" % dump_id) return jsonify({ "id": dump["id"], "timestamp": _convert_timestamp_to_string_dump_format(dump["created"]), })
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.timescale_connection import _ts as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format( dump_id=dump_id, time=ts) spark_dump_path = os.path.join(location, dump_path, spark_dump_file) transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 12): return sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) sys.exit(0)
def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump: bool, do_db_dump: bool): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore. Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump do_listen_dump: If True, make a listens dump do_spark_dump: If True, make a spark listens dump do_db_dump: If True, make a public/private postgres/timescale dump """ app = create_app() with app.app_context(): ls = DumpListenStore(app) if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) expected_num_dumps = 0 if do_db_dump: db_dump.dump_postgres_db(dump_path, end_time, threads) expected_num_dumps += 4 if do_listen_dump: ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) expected_num_dumps += 1 if do_spark_dump: ls.dump_listens_for_spark(dump_path, dump_id=dump_id, dump_type="full", end_time=end_time) expected_num_dumps += 1 try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: # 6 types of dumps, archive, md5, sha256 for each expected_num_dump_files = expected_num_dumps * 3 if not sanity_check_dumps(dump_path, expected_num_dump_files): return sys.exit(-1) except OSError: sys.exit(-1) current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') sys.exit(0)
def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): ls = DumpListenStore(app) if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error( "Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) dump_path = os.path.join(location, dump_name) create_path(dump_path) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads) ls.dump_listens_for_spark(dump_path, dump_id=dump_id, dump_type="incremental", start_time=start_time, end_time=end_time) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 6): return sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'incremental') # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s incremental\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id)) current_app.logger.info('Dumps created and hashes written at %s' % dump_path) sys.exit(0)