def create(location, threads): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression """ db.init_db_connection(config.SQLALCHEMY_DATABASE_URI) ls = init_influx_connection(log, { 'REDIS_HOST': config.REDIS_HOST, 'REDIS_PORT': config.REDIS_PORT, 'REDIS_NAMESPACE': config.REDIS_NAMESPACE, 'INFLUX_HOST': config.INFLUX_HOST, 'INFLUX_PORT': config.INFLUX_PORT, 'INFLUX_DB_NAME': config.INFLUX_DB_NAME, }) time_now = datetime.today() dump_path = os.path.join(location, 'listenbrainz-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, time_now, threads) ls.dump_listens(dump_path, time_now, threads) try: write_hashes(dump_path) except IOError as e: log.error('Unable to create hash files! Error: %s', str(e)) return log.info('Dumps created and hashes written at %s' % dump_path)
def create(location, threads): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression """ app = create_app() with app.app_context(): ls = init_influx_connection( current_app.logger, { 'REDIS_HOST': current_app.config['REDIS_HOST'], 'REDIS_PORT': current_app.config['REDIS_PORT'], 'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'], 'INFLUX_HOST': current_app.config['INFLUX_HOST'], 'INFLUX_PORT': current_app.config['INFLUX_PORT'], 'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'], }) time_now = datetime.today() dump_path = os.path.join( location, 'listenbrainz-dump-{time}'.format( time=time_now.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, time_now, threads) ls.dump_listens(dump_path, time_now, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create(location, threads): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression """ app = create_app() with app.app_context(): ls = init_influx_connection(current_app.logger, { 'REDIS_HOST': current_app.config['REDIS_HOST'], 'REDIS_PORT': current_app.config['REDIS_PORT'], 'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'], 'INFLUX_HOST': current_app.config['INFLUX_HOST'], 'INFLUX_PORT': current_app.config['INFLUX_PORT'], 'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'], }) time_now = datetime.today() dump_path = os.path.join(location, 'listenbrainz-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, time_now, threads) ls.dump_listens(dump_path, time_now, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.influx_connection import _influx as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] dump_path = os.path.join( location, 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=False) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def test_import_postgres_db(self): # create a user db_user.create(1, 'test_user') user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # do a db dump and reset the db private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir) self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) # import the dump db_dump.import_postgres_dump(private_dump, public_dump) user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # reset again, and use more threads to import self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) db_dump.import_postgres_dump(private_dump, public_dump, threads=2) user_count = db_user.get_user_count() self.assertEqual(user_count, 1)
def test_import_postgres_db(self): # create a user with self.app.app_context(): one_id = db_user.create(1, 'test_user') user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # do a db dump and reset the db private_dump, private_ts_dump, public_dump, public_ts_dump = db_dump.dump_postgres_db( self.tempdir) self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) # import the dump db_dump.import_postgres_dump(private_dump, None, public_dump, None) user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # reset again, and use more threads to import self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) db_dump.import_postgres_dump(private_dump, None, public_dump, None, threads=2) user_count = db_user.get_user_count() self.assertEqual(user_count, 1) two_id = db_user.create(2, 'vnskprk') self.assertGreater(two_id, one_id)
def test_dump_recording_feedback(self): # create a user with self.app.app_context(): one_id = db_user.create(1, 'test_user') user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # insert a feedback record feedback = Feedback( user_id=one_id, recording_msid="d23f4719-9212-49f0-ad08-ddbfbfc50d6f", score=1) db_feedback.insert(feedback) # do a db dump and reset the db private_dump, private_ts_dump, public_dump, public_ts_dump = db_dump.dump_postgres_db( self.tempdir) self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) self.assertEqual( db_feedback.get_feedback_count_for_user(user_id=one_id), 0) # import the dump and check the records are inserted db_dump.import_postgres_dump(private_dump, None, public_dump, None) user_count = db_user.get_user_count() self.assertEqual(user_count, 1) dumped_feedback = db_feedback.get_feedback_for_user(user_id=one_id, limit=1, offset=0) self.assertEqual(len(dumped_feedback), 1) self.assertEqual(dumped_feedback[0].user_id, feedback.user_id) self.assertEqual(dumped_feedback[0].recording_msid, feedback.recording_msid) self.assertEqual(dumped_feedback[0].score, feedback.score) # reset again, and use more threads to import self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) dumped_feedback = [] db_dump.import_postgres_dump(private_dump, None, public_dump, None, threads=2) user_count = db_user.get_user_count() self.assertEqual(user_count, 1) dumped_feedback = db_feedback.get_feedback_for_user(user_id=one_id, limit=1, offset=0) self.assertEqual(len(dumped_feedback), 1) self.assertEqual(dumped_feedback[0].user_id, feedback.user_id) self.assertEqual(dumped_feedback[0].recording_msid, feedback.recording_msid) self.assertEqual(dumped_feedback[0].score, feedback.score)
def test_import_postgres_db(self): # create a user db_user.create('test_user') user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # do a db dump and reset the db location = db_dump.dump_postgres_db(self.tempdir) self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) # import the dump db_dump.import_postgres_dump(location) user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # reset again, and use more threads to import self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) db_dump.import_postgres_dump(location, threads=2) user_count = db_user.get_user_count() self.assertEqual(user_count, 1)
def test_dump_postgres_db_table_entries(self): db_user.create('test_user') timestamp = datetime.today() location = db_dump.dump_postgres_db(self.tempdir, dump_time=timestamp) dump_entries = db_dump.get_dump_entries() self.assertEqual(len(dump_entries), 1) self.assertEqual(dump_entries[0]['created'].strftime('%s'), timestamp.strftime('%s'))
def test_import_postgres_db(self): # create a user with self.app.app_context(): one_id = db_user.create(1, 'test_user') user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # do a db dump and reset the db private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir) self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) # import the dump db_dump.import_postgres_dump(private_dump, public_dump) user_count = db_user.get_user_count() self.assertEqual(user_count, 1) # reset again, and use more threads to import self.reset_db() user_count = db_user.get_user_count() self.assertEqual(user_count, 0) db_dump.import_postgres_dump(private_dump, public_dump, threads=2) user_count = db_user.get_user_count() self.assertEqual(user_count, 1) two_id = db_user.create(2, 'vnskprk') self.assertGreater(two_id, one_id)
def test_dump_postgres_db_table_entries(self): with self.app.app_context(): db_user.create(1, 'test_user') timestamp = datetime.today() location = db_dump.dump_postgres_db(self.tempdir, dump_time=timestamp) dump_entries = db_dump.get_dump_entries() self.assertEqual(len(dump_entries), 1) self.assertEqual(dump_entries[0]['created'].strftime('%s'), timestamp.strftime('%s'))
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.timescale_connection import _ts as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format( dump_id=dump_id, time=ts) spark_dump_path = os.path.join(location, dump_path, spark_dump_file) transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 12): return sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) sys.exit(0)
def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump: bool, do_db_dump: bool): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore. Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump do_listen_dump: If True, make a listens dump do_spark_dump: If True, make a spark listens dump do_db_dump: If True, make a public/private postgres/timescale dump """ app = create_app() with app.app_context(): ls = DumpListenStore(app) if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) expected_num_dumps = 0 if do_db_dump: db_dump.dump_postgres_db(dump_path, end_time, threads) expected_num_dumps += 4 if do_listen_dump: ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) expected_num_dumps += 1 if do_spark_dump: ls.dump_listens_for_spark(dump_path, dump_id=dump_id, dump_type="full", end_time=end_time) expected_num_dumps += 1 try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: # 6 types of dumps, archive, md5, sha256 for each expected_num_dump_files = expected_num_dumps * 3 if not sanity_check_dumps(dump_path, expected_num_dump_files): return sys.exit(-1) except OSError: sys.exit(-1) current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') sys.exit(0)