示例#1
0
    def test_create_incremental_dump_with_id(self):

        # if the dump ID does not exist, it should exit with a -1
        result = self.runner.invoke(
            dump_manager.create_incremental,
            ['--location', self.tempdir, '--dump-id', 1000])
        self.assertEqual(result.exit_code, -1)

        # create a base dump entry
        t = int(time.time())
        db_dump.add_dump_entry(t)
        sleep(1)
        self.listenstore.insert(generate_data(1, self.user_name, 1500000000,
                                              5))
        sleep(1)

        # create a new dump ID to recreate later
        dump_id = db_dump.add_dump_entry(int(time.time()))
        # now, create a dump with that specific dump id
        result = self.runner.invoke(
            dump_manager.create_incremental,
            ['--location', self.tempdir, '--dump-id', dump_id])
        self.assertEqual(len(os.listdir(self.tempdir)), 1)
        dump_name = os.listdir(self.tempdir)[0]
        created_dump_id = int(dump_name.split('-')[2])
        self.assertEqual(dump_id, created_dump_id)

        # dump should contain the listen and spark archive
        archive_count = 0
        for file_name in os.listdir(os.path.join(self.tempdir, dump_name)):
            if file_name.endswith('.tar.xz'):
                archive_count += 1
        self.assertEqual(archive_count, 2)
 def test_copy_table(self):
     db_dump.add_dump_entry(datetime.today().strftime('%s'))
     with db.engine.connect() as connection:
         db_dump.copy_table(
             cursor=connection.connection.cursor(),
             location=self.tempdir,
             columns='id, created',
             table_name='data_dump',
         )
     dumps = db_dump.get_dump_entries()
     with open(os.path.join(self.tempdir, 'data_dump'), 'r') as f:
         file_contents = [line for line in f]
     self.assertEqual(len(dumps), len(file_contents))
示例#3
0
 def test_copy_table(self):
     db_dump.add_dump_entry(datetime.today().strftime('%s'))
     with db.engine.connect() as connection:
         db_dump.copy_table(
             cursor=connection.connection.cursor(),
             location=self.tempdir,
             columns='id, created',
             table_name='data_dump',
         )
     dumps = db_dump.get_dump_entries()
     with open(os.path.join(self.tempdir, 'data_dump'), 'r') as f:
         file_contents = [line for line in f]
     self.assertEqual(len(dumps), len(file_contents))
示例#4
0
    def test_create_incremental(self, mock_notify):
        # create a incremental dump, this won't work because the incremental dump does
        # not have a previous dump
        result = self.runner.invoke(dump_manager.create_incremental,
                                    ['--location', self.tempdir])
        self.assertEqual(result.exit_code, -1)
        self.assertEqual(len(os.listdir(self.tempdir)), 0)

        base = int(time.time())
        dump_id = db_dump.add_dump_entry(base - 60)
        print("%d dump id" % dump_id)
        sleep(1)
        self.listenstore.insert(generate_data(1, self.user_name, base - 30, 5))
        result = self.runner.invoke(dump_manager.create_incremental,
                                    ['--location', self.tempdir])
        self.assertEqual(len(os.listdir(self.tempdir)), 1)
        dump_name = os.listdir(self.tempdir)[0]
        mock_notify.assert_called_with(dump_name, 'incremental')

        # created dump ID should be one greater than previous dump's ID
        created_dump_id = int(dump_name.split('-')[2])
        print("%d created dump id" % created_dump_id)
        self.assertEqual(created_dump_id, dump_id + 1)

        # make sure that the dump contains a full listens and spark dump
        archive_count = 0
        for file_name in os.listdir(os.path.join(self.tempdir, dump_name)):
            if file_name.endswith('.tar.xz'):
                archive_count += 1
        self.assertEqual(archive_count, 2)
示例#5
0
    def test_create_full_dump_with_id(self):

        self.listenstore.insert(generate_data(1, self.user_name, 1500000000,
                                              5))
        # if the dump ID does not exist, it should exit with a -1
        result = self.runner.invoke(
            dump_manager.create_full,
            ['--location', self.tempdir, '--dump-id', 1000])
        self.assertEqual(result.exit_code, -1)
        # make sure no directory was created either
        self.assertEqual(len(os.listdir(self.tempdir)), 0)

        # now, add a dump entry to the database and create a dump with that specific dump id
        dump_id = db_dump.add_dump_entry(int(time.time()))
        result = self.runner.invoke(
            dump_manager.create_full,
            ['--location', self.tempdir, '--dump-id', dump_id])
        self.assertEqual(len(os.listdir(self.tempdir)), 1)
        dump_name = os.listdir(self.tempdir)[0]
        created_dump_id = int(dump_name.split('-')[2])
        self.assertEqual(dump_id, created_dump_id)

        # dump should contain the 4 archives
        archive_count = 0
        for file_name in os.listdir(os.path.join(self.tempdir, dump_name)):
            if file_name.endswith('.tar.xz'):
                archive_count += 1
        self.assertEqual(archive_count, 4)
示例#6
0
def create_incremental(location, threads, dump_id):
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.influx_connection import _influx as ls
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found, exiting!", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
        if prev_dump_entry is None: # incremental dumps must have a previous dump in the series
            current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id)
            sys.exit(-1)
        start_time = prev_dump_entry['created']
        current_app.logger.info("Dumping data from %s to %s", start_time, end_time)
        dump_path = os.path.join(location, 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=False)
        ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=True)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_full(location, threads, dump_id, last_dump_id):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table
    """
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.influx_connection import _influx as ls
        if last_dump_id:
            all_dumps = db_dump.get_dump_entries()
            if len(all_dumps) == 0:
                current_app.logger.error(
                    "Cannot create full dump with last dump's ID, no dump exists!"
                )
                sys.exit(-1)
            dump_id = all_dumps[0]['id']

        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        dump_path = os.path.join(
            location, 'listenbrainz-dump-{dump_id}-{time}-full'.format(
                dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, end_time, threads)
        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        end_time=end_time,
                        threads=threads,
                        spark_format=False)
        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        end_time=end_time,
                        threads=threads,
                        spark_format=True)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)
示例#8
0
    def test_dump_get_200(self):
        t0 = datetime.now()
        dump_id = db_dump.add_dump_entry(int(t0.strftime("%s")))
        r = self.client.get("/1/status/get-dump-info",
                            query_string={"id": dump_id})
        self.assert200(r)
        self.assertDictEqual(r.json, {
            "id": dump_id,
            "timestamp": t0.strftime("%Y%m%d-%H%M%S"),
        })

        # should return the latest dump if no dump ID passed
        t1 = t0 + timedelta(seconds=1)
        dump_id_1 = db_dump.add_dump_entry(int(t1.strftime("%s")))
        r = self.client.get("/1/status/get-dump-info")
        self.assert200(r)
        self.assertDictEqual(r.json, {
            "id": dump_id_1,
            "timestamp": t1.strftime("%Y%m%d-%H%M%S"),
        })
示例#9
0
def create_incremental(location, threads, dump_id):
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found, exiting!", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
        if prev_dump_entry is None: # incremental dumps must have a previous dump in the series
            current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id)
            sys.exit(-1)
        start_time = prev_dump_entry['created']
        current_app.logger.info("Dumping data from %s to %s", start_time, end_time)

        dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)
        listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads)
        spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-incremental.tar.xz'.format(dump_id=dump_id,
                           time=end_time.strftime('%Y%m%d-%H%M%S'))
        spark_dump_path = os.path.join(location, dump_path, spark_dump_file)
        transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'incremental')

        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
示例#10
0
def create_full(location, threads, dump_id, last_dump_id):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from the listenstore

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table
    """
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if last_dump_id:
            all_dumps = db_dump.get_dump_entries()
            if len(all_dumps) == 0:
                current_app.logger.error(
                    "Cannot create full dump with last dump's ID, no dump exists!"
                )
                sys.exit(-1)
            dump_id = all_dumps[0]['id']

        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        ts = end_time.strftime('%Y%m%d-%H%M%S')
        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
            dump_id=dump_id, time=ts)
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, end_time, threads)

        listens_dump_file = ls.dump_listens(dump_path,
                                            dump_id=dump_id,
                                            end_time=end_time,
                                            threads=threads)
        spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format(
            dump_id=dump_id, time=ts)
        spark_dump_path = os.path.join(location, dump_path, spark_dump_file)
        transmogrify_dump_file_to_spark_import_format(listens_dump_file,
                                                      spark_dump_path, threads)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            if not sanity_check_dumps(dump_path, 12):
                return sys.exit(-1)
        except OSError as e:
            sys.exit(-1)

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'fullexport')

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s full\n" % (ts, dump_id))

        sys.exit(0)
def create_full(location, threads, dump_id, do_listen_dump: bool,
                do_spark_dump: bool, do_db_dump: bool):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from the listenstore.

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            do_listen_dump: If True, make a listens dump
            do_spark_dump: If True, make a spark listens dump
            do_db_dump: If True, make a public/private postgres/timescale dump
    """
    app = create_app()
    with app.app_context():
        ls = DumpListenStore(app)
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        ts = end_time.strftime('%Y%m%d-%H%M%S')
        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
            dump_id=dump_id, time=ts)
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)

        expected_num_dumps = 0
        if do_db_dump:
            db_dump.dump_postgres_db(dump_path, end_time, threads)
            expected_num_dumps += 4
        if do_listen_dump:
            ls.dump_listens(dump_path,
                            dump_id=dump_id,
                            end_time=end_time,
                            threads=threads)
            expected_num_dumps += 1
        if do_spark_dump:
            ls.dump_listens_for_spark(dump_path,
                                      dump_id=dump_id,
                                      dump_type="full",
                                      end_time=end_time)
            expected_num_dumps += 1

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            # 6 types of dumps, archive, md5, sha256 for each
            expected_num_dump_files = expected_num_dumps * 3
            if not sanity_check_dumps(dump_path, expected_num_dump_files):
                return sys.exit(-1)
        except OSError:
            sys.exit(-1)

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s full\n" % (ts, dump_id))

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'fullexport')

        sys.exit(0)
def create_incremental(location, threads, dump_id):
    app = create_app()
    with app.app_context():
        ls = DumpListenStore(app)
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found, exiting!",
                                         dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
        if prev_dump_entry is None:  # incremental dumps must have a previous dump in the series
            current_app.logger.error(
                "Invalid dump ID %d, could not find previous dump", dump_id)
            sys.exit(-1)
        start_time = prev_dump_entry['created']
        current_app.logger.info("Dumping data from %s to %s", start_time,
                                end_time)

        dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(
            dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)

        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        start_time=start_time,
                        end_time=end_time,
                        threads=threads)
        ls.dump_listens_for_spark(dump_path,
                                  dump_id=dump_id,
                                  dump_type="incremental",
                                  start_time=start_time,
                                  end_time=end_time)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            if not sanity_check_dumps(dump_path, 6):
                return sys.exit(-1)
        except OSError as e:
            sys.exit(-1)

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'incremental')

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s incremental\n" %
                    (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)
        sys.exit(0)
示例#13
0
 def test_add_dump_entry(self):
     prev_dumps = db_dump.get_dump_entries()
     db_dump.add_dump_entry(datetime.today().strftime('%s'))
     now_dumps = db_dump.get_dump_entries()
     self.assertEqual(len(now_dumps), len(prev_dumps) + 1)
 def test_add_dump_entry(self):
     prev_dumps = db_dump.get_dump_entries()
     db_dump.add_dump_entry()
     now_dumps = db_dump.get_dump_entries()
     self.assertEqual(len(now_dumps), len(prev_dumps) + 1)
示例#15
0
 def test_add_dump_entry(self):
     prev_dumps = db_dump.get_dump_entries()
     db_dump.add_dump_entry(datetime.today().strftime('%s'))
     now_dumps = db_dump.get_dump_entries()
     self.assertEqual(len(now_dumps), len(prev_dumps) + 1)