示例#1
0
def create(location, threads):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
    """
    app = create_app()
    with app.app_context():
        ls = init_influx_connection(
            current_app.logger, {
                'REDIS_HOST': current_app.config['REDIS_HOST'],
                'REDIS_PORT': current_app.config['REDIS_PORT'],
                'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'],
                'INFLUX_HOST': current_app.config['INFLUX_HOST'],
                'INFLUX_PORT': current_app.config['INFLUX_PORT'],
                'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'],
            })
        time_now = datetime.today()
        dump_path = os.path.join(
            location, 'listenbrainz-dump-{time}'.format(
                time=time_now.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, time_now, threads)
        ls.dump_listens(dump_path, time_now, threads)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)
def create(location, threads):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
    """
    db.init_db_connection(config.SQLALCHEMY_DATABASE_URI)
    ls = init_influx_connection(log,  {
        'REDIS_HOST': config.REDIS_HOST,
        'REDIS_PORT': config.REDIS_PORT,
        'REDIS_NAMESPACE': config.REDIS_NAMESPACE,
        'INFLUX_HOST': config.INFLUX_HOST,
        'INFLUX_PORT': config.INFLUX_PORT,
        'INFLUX_DB_NAME': config.INFLUX_DB_NAME,
    })
    time_now = datetime.today()
    dump_path = os.path.join(location, 'listenbrainz-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S')))
    create_path(dump_path)
    db_dump.dump_postgres_db(dump_path, time_now, threads)
    ls.dump_listens(dump_path, time_now, threads)
    try:
        write_hashes(dump_path)
    except IOError as e:
        log.error('Unable to create hash files! Error: %s', str(e))
        return
    log.info('Dumps created and hashes written at %s' % dump_path)
def create(location, threads):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
    """
    app = create_app()
    with app.app_context():
        ls = init_influx_connection(current_app.logger,  {
            'REDIS_HOST': current_app.config['REDIS_HOST'],
            'REDIS_PORT': current_app.config['REDIS_PORT'],
            'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'],
            'INFLUX_HOST': current_app.config['INFLUX_HOST'],
            'INFLUX_PORT': current_app.config['INFLUX_PORT'],
            'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'],
        })
        time_now = datetime.today()
        dump_path = os.path.join(location, 'listenbrainz-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, time_now, threads)
        ls.dump_listens(dump_path, time_now, threads)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
示例#4
0
def create_incremental(location, threads, dump_id):
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.influx_connection import _influx as ls
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found, exiting!", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
        if prev_dump_entry is None: # incremental dumps must have a previous dump in the series
            current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id)
            sys.exit(-1)
        start_time = prev_dump_entry['created']
        current_app.logger.info("Dumping data from %s to %s", start_time, end_time)
        dump_path = os.path.join(location, 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=False)
        ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=True)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
示例#5
0
def create_spark_dump(location, threads):
    with create_app().app_context():
        ls = init_influx_connection(
            current_app.logger, {
                'REDIS_HOST': current_app.config['REDIS_HOST'],
                'REDIS_PORT': current_app.config['REDIS_PORT'],
                'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'],
                'INFLUX_HOST': current_app.config['INFLUX_HOST'],
                'INFLUX_PORT': current_app.config['INFLUX_PORT'],
                'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'],
            })
        time_now = datetime.today()
        dump_path = os.path.join(
            location, 'listenbrainz-spark-dump-{time}'.format(
                time=time_now.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        ls.dump_listens(dump_path, time_now, threads, spark_format=True)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            return
        current_app.logger.info('Dump created and hash written at %s',
                                dump_path)
示例#6
0
    def write_listens_to_dump(self, listens_path, users, tar, archive_name,
                              start_time, end_time):
        """ Write listens into the ListenBrainz dump.

        Args:
            listens_path (str): the path where listens should be kept before adding to the archive
            users (List[dict]): a list of all users
            tar (TarFile obj): the tar obj to which listens should be added
            archive_name (str): the name of the archive
            start_time and end_time: the range of time for which listens are to be dumped
        """
        dump_complete = False
        next_user_id = 0
        index = {}
        while not dump_complete:
            file_uuid = str(uuid.uuid4())
            file_name = file_uuid + '.listens'
            # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid)
            file_directory = os.path.join(file_name[0], file_name[0:2])
            tmp_directory = os.path.join(listens_path, file_directory)
            create_path(tmp_directory)
            tmp_file_path = os.path.join(tmp_directory, file_name)
            archive_file_path = os.path.join(archive_name, 'listens',
                                             file_directory, file_name)
            with open(tmp_file_path, 'w') as f:
                file_done = False
                while next_user_id < len(users):
                    if f.tell() > DUMP_FILE_SIZE_LIMIT:
                        file_done = True
                        break

                    username = users[next_user_id]['musicbrainz_id']
                    offset = f.tell()
                    size = self.dump_user(username=username,
                                          fileobj=f,
                                          start_time=start_time,
                                          end_time=end_time)
                    index[username] = {
                        'file_name': file_uuid,
                        'offset': offset,
                        'size': size,
                    }
                    next_user_id += 1
                    self.log.info("%d users done. Total: %d", next_user_id,
                                  len(users))

            if file_done:
                tar.add(tmp_file_path, arcname=archive_file_path)
                os.remove(tmp_file_path)
                continue

            if next_user_id == len(users):
                if not file_done:  # if this was the last user and file hasn't been added, add it
                    tar.add(tmp_file_path, arcname=archive_file_path)
                    os.remove(tmp_file_path)
                dump_complete = True
                break

        return index
    def dump_listens(self, location, dump_id, start_time=datetime.utcfromtimestamp(0), end_time=None,
                     threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Dumps all listens in the ListenStore into a .tar.xz archive.

        Files are created with UUIDs as names. Each file can contain listens for a number of users.
        An index.json file is used to save which file contains the listens of which users.

        This creates an incremental dump if start_time is specified (with range start_time to end_time),
        otherwise it creates a full dump with all listens.

        Args:
            location: the directory where the listens dump archive should be created
            dump_id (int): the ID of the dump in the dump sequence
            start_time and end_time (datetime): the time range for which listens should be dumped
                start_time defaults to utc 0 (meaning a full dump) and end_time defaults to the current time
            threads (int): the number of threads to use for compression
            spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps

        Returns:
            the path to the dump archive
        """

        if end_time is None:
            end_time = datetime.now()

        self.log.info('Beginning dump of listens from TimescaleDB...')
        full_dump = bool(start_time == datetime.utcfromtimestamp(0))
        archive_name = 'listenbrainz-listens-dump-{dump_id}-{time}'.format(dump_id=dump_id,
                                                                           time=end_time.strftime('%Y%m%d-%H%M%S'))
        if full_dump:
            archive_name = '{}-full'.format(archive_name)
        else:
            archive_name = '{}-incremental'.format(archive_name)
        archive_path = os.path.join(location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = ['pxz', '--compress', '-T{threads}'.format(threads=threads)]
            pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = os.path.join(self.dump_temp_dir_root, str(uuid.uuid4()))
                create_path(temp_dir)
                self.write_dump_metadata(archive_name, start_time, end_time, temp_dir, tar, full_dump)

                listens_path = os.path.join(temp_dir, 'listens')
                self.write_listens(listens_path, tar, archive_name, start_time, end_time)

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        pxz.wait()
        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path
def create_full(location, threads, dump_id, last_dump_id):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table
    """
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.influx_connection import _influx as ls
        if last_dump_id:
            all_dumps = db_dump.get_dump_entries()
            if len(all_dumps) == 0:
                current_app.logger.error(
                    "Cannot create full dump with last dump's ID, no dump exists!"
                )
                sys.exit(-1)
            dump_id = all_dumps[0]['id']

        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        dump_path = os.path.join(
            location, 'listenbrainz-dump-{dump_id}-{time}-full'.format(
                dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, end_time, threads)
        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        end_time=end_time,
                        threads=threads,
                        spark_format=False)
        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        end_time=end_time,
                        threads=threads,
                        spark_format=True)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)
示例#9
0
def create_spark_dump(location, threads):
    with create_app().app_context():
        from listenbrainz.webserver.influx_connection import _influx as ls
        time_now = datetime.today()
        dump_path = os.path.join(location, 'listenbrainz-spark-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        ls.dump_listens(dump_path, time_now, threads, spark_format=True)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return
        current_app.logger.info('Dump created and hash written at %s', dump_path)
    def test_cleanup_dumps(self):
        create_path(os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000001'))
        create_path(os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000002'))
        create_path(os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000003'))
        create_path(os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000004'))
        create_path(os.path.join(self.tempdir, 'not-a-dump'))

        dump_manager._cleanup_dumps(self.tempdir)

        newdirs = os.listdir(self.tempdir)
        self.assertNotIn('listenbrainz-dump-20180312-000001', newdirs)
        self.assertNotIn('listenbrainz-dump-20180312-000002', newdirs)
        self.assertIn('listenbrainz-dump-20180312-000003', newdirs)
        self.assertIn('listenbrainz-dump-20180312-000003', newdirs)
        self.assertIn('not-a-dump', newdirs)
 def write_spark_listens_to_disk(self, unwritten_listens, temp_dir):
     for year in unwritten_listens:
         for month in unwritten_listens[year]:
             if year < 2002:
                 directory = temp_dir
                 filename = os.path.join(directory, 'invalid.json')
             else:
                 directory = os.path.join(temp_dir, str(year))
                 filename = os.path.join(directory,
                                         '{}.json'.format(str(month)))
             create_path(directory)
             with open(filename, 'a') as f:
                 f.write('\n'.join([
                     ujson.dumps(listen)
                     for listen in unwritten_listens[year][month]
                 ]))
                 f.write('\n')
    def write_incremental_listens_to_disk(self, listens, temp_dir):
        """ Write all spark listens in year/month dir format to disk.

        Args:
            listens : the listens to be written into the disk
            temp_dir: the dir into which listens should be written
        """
        for year in listens:
            for month in listens[year]:
                if year < 2002:
                    directory = temp_dir
                    filename = os.path.join(directory, 'invalid.json')
                else:
                    directory = os.path.join(temp_dir, str(year))
                    filename = os.path.join(directory, '{}.json'.format(str(month)))
                create_path(directory)
                with open(filename, 'a') as f:
                    f.write('\n'.join([ujson.dumps(listen) for listen in listens[year][month]]))
                    f.write('\n')
    def test_cleanup_dumps(self):
        create_path(
            os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000001'))
        create_path(
            os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000002'))
        create_path(
            os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000003'))
        create_path(
            os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000004'))
        create_path(os.path.join(self.tempdir, 'not-a-dump'))

        dump_manager._cleanup_dumps(self.tempdir)

        newdirs = os.listdir(self.tempdir)
        self.assertNotIn('listenbrainz-dump-20180312-000001', newdirs)
        self.assertNotIn('listenbrainz-dump-20180312-000002', newdirs)
        self.assertIn('listenbrainz-dump-20180312-000003', newdirs)
        self.assertIn('listenbrainz-dump-20180312-000003', newdirs)
        self.assertIn('not-a-dump', newdirs)
示例#14
0
def create_feedback(location, threads):
    """ Create a spark formatted dump of user/recommendation feedback data.

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
    """
    app = create_app()
    with app.app_context():

        end_time = datetime.now()
        ts = end_time.strftime('%Y%m%d-%H%M%S')
        dump_name = 'listenbrainz-feedback-{time}-full'.format(time=ts)
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)
        db_dump.dump_feedback_for_spark(dump_path, end_time, threads)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            if not sanity_check_dumps(dump_path, 3):
                sys.exit(-1)
        except OSError as e:
            sys.exit(-1)

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'feedback')

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s 0 feedback\n" % (end_time.strftime('%Y%m%d-%H%M%S')))

        current_app.logger.info(
            'Feedback dump created and hashes written at %s' % dump_path)

        sys.exit(0)
    def write_listens_full(self, listens_path, users, dump_time):
        dump_complete = False
        next_user_id = 0
        index = {}
        while not dump_complete:
            file_name = str(uuid.uuid4())
            # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid)
            directory = os.path.join(listens_path, file_name[0],
                                     file_name[0:2])
            create_path(directory)
            file_path = os.path.join(directory,
                                     '{uuid}.listens'.format(uuid=file_name))
            with open(file_path, 'w') as f:
                file_done = False
                while next_user_id < len(users):
                    if f.tell() > DUMP_FILE_SIZE_LIMIT:
                        file_done = True
                        break

                    username = users[next_user_id]['musicbrainz_id']
                    offset = f.tell()
                    size = self.dump_user(username=username,
                                          fileobj=f,
                                          dump_time=dump_time)
                    index[username] = {
                        'file_name': file_name,
                        'offset': offset,
                        'size': size,
                    }
                    next_user_id += 1
                    self.log.info("%d users done. Total: %d", next_user_id,
                                  len(users))

                if file_done:
                    continue

                if next_user_id == len(users):
                    dump_complete = True
                    break

        return index
示例#16
0
def create_incremental(location, threads, dump_id):
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found, exiting!", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
        if prev_dump_entry is None: # incremental dumps must have a previous dump in the series
            current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id)
            sys.exit(-1)
        start_time = prev_dump_entry['created']
        current_app.logger.info("Dumping data from %s to %s", start_time, end_time)

        dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)
        listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads)
        spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-incremental.tar.xz'.format(dump_id=dump_id,
                           time=end_time.strftime('%Y%m%d-%H%M%S'))
        spark_dump_path = os.path.join(location, dump_path, spark_dump_file)
        transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'incremental')

        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
示例#17
0
def _create_dump(location,
                 dump_type,
                 tables,
                 dump_time,
                 threads=DUMP_DEFAULT_THREAD_COUNT):
    """ Creates a dump of the provided tables at the location passed

        Arguments:
            location: the path where the dump should be created
            dump_type: the type of data dump being made - private or public
            tables: a dict containing the names of the tables to be dumped as keys and the columns
                    to be dumped as values
            dump_time: the time at which the dump process was started
            threads: the maximum number of threads to use for compression

        Returns:
            the path to the archive file created
    """

    archive_name = 'listenbrainz-{dump_type}-dump-{time}'.format(
        dump_type=dump_type, time=dump_time.strftime('%Y%m%d-%H%M%S'))
    archive_path = os.path.join(
        location, '{archive_name}.tar.xz'.format(archive_name=archive_name, ))

    with open(archive_path, 'w') as archive:

        pxz_command = [
            'pxz', '--compress', '-T{threads}'.format(threads=threads)
        ]
        pxz = subprocess.Popen(pxz_command,
                               stdin=subprocess.PIPE,
                               stdout=archive)

        with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

            temp_dir = tempfile.mkdtemp()

            try:
                schema_seq_path = os.path.join(temp_dir, "SCHEMA_SEQUENCE")
                with open(schema_seq_path, "w") as f:
                    f.write(str(db.SCHEMA_VERSION))
                tar.add(schema_seq_path,
                        arcname=os.path.join(archive_name, "SCHEMA_SEQUENCE"))
                timestamp_path = os.path.join(temp_dir, "TIMESTAMP")
                with open(timestamp_path, "w") as f:
                    f.write(dump_time.isoformat(" "))
                tar.add(timestamp_path,
                        arcname=os.path.join(archive_name, "TIMESTAMP"))
                tar.add(DUMP_LICENSE_FILE_PATH,
                        arcname=os.path.join(archive_name, "COPYING"))
            except IOError as e:
                current_app.logger.error(
                    'IOError while adding dump metadata: %s',
                    str(e),
                    exc_info=True)
                raise
            except Exception as e:
                current_app.logger.error(
                    'Exception while adding dump metadata: %s',
                    str(e),
                    exc_info=True)
                raise

            archive_tables_dir = os.path.join(temp_dir, 'lbdump', 'lbdump')
            create_path(archive_tables_dir)

            with db.engine.connect() as connection:
                if dump_type == "feedback":
                    dump_user_feedback(connection, location=archive_tables_dir)
                else:
                    with connection.begin() as transaction:
                        cursor = connection.connection.cursor()
                        for table in tables:
                            try:
                                copy_table(
                                    cursor=cursor,
                                    location=archive_tables_dir,
                                    columns=','.join(tables[table]),
                                    table_name=table,
                                )
                            except IOError as e:
                                current_app.logger.error(
                                    'IOError while copying table %s',
                                    table,
                                    exc_info=True)
                                raise
                            except Exception as e:
                                current_app.logger.error(
                                    'Error while copying table %s: %s',
                                    table,
                                    str(e),
                                    exc_info=True)
                                raise
                        transaction.rollback()

            tar.add(archive_tables_dir,
                    arcname=os.path.join(archive_name,
                                         'lbdump'.format(dump_type)))

            shutil.rmtree(temp_dir)

        pxz.stdin.close()

    pxz.wait()
    return archive_path
    def dump_listens(self, location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Dumps all listens in the ListenStore into a .tar.xz archive.

        Files are created with UUIDs as names. Each file can contain listens for a number of users.
        An index.json file is used to save which file contains the listens of which users.

        Args:
            location: the directory where the listens dump archive should be created
            dump_time (datetime): the time at which the data dump was started
            threads (int): the number of threads to user for compression

        Returns:
            the path to the dump archive
        """

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info('Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users(columns=['id', 'musicbrainz_id'])
        self.log.info('Total number of users: %d', len(users))

        archive_name = 'listenbrainz-listens-dump-{time}'.format(time=dump_time.strftime('%Y%m%d-%H%M%S'))
        archive_path = os.path.join(location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = ['pxz', '--compress', '-T{threads}'.format(threads=threads)]
            pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = tempfile.mkdtemp()

                try:
                    # add timestamp
                    timestamp_path = os.path.join(temp_dir, 'TIMESTAMP')
                    with open(timestamp_path, 'w') as f:
                        f.write(dump_time.isoformat(' '))
                    tar.add(timestamp_path,
                            arcname=os.path.join(archive_name, 'TIMESTAMP'))

                    # add schema version
                    schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE')
                    with open(schema_version_path, 'w') as f:
                        f.write(str(LISTENS_DUMP_SCHEMA_VERSION))
                    tar.add(schema_version_path,
                            arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE'))

                    # add copyright notice
                    tar.add(DUMP_LICENSE_FILE_PATH,
                            arcname=os.path.join(archive_name, 'COPYING'))

                except IOError as e:
                    self.log.critical('IOError while writing metadata dump files: %s', str(e), exc_info=True)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding dump metadata: %s', str(e), exc_info=True)
                    raise

                listens_path = os.path.join(temp_dir, 'listens')

                dump_complete = False
                next_user_id = 0
                index = {}
                while not dump_complete:
                    file_name = str(uuid.uuid4())
                    # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid)
                    directory = os.path.join(listens_path, file_name[0], file_name[0:2])
                    create_path(directory)
                    file_path = os.path.join(directory, '{uuid}.listens'.format(uuid=file_name))
                    with open(file_path, 'w') as f:
                        file_done = False
                        while next_user_id < len(users):
                            if f.tell() > DUMP_FILE_SIZE_LIMIT:
                                file_done = True
                                break

                            username = users[next_user_id]['musicbrainz_id']
                            offset = f.tell()
                            size = self.dump_user(username=username, fileobj=f, dump_time=dump_time)
                            index[username] = {
                                'file_name': file_name,
                                'offset': offset,
                                'size': size,
                            }
                            next_user_id += 1

                        if file_done:
                            continue

                        if next_user_id == len(users):
                            dump_complete = True
                            break


                # add the listens directory to the archive
                self.log.info('Got all listens, adding them to the archive...')
                tar.add(listens_path,
                        arcname=os.path.join(archive_name, 'listens'))

                # add index.json file to the archive
                try:
                    index_path = os.path.join(temp_dir, 'index.json')
                    with open(index_path, 'w') as f:
                        f.write(ujson.dumps(index))
                    tar.add(index_path,
                            arcname=os.path.join(archive_name, 'index.json'))
                except IOError as e:
                    self.log.critical('IOError while writing index.json to archive: %s', str(e), exc_info=True)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding index file to archive: %s', str(e), exc_info=True)
                    raise

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        pxz.wait()
        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path
def create_full(location, threads, dump_id, do_listen_dump: bool,
                do_spark_dump: bool, do_db_dump: bool):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from the listenstore.

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            do_listen_dump: If True, make a listens dump
            do_spark_dump: If True, make a spark listens dump
            do_db_dump: If True, make a public/private postgres/timescale dump
    """
    app = create_app()
    with app.app_context():
        ls = DumpListenStore(app)
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        ts = end_time.strftime('%Y%m%d-%H%M%S')
        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
            dump_id=dump_id, time=ts)
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)

        expected_num_dumps = 0
        if do_db_dump:
            db_dump.dump_postgres_db(dump_path, end_time, threads)
            expected_num_dumps += 4
        if do_listen_dump:
            ls.dump_listens(dump_path,
                            dump_id=dump_id,
                            end_time=end_time,
                            threads=threads)
            expected_num_dumps += 1
        if do_spark_dump:
            ls.dump_listens_for_spark(dump_path,
                                      dump_id=dump_id,
                                      dump_type="full",
                                      end_time=end_time)
            expected_num_dumps += 1

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            # 6 types of dumps, archive, md5, sha256 for each
            expected_num_dump_files = expected_num_dumps * 3
            if not sanity_check_dumps(dump_path, expected_num_dump_files):
                return sys.exit(-1)
        except OSError:
            sys.exit(-1)

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s full\n" % (ts, dump_id))

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'fullexport')

        sys.exit(0)
    def dump_listens(self,
                     location,
                     dump_time=datetime.today(),
                     threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Dumps all listens in the ListenStore into a .tar.xz archive.

        Files are created with UUIDs as names. Each file can contain listens for a number of users.
        An index.json file is used to save which file contains the listens of which users.

        Args:
            location: the directory where the listens dump archive should be created
            dump_time (datetime): the time at which the data dump was started
            threads (int): the number of threads to user for compression

        Returns:
            the path to the dump archive
        """

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info(
            'Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users(columns=['id', 'musicbrainz_id'])
        self.log.info('Total number of users: %d', len(users))

        archive_name = 'listenbrainz-listens-dump-{time}'.format(
            time=dump_time.strftime('%Y%m%d-%H%M%S'))
        archive_path = os.path.join(
            location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = [
                'pxz', '--compress', '-T{threads}'.format(threads=threads)
            ]
            pxz = subprocess.Popen(pxz_command,
                                   stdin=subprocess.PIPE,
                                   stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = tempfile.mkdtemp()

                try:
                    # add timestamp
                    timestamp_path = os.path.join(temp_dir, 'TIMESTAMP')
                    with open(timestamp_path, 'w') as f:
                        f.write(dump_time.isoformat(' '))
                    tar.add(timestamp_path,
                            arcname=os.path.join(archive_name, 'TIMESTAMP'))

                    # add schema version
                    schema_version_path = os.path.join(temp_dir,
                                                       'SCHEMA_SEQUENCE')
                    with open(schema_version_path, 'w') as f:
                        f.write(str(LISTENS_DUMP_SCHEMA_VERSION))
                    tar.add(schema_version_path,
                            arcname=os.path.join(archive_name,
                                                 'SCHEMA_SEQUENCE'))

                    # add copyright notice
                    tar.add(DUMP_LICENSE_FILE_PATH,
                            arcname=os.path.join(archive_name, 'COPYING'))

                except IOError as e:
                    log_ioerrors(self.log, e)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding dump metadata: %s',
                                   str(e))
                    raise

                listens_path = os.path.join(temp_dir, 'listens')

                dump_complete = False
                next_user_id = 0
                index = {}
                while not dump_complete:
                    file_name = str(uuid.uuid4())
                    # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid)
                    directory = os.path.join(listens_path, file_name[0],
                                             file_name[0:2])
                    create_path(directory)
                    file_path = os.path.join(
                        directory, '{uuid}.listens'.format(uuid=file_name))
                    with open(file_path, 'w') as f:
                        file_done = False
                        while next_user_id < len(users):
                            if f.tell() > DUMP_FILE_SIZE_LIMIT:
                                file_done = True
                                break

                            username = users[next_user_id]['musicbrainz_id']
                            offset = f.tell()
                            size = self.dump_user(username=username,
                                                  fileobj=f,
                                                  dump_time=dump_time)
                            index[username] = {
                                'file_name': file_name,
                                'offset': offset,
                                'size': size,
                            }
                            next_user_id += 1

                        if file_done:
                            continue

                        if next_user_id == len(users):
                            dump_complete = True
                            break

                # add the listens directory to the archive
                self.log.info('Got all listens, adding them to the archive...')
                tar.add(listens_path,
                        arcname=os.path.join(archive_name, 'listens'))

                # add index.json file to the archive
                try:
                    index_path = os.path.join(temp_dir, 'index.json')
                    with open(index_path, 'w') as f:
                        f.write(ujson.dumps(index))
                    tar.add(index_path,
                            arcname=os.path.join(archive_name, 'index.json'))
                except IOError as e:
                    log_ioerrors(self.log, e)
                    raise
                except Exception as e:
                    self.log.error(
                        'Exception while adding index file to archive: %s',
                        str(e))
                    raise

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path
def create_incremental(location, threads, dump_id):
    app = create_app()
    with app.app_context():
        ls = DumpListenStore(app)
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found, exiting!",
                                         dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
        if prev_dump_entry is None:  # incremental dumps must have a previous dump in the series
            current_app.logger.error(
                "Invalid dump ID %d, could not find previous dump", dump_id)
            sys.exit(-1)
        start_time = prev_dump_entry['created']
        current_app.logger.info("Dumping data from %s to %s", start_time,
                                end_time)

        dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(
            dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)

        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        start_time=start_time,
                        end_time=end_time,
                        threads=threads)
        ls.dump_listens_for_spark(dump_path,
                                  dump_id=dump_id,
                                  dump_type="incremental",
                                  start_time=start_time,
                                  end_time=end_time)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            if not sanity_check_dumps(dump_path, 6):
                return sys.exit(-1)
        except OSError as e:
            sys.exit(-1)

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'incremental')

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s incremental\n" %
                    (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)
        sys.exit(0)
示例#22
0
def create_full(location, threads, dump_id, last_dump_id):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from the listenstore

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table
    """
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if last_dump_id:
            all_dumps = db_dump.get_dump_entries()
            if len(all_dumps) == 0:
                current_app.logger.error(
                    "Cannot create full dump with last dump's ID, no dump exists!"
                )
                sys.exit(-1)
            dump_id = all_dumps[0]['id']

        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        ts = end_time.strftime('%Y%m%d-%H%M%S')
        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
            dump_id=dump_id, time=ts)
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, end_time, threads)

        listens_dump_file = ls.dump_listens(dump_path,
                                            dump_id=dump_id,
                                            end_time=end_time,
                                            threads=threads)
        spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format(
            dump_id=dump_id, time=ts)
        spark_dump_path = os.path.join(location, dump_path, spark_dump_file)
        transmogrify_dump_file_to_spark_import_format(listens_dump_file,
                                                      spark_dump_path, threads)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            if not sanity_check_dumps(dump_path, 12):
                return sys.exit(-1)
        except OSError as e:
            sys.exit(-1)

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'fullexport')

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s full\n" % (ts, dump_id))

        sys.exit(0)
示例#23
0
    def test_cleanup_dumps(self):
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-1-20180312-000001-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-2-20180312-000002-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-3-20180312-000003-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-4-20180312-000004-full'))

        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-1-20180312-000001-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-2-20180312-000002-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-3-20180312-000003-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-4-20180312-000004-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-5-20180312-000005-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-6-20180312-000006-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-7-20180312-000007-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-99-20200124-000007-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-100-20200124-000008-incremental'))

        create_path(os.path.join(self.tempdir, 'not-a-dump'))

        dump_manager._cleanup_dumps(self.tempdir)

        newdirs = os.listdir(self.tempdir)
        self.assertNotIn('listenbrainz-dump-1-20180312-000001-full', newdirs)
        self.assertNotIn('listenbrainz-dump-2-20180312-000002-full', newdirs)

        self.assertIn('listenbrainz-dump-3-20180312-000003-full', newdirs)
        self.assertIn('listenbrainz-dump-4-20180312-000004-full', newdirs)

        self.assertNotIn('listenbrainz-dump-1-20180312-000001-incremental',
                         newdirs)
        self.assertNotIn('listenbrainz-dump-2-20180312-000002-incremental',
                         newdirs)
        self.assertNotIn('listenbrainz-dump-3-20180312-000003-incremental',
                         newdirs)

        self.assertIn('listenbrainz-dump-4-20180312-000004-incremental',
                      newdirs)
        self.assertIn('listenbrainz-dump-5-20180312-000005-incremental',
                      newdirs)
        self.assertIn('listenbrainz-dump-6-20180312-000006-incremental',
                      newdirs)
        self.assertIn('listenbrainz-dump-7-20180312-000007-incremental',
                      newdirs)
        self.assertIn('listenbrainz-dump-99-20200124-000007-incremental',
                      newdirs)
        self.assertIn('listenbrainz-dump-100-20200124-000008-incremental',
                      newdirs)

        self.assertIn('not-a-dump', newdirs)
    def dump_listens_for_spark(
            self,
            location,
            dump_id: int,
            dump_type: str,
            start_time: datetime = datetime.utcfromtimestamp(
                DATA_START_YEAR_IN_SECONDS),
            end_time: datetime = None):
        """ Dumps all listens in the ListenStore into spark parquet files in a .tar archive.

        Listens are dumped into files ideally no larger than 128MB, sorted from oldest to newest. Files
        are named #####.parguet with monotonically increasing integers starting with 0.

        This creates an incremental dump if start_time is specified (with range start_time to end_time),
        otherwise it creates a full dump with all listens.

        Args:
            location: the directory where the listens dump archive should be created
            dump_id: the ID of the dump in the dump sequence
            dump_type: type of dump, full or incremental
            start_time: the start of the time range for which listens should be dumped. defaults to
                utc 0 (meaning a full dump)
            end_time: the end of time range for which listens should be dumped. defaults to the current time

        Returns:
            the path to the dump archive
        """

        if end_time is None:
            end_time = datetime.now()

        self.log.info('Beginning spark dump of listens from TimescaleDB...')
        full_dump = bool(start_time == datetime.utcfromtimestamp(
            DATA_START_YEAR_IN_SECONDS))
        archive_name = 'listenbrainz-spark-dump-{dump_id}-{time}'.format(
            dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        if full_dump:
            archive_name = '{}-full'.format(archive_name)
        else:
            archive_name = '{}-incremental'.format(archive_name)
        archive_path = os.path.join(
            location, '{filename}.tar'.format(filename=archive_name))

        parquet_index = 0
        with tarfile.open(archive_path, "w") as tar:

            temp_dir = os.path.join(self.dump_temp_dir_root, str(uuid.uuid4()))
            create_path(temp_dir)
            self.write_dump_metadata(archive_name, start_time, end_time,
                                     temp_dir, tar, full_dump)

            for year in range(start_time.year, end_time.year + 1):
                if year == start_time.year:
                    start = start_time
                else:
                    start = datetime(year=year, day=1, month=1)
                if year == end_time.year:
                    end = end_time
                else:
                    end = datetime(year=year + 1, day=1, month=1)

                self.log.info("dump %s to %s" %
                              (start.strftime("%Y-%m-%d %H:%M:%S"),
                               end.strftime("%Y-%m-%d %H:%M:%S")))

                # This try block is here in an effort to expose bugs that occur during testing
                # Without it sometimes test pass and sometimes they give totally unrelated errors.
                # Keeping this block should help with future testing...
                try:
                    parquet_index = self.write_parquet_files(
                        archive_name, temp_dir, tar, dump_type, start, end,
                        parquet_index)
                except Exception as err:
                    self.log.info("likely test failure: " + str(err))
                    raise

            shutil.rmtree(temp_dir)

        self.log.info('ListenBrainz spark listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path
示例#25
0
def _create_dump(location, dump_type, tables, dump_time, threads=DUMP_DEFAULT_THREAD_COUNT):
    """ Creates a dump of the provided tables at the location passed

        Arguments:
            location: the path where the dump should be created
            dump_type: the type of data dump being made - private or public
            tables: a dict containing the names of the tables to be dumped as keys and the columns
                    to be dumped as values
            dump_time: the time at which the dump process was started
            threads: the maximum number of threads to use for compression

        Returns:
            the path to the archive file created
    """

    archive_name = 'listenbrainz-{dump_type}-dump-{time}'.format(
        dump_type=dump_type,
        time=dump_time.strftime('%Y%m%d-%H%M%S')
    )
    archive_path = os.path.join(location, '{archive_name}.tar.xz'.format(
        archive_name=archive_name,
    ))

    with open(archive_path, 'w') as archive:

        pxz_command = ['pxz', '--compress', '-T{threads}'.format(threads=threads)]
        pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive)

        with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

            temp_dir = tempfile.mkdtemp()

            try:
                schema_seq_path = os.path.join(temp_dir, "SCHEMA_SEQUENCE")
                with open(schema_seq_path, "w") as f:
                    f.write(str(db.SCHEMA_VERSION))
                tar.add(schema_seq_path,
                        arcname=os.path.join(archive_name, "SCHEMA_SEQUENCE"))
                timestamp_path = os.path.join(temp_dir, "TIMESTAMP")
                with open(timestamp_path, "w") as f:
                    f.write(dump_time.isoformat(" "))
                tar.add(timestamp_path,
                        arcname=os.path.join(archive_name, "TIMESTAMP"))
                tar.add(DUMP_LICENSE_FILE_PATH,
                        arcname=os.path.join(archive_name, "COPYING"))
            except IOError as e:
                current_app.logger.error('IOError while adding dump metadata: %s', str(e), exc_info=True)
                raise
            except Exception as e:
                current_app.logger.error('Exception while adding dump metadata: %s', str(e), exc_info=True)
                raise


            archive_tables_dir = os.path.join(temp_dir, 'lbdump', 'lbdump')
            create_path(archive_tables_dir)


            with db.engine.connect() as connection:
                with connection.begin() as transaction:
                    cursor = connection.connection.cursor()
                    for table in tables:
                        try:
                            copy_table(
                                cursor=cursor,
                                location=archive_tables_dir,
                                columns=','.join(tables[table]),
                                table_name=table,
                            )
                        except IOError as e:
                            current_app.logger.error('IOError while copying table %s', table, exc_info=True)
                            raise
                        except Exception as e:
                            current_app.logger.error('Error while copying table %s: %s', table, str(e), exc_info=True)
                            raise
                    transaction.rollback()


            tar.add(archive_tables_dir, arcname=os.path.join(archive_name, 'lbdump'.format(dump_type)))

            shutil.rmtree(temp_dir)

        pxz.stdin.close()

    pxz.wait()
    return archive_path
示例#26
0
    def test_cleanup_dumps(self):
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-1-20180312-000001-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-2-20180312-000002-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-3-20180312-000003-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-4-20180312-000004-full'))

        for i in range(1, 50):
            create_path(
                os.path.join(
                    self.tempdir,
                    'listenbrainz-dump-%d-20180312-%06d-incremental' % (i, i)))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-99-20200124-000007-incremental'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-dump-100-20200124-000008-incremental'))

        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-feedback-20180312-000001-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-feedback-20180312-000002-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-feedback-20180312-000003-full'))
        create_path(
            os.path.join(self.tempdir,
                         'listenbrainz-feedback-20180312-000004-full'))

        create_path(os.path.join(self.tempdir, 'not-a-dump'))

        dump_manager._cleanup_dumps(self.tempdir)

        newdirs = os.listdir(self.tempdir)
        self.assertNotIn('listenbrainz-dump-1-20180312-000001-full', newdirs)
        self.assertNotIn('listenbrainz-dump-2-20180312-000002-full', newdirs)

        self.assertIn('listenbrainz-dump-3-20180312-000003-full', newdirs)
        self.assertIn('listenbrainz-dump-4-20180312-000004-full', newdirs)

        self.assertNotIn('listenbrainz-dump-1-20180312-000001-incremental',
                         newdirs)
        self.assertNotIn('listenbrainz-dump-2-20180312-000002-incremental',
                         newdirs)
        self.assertNotIn('listenbrainz-dump-3-20180312-000003-incremental',
                         newdirs)
        self.assertNotIn('listenbrainz-dump-21-20180312-000003-incremental',
                         newdirs)

        for i in range(22, 50):
            self.assertIn(
                'listenbrainz-dump-%d-20180312-%06d-incremental' % (i, i),
                newdirs)

        self.assertIn('listenbrainz-dump-99-20200124-000007-incremental',
                      newdirs)
        self.assertIn('listenbrainz-dump-100-20200124-000008-incremental',
                      newdirs)

        self.assertNotIn('listenbrainz-feedback-20180312-000001-full', newdirs)
        self.assertNotIn('listenbrainz-feedback-20180312-000002-full', newdirs)

        self.assertIn('listenbrainz-feedback-20180312-000003-full', newdirs)
        self.assertIn('listenbrainz-feedback-20180312-000004-full', newdirs)

        self.assertIn('not-a-dump', newdirs)
示例#27
0
    def dump_listens(self, location, dump_time=datetime.today(), threads=None):
        """ Fetches listens of each user from her measurement and dumps them into a file.
            These files are compressed into an archive.

        Args:
            location: the directory where the listens dump archive should be created
            dump_time (datetime): the time at which the data dump was started
            threads (int): the number of threads to user for compression

        Returns:
            the path to the dump archive
        """

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info(
            'Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users()
        self.log.info('Total number of users: %d', len(users))

        archive_name = 'listenbrainz-listens-dump-{time}'.format(
            time=dump_time.strftime('%Y%m%d-%H%M%S'))
        archive_path = os.path.join(
            location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = ['pxz', '--compress']
            if threads is not None:
                pxz_command.append('-T {threads}'.format(threads=threads))

            pxz = subprocess.Popen(pxz_command,
                                   stdin=subprocess.PIPE,
                                   stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = tempfile.mkdtemp()

                try:
                    # add timestamp
                    timestamp_path = os.path.join(temp_dir, 'TIMESTAMP')
                    with open(timestamp_path, 'w') as f:
                        f.write(dump_time.isoformat(' '))
                    tar.add(timestamp_path,
                            arcname=os.path.join(archive_name, 'TIMESTAMP'))

                    # add schema version
                    schema_version_path = os.path.join(temp_dir,
                                                       'SCHEMA_SEQUENCE')
                    with open(schema_version_path, 'w') as f:
                        f.write(str(LISTENS_DUMP_SCHEMA_VERSION))
                    tar.add(schema_version_path,
                            arcname=os.path.join(archive_name,
                                                 'SCHEMA_SEQUENCE'))

                    # add copyright notice
                    tar.add(DUMP_LICENSE_FILE_PATH,
                            arcname=os.path.join(archive_name, 'COPYING'))

                except IOError as e:
                    log_ioerrors(self.log, e)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding dump metadata: %s',
                                   str(e))
                    raise

                listens_path = os.path.join(temp_dir, 'listens')
                create_path(listens_path)

                # get listens from all measurements and write them to files in
                # a temporary dir before adding them to the archive
                for user in users:
                    username = user['musicbrainz_id']
                    offset = 0

                    user_listens_file = '{username}.listens'.format(
                        username=username)
                    user_listens_path = os.path.join(listens_path,
                                                     user_listens_file)

                    with open(user_listens_path, 'w') as f:
                        # Get this user's listens in chunks
                        while True:

                            # loop until we get this chunk of listens
                            while True:
                                try:
                                    result = self.influx.query("""
                                        SELECT *
                                          FROM {measurement}
                                         WHERE time <= {timestamp}
                                      ORDER BY time DESC
                                         LIMIT {limit}
                                        OFFSET {offset}
                                    """.format(
                                        measurement=
                                        get_escaped_measurement_name(username),
                                        timestamp=get_influx_query_timestamp(
                                            dump_time.strftime('%s')),
                                        limit=DUMP_CHUNK_SIZE,
                                        offset=offset,
                                    ))
                                    break
                                except Exception as e:
                                    self.log.error(
                                        'Error while getting listens for user %s',
                                        user['musicbrainz_id'])
                                    self.log.error(str(e))
                                    time.sleep(3)

                            rows = list(
                                result.get_points(
                                    get_measurement_name(username)))
                            if not rows:
                                break

                            for row in rows:
                                listen = Listen.from_influx(row).to_api()
                                try:
                                    f.write(ujson.dumps(listen))
                                    f.write('\n')
                                except IOError as e:
                                    log_ioerrors(self.log, e)
                                    raise
                                except Exception as e:
                                    self.log.error(
                                        'Exception while creating json for user: %s',
                                        user['musicbrainz_id'])
                                    self.log.error(str(e))
                                    raise

                            offset += DUMP_CHUNK_SIZE

                # add the listens directory to the archive
                self.log.info('Got all listens, adding them to the archive...')
                tar.add(listens_path,
                        arcname=os.path.join(archive_name, 'listens'))

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path