Exemplo n.º 1
0
def full_db(location, rotate=False):
    """Create complete dump of PostgreSQL database.

    This command creates database dump using pg_dump and puts it into specified directory
    (default is *backup*). It's also possible to remove all previously created backups
    except two most recent ones. If you want to do that, set *rotate* argument to True.

    File with a dump will be a tar archive with a timestamp in the name: `%Y%m%d-%H%M%S.tar.bz2`.
    """
    create_path(location)

    FILE_PREFIX = "cb-backup-"
    db_hostname, db_name, db_username, db_password = explode_db_uri(current_app.config['SQLALCHEMY_DATABASE_URI'])

    print('Creating database dump in "%s"...' % location)

    # Executing pg_dump command
    # More info about it is available at http://www.postgresql.org/docs/9.3/static/app-pgdump.html
    dump_file = os.path.join(location, FILE_PREFIX + strftime("%Y%m%d-%H%M%S", gmtime()))
    if subprocess.call('pg_dump -Ft "%s" > "%s.tar"' % (db_name, dump_file), shell=True) != 0:
        raise Exception("Failed to create database dump!")

    # Compressing created dump
    if subprocess.call('bzip2 "%s.tar"' % dump_file, shell=True) != 0:
        raise Exception("Failed to create database dump!")

    print('Created %s.tar.bz2' % dump_file)

    if rotate:
        print("Removing old backups (except two latest)...")
        remove_old_archives(location, "%s[0-9]+-[0-9]+.tar" % FILE_PREFIX,
                            is_dir=False, sort_key=lambda x: os.path.getmtime(x))

    print("Done!")
Exemplo n.º 2
0
def create_base_archive(*, location, meta_files_dir=None):
    """Creates a dump of all license-independent information: (users, license).

    Args:
        location: Path of the directory where the archive needs to be created.
        meta_files_dir (optional): Path of the directory containing the meta files to be copied
            into the archive (TIMESTAMP and SCHEMA_VERSION). If not specified, the meta files are
            generated and added to the archive.
    Returns:
        Complete path to the created archive.
    """
    with tarfile.open(os.path.join(location, "cbdump.tar.bz2"),
                      "w:bz2") as tar:
        temp_dir = tempfile.mkdtemp()
        base_archive_dir = os.path.join(temp_dir, 'cbdump')
        create_path(base_archive_dir)

        # Dumping tables
        base_archive_tables_dir = os.path.join(base_archive_dir, 'cbdump')
        create_path(base_archive_tables_dir)
        with db.engine.connect() as connection:
            with connection.begin() as transaction:
                cursor = connection.connection.cursor()
                try:
                    with open(
                            os.path.join(base_archive_tables_dir,
                                         'user_sanitised'), 'w') as f:
                        cursor.copy_to(f,
                                       '"user"',
                                       columns=('id', 'created',
                                                'display_name',
                                                'musicbrainz_id'))
                    with open(os.path.join(base_archive_tables_dir, 'license'),
                              'w') as f:
                        cursor.copy_to(f,
                                       'license',
                                       columns=_TABLES["license"])
                except Exception:
                    print(
                        'Error while copying tables during creation of the base archive.'
                    )
                    transaction.rollback()
        tar.add(base_archive_tables_dir, arcname='cbdump')

        # Including additional information about this archive
        # Copying the most restrictive license there (CC BY-NC-SA 3.0)
        tar.add(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             "licenses", "cc-by-nc-sa-30.txt"),
                arcname='COPYING')
        # Copy meta files
        if not meta_files_dir:
            prepare_meta_files(temp_dir)
            meta_files_dir = temp_dir
        add_meta_files(tar, meta_files_dir)

        shutil.rmtree(temp_dir)  # Cleanup
        return " + %s/cbdump.tar.bz2" % location
Exemplo n.º 3
0
def public(location, rotate=False):
    """Creates a set of archives with public data.

    1. Base archive with license-independent data (users, licenses).
    2. Archive with all reviews and revisions.
    3... Separate archives for each license (contain reviews and revisions associated with specific license).
    """
    print("Creating public database dump...")
    time_now = datetime.today()

    # Creating a directory where all dumps will go
    dump_dir = os.path.join(location, time_now.strftime('%Y%m%d-%H%M%S'))
    create_path(dump_dir)

    # Prepare meta files
    meta_files_dir = tempfile.mkdtemp()
    prepare_meta_files(meta_files_dir, time_now=time_now)

    with db.engine.begin() as connection:
        # BASE ARCHIVE
        # Contains all license independent data (licenses, users)
        base_archive_path = create_base_archive(
            connection,
            location=dump_dir,
            meta_files_dir=meta_files_dir,
        )
        print(base_archive_path)

        # 1. COMBINED
        # Archiving all reviews (any license)
        review_dump_path = create_reviews_archive(
            connection,
            location=dump_dir,
            meta_files_dir=meta_files_dir,
        )
        print(review_dump_path)

        # 2. SEPARATE
        # Creating separate archives for each license
        for license in db_license.get_licenses_list(connection):
            review_dump_path = create_reviews_archive(
                connection,
                location=dump_dir,
                meta_files_dir=meta_files_dir,
                license_id=license['id'],
            )
            print(review_dump_path)

    shutil.rmtree(meta_files_dir)  # Cleanup
    if rotate:
        print("Removing old dumps (except two latest)...")
        remove_old_archives(location, "[0-9]+-[0-9]+", is_dir=True)

    print("Done!")
Exemplo n.º 4
0
def full_db(location, rotate=False):
    """Create complete dump of PostgreSQL database.

    This command creates database dump using pg_dump and puts it into specified directory
    (default is *backup*). It's also possible to remove all previously created backups
    except two most recent ones. If you want to do that, set *rotate* argument to True.

    File with a dump will be a tar archive with a timestamp in the name: `%Y%m%d-%H%M%S.tar.bz2`.
    """
    create_path(location)

    FILE_PREFIX = "cb-backup-"
    db_hostname, db_port, db_name, db_username, _ = \
        explode_db_uri(current_app.config['SQLALCHEMY_DATABASE_URI'])

    print('Creating database dump in "%s"...' % location)

    # Executing pg_dump command
    # More info about it is available at http://www.postgresql.org/docs/9.3/static/app-pgdump.html
    dump_file = os.path.join(location,
                             FILE_PREFIX + strftime("%Y%m%d-%H%M%S", gmtime()))
    print(
        'pg_dump -h "%s" -p "%s" -U "%s" -d "%s" -Ft > "%s.tar"' %
        (db_hostname, db_port, db_username, db_name, dump_file), )
    result = subprocess.call(
        'pg_dump -h "%s" -p "%s" -U "%s" -d "%s" -Ft > "%s.tar"' %
        (db_hostname, db_port, db_username, db_name, dump_file),
        shell=True,
    )
    if result != 0:
        raise Exception("Failed to create database dump!")

    # Compressing created dump
    result = subprocess.call('bzip2 "%s.tar"' % dump_file, shell=True)
    if result != 0:
        raise Exception("Failed to create database dump!")

    print('Created %s.tar.bz2' % dump_file)

    if rotate:
        print("Removing old backups (except two latest)...")
        remove_old_archives(location,
                            "%s[0-9]+-[0-9]+.tar" % FILE_PREFIX,
                            is_dir=False,
                            sort_key=os.path.getmtime)

    print("Done!")
Exemplo n.º 5
0
def json(location=os.path.join(os.getcwd(), 'export', 'json'), rotate=False):
    """Create JSON dumps with all reviews.

    This command will create an archive for each license available on CB.
    Archives will be put into a specified directory (default is *dump*).
    """
    create_path(location)

    current_app.json_encoder = DumpJSONEncoder

    print("Creating new archives...")
    for license in License.query.all():
        safe_name = slugify(license.id)
        with tarfile.open(
                os.path.join(
                    location, "critiquebrainz-%s-%s-json.tar.bz2" %
                    (datetime.today().strftime('%Y%m%d'), safe_name)),
                "w:bz2") as tar:
            temp_dir = tempfile.mkdtemp()
            license_dir = os.path.join(temp_dir, safe_name)
            create_path(license_dir)

            # Finding release groups that have reviews with current license
            query = db.session.query(Review.entity_id).group_by(
                Review.entity_id)
            for entity in query.all():
                entity = entity[0]
                # Creating directory structure and dumping reviews
                dir_part = os.path.join(entity[0:1], entity[0:2])
                reviews = Review.list(entity_id=entity,
                                      license_id=license.id)[0]
                if len(reviews) > 0:
                    rg_dir = '%s/%s' % (license_dir, dir_part)
                    create_path(rg_dir)
                    f = open('%s/%s.json' % (rg_dir, entity), 'w+')
                    f.write(
                        jsonify(reviews=[r.to_dict() for r in reviews]).data)
                    f.close()

            tar.add(license_dir, arcname='reviews')

            # Copying legal text
            tar.add(os.path.join("critiquebrainz", "data", "licenses",
                                 safe_name + ".txt"),
                    arcname='COPYING')

            print(" + %s/critiquebrainz-%s-%s-json.tar.bz2" %
                  (location, datetime.today().strftime('%Y%m%d'), safe_name))

            shutil.rmtree(temp_dir)  # Cleanup

    if rotate:
        print("Removing old sets of archives (except two latest)...")
        remove_old_archives(location,
                            "critiquebrainz-[0-9]+-[-\w]+-json.tar.bz2",
                            is_dir=False,
                            sort_key=lambda x: os.path.getmtime(x))

    print("Done!")
Exemplo n.º 6
0
def json(location, rotate=False):
    """Create JSON dumps with all reviews.

    This command will create an archive for each license available on CB.
    Archives will be put into a specified directory (default is *dump*).
    """
    create_path(location)

    current_app.json_encoder = DumpJSONEncoder

    print("Creating new archives...")
    with db.engine.begin() as connection:
        for license in db_license.get_licenses_list(connection):
            safe_name = slugify(license["id"])
            with tarfile.open(os.path.join(location, "critiquebrainz-%s-%s-json.tar.bz2" %
                                           (datetime.today().strftime('%Y%m%d'), safe_name)), "w:bz2") as tar:
                temp_dir = tempfile.mkdtemp()
                license_dir = os.path.join(temp_dir, safe_name)
                create_path(license_dir)

                # Finding entities that have reviews with current license
                entities = db_review.get_distinct_entities(connection)
                for entity in entities:
                    entity = str(entity)
                    # Creating directory structure and dumping reviews
                    dir_part = os.path.join(entity[0:1], entity[0:2])
                    reviews = db_review.get_reviews_list(connection, entity_id=entity, license_id=license["id"], limit=None)[0]
                    if reviews:
                        rg_dir = '%s/%s' % (license_dir, dir_part)
                        create_path(rg_dir)
                        f = open('%s/%s.json' % (rg_dir, entity), 'w+')
                        f.write(jsonify(reviews=[db_review.to_dict(r, connection=connection) for r in reviews])
                                .data.decode("utf-8"))
                        f.close()

                tar.add(license_dir, arcname='reviews')

                # Copying legal text
                tar.add(os.path.join(os.path.dirname(os.path.realpath(__file__)), "licenses", safe_name + ".txt"),
                        arcname='COPYING')

                print(" + %s/critiquebrainz-%s-%s-json.tar.bz2" % (location, datetime.today().strftime('%Y%m%d'), safe_name))

                shutil.rmtree(temp_dir)  # Cleanup

        if rotate:
            print("Removing old sets of archives (except two latest)...")
            remove_old_archives(location, r"critiquebrainz-[0-9]+-[-\w]+-json.tar.bz2",
                                is_dir=False, sort_key=os.path.getmtime)

        print("Done!")
Exemplo n.º 7
0
def json(location, rotate=False):
    """Create JSON dumps with all reviews.

    This command will create an archive for each license available on CB.
    Archives will be put into a specified directory (default is *dump*).
    """
    create_path(location)

    current_app.json_encoder = DumpJSONEncoder

    print("Creating new archives...")
    for license in model.License.query.all():
        safe_name = slugify(license.id)
        with tarfile.open(os.path.join(location, "critiquebrainz-%s-%s-json.tar.bz2" %
                (datetime.today().strftime('%Y%m%d'), safe_name)), "w:bz2") as tar:
            temp_dir = tempfile.mkdtemp()
            license_dir = os.path.join(temp_dir, safe_name)
            create_path(license_dir)

            # Finding release groups that have reviews with current license
            query = db.session.query(model.Review.entity_id).group_by(model.Review.entity_id)
            for entity in query.all():
                entity = entity[0]
                # Creating directory structure and dumping reviews
                dir_part = os.path.join(entity[0:1], entity[0:2])
                reviews = model.Review.list(entity_id=entity, license_id=license.id)[0]
                if len(reviews) > 0:
                    rg_dir = '%s/%s' % (license_dir, dir_part)
                    create_path(rg_dir)
                    f = open('%s/%s.json' % (rg_dir, entity), 'w+')
                    f.write(jsonify(reviews=[r.to_dict() for r in reviews]).data)
                    f.close()

            tar.add(license_dir, arcname='reviews')

            # Copying legal text
            tar.add(os.path.join("critiquebrainz", "data", "licenses", safe_name + ".txt"), arcname='COPYING')

            print(" + %s/critiquebrainz-%s-%s-json.tar.bz2" % (location, datetime.today().strftime('%Y%m%d'), safe_name))

            shutil.rmtree(temp_dir)  # Cleanup

    if rotate:
        print("Removing old sets of archives (except two latest)...")
        remove_old_archives(location, "critiquebrainz-[0-9]+-[-\w]+-json.tar.bz2",
                            is_dir=False, sort_key=lambda x: os.path.getmtime(x))

    print("Done!")
Exemplo n.º 8
0
def public(location, rotate=False):
    """Creates a set of archives with public data.

    1. Base archive with license-independent data (users, licenses).
    2. Archive with all reviews and revisions.
    3... Separate archives for each license (contain reviews and revisions associated with specific license).
    """
    print("Creating public database dump...")
    time_now = datetime.today()

    cursor = db.session.connection().connection.cursor()

    # Creating a directory where all dumps will go
    dump_dir = os.path.join(location, time_now.strftime('%Y%m%d-%H%M%S'))
    create_path(dump_dir)

    temp_dir = tempfile.mkdtemp()

    # Preparing meta files
    with open(os.path.join(temp_dir, 'TIMESTAMP'), 'w') as f:
        f.write(time_now.isoformat(' '))
    with open(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'), 'w') as f:
        f.write(str(model.__version__))

    # BASE ARCHIVE
    # Archiving stuff that is independent from licenses (users, licenses)
    with tarfile.open(os.path.join(dump_dir, "cbdump.tar.bz2"), "w:bz2") as tar:
        base_archive_dir = os.path.join(temp_dir, 'cbdump')
        create_path(base_archive_dir)

        # Dumping tables
        base_archive_tables_dir = os.path.join(base_archive_dir, 'cbdump')
        create_path(base_archive_tables_dir)
        with open(os.path.join(base_archive_tables_dir, 'user_sanitised'), 'w') as f:
            cursor.copy_to(f, '"user"', columns=('id', 'created', 'display_name', 'musicbrainz_id'))
        with open(os.path.join(base_archive_tables_dir, 'license'), 'w') as f:
            cursor.copy_to(f, 'license', columns=get_columns(model.License))
        tar.add(base_archive_tables_dir, arcname='cbdump')

        # Including additional information about this archive
        # Copying the most restrictive license there (CC BY-NC-SA 3.0)
        tar.add(os.path.join('critiquebrainz', 'data', 'licenses', 'cc-by-nc-sa-30.txt'), arcname='COPYING')
        tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
        tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'), arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump.tar.bz2" % dump_dir)

    # REVIEWS
    # Archiving review tables (review, revision)

    # 1. COMBINED
    # Archiving all reviews (any license)
    REVISION_COMBINED_SQL = "SELECT %s FROM revision JOIN review " \
                            "ON review.id = revision.review_id " \
                            "WHERE review.is_hidden = false AND review.is_draft = false" \
                            % ', '.join(['revision.' + col for col in get_columns(model.Revision)])
    with tarfile.open(os.path.join(dump_dir, "cbdump-reviews-all.tar.bz2"), "w:bz2") as tar:
        # Dumping tables
        reviews_combined_tables_dir = os.path.join(temp_dir, 'cbdump-reviews-all')
        create_path(reviews_combined_tables_dir)
        with open(os.path.join(reviews_combined_tables_dir, 'review'), 'w') as f:
            cursor.copy_to(f, "(SELECT %s FROM review WHERE is_hidden = false AND is_draft = false)" %
                           (', '.join(get_columns(model.Review))))
        with open(os.path.join(reviews_combined_tables_dir, 'revision'), 'w') as f:
            cursor.copy_to(f, "(%s)" % REVISION_COMBINED_SQL)
        tar.add(reviews_combined_tables_dir, arcname='cbdump')

        # Including additional information about this archive
        # Copying the most restrictive license there (CC BY-NC-SA 3.0)
        tar.add(os.path.join('critiquebrainz', 'data', 'licenses', 'cc-by-nc-sa-30.txt'), arcname='COPYING')
        tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
        tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'), arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump-reviews-all.tar.bz2" % dump_dir)

    # 2. SEPARATE
    # Creating separate archives for each license
    REVISION_SEPARATE_SQL = REVISION_COMBINED_SQL + " AND review.license_id ='%s'"
    for license in model.License.query.all():
        safe_name = slugify(license.id)
        with tarfile.open(os.path.join(dump_dir, "cbdump-reviews-%s.tar.bz2" % safe_name), "w:bz2") as tar:
            # Dumping tables
            tables_dir = os.path.join(temp_dir, safe_name)
            create_path(tables_dir)
            with open(os.path.join(tables_dir, 'review'), 'w') as f:
                cursor.copy_to(f, "(SELECT %s FROM review WHERE is_hidden = false AND is_draft = false " \
                                  "AND license_id = '%s')" % (', '.join(get_columns(model.Review)), license.id))
            with open(os.path.join(tables_dir, 'revision'), 'w') as f:
                cursor.copy_to(f, "(%s)" % (REVISION_SEPARATE_SQL % license.id))
            tar.add(tables_dir, arcname='cbdump')

            # Including additional information about this archive
            tar.add(os.path.join("critiquebrainz", "data", "licenses", safe_name + ".txt"), arcname='COPYING')
            tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
            tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'), arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump-reviews-%s.tar.bz2" % (dump_dir, safe_name))

    shutil.rmtree(temp_dir)  # Cleanup

    if rotate:
        print("Removing old dumps (except two latest)...")
        remove_old_archives(location, "[0-9]+-[0-9]+", is_dir=True)

    print("Done!")
Exemplo n.º 9
0
def public(location, rotate=False):
    """Creates a set of archives with public data.

    1. Base archive with license-independent data (users, licenses).
    2. Archive with all reviews and revisions.
    3... Separate archives for each license (contain reviews and revisions associated with specific license).
    """
    print("Creating public database dump...")
    time_now = datetime.today()

    connection = db.engine.raw_connection()
    cursor = connection.cursor()

    # Creating a directory where all dumps will go
    dump_dir = os.path.join(location, time_now.strftime('%Y%m%d-%H%M%S'))
    create_path(dump_dir)

    temp_dir = tempfile.mkdtemp()

    # Preparing meta files
    with open(os.path.join(temp_dir, 'TIMESTAMP'), 'w') as f:
        f.write(time_now.isoformat(' '))
    with open(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'), 'w') as f:
        f.write(str(db.SCHEMA_VERSION))

    # BASE ARCHIVE
    # Archiving stuff that is independent from licenses (users, licenses)
    with tarfile.open(os.path.join(dump_dir, "cbdump.tar.bz2"),
                      "w:bz2") as tar:
        base_archive_dir = os.path.join(temp_dir, 'cbdump')
        create_path(base_archive_dir)

        # Dumping tables
        base_archive_tables_dir = os.path.join(base_archive_dir, 'cbdump')
        create_path(base_archive_tables_dir)
        with open(os.path.join(base_archive_tables_dir, 'user_sanitised'),
                  'w') as f:
            cursor.copy_to(f,
                           '"user"',
                           columns=('id', 'created', 'display_name',
                                    'musicbrainz_id'))
        with open(os.path.join(base_archive_tables_dir, 'license'), 'w') as f:
            cursor.copy_to(f, 'license', columns=_TABLES["license"])
        tar.add(base_archive_tables_dir, arcname='cbdump')

        # Including additional information about this archive
        # Copying the most restrictive license there (CC BY-NC-SA 3.0)
        tar.add(os.path.join('critiquebrainz', 'data', 'licenses',
                             'cc-by-nc-sa-30.txt'),
                arcname='COPYING')
        tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
        tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'),
                arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump.tar.bz2" % dump_dir)

    # REVIEWS
    # Archiving review tables (review, revision)

    # 1. COMBINED
    # Archiving all reviews (any license)
    REVISION_COMBINED_SQL = """
        SELECT {columns} FROM revision JOIN review
            ON review.id = revision.review_id
         WHERE review.is_hidden = false AND review.is_draft = false
    """.format(
        columns=', '.join(['revision.' + col for col in _TABLES["revision"]]))
    with tarfile.open(os.path.join(dump_dir, "cbdump-reviews-all.tar.bz2"),
                      "w:bz2") as tar:
        # Dumping tables
        reviews_combined_tables_dir = os.path.join(temp_dir,
                                                   'cbdump-reviews-all')
        create_path(reviews_combined_tables_dir)
        with open(os.path.join(reviews_combined_tables_dir, 'review'),
                  'w') as f:
            cursor.copy_to(
                f,
                "(SELECT {columns} FROM review WHERE is_hidden = false AND is_draft = false)"
                .format(columns=', '.join(_TABLES["review"])))
        with open(os.path.join(reviews_combined_tables_dir, 'revision'),
                  'w') as f:
            cursor.copy_to(f, "({sql})".format(sql=REVISION_COMBINED_SQL))
        tar.add(reviews_combined_tables_dir, arcname='cbdump')

        # Including additional information about this archive
        # Copying the most restrictive license there (CC BY-NC-SA 3.0)
        tar.add(os.path.join('critiquebrainz', 'data', 'licenses',
                             'cc-by-nc-sa-30.txt'),
                arcname='COPYING')
        tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
        tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'),
                arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump-reviews-all.tar.bz2" % dump_dir)

    # 2. SEPARATE
    # Creating separate archives for each license
    for license in db_license.list_licenses():
        safe_name = slugify(license["id"])
        with tarfile.open(
                os.path.join(dump_dir,
                             "cbdump-reviews-%s.tar.bz2" % safe_name),
                "w:bz2") as tar:
            # Dumping tables
            tables_dir = os.path.join(temp_dir, safe_name)
            create_path(tables_dir)
            with open(os.path.join(tables_dir, 'review'), 'w') as f:
                cursor.copy_to(
                    f, """(
                    SELECT {columns}
                      FROM review
                     WHERE is_hidden = false
                       AND is_draft = false
                       AND license_id = '{license_id}'
                )""".format(columns=', '.join(_TABLES["review"]),
                            license_id=license["id"]))
            with open(os.path.join(tables_dir, 'revision'), 'w') as f:
                cursor.copy_to(
                    f,
                    """({REVISION_COMBINED_SQL} AND review.license_id='{license_id}')"""
                    .format(REVISION_COMBINED_SQL=REVISION_COMBINED_SQL,
                            license_id=license["id"]))
            tar.add(tables_dir, arcname='cbdump')

            # Including additional information about this archive
            tar.add(os.path.join("critiquebrainz", "data", "licenses",
                                 safe_name + ".txt"),
                    arcname='COPYING')
            tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
            tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'),
                    arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump-reviews-%s.tar.bz2" % (dump_dir, safe_name))

    shutil.rmtree(temp_dir)  # Cleanup
    connection.close()

    if rotate:
        print("Removing old dumps (except two latest)...")
        remove_old_archives(location, "[0-9]+-[0-9]+", is_dir=True)

    print("Done!")
Exemplo n.º 10
0
def public(
        location=os.path.join(os.getcwd(), 'export', 'public'), rotate=False):
    """Creates a set of archives with public data.

    1. Base archive with license-independent data (users, licenses).
    2. Archive with all reviews and revisions.
    3... Separate archives for each license (contain reviews and revisions associated with specific license).
    """
    print("Creating public database dump...")
    time_now = datetime.today()

    cursor = db.session.connection().connection.cursor()

    # Creating a directory where all dumps will go
    dump_dir = os.path.join(location, time_now.strftime('%Y%m%d-%H%M%S'))
    create_path(dump_dir)

    temp_dir = tempfile.mkdtemp()

    # Preparing meta files
    with open(os.path.join(temp_dir, 'TIMESTAMP'), 'w') as f:
        f.write(time_now.isoformat(' '))
    with open(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'), 'w') as f:
        f.write(str(model.__version__))

    # BASE ARCHIVE
    # Archiving stuff that is independent from licenses (users, licenses)
    with tarfile.open(os.path.join(dump_dir, "cbdump.tar.bz2"),
                      "w:bz2") as tar:
        base_archive_dir = os.path.join(temp_dir, 'cbdump')
        create_path(base_archive_dir)

        # Dumping tables
        base_archive_tables_dir = os.path.join(base_archive_dir, 'cbdump')
        create_path(base_archive_tables_dir)
        with open(os.path.join(base_archive_tables_dir, 'user_sanitised'),
                  'w') as f:
            cursor.copy_to(f,
                           '"user"',
                           columns=('id', 'created', 'display_name',
                                    'musicbrainz_id'))
        with open(os.path.join(base_archive_tables_dir, 'license'), 'w') as f:
            cursor.copy_to(f, 'license', columns=get_columns(model.License))
        tar.add(base_archive_tables_dir, arcname='cbdump')

        # Including additional information about this archive
        # Copying the most restrictive license there (CC BY-NC-SA 3.0)
        tar.add(os.path.join('critiquebrainz', 'data', 'licenses',
                             'cc-by-nc-sa-30.txt'),
                arcname='COPYING')
        tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
        tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'),
                arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump.tar.bz2" % dump_dir)

    # REVIEWS
    # Archiving review tables (review, revision)

    # 1. COMBINED
    # Archiving all reviews (any license)
    with tarfile.open(os.path.join(dump_dir, "cbdump-reviews-all.tar.bz2"),
                      "w:bz2") as tar:
        # Dumping tables
        reviews_combined_tables_dir = os.path.join(temp_dir,
                                                   'cbdump-reviews-all')
        create_path(reviews_combined_tables_dir)
        with open(os.path.join(reviews_combined_tables_dir, 'review'),
                  'w') as f:
            cursor.copy_to(f, 'review', columns=get_columns(model.Review))
        with open(os.path.join(reviews_combined_tables_dir, 'revision'),
                  'w') as f:
            cursor.copy_to(f, 'revision', columns=get_columns(model.Revision))
        tar.add(reviews_combined_tables_dir, arcname='cbdump')

        # Including additional information about this archive
        # Copying the most restrictive license there (CC BY-NC-SA 3.0)
        tar.add(os.path.join('critiquebrainz', 'data', 'licenses',
                             'cc-by-nc-sa-30.txt'),
                arcname='COPYING')
        tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
        tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'),
                arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump-reviews-all.tar.bz2" % dump_dir)

    # 2. SEPARATE
    # Creating separate archives for each license
    for license in License.query.all():
        safe_name = slugify(license.id)
        with tarfile.open(
                os.path.join(dump_dir,
                             "cbdump-reviews-%s.tar.bz2" % safe_name),
                "w:bz2") as tar:
            # Dumping tables
            tables_dir = os.path.join(temp_dir, safe_name)
            create_path(tables_dir)
            with open(os.path.join(tables_dir, 'review'), 'w') as f:
                cursor.copy_to(
                    f, "(SELECT (%s) FROM review WHERE license_id = '%s')" %
                    (', '.join(get_columns(model.Review)), license.id))
            with open(os.path.join(tables_dir, 'revision'), 'w') as f:
                cursor.copy_to(
                    f,
                    "(SELECT (revision.%s) FROM revision JOIN review ON revision.review_id = review.id WHERE review.license_id = '%s')"
                    % (', revision.'.join(get_columns(
                        model.Revision)), license.id))
            tar.add(tables_dir, arcname='cbdump')

            # Including additional information about this archive
            tar.add(os.path.join("critiquebrainz", "data", "licenses",
                                 safe_name + ".txt"),
                    arcname='COPYING')
            tar.add(os.path.join(temp_dir, 'TIMESTAMP'), arcname='TIMESTAMP')
            tar.add(os.path.join(temp_dir, 'SCHEMA_SEQUENCE'),
                    arcname='SCHEMA_SEQUENCE')

        print(" + %s/cbdump-reviews-%s.tar.bz2" % (dump_dir, safe_name))

    shutil.rmtree(temp_dir)  # Cleanup

    if rotate:
        print("Removing old dumps (except two latest)...")
        remove_old_archives(location, "[0-9]+-[0-9]+", is_dir=True)

    print("Done!")
Exemplo n.º 11
0
def create_reviews_archive(connection, *, location, meta_files_dir=None, license_id=None):
    """Creates a dump of reviews filtered on the given license_id, their revisions and
       the avg. rating tables.

    Args:
        connection (sqlalchemy.engine.Connection): an sqlalchemy connection to the database for executing database queries
        location: Path of the directory where the archive needs to be created.
        meta_files_dir (optional): Path of the directory containing the meta files to be copied
            into the archive (TIMESTAMP and SCHEMA_VERSION). If not specified, the meta files are
            generated and added to the archive.
        license_id (optional): The ID of the license whose reviews (and related information)
            is to be added to the dump. All reviews are copied (irrespective of their
            license) if license_id is None.
    Returns:
        Complete path to the created archive.
    """
    if license_id:
        license_where_clause = "AND license_id = '{}'".format(license_id)
        safe_name = slugify(license_id)
        archive_name = "cbdump-reviews-{}.tar.bz2".format(safe_name)
    else:
        license_where_clause = ''
        archive_name = "cbdump-reviews-all.tar.bz2"
        safe_name = 'cb-reviews-all'

    REVIEW_SQL = """(
        SELECT {columns}
          FROM review
         WHERE is_hidden = false
           AND is_draft = false
               {license_where_clause}
    )""".format(columns=', '.join(_TABLES["review"]), license_where_clause=license_where_clause)

    REVISION_SQL = """(
        SELECT {columns}
          FROM revision
          JOIN review
            ON review.id = revision.review_id
         WHERE review.is_hidden = false
           AND review.is_draft = false
               {license_where_clause}
    )""".format(
        columns=', '.join(['revision.' + column for column in _TABLES['revision']]),
        license_where_clause=license_where_clause,
    )

    with tarfile.open(os.path.join(location, archive_name), "w:bz2") as tar:
        # Dumping tables
        temp_dir = tempfile.mkdtemp()
        reviews_tables_dir = os.path.join(temp_dir, safe_name)
        create_path(reviews_tables_dir)

        cursor = connection.connection.cursor()
        try:
            with open(os.path.join(reviews_tables_dir, 'review'), 'w') as f:
                cursor.copy_to(f, REVIEW_SQL)

            with open(os.path.join(reviews_tables_dir, 'revision'), 'w') as f:
                cursor.copy_to(f, REVISION_SQL)

            with open(os.path.join(reviews_tables_dir, 'avg_rating'), 'w') as f:
                cursor.copy_to(f, "(SELECT {columns} FROM avg_rating)".format(columns=", ".join(_TABLES["avg_rating"])))
        except Exception as e:
            print("Error {} occurred while copying tables during the creation of the reviews archive!".format(e))
            raise
        tar.add(reviews_tables_dir, arcname='cbdump')

        if not license_id:
            tar.add(os.path.join(os.path.dirname(os.path.realpath(__file__)), "licenses", "cc-by-nc-sa-30.txt"),
                    arcname='COPYING')
        else:
            tar.add(os.path.join(os.path.dirname(os.path.realpath(__file__)), "licenses", safe_name + ".txt"), arcname='COPYING')

        if not meta_files_dir:
            prepare_meta_files(temp_dir)
            meta_files_dir = temp_dir
        add_meta_files(tar, meta_files_dir)
        shutil.rmtree(temp_dir)  # Cleanup

    return " + {location}/{archive_name}".format(location=location, archive_name=archive_name)