예제 #1
0
def drop_set(cluster, name, allow_forced_removal=False):
    # TODO: add dry run support

    (name, (configuration, stat)) = fetch_sets(cluster, (name, ))[0]

    deletions = get_managed_databases(
        cluster,
        (configuration.database.dsn, ),
        configure=False,
        skip_inaccessible=allow_forced_removal,
    )

    ztransaction = check_version(cluster)

    transactions = []

    # TODO: add help to inform user of the possiblity of retry
    for connection in deletions.values():
        transaction = Transaction(connection, 'drop-set:%s' % (name, ))
        transactions.append(transaction)
        with connection.cursor() as cursor:
            unconfigure_set(cluster, cursor, name, configuration)

    ztransaction.delete(
        cluster.get_set_path(name),
        version=stat.version,
    )

    with managed(transactions):
        commit(ztransaction)
예제 #2
0
파일: __init__.py 프로젝트: disqus/pgshovel
def drop_set(cluster, name, allow_forced_removal=False):
    # TODO: add dry run support

    (name, (configuration, stat)) = fetch_sets(cluster, (name,))[0]

    deletions = get_managed_databases(
        cluster,
        (configuration.database.dsn,),
        configure=False,
        skip_inaccessible=allow_forced_removal,
    )

    ztransaction = check_version(cluster)

    transactions = []

    # TODO: add help to inform user of the possiblity of retry
    for connection in deletions.values():
        transaction = Transaction(connection, 'drop-set:%s' % (name,))
        transactions.append(transaction)
        with connection.cursor() as cursor:
            unconfigure_set(cluster, cursor, name, configuration)

    ztransaction.delete(
        cluster.get_set_path(name),
        version=stat.version,
    )

    with managed(transactions):
        commit(ztransaction)
예제 #3
0
파일: __init__.py 프로젝트: disqus/pgshovel
def initialize_cluster(cluster):
    """
    Initialize a pgshovel cluster in ZooKeeper.
    """
    logger.info('Creating a new cluster for %s...', cluster)

    configuration = ClusterConfiguration(version=__version__)
    ztransaction = cluster.zookeeper.transaction()
    ztransaction.create(cluster.path, BinaryCodec(ClusterConfiguration).encode(configuration))
    ztransaction.create(cluster.get_set_path())
    commit(ztransaction)
예제 #4
0
def initialize_cluster(cluster):
    """
    Initialize a pgshovel cluster in ZooKeeper.
    """
    logger.info('Creating a new cluster for %s...', cluster)

    configuration = ClusterConfiguration(version=__version__)
    ztransaction = cluster.zookeeper.transaction()
    ztransaction.create(
        cluster.path,
        BinaryCodec(ClusterConfiguration).encode(configuration))
    ztransaction.create(cluster.get_set_path())
    commit(ztransaction)
예제 #5
0
def upgrade_cluster(cluster, force=False):
    zookeeper = cluster.zookeeper

    codec = BinaryCodec(ClusterConfiguration)
    data, stat = zookeeper.get(cluster.path)
    configuration = codec.decode(data)

    # if the configuration is newer or equal, require manual intervention
    assert parse_version(__version__) > parse_version(
        configuration.version) or force, 'cannot downgrade %s to %s' % (
            configuration.version, __version__)

    logger.info('Upgrading cluster from %s to %s...', configuration.version,
                __version__)
    configuration.version = __version__

    ztransaction = zookeeper.transaction()
    ztransaction.set_data(cluster.path,
                          codec.encode(configuration),
                          version=stat.version)

    # collect databases
    databases = set()
    for s, (configuration, stat) in fetch_sets(cluster):
        databases.add(configuration.database.dsn)

        # TODO: not entirely sure that this is necessary, but can't hurt
        ztransaction.check(cluster.get_set_path(s), version=stat.version)

    transactions = []
    # get_managed_databases prevents duplicates, so this is safe to perform
    # without doing any advisory locking (although it will error if two sets
    # refer to the same database using different DSNs.) get_managed_databases
    # should provide some capacity for doing deduplication to make this more
    # convenient, probably, but this at least keeps it from inadvertently
    # breaking for now.
    for connection in get_managed_databases(cluster,
                                            databases,
                                            configure=False,
                                            same_version=False).values():
        transaction = Transaction(connection, 'update-cluster')
        transactions.append(transaction)
        with connection.cursor() as cursor:
            setup_database(cluster, cursor)

    with managed(transactions):
        commit(ztransaction)
예제 #6
0
파일: __init__.py 프로젝트: disqus/pgshovel
def upgrade_cluster(cluster, force=False):
    zookeeper = cluster.zookeeper

    codec = BinaryCodec(ClusterConfiguration)
    data, stat = zookeeper.get(cluster.path)
    configuration = codec.decode(data)

    # if the configuration is newer or equal, require manual intervention
    assert parse_version(__version__) > parse_version(configuration.version) or force, 'cannot downgrade %s to %s' % (configuration.version, __version__)

    logger.info('Upgrading cluster from %s to %s...', configuration.version, __version__)
    configuration.version = __version__

    ztransaction = zookeeper.transaction()
    ztransaction.set_data(cluster.path, codec.encode(configuration), version=stat.version)

    # collect databases
    databases = set()
    for s, (configuration, stat) in fetch_sets(cluster):
        databases.add(configuration.database.dsn)

        # TODO: not entirely sure that this is necessary, but can't hurt
        ztransaction.check(cluster.get_set_path(s), version=stat.version)

    transactions = []
    # get_managed_databases prevents duplicates, so this is safe to perform
    # without doing any advisory locking (although it will error if two sets
    # refer to the same database using different DSNs.) get_managed_databases
    # should provide some capacity for doing deduplication to make this more
    # convenient, probably, but this at least keeps it from inadvertently
    # breaking for now.
    for connection in get_managed_databases(cluster, databases, configure=False, same_version=False).values():
        transaction = Transaction(connection, 'update-cluster')
        transactions.append(transaction)
        with connection.cursor() as cursor:
            setup_database(cluster, cursor)

    with managed(transactions):
        commit(ztransaction)
예제 #7
0
def create_set(cluster, name, configuration):
    # TODO: add dry run support

    validate_set_configuration(configuration)

    databases = get_managed_databases(cluster, (configuration.database.dsn, ))

    ztransaction = check_version(cluster)

    transactions = []
    for connection in databases.values():
        transaction = Transaction(connection, 'create-set:%s' % (name, ))
        transactions.append(transaction)

        with connection.cursor() as cursor:
            configure_set(cluster, cursor, name, configuration)

    ztransaction.create(
        cluster.get_set_path(name),
        BinaryCodec(ReplicationSetConfiguration).encode(configuration),
    )

    with managed(transactions):
        commit(ztransaction)
예제 #8
0
파일: __init__.py 프로젝트: disqus/pgshovel
def create_set(cluster, name, configuration):
    # TODO: add dry run support

    validate_set_configuration(configuration)

    databases = get_managed_databases(cluster, (configuration.database.dsn,))

    ztransaction = check_version(cluster)

    transactions = []
    for connection in databases.values():
        transaction = Transaction(connection, 'create-set:%s' % (name,))
        transactions.append(transaction)

        with connection.cursor() as cursor:
            configure_set(cluster, cursor, name, configuration)

    ztransaction.create(
        cluster.get_set_path(name),
        BinaryCodec(ReplicationSetConfiguration).encode(configuration),
    )

    with managed(transactions):
        commit(ztransaction)
예제 #9
0
def update_set(cluster,
               name,
               updated_configuration,
               allow_forced_removal=False):
    # TODO: add dry run support

    validate_set_configuration(updated_configuration)

    (name, (current_configuration, stat)) = fetch_sets(cluster, (name, ))[0]

    # TODO: It probably makes sense to normalize the database URIs here.
    current_databases = set((current_configuration.database.dsn, ))
    updated_databases = set((updated_configuration.database.dsn, ))

    additions = get_managed_databases(cluster,
                                      updated_databases - current_databases)
    mutations = get_managed_databases(cluster,
                                      updated_databases & current_databases)
    deletions = get_managed_databases(
        cluster,
        current_databases - updated_databases,
        skip_inaccessible=allow_forced_removal,
    )

    # ensure no items show up multiple times, since that causes incorrect behavior
    # TODO: this is a very naive approach to avoid shooting ourselves in the
    # foot and could be improved for valid cases (updating a dsn for an
    # existing set should be treated as a mutation, not an addition and
    # deletion) but this would require a more intelligent implementation
    occurrences = collections.Counter()
    for nodes in map(operator.methodcaller('keys'),
                     (additions, mutations, deletions)):
        occurrences.update(nodes)

    duplicates = list(
        itertools.takewhile(lambda (node, count): count > 1,
                            occurrences.most_common()))
    assert not duplicates, 'found duplicates: %s' % (duplicates, )

    ztransaction = check_version(cluster)

    transactions = []

    for connection in additions.values():
        transaction = Transaction(connection,
                                  'update-set:create:%s' % (name, ))
        transactions.append(transaction)
        with connection.cursor() as cursor:
            configure_set(cluster, cursor, name, updated_configuration, None)

    for connection in mutations.values():
        transaction = Transaction(connection,
                                  'update-set:update:%s' % (name, ))
        transactions.append(transaction)
        with connection.cursor() as cursor:
            configure_set(cluster, cursor, name, updated_configuration,
                          current_configuration)

    # TODO: add help to inform user of the possiblity of retry
    for connection in deletions.values():
        transaction = Transaction(connection,
                                  'update-set:delete:%s' % (name, ))
        transactions.append(transaction)
        with connection.cursor() as cursor:
            unconfigure_set(cluster, cursor, name, current_configuration)

    ztransaction.set_data(
        cluster.get_set_path(name),
        BinaryCodec(ReplicationSetConfiguration).encode(updated_configuration),
        version=stat.version,
    )

    with managed(transactions):
        commit(ztransaction)
예제 #10
0
def get_managed_databases(cluster,
                          dsns,
                          configure=True,
                          skip_inaccessible=False,
                          same_version=True):
    """
    Returns a dictionary of managed databases by their unique node ID. If the
    same node is referenced multiple times (either by the same, or by different
    DSNs), an error is raised.

    If the database has not already been configured for use with pgshovel, the
    database will be implicitly configured, unless the ``configure`` argument
    is ``False``, in which case it will error. If the same node is attempted to
    be configured multiple times (by providing the same DSN multiple times, or
    diffrent DSNs that point to the same database) an error is raised to
    prevent deadlocking during configuration.

    By default, all databases must be accessible. If partial results are
    acceptable (such as cases where databases may be expected to have
    permanently failed), the ``skip_inaccessible`` arguments allows returning
    only those databases that are able to be connected to and an error is
    logged.
    """
    if not dsns:
        return {}

    nodes = {}

    if same_version:
        ztransaction = check_version(cluster)
    else:
        ztransaction = cluster.zookeeper.transaction()

    lock_id = random.randint(-2**63, 2**63 - 1)  # bigint max/min
    logger.debug('Connecting to databases: %s', FormattedSequence(dsns))

    transactions = []

    for dsn in dsns:
        try:
            connection = psycopg2.connect(dsn)
        except Exception as error:
            if skip_inaccessible:
                logger.warning('%s is inaccessible due to error, skipping: %s',
                               dsn, error)
                continue
            else:
                raise

        logger.debug('Checking if %s has been configured...', dsn)
        try:
            with connection.cursor() as cursor:
                node_id = get_node_identifier(cluster, cursor)
                assert node_id is not None
        except psycopg2.ProgrammingError:
            if not configure:
                raise

            # TODO: Check this better to ensure this is the right type of error
            # (make sure that is specific enough to the table not being
            # present.)
            logger.info(
                '%s has not been configured for use, setting up now...', dsn)
            connection.rollback()  # start over

            transaction = Transaction(connection, 'setup-database')
            transactions.append(transaction)
            with connection.cursor() as cursor:
                # To ensure that we're not attempting to configure the same
                # database multiple times (which would result in a deadlock,
                # since the second transaction will block indefinitely, waiting
                # for the first transaction to be committed or rolled back) we
                # take out an advisory lock to check that we haven't already
                # prepared this database. (We can't simply check for the
                # existence of the configuration table at this point, since
                # that transaction has not been committed yet.)
                cursor.execute('SELECT pg_try_advisory_lock(%s) as acquired',
                               (lock_id, ))
                ((acquired, ), ) = cursor.fetchall()
                assert acquired, 'could not take out advisory lock on %s (possible deadlock?)' % (
                    connection, )

                node_id = setup_database(cluster, cursor)
        else:
            # Check to ensure that the remote database is configured using the
            # same version as the local version. This is important since a
            # previously configured database that has not been used for some
            # time can still have an old version of the schema, log trigger,
            # etc. Adding it back to the cluster without upgrading it can cause
            # strange compatibility issues.
            # TODO: It would make sense here to provide an easy upgrade path --
            # right now, there is no direct path to upgrading a database that
            # has no groups associated with it!
            with connection.cursor() as cursor:
                version = str(
                    get_configuration_value(cluster, cursor, 'version'))
                assert version == __version__, 'local and node versions do not match (local: %s, node: %s)' % (
                    __version__, version)

            logger.debug('%s is already configured as %s (version %s).', dsn,
                         node_id, version)
            connection.commit()  # don't leave idle in transaction

        assert node_id not in nodes, 'found duplicate node: %s and %s' % (
            connection, nodes[node_id])
        nodes[node_id] = connection

    if transactions:
        with managed(transactions):
            commit(ztransaction)

    return nodes
예제 #11
0
파일: __init__.py 프로젝트: disqus/pgshovel
def update_set(cluster, name, updated_configuration, allow_forced_removal=False):
    # TODO: add dry run support

    validate_set_configuration(updated_configuration)

    (name, (current_configuration, stat)) = fetch_sets(cluster, (name,))[0]

    # TODO: It probably makes sense to normalize the database URIs here.
    current_databases = set((current_configuration.database.dsn,))
    updated_databases = set((updated_configuration.database.dsn,))

    additions = get_managed_databases(cluster, updated_databases - current_databases)
    mutations = get_managed_databases(cluster, updated_databases & current_databases)
    deletions = get_managed_databases(
        cluster,
        current_databases - updated_databases,
        skip_inaccessible=allow_forced_removal,
    )

    # ensure no items show up multiple times, since that causes incorrect behavior
    # TODO: this is a very naive approach to avoid shooting ourselves in the
    # foot and could be improved for valid cases (updating a dsn for an
    # existing set should be treated as a mutation, not an addition and
    # deletion) but this would require a more intelligent implementation
    occurrences = collections.Counter()
    for nodes in map(operator.methodcaller('keys'), (additions, mutations, deletions)):
        occurrences.update(nodes)

    duplicates = list(itertools.takewhile(lambda (node, count): count > 1, occurrences.most_common()))
    assert not duplicates, 'found duplicates: %s' % (duplicates,)

    ztransaction = check_version(cluster)

    transactions = []

    for connection in additions.values():
        transaction = Transaction(connection, 'update-set:create:%s' % (name,))
        transactions.append(transaction)
        with connection.cursor() as cursor:
            configure_set(cluster, cursor, name, updated_configuration, None)

    for connection in mutations.values():
        transaction = Transaction(connection, 'update-set:update:%s' % (name,))
        transactions.append(transaction)
        with connection.cursor() as cursor:
            configure_set(cluster, cursor, name, updated_configuration, current_configuration)

    # TODO: add help to inform user of the possiblity of retry
    for connection in deletions.values():
        transaction = Transaction(connection, 'update-set:delete:%s' % (name,))
        transactions.append(transaction)
        with connection.cursor() as cursor:
            unconfigure_set(cluster, cursor, name, current_configuration)

    ztransaction.set_data(
        cluster.get_set_path(name),
        BinaryCodec(ReplicationSetConfiguration).encode(updated_configuration),
        version=stat.version,
    )

    with managed(transactions):
        commit(ztransaction)
예제 #12
0
파일: __init__.py 프로젝트: disqus/pgshovel
def get_managed_databases(cluster, dsns, configure=True, skip_inaccessible=False, same_version=True):
    """
    Returns a dictionary of managed databases by their unique node ID. If the
    same node is referenced multiple times (either by the same, or by different
    DSNs), an error is raised.

    If the database has not already been configured for use with pgshovel, the
    database will be implicitly configured, unless the ``configure`` argument
    is ``False``, in which case it will error. If the same node is attempted to
    be configured multiple times (by providing the same DSN multiple times, or
    diffrent DSNs that point to the same database) an error is raised to
    prevent deadlocking during configuration.

    By default, all databases must be accessible. If partial results are
    acceptable (such as cases where databases may be expected to have
    permanently failed), the ``skip_inaccessible`` arguments allows returning
    only those databases that are able to be connected to and an error is
    logged.
    """
    if not dsns:
        return {}

    nodes = {}

    if same_version:
        ztransaction = check_version(cluster)
    else:
        ztransaction = cluster.zookeeper.transaction()

    lock_id = random.randint(-2**63, 2**63-1)  # bigint max/min
    logger.debug('Connecting to databases: %s', FormattedSequence(dsns))

    transactions = []

    for dsn in dsns:
        try:
            connection = psycopg2.connect(dsn)
        except Exception as error:
            if skip_inaccessible:
                logger.warning('%s is inaccessible due to error, skipping: %s', dsn, error)
                continue
            else:
                raise

        logger.debug('Checking if %s has been configured...', dsn)
        try:
            with connection.cursor() as cursor:
                node_id = get_node_identifier(cluster, cursor)
                assert node_id is not None
        except psycopg2.ProgrammingError:
            if not configure:
                raise

            # TODO: Check this better to ensure this is the right type of error
            # (make sure that is specific enough to the table not being
            # present.)
            logger.info('%s has not been configured for use, setting up now...', dsn)
            connection.rollback()  # start over

            transaction = Transaction(connection, 'setup-database')
            transactions.append(transaction)
            with connection.cursor() as cursor:
                # To ensure that we're not attempting to configure the same
                # database multiple times (which would result in a deadlock,
                # since the second transaction will block indefinitely, waiting
                # for the first transaction to be committed or rolled back) we
                # take out an advisory lock to check that we haven't already
                # prepared this database. (We can't simply check for the
                # existence of the configuration table at this point, since
                # that transaction has not been committed yet.)
                cursor.execute('SELECT pg_try_advisory_lock(%s) as acquired', (lock_id,))
                ((acquired,),) = cursor.fetchall()
                assert acquired, 'could not take out advisory lock on %s (possible deadlock?)' % (connection,)

                node_id = setup_database(cluster, cursor)
        else:
            # Check to ensure that the remote database is configured using the
            # same version as the local version. This is important since a
            # previously configured database that has not been used for some
            # time can still have an old version of the schema, log trigger,
            # etc. Adding it back to the cluster without upgrading it can cause
            # strange compatibility issues.
            # TODO: It would make sense here to provide an easy upgrade path --
            # right now, there is no direct path to upgrading a database that
            # has no groups associated with it!
            with connection.cursor() as cursor:
                version = str(get_configuration_value(cluster, cursor, 'version'))
                assert version == __version__, 'local and node versions do not match (local: %s, node: %s)' % (__version__, version)

            logger.debug('%s is already configured as %s (version %s).', dsn, node_id, version)
            connection.commit()  # don't leave idle in transaction

        assert node_id not in nodes, 'found duplicate node: %s and %s' % (connection, nodes[node_id])
        nodes[node_id] = connection

    if transactions:
        with managed(transactions):
            commit(ztransaction)

    return nodes