Exemplo n.º 1
0
def main(cli_args, get_steps, slice_name):

    printer = Prindenter(indent=0)

    printer('Syncing a {} slice from {}.{} to {}.{}'.format(
        slice_name, cli_args.upstream.host, cli_args.upstream.database,
                    cli_args.downstream.host, cli_args.downstream.database))

    with Indent(printer):
        printer('[Upstream connection parameters]')
        with Indent(printer):
            printer(cli_args.upstream.__dict__)

        printer('[Downstream connection parameters]')
        with Indent(printer):
            printer(cli_args.downstream.__dict__)

    printer("[Database configuration check]")
    with Connection(cli_args.upstream) as upstream_connection, Indent(printer):

        # collect database-level info into an object
        with Connection(cli_args.downstream) as downstream_connection:
            with downstream_connection.cursor() as downstream_cursor:
                with upstream_connection.cursor() as upstream_cursor:
                    db_pair = Db.Twin(upstream_cursor, downstream_cursor, printer=printer)

        db_pair.downstream.args = cli_args.downstream

        db_pair.upstream.args = cli_args.upstream
        db_pair.upstream.connection = upstream_connection


        printer("[Database sync]")
        with Indent(printer):
            # do the sync-steps for each table in the slice
            for table_name, sync_func in get_steps(db_pair, cli_args, printer=printer).items():
                printer(f'[Table: {table_name}]')
                with Indent(printer):
                    if sync_func:
                        sync_func()
                        printer("")
                    else:
                        with Indent(printer):
                            printer("skipped explicitly by slice definition")
                            printer("")

    printer('Done')
    printer.print_summary()
Exemplo n.º 2
0
def md5_row_ranges(cursor, table, condition, granularity, printer=Prindenter()):

    if granularity <= 1:
        raise ValueError("Variable granularity scanner called, but a trivial granule size was provided")

    converted_columns_str = ",".join(table.columns)

    shortened_condition = pretty_shorten(condition)[:-1]

    printer(f"[ Fingerprinting {cursor.connection.db}.{table.name} in row-ranges of size {granularity}\n"
            f"  where {table.id_col} in {shortened_condition} ]")
    with Indent(printer):

        result = show_do_query(cursor,
                f"""
                SELECT MD5(GROUP_CONCAT(row_fingerprint ORDER BY id)) AS range_fingerprint,
                       row_group * {granularity} as range_begin,
                       (row_group + 1) * {granularity} - 1 as range_end
                FROM
                    (SELECT MD5(CONCAT_WS('|', {converted_columns_str})) as row_fingerprint,
                            FLOOR({table.id_col}/{granularity}) as row_group,
                            {table.id_col} as id
                    FROM {table.name}
                    WHERE {condition}
                    ORDER BY {table.id_col}) as r
                GROUP BY row_group;
                """, printer=printer)

        # organize fingerprints by interval
    return { Interval(row['range_begin'], row['range_end']) : row['range_fingerprint'] for row in result }
Exemplo n.º 3
0
def composite_key_sync(table_name,
                       db_pair,
                       cli_args,
                       keys,
                       condition=None,
                       printer=Prindenter()):

    if condition:
        printer("WARNING, use of 'condition' here is untested")

    # Check to see if work needs to be done
    with Connection(db_pair.downstream.args) as downstream_connection:
        with downstream_connection.cursor() as downstream_cursor:
            with db_pair.upstream.connection.cursor() as upstream_cursor:

                table = Table.Twin(table_name,
                                   downstream_cursor,
                                   upstream_cursor,
                                   keys[0],
                                   printer=printer)

                table.try_sync_schema(upstream_cursor,
                                      downstream_cursor,
                                      throw=False,
                                      printer=printer)

                # do not assume that the use_col is a primary key--it may not be
                delattr(table, 'id_col')
                delattr(table.upstream, 'id_col')
                delattr(table.downstream, 'id_col')

                syncs_completed = []

                if table.is_synced(upstream_cursor,
                                   downstream_cursor,
                                   printer=printer):
                    return identical(table,
                                     "not finding any changes",
                                     printer=printer)

                multikey(table,
                         db_pair,
                         cli_args,
                         keys,
                         condition=condition,
                         printer=printer)
                syncs_completed.append('multikey sync ')

                if table.is_synced(upstream_cursor,
                                   downstream_cursor,
                                   printer=printer):
                    return identical(table,
                                     f"after {','.join(syncs_completed)}",
                                     printer=printer)

                return has_changes_final(
                    table, f"after {','.join(syncs_completed)}, "
                    "because this function is not fully implemented",
                    printer=printer)
Exemplo n.º 4
0
def pull_missing_ids(table,
                     db_pair,
                     cli_args,
                     batch_rows,
                     condition=None,
                     printer=Prindenter()):
    def make_space_downstream(printer):
        with Connection(db_pair.downstream.args) as downstream_connection:
            with downstream_connection.cursor() as cursor:
                if condition:
                    delete = f'delete from {table.name} where {table.id_col} > {table.upstream.max_id} and {condition};'
                else:
                    delete = f'delete from {table.name} where {table.id_col} > {table.upstream.max_id};'
                with Indent(printer):
                    result = show_do_query(cursor, delete, printer=printer)

    if table.downstream.max_id == table.upstream.max_id:
        printer("Nothing to sync")
        return False

    # check for downstream changes beyond max_id for upstream db and clobber them (this is a one-way sync)
    elif table.downstream.max_id > table.upstream.max_id:
        printer("Downstream db has more rows, deleting them.")
        make_space_downstream(printer)
    else:
        printer("Upstream db has more rows, pulling them.")

        # dump to a file
        if table.downstream.max_id == None or table.downstream.max_id == 0:
            # if the target table is empty, dump everything
            mysqldump_data_batches(cli_args.upstream,
                                   table.name,
                                   batch_rows,
                                   table.upstream.max_id,
                                   id_col=table.id_col,
                                   condition=condition,
                                   printer=printer)

        else:
            # otherwise, dump just the rows whose ids aren't in the target
            mysqldump_data_batches(
                cli_args.upstream,
                table.name,
                batch_rows,  # batch size
                table.upstream.max_id,  # max id
                min_id=table.downstream.max_id + 1,
                id_col=table.id_col,
                condition=condition,
                printer=printer)

        printer("Making space downstream")
        make_space_downstream(printer)

        # load from a file
        printer("Loading updated rows")
        mysqlload(cli_args.downstream, table.name, printer=printer)

    return True
Exemplo n.º 5
0
def main(args):
    printer = Prindenter(indent=0)

    printer('Pulling schema from {} to {}'.format(args.upstream.host, args.downstream.host))

    with Connection(args.upstream) as upstream_connection, Indent(printer):
        pull_schema(args, upstream_connection, printer=printer)

    printer('Done')
Exemplo n.º 6
0
def get_last_touched_date(table, column, db, printer=Prindenter()):
    printer(f"[ Finding most recent modification date from {db.args.host}.{db.args.database}.{table}.{column} ]")
    with Indent(printer):
        with Connection(db.args) as connection:
            with connection.cursor() as cursor:
                most_recent_sql = f'select max({column}) from {table};'
                most_recent = show_do_query(cursor, most_recent_sql, printer=printer)[0][f"max({column})"]
        printer(f"Found: {most_recent}")
    return most_recent
Exemplo n.º 7
0
def pull_modifications_since(date,
                             table,
                             column,
                             db_pair,
                             cli_args,
                             condition=None,
                             printer=Prindenter()):

    printer("syncing rows from {}.{} with {} newer than {}".format(
        db_pair.upstream.args.database, table.name, column, date))
    if condition:
        printer("... where {}".format(condition))
    with Indent(printer):
        with db_pair.upstream.connection.cursor() as upstream_cursor:
            if condition:
                newer_than_sql = f'select {table.id_col} from {table.name} where {column} > \'{date}\' and {condition};'
            else:
                newer_than_sql = f'select {table.id_col} from {table.name} where {column} > \'{date}\';'
            newer_than_result = show_do_query(upstream_cursor,
                                              newer_than_sql,
                                              printer=printer)
        if newer_than_result:
            ids_to_sync = [x[table.id_col] for x in newer_than_result]
            printer("Found {} such rows".format(len(ids_to_sync)))

            id_lists = Ids.partition(Constants.batch_conditions, ids_to_sync)
            conditions = []
            for ids in id_lists:
                ids_str = ",".join([str(x) for x in ids])
                conditions.append(f"{table.id_col} in ({ids_str})")

            with Indent(printer):
                printer("Proceeding in {} batches".format(len(conditions)))
                for condition in conditions:

                    # dump upstream data
                    mysqldump_data(cli_args.upstream,
                                   table.name,
                                   condition,
                                   printer=printer)

                    # clear old rows from downstream
                    delete = 'delete from {} where {};'.format(
                        table.name, condition)
                    with Connection(
                            db_pair.downstream.args) as downstream_connection:
                        with downstream_connection.cursor() as cursor:
                            show_do_query(cursor, delete, printer=printer)

                    # load new rows into downstream
                    mysqlload(cli_args.downstream, table.name, printer=printer)
            return True

        else:
            printer("No recent modifications found")
            return False
Exemplo n.º 8
0
    def is_synced_warn(self, upstream_cursor, downstream_cursor, message='', printer=Prindenter()):
        equality_found = self.is_synced(upstream_cursor, downstream_cursor, printer=printer)
        if not equality_found:
            summary = f"{self.name} : DIFFERS {message} (Attempts exhausted, were changes made during sync?)"
            # also, could there be a change re: time/date that was ignored by check_columns?

            printer.append_summary(summary)
        else:
            printer.append_summary(f"{self.name} : IDENTICAL {message}")

        return equality_found
Exemplo n.º 9
0
def show_create(cursor, table_name, printer=Prindenter()):

    printer(f"[Extracting creation SQL from {cursor.connection.db}.{table_name}]")
    with Indent(printer):
        result = show_do_query(cursor,
               f"""
                SHOW CREATE TABLE {table_name};
                """,
                printer=printer)

        return result[0]['Create Table'].strip()
Exemplo n.º 10
0
 def reup_maxes(self,
                downstream_cursor,
                upstream_cursor,
                printer=Prindenter()):
     printer(
         "[Setting group_concat_max_len to known-good value discovered in earlier session]"
     )
     with Indent(printer):
         set_group_concat(upstream_cursor,
                          self.upstream.concat.bytes,
                          printer=printer)
         set_group_concat(downstream_cursor,
                          self.downstream.concat.bytes,
                          printer=printer)
Exemplo n.º 11
0
    def is_synced(self, upstream_cursor, downstream_cursor, printer=Prindenter()):
        with Indent(printer):
            get_checksum = f'checksum table {self.name};'

            result = show_do_query(upstream_cursor, get_checksum, printer=printer)
            upstream_checksum = result[0]['Checksum']

            result = show_do_query(downstream_cursor, get_checksum, printer=printer)
            downstream_checksum = result[0]['Checksum']

            if upstream_checksum != downstream_checksum:
                return False
            else:
                return True
                printer(f"{self.name} is identical on either side")
Exemplo n.º 12
0
    def __init__(self, table_name, cursor, id_col, printer=Prindenter()):

        # Initialize values also found on Twin and that don't disagree between upstream and downstream
        self.id_col = id_col
        self.name = table_name

        # column descriptions with concatentate-friendly modifications
        self.columns = examine_columns(cursor, table_name, printer=printer)

        # how many rows?
        target = f'max({self.id_col})'
        printer(f"[Finding {target} for {cursor.connection.db}.{self.name}]")
        query = f'select {target} from {self.name};'
        with Indent(printer):
            result = show_do_query(cursor, query, printer=printer)
            self.max_id = result[0][target] or 0
Exemplo n.º 13
0
    def __init__(self,
                 upstream_cursor,
                 downstream_cursor,
                 printer=Prindenter()):

        printer("[Upstream db]")
        with Indent(printer):
            self.upstream = One(upstream_cursor, printer=printer)

        printer("[Downstream db]")
        with Indent(printer):
            self.downstream = One(downstream_cursor, printer=printer)

        self.concat = GroupConcat(
            min(self.downstream.concat.md5s, self.upstream.concat.md5s),
            min(self.downstream.concat.bytes, self.upstream.concat.bytes))
Exemplo n.º 14
0
def get_steps(db_pair, cli_args, printer=Prindenter()):
    def sync(table, zoom_levels):
        return Sync.general(table,
                            zoom_levels,
                            db_pair,
                            cli_args,
                            printer=printer),

    steps = OrderedDict()

    steps['foo_tokens'] = lambda: pull_foo(db_pair, cli_args, printer=printer)
    steps['foo_ref'] = None  # this table also handled by pull_foo
    steps['baz'] = lambda: sync('baz', [10])
    steps['bar'] = lambda: sync('bar', [100, 1])

    return steps
Exemplo n.º 15
0
def strip_fk(mysql_args, printer=Prindenter()):
    target_db = cli_args.database

    with Connection(mysql_args) as connection:
        with connection.cursor() as cursor:
            foreign_keys = '''
                           SELECT CONSTRAINT_NAME, TABLE_NAME
                           FROM INFORMATION_SCHEMA.REFERENTIAL_CONSTRAINTS
                           WHERE CONSTRAINT_SCHEMA = '{}'
                           '''.format(target_db)
            result = show_do_query(cursor, foreign_keys, printer=printer)

            for row in result:
                fk = row['CONSTRAINT_NAME']
                table = row['TABLE_NAME']
                drop_constraint = '''
                                  ALTER TABLE {} DROP FOREIGN KEY {};
                                  '''.format(table, fk)
                with Indent(printer):
                    result = show_do_query(cursor, drop_constraint, printer=printer)
Exemplo n.º 16
0
def strip_uk(target_args, printer=Prindenter()):
    target_db = target_args.database

    with Connection(target_args) as connection:
        with connection.cursor() as cursor:
            unique_keys = '''
                          SELECT DISTINCT constraint_name, table_name
                          FROM information_schema.table_constraints
                          WHERE constraint_type = 'UNIQUE'
                          AND table_schema = '{}';
                           '''.format(target_db)
            result = show_do_query(cursor, unique_keys, printer=printer)

            for row in result:
                uk = row['constraint_name']
                table = row['table_name']
                drop_constraint = '''
                                  DROP INDEX {} ON {};
                                  '''.format(uk, table)
                with Indent(printer):
                    result = show_do_query(cursor, drop_constraint, printer=printer)
Exemplo n.º 17
0
def get_group_concat(cursor, try_set=10000000 * 33, printer=Prindenter()):
    # 32 bytes for the md5 plus 1 for the comma times a million rows

    # limiting it here because I'd prefer too many small queries over a few monsters
    # that ties up the server with no gaps.  This may be unnecessarily cautious, go bigger at your own risk.

    # hell, this is all at your own risk

    printer("How many rows is {} willing to hash at a time?".format(
        cursor.connection.host))
    with Indent(printer):

        # try to ask for enough space for 1 million rows at a time
        printer("Asking for lots lof space...")
        result = show_do_query(
            cursor,
            "set session group_concat_max_len = {};".format(try_set),
            printer=printer)

        # but accept what we're given
        printer("Taking what we can get...")
        result = show_do_query(
            cursor,
            "show variables where Variable_name = 'group_concat_max_len';",
            printer=printer)
        max_group_concat_bytes = int(result[0]['Value'])

        # and see how many rows that is
        printer("How many of these will fit?")
        result = show_do_query(
            cursor,
            "select length(concat(md5('foo'),',')) as md5_bytes;",
            printer=printer)
        md5_bytes = int(result[0]['md5_bytes'])

        rows = floor(max_group_concat_bytes / md5_bytes)

    printer("{} is willing to hash {} rows at a time.".format(
        cursor.connection.host, rows))
    return GroupConcat(rows, max_group_concat_bytes)
Exemplo n.º 18
0
def md5_rows(cursor, table, condition, granularity, printer=Prindenter()):

    if granularity > 1:
        raise ValueError("Individual row scanner called, but a nontrivial row-range size was provided")

    converted_columns_str = ",".join(table.columns)

    shortened_condition = pretty_shorten(condition)[:-1]
    printer(f"[ Fingerprinting each row in {cursor.connection.db}.{table.name}\n"
            f"  where {table.id_col} in {shortened_condition} ]")
    with Indent(printer):

        result = show_do_query(cursor,
                f"""
                    SELECT {table.id_col} as id, MD5(CONCAT_WS('|', {converted_columns_str})) as fingerprint
                    FROM {table.name}
                    WHERE {condition}
                    ORDER BY {table.id_col};
                """,
                printer=printer)

        return { row[table.id_col] : row['fingerprint'] for row in result }
Exemplo n.º 19
0
    def fingerprint_groups(cursor,
                           table,
                           top_key,
                           sub_keys,
                           printer=Prindenter()):

        printer(
            f"[ Fingerprinting {cursor.connection.db}.{table.name} grouped by {top_key} ]"
        )
        with Indent(printer):

            all_columns = ",".join(table.upstream.columns)
            subkey_columns = ",".join(sub_keys)
            return show_do_query(cursor,
                                 f"""
                                 SELECT {top_key},
                                     MD5(GROUP_CONCAT({all_columns}
                                         ORDER BY {subkey_columns})) AS group_fingerprint
                                 FROM {table.name}
                                 GROUP BY {top_key};
                                 """,
                                 printer=printer)
Exemplo n.º 20
0
def examine_columns(cursor, table_name, printer=Prindenter()):

    printer(f"[Examining Columns on {cursor.connection.db}.{table_name}]")
    with Indent(printer):
        result = show_do_query(cursor,
               f"""
                SELECT COLUMN_NAME, IS_NULLABLE, COLUMN_TYPE, COLLATION_NAME
                FROM information_schema.columns
                WHERE table_schema='{cursor.connection.db}'
                AND table_name='{table_name}';
                """,
                printer=printer)

        column_conversions= []

        for column in result:

            # make the column representation concatenate-friendly
            converted = f"`{column['COLUMN_NAME']}`"

            if column['IS_NULLABLE'] == 'YES':
                converted = f"IFNULL({converted}, 'NULL')"

            if column['COLLATION_NAME'] and column['COLLATION_NAME'] not in ['NULL', 'utf8_general_ci']:
                converted = f"BINARY {converted}"

            if 'binary(' in column['COLUMN_TYPE']:
                converted = f"hex({converted})"

            # your data may deviate in new and exciting ways
            # handle them here ...

            with Indent(printer):
                printer(converted)
            column_conversions.append(converted)

        return column_conversions
Exemplo n.º 21
0
def pull_schema(args, upstream_connection, printer=Prindenter()):

    target_db = args.downstream.database

    with Connection(args.downstream) as downstream_connection:
        with downstream_connection.cursor() as cursor:
            show_tables = 'show tables;'
            result = show_do_query(cursor, show_tables, printer=printer)
            table_ct = len(result)

    if table_ct > 0:
        printer("{} is a nonempty downstream database. "
                "If you want me to create a new database in its place, you'll have to drop and create it yourself.".format(target_db))
                # if you'd rather I nuke it for you, you're trusting me too much

    else:

        tmp_file = 'schema_nofk.sql'

        # dump schema to a file
        mysqldump_schema_nofk(args.upstream, tmp_file, printer=printer)

        # load from a file
        mysqlload(args.downstream, tmp_file, printer=printer)
Exemplo n.º 22
0
def pull_foo(db_pair, cli_args, printer=Prindenter()):

    # grab only the foo token indices that are relevant
    with db_pair.upstream.connection.cursor() as upstream_cursor:

        get_ids = '''select id, foo_token_id from foo_ref where name like 'relevant%';'''

        result = show_do_query(upstream_cursor, get_ids, printer=printer)
        foo_token_ids = ', '.join([str(x['foo_token_id']) for x in result])
        foo_ref_ids = ', '.join([str(x['id']) for x in result])

    # dump just those rows
    mysqldump_data(cli_args.upstream,
                   'foo_ref',
                   'id in ({});'.format(foo_ref_ids),
                   printer=printer)
    mysqldump_data(cli_args.upstream,
                   'foo_tokens',
                   'id in ({});'.format(foo_token_ids),
                   printer=printer)

    # clear old rows
    with Connection(db_pair.downstream.args) as downstream_connection:
        with downstream_connection.cursor() as cursor:

            show_do_query(cursor, 'truncate foo_ref;', printer=printer)
            show_do_query(cursor, 'truncate foo_tokens;', printer=printer)

    # load new rows
    mysqlload(cli_args.downstream, 'foo_ref', printer=printer)
    mysqlload(cli_args.downstream, 'foo_tokens', printer=printer)

    printer.append_summary("foo_ref : UP TO DATE for select rows")
    printer.append_summary("foo_tokens : UP TO DATE for select rows")

    printer("foo_tokens and foo_ref are up to date where it matters")
Exemplo n.º 23
0
    def try_sync_schema(self, upstream_cursor, downstream_cursor, throw=True, printer=Prindenter()):

        printer(f"[Comparing upstream/downstream schemas for table: {self.name}]")
        with Indent(printer):

            def go(table, upstream_cursor, downstream_cursor, printer):
                schema_changes = sync_schema(upstream_cursor, downstream_cursor, self.name, printer=printer)
                if not Twin.report_if_schema_changed(self, schema_changes, printer):
                    table.successful_schema_sync = True

            if not self.successful_schema_sync:
                if throw:
                    go(self, upstream_cursor, downstream_cursor, printer)
                else:
                    try:
                        go(self, upstream_cursor, downstream_cursor, printer)
                    except Exception:
                        printer("Error occurred while syncing schema, but errors were suppressed")
                        printer("Will retry schema sync after data sync")
                        printer(traceback.format_exc())
            if self.successful_schema_sync:
                printer("...schemas are in sync".format(self.name))
            else:
                printer("...schemas are NOT in syc".format(self.name))
Exemplo n.º 24
0
def main(args):
    printer = Prindenter(indent=0)

    printer('Dropping unique keys from database: {}'.format(target_args.database))
    strip_uk(args, args, printer=printer)
    printer('Done')
Exemplo n.º 25
0
def multikey(table,
             db_pair,
             cli_args,
             keycolumns,
             condition=None,
             printer=Prindenter()):

    # given a query result for both up and downstream, sync the rows where check_col differs
    def group_sync(id_col,
                   check_col,
                   upstream,
                   downstream,
                   made_changes,
                   printer=Prindenter()):

        group_fingerprints_by_id = {}

        def populate(name, query_result, key, store):
            for row in query_result:
                try:
                    store.setdefault(row[key], {})
                    store[row[key]][name] = row[check_col]
                except KeyError:
                    IPython.embed()

        populate('up', upstream, id_col, group_fingerprints_by_id)
        populate('down', downstream, id_col, group_fingerprints_by_id)

        to_delete = []
        to_write = []

        for id, stream in group_fingerprints_by_id.items():
            if 'up' not in stream:
                to_delete.append(id)
            elif 'down' not in stream:
                to_write.append(id)
            else:
                if stream['up'] != stream['down']:
                    to_delete.append(id)
                    to_write.append(id)

        if any(to_write):
            write_condition = f"{id_col} in ({','.join(map(str, to_write))})"
        else:
            write_condition = None

        if any(to_delete):
            delete_condition = f"{id_col} in ({','.join(map(str, to_delete))})"
        else:
            delete_condition = None

        if write_condition:
            printer(
                f"Found {str(len(to_write))} groups to pull down from upstream"
            )
            made_changes = True
            mysqldump_data(cli_args.upstream,
                           table.name,
                           write_condition,
                           printer=printer)
        else:
            printer(f"Nothing to pull down from upstream")

        if delete_condition:
            printer("Making space downstream")
            made_changes = True
            with Indent(printer):
                with Connection(
                        db_pair.downstream.args) as downstream_connection:
                    with downstream_connection.cursor() as cursor:
                        result = show_do_query(
                            cursor,
                            f'delete from {table.name} where {delete_condition};',
                            printer=printer)
        else:
            printer(f"Downstream space is open for new data")

        if write_condition:
            # load from a file
            printer("Loading rows")
            mysqlload(cli_args.downstream, table.name, printer=printer)

    # group the table by 'top_key' and hash the groups
    def fingerprint_groups(cursor,
                           table,
                           top_key,
                           sub_keys,
                           printer=Prindenter()):

        printer(
            f"[ Fingerprinting {cursor.connection.db}.{table.name} grouped by {top_key} ]"
        )
        with Indent(printer):

            all_columns = ",".join(table.upstream.columns)
            subkey_columns = ",".join(sub_keys)
            return show_do_query(cursor,
                                 f"""
                                 SELECT {top_key},
                                     MD5(GROUP_CONCAT({all_columns}
                                         ORDER BY {subkey_columns})) AS group_fingerprint
                                 FROM {table.name}
                                 GROUP BY {top_key};
                                 """,
                                 printer=printer)

    made_changes = False

    # sync based on row cardinality
    ids = f"SELECT {keycolumns[0]}, count(*) as group_size FROM {table.name} group by 1;"

    with db_pair.upstream.connection.cursor() as upstream_cursor:
        upstream = show_do_query(upstream_cursor, ids, printer=printer)

        with Connection(db_pair.downstream.args) as downstream_connection:
            with downstream_connection.cursor() as downstream_cursor:
                downstream = show_do_query(downstream_cursor,
                                           ids,
                                           printer=printer)

    printer(
        f"[ Using {keycolumns[0]} as a key to sync missing rows on table {table.name} ]"
    )
    with Indent(printer):
        group_sync(keycolumns[0],
                   'group_size',
                   upstream,
                   downstream,
                   made_changes,
                   printer=printer)

    # if changes persist, sync based on row contents

    with db_pair.upstream.connection.cursor() as upstream_cursor:
        with Connection(db_pair.downstream.args) as downstream_connection:
            with downstream_connection.cursor() as downstream_cursor:
                if table.is_synced(upstream_cursor,
                                   downstream_cursor,
                                   printer=printer):
                    return made_changes

                else:
                    upstream = fingerprint_groups(upstream_cursor,
                                                  table,
                                                  keycolumns[0],
                                                  keycolumns[1:],
                                                  printer=printer)
                    downstream = fingerprint_groups(downstream_cursor,
                                                    table,
                                                    keycolumns[0],
                                                    keycolumns[1:],
                                                    printer=printer)

                    printer(
                        f"[ Using {keycolumns[0]} as a key to find mismatched data on table {table.name} ]"
                    )
                    with Indent(printer):
                        group_sync(keycolumns[0],
                                   'group_fingerprint',
                                   upstream,
                                   downstream,
                                   made_changes,
                                   printer=printer)

                    return made_changes
Exemplo n.º 26
0
    def __init__(self, table_name, downstream_cursor, upstream_cursor, id_col, printer=Prindenter()):

        self.name = table_name
        self.id_col = id_col

        printer(f"[Upstream {table_name}]")
        with Indent(printer):
            self.upstream = One(table_name, upstream_cursor, id_col, printer=printer)

        create_twin_if_not_exists(upstream_cursor, downstream_cursor, table_name, printer=printer)

        # separate properties
        printer(f"[Downstream {table_name}]")
        with Indent(printer):
            self.downstream = One(table_name, downstream_cursor, id_col, printer=printer)

        self.successful_schema_sync = False # set true when sync completes
Exemplo n.º 27
0
def pre_general(table_name,
                db_pair,
                cli_args,
                id_col,
                batch_rows,
                condition=None,
                printer=Prindenter()):

    # keep track of which syncs were performed
    presync_types = []

    # Check to see if work needs to be done
    with Connection(db_pair.downstream.args) as downstream_connection:
        with downstream_connection.cursor() as downstream_cursor:
            with db_pair.upstream.connection.cursor() as upstream_cursor:

                table = Table.Twin(table_name,
                                   downstream_cursor,
                                   upstream_cursor,
                                   id_col,
                                   printer=printer)

                table.try_sync_schema(upstream_cursor,
                                      downstream_cursor,
                                      throw=False,
                                      printer=printer)

                # TODO : move modified_time / last_touched checks into Table.Twin
                # before id_sync touches the table, get the downstream last modified time
                if '`modified_time`' in table.upstream.columns:
                    last_touched = Table.get_last_touched_date(
                        table_name,
                        '`modified_time`',
                        db_pair.downstream,
                        printer=printer)

                # pull latest based id
                if f'`{table.id_col}`' in table.upstream.columns:
                    printer(
                        f"[syncing (on '{table.id_col}') table: {table.name}]")
                    with Indent(printer):
                        if pull_missing_ids(table,
                                            db_pair,
                                            cli_args,
                                            batch_rows,
                                            condition=condition,
                                            printer=printer):
                            presync_types.append("missing-id comparison")

                # pull latest based on modified time
                if '`modified_time`' in table.upstream.columns:
                    printer("[syncing (on 'modified_time') table: {}]".format(
                        table_name))
                    with Indent(printer):
                        if pull_modifications_since(last_touched,
                                                    table,
                                                    'modified_time',
                                                    db_pair,
                                                    cli_args,
                                                    condition=condition,
                                                    printer=printer):
                            presync_types.append("modified_time comparison")

                # prepare report.  What was done and where does that leave us?
                if not any(presync_types):
                    presync_types.append("not finding any changes")
                preposition = "after " + " & ".join(presync_types)

                if cli_args.lite:
                    reportfunc = unknown
                    printer("Skipped interim equality check due to lite mode")
                else:
                    printer("[Interim equality check for table {}]".format(
                        table_name))
                    if table.is_synced(upstream_cursor,
                                       downstream_cursor,
                                       printer=printer):
                        reportfunc = identical
                    else:
                        reportfunc = has_changes

                return reportfunc(table, preposition, printer=printer)
Exemplo n.º 28
0
def create_twin_if_not_exists(upstream_cursor, downstream_cursor, table_name, printer=Prindenter()):

    printer(f"[Checking for table existence: {downstream_cursor.connection.db}.{table_name}]")
    with Indent(printer):
        result = show_do_query(downstream_cursor,
               f"""
                SELECT *
                FROM information_schema.tables
                WHERE table_schema = '{downstream_cursor.connection.db}'
                    AND table_name = '{table_name}'
                LIMIT 1;
                """,
                printer=printer)

        if any(result):
            printer("It exists, moving on")
        else:
            printer("It does not exist, creating it")
            sql = show_create(upstream_cursor, table_name, printer=printer)
            result = show_do_query(downstream_cursor, sql, printer=printer)
Exemplo n.º 29
0
    def group_sync(id_col,
                   check_col,
                   upstream,
                   downstream,
                   made_changes,
                   printer=Prindenter()):

        group_fingerprints_by_id = {}

        def populate(name, query_result, key, store):
            for row in query_result:
                try:
                    store.setdefault(row[key], {})
                    store[row[key]][name] = row[check_col]
                except KeyError:
                    IPython.embed()

        populate('up', upstream, id_col, group_fingerprints_by_id)
        populate('down', downstream, id_col, group_fingerprints_by_id)

        to_delete = []
        to_write = []

        for id, stream in group_fingerprints_by_id.items():
            if 'up' not in stream:
                to_delete.append(id)
            elif 'down' not in stream:
                to_write.append(id)
            else:
                if stream['up'] != stream['down']:
                    to_delete.append(id)
                    to_write.append(id)

        if any(to_write):
            write_condition = f"{id_col} in ({','.join(map(str, to_write))})"
        else:
            write_condition = None

        if any(to_delete):
            delete_condition = f"{id_col} in ({','.join(map(str, to_delete))})"
        else:
            delete_condition = None

        if write_condition:
            printer(
                f"Found {str(len(to_write))} groups to pull down from upstream"
            )
            made_changes = True
            mysqldump_data(cli_args.upstream,
                           table.name,
                           write_condition,
                           printer=printer)
        else:
            printer(f"Nothing to pull down from upstream")

        if delete_condition:
            printer("Making space downstream")
            made_changes = True
            with Indent(printer):
                with Connection(
                        db_pair.downstream.args) as downstream_connection:
                    with downstream_connection.cursor() as cursor:
                        result = show_do_query(
                            cursor,
                            f'delete from {table.name} where {delete_condition};',
                            printer=printer)
        else:
            printer(f"Downstream space is open for new data")

        if write_condition:
            # load from a file
            printer("Loading rows")
            mysqlload(cli_args.downstream, table.name, printer=printer)
Exemplo n.º 30
0
def general(table,
            zoom_levels,
            db_pair,
            cli_args,
            id_col='id',
            batch_rows=Constants.batch_rows,
            condition=None,
            printer=Prindenter()):

    # prepare for recursion if not already in it

    if type(table) == str:
        printer("[Examining table: {}]".format(table))
        with Indent(printer):
            try:
                table = pre_general(table,
                                    db_pair,
                                    cli_args,
                                    id_col,
                                    batch_rows,
                                    condition=condition,
                                    printer=printer)
            except sh.ErrorReturnCode_1 as err:

                # handle schema mismatches with a sledgehammer
                # TODO: allow user to provide path to migration scripts,
                # run outstanding ones if they show up in migration_tracker
                if "Column count doesn't match" in str(err):

                    printer("Upstream schema differs, pulling it down")

                    with Indent(printer):
                        # get upstream schema
                        filename = 'newschema_{}.sql'.format(table)
                        mysqldump_schema_nofk(cli_args.upstream,
                                              filename,
                                              restrict_to_table=table,
                                              printer=printer)

                        # drop downstream table
                        drop = 'drop table {};'.format(table)
                        with Connection(db_pair.downstream.args
                                        ) as downstream_connection:
                            with downstream_connection.cursor(
                            ) as downstream_cursor:
                                show_do_query(downstream_cursor,
                                              drop,
                                              printer=printer)

                        # recreate downstream table
                        mysqlload(cli_args.downstream,
                                  filename,
                                  printer=printer)

                    # try again
                    printer("[New schema loaded, downstream table is empty]")
                    table = pre_general(table,
                                        db_pair,
                                        cli_args,
                                        id_col,
                                        condition=condition,
                                        printer=printer)
                else:
                    raise

    if type(zoom_levels) == list:
        # set up for recursion
        if table.needs_work:
            printer(
                "Sync: 'general' received magnification list instead of zoom_level map, building zoom_level map...",
                end='')
            with Indent(printer):
                # prepare the zoom-level map
                zoom_levels = SortedDict({x: None for x in zoom_levels})

                # append the outermost zoom level (completed in general)
                zoom_levels[table.upstream.max_id] = [
                    Ids.Interval(0, table.upstream.max_id)
                ]
        else:
            printer("Sync: 'general' finished early: presync was sufficient")
            return

        printer("done\n")

        # begin recursion
        printer("[Sync: 'general' top-level recursion]")
        with Indent(printer):
            return general(table,
                           zoom_levels,
                           db_pair,
                           cli_args,
                           condition=condition,
                           printer=printer)

    # if control gets this far, recursion has begun

    granularity = None
    scopes = None
    # examine the scope map by decreasing magnification
    # find the transition from unknowns to knowns
    for ((smaller_granularity, smaller_scope), (larger_granularity, larger_scope)) \
            in reversed(list(zip(zoom_levels.items(), zoom_levels.items()[1:]))):

        if not smaller_scope:
            scopes = larger_scope  # we'll be filling these out
            granularity = smaller_granularity  # by breaking them into pieces this big
            break

    if not scopes:
        printer(
            "Zoom-level map fully populated, no more 'general' recursions will follow"
        )

        conditions = []
        final_size = zoom_levels.keys()[0]

        final_scopes = list(zoom_levels.values()[0])
        final_scopes.sort()

        if final_size <= 1 and type(final_scopes[0]) == int:
            printer("Scanned down to individual rows")
            row_lists = Ids.partition(Constants.batch_fingerprints,
                                      final_scopes)

            for rows in row_lists:
                conditions.append("{} in ({})".format(
                    table.id_col, ",".join([str(x) for x in rows])))

        elif final_size > 1 and isinstance(final_scopes[0], Ids.Interval):
            printer("Scanned down to row-ranges of size {}".format(final_size))
            interval_lists = Ids.partition(Constants.batch_fingerprints,
                                           final_scopes)

            conditions = []
            for intervals in interval_lists:
                conditions.append(" OR ".join([
                    "{} BETWEEN {} AND {}".format(table.id_col, i.start, i.end)
                    for i in intervals
                ]))

        else:
            raise ValueError(
                "Can't decide whether to transfer rows, or row-ranges")

        printer("[Transfer proceeding in {} batches]".format(len(conditions)))
        with Indent(printer):

            for condition in conditions:

                # dump upstream data
                mysqldump_data(cli_args.upstream,
                               table.name,
                               condition,
                               printer=printer)

                # clear old rows from downstream
                delete = 'delete from {} where {};'.format(
                    table.name, condition)
                with Connection(
                        db_pair.downstream.args) as downstream_connection:
                    with downstream_connection.cursor() as cursor:
                        show_do_query(cursor, delete, printer=printer)

                # load new rows into downstream
                mysqlload(cli_args.downstream, table.name, printer=printer)

        with Connection(db_pair.downstream.args) as downstream_connection:
            with downstream_connection.cursor() as downstream_cursor:
                with db_pair.upstream.connection.cursor() as upstream_cursor:
                    table.is_synced_warn(upstream_cursor,
                                         downstream_cursor,
                                         message='(after general sync)',
                                         printer=printer)
                    table.try_sync_schema(upstream_cursor,
                                          downstream_cursor,
                                          throw=True,
                                          printer=printer)

    # if we found a row with unpopulated scopes, then we have more scanning to do
    else:
        printer(
            "[Given {} larger-granules, making smaller granules of size {} and fingerprinting them]"
            .format(len(scopes), granularity))
        next_scopes = []
        with Indent(printer):
            with Connection(db_pair.downstream.args) as downstream_connection:
                with downstream_connection.cursor() as downstream_cursor:
                    with db_pair.upstream.connection.cursor(
                    ) as upstream_cursor:

                        # new sessions, reset group_concat (default is oddly low)
                        db_pair.reup_maxes(downstream_cursor,
                                           upstream_cursor,
                                           printer=printer)

                        #for scope in scopes:
                        #    next_scopes += list(Db.find_diffs(upstream_cursor, downstream_cursor, table, scope, granularity,
                        #                                       printer=printer))
                        # rather than making a round trip for each one, lets do them all at once

                        next_scopes += list(
                            Db.find_diffs(upstream_cursor,
                                          downstream_cursor,
                                          table,
                                          scopes,
                                          granularity,
                                          condition=condition,
                                          printer=printer))
                        printer(
                            ''
                        )  # Db.find_diffs ends without a newline... add one

        # if no ranges were found to contain diffs
        if len(
                next_scopes
        ) == 0:  # note that any([0]) is False, but len([0]) == 0 is True
            # we want the latter, else we ignore row 0
            message = textwrap.dedent("""
            Found no ranges with diffs.  Nothing to do.
            If the tables were truly identical, TABLE CHECKSUM would have
            prevented sync from gettin this far.
            Perhaps some columns were ignored during the scan?
            (e.g. timestamps, as an ugly hack to avoid thinking about time zones)
            """)
            printer(message)
            printer.append_summary(
                "{} : IDENTICAL? (TABLE CHECKSUM failed but a custom MD5 scan found no diffs)"
                .format(table.name))

        # if no ranges were found to contain diffs
        else:
            zoom_levels[granularity] = next_scopes
            printer("[Another 'general' recursion]")
            with Indent(printer):
                return general(table,
                               zoom_levels,
                               db_pair,
                               cli_args,
                               condition=condition,
                               printer=printer)