示例#1
0
def fetch_current_scn(conn_config):
   connection = orc_db.open_connection(conn_config)
   cur = connection.cursor()
   current_scn = cur.execute("SELECT current_scn FROM V$DATABASE").fetchall()[0][0]
   cur.close()
   connection.close()
   return current_scn
示例#2
0
def sync_tables(conn_config, streams, state, end_scn, scn_window_size = None):
   connection = orc_db.open_connection(conn_config)
   if not verify_db_supplemental_log_level(connection):
      for stream in streams:
         if not verify_table_supplemental_log_level(stream, connection):
            raise Exception("""
      Unable to replicate with logminer for stream({}) because supplmental_log_data is not set to 'ALL' for either the table or the database.
      Please run: ALTER DATABASE ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS;
            """.format(stream.tap_stream_id))

   cur = connection.cursor()
   cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
   cur.execute("""ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""")
   cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""")
   cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""")

   start_scn_window = min([get_bookmark(state, s.tap_stream_id, 'scn') for s in streams])

   while start_scn_window < end_scn:
      stop_scn_window = end_scn
      if SCN_WINDOW_SIZE:
         stop_scn_window = start_scn_window + SCN_WINDOW_SIZE
         if stop_scn_window > end_scn:
            stop_scn_window = end_scn

      state = sync_tables_logminer(cur, streams, state, start_scn_window, stop_scn_window)

      start_scn_window = stop_scn_window

   cur.close()
   connection.close()
def sync_view(conn_config, stream, state, desired_columns):
    connection = orc_db.open_connection(conn_config)
    connection.outputtypehandler = common.OutputTypeHandler

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None

    #pick a new table version
    nascent_stream_version = int(time.time() * 1000)
    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # cur = connection.cursor()
    md = metadata.to_map(stream.metadata)
    schema_name = md.get(()).get('schema-name')

    escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c),
                          desired_columns)
    escaped_schema = schema_name
    escaped_table = stream.table
    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.tap_stream_id, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        select_sql = 'SELECT {} FROM {}.{}'.format(','.join(escaped_columns),
                                                   escaped_schema,
                                                   escaped_table)

        LOGGER.info("select %s", select_sql)
        for row in cur.execute(select_sql):
            record_message = common.row_to_singer_message(
                stream, row, nascent_stream_version, desired_columns,
                time_extracted)
            singer.write_message(record_message)
            counter.increment()

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)
    cur.close()
    connection.close()
    return state
示例#4
0
def do_discovery(conn_config, filter_schemas):
    LOGGER.info("starting discovery")
    connection = orc_db.open_connection(conn_config)
    cur = connection.cursor()

    row_counts = produce_row_counts(connection, filter_schemas)
    table_info = {}

    binds_sql = [":{}".format(b) for b in range(len(filter_schemas))]

    sql = filter_schemas_sql_clause(
        """
   SELECT owner, table_name
   FROM all_tables
   WHERE owner != 'SYS'""", binds_sql)

    LOGGER.info("fetching tables: %s %s", sql, filter_schemas)
    for row in cur.execute(sql, filter_schemas):
        schema = row[0]
        table = row[1]

        if schema not in table_info:
            table_info[schema] = {}

        is_view = False
        table_info[schema][table] = {
            'row_count': row_counts[table],
            'is_view': is_view
        }

    sql = filter_schemas_sql_clause(
        """
   SELECT owner, view_name
   FROM sys.all_views
   WHERE owner != 'SYS'""", binds_sql)

    LOGGER.info("fetching views")
    for row in cur.execute(sql, filter_schemas):
        view_name = row[1]
        schema = row[0]
        if schema not in table_info:
            table_info[schema] = {}

        table_info[schema][view_name] = {'is_view': True}

    # If table_info is empty, send a SYMON error that no table was found with current input.
    if not table_info:
        cur.close()
        connection.close()
        # Raise database error because only database error messages are interpretted during discovery
        raise cx_Oracle.DatabaseError(
            'SYM-00001: No Table Found in the Database')

    catalog = discover_columns(connection, table_info, filter_schemas)
    json.dump(catalog, sys.stdout, indent=2)
    cur.close()
    connection.close()
    return catalog
示例#5
0
def do_discovery(conn_config, filter_schemas):
   LOGGER.info("starting discovery")
   connection = orc_db.open_connection(conn_config)
   cur = connection.cursor()

   row_counts = produce_row_counts(connection, filter_schemas)
   table_info = {}

   binds_sql = [":{}".format(b) for b in range(len(filter_schemas))]


   sql  = filter_schemas_sql_clause("""
   SELECT owner, table_name
   FROM all_tables
   WHERE owner != 'SYS'""", binds_sql)

   LOGGER.info("fetching tables: %s %s", sql, filter_schemas)
   for row in cur.execute(sql, filter_schemas):
      schema = row[0]
      table = row[1]

      if schema not in table_info:
         table_info[schema] = {}

      is_view = False
      table_info[schema][table] = {
         'row_count': row_counts[table],
         'is_view': is_view
      }


   sql = filter_schemas_sql_clause("""
   SELECT owner, view_name
   FROM sys.all_views
   WHERE owner != 'SYS'""", binds_sql)

   LOGGER.info("fetching views")
   for row in cur.execute(sql, filter_schemas):
     view_name = row[1]
     schema = row[0]
     if schema not in table_info:
        table_info[schema] = {}

     table_info[schema][view_name] = {
        'is_view': True
     }

   catalog = discover_columns(connection, table_info, filter_schemas)
   dump_catalog(catalog)
   cur.close()
   connection.close()
   return catalog
def sync_table(conn_config, stream, state, desired_columns):
    connection = orc_db.open_connection(conn_config)
    connection.outputtypehandler = common.OutputTypeHandler

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream.tap_stream_id,
                                         'version')
    # If there was no bookmark for stream_version, it is the first time
    # this table is being sync'd, so get a new version, write to
    # state
    if stream_version is None:
        stream_version = int(time.time() * 1000)
        state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                      stream_version)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=stream_version)
    singer.write_message(activate_version_message)

    md = metadata.to_map(stream.metadata)
    schema_name = md.get(()).get('schema-name')

    escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c),
                          desired_columns)
    escaped_schema = schema_name
    escaped_table = stream.table

    replication_key = md.get((), {}).get('replication-key')
    #escaped_replication_key = common.prepare_columns_sql(stream, replication_key)
    replication_key_value = singer.get_bookmark(state, stream.tap_stream_id,
                                                'replication_key_value')
    replication_key_sql_datatype = md.get(
        ('properties', replication_key)).get('sql-datatype')

    with metrics.record_counter(None) as counter:
        if replication_key_value:
            LOGGER.info("Resuming Incremental replication from %s = %s",
                        replication_key, replication_key_value)
            casted_where_clause_arg = common.prepare_where_clause_arg(
                replication_key_value, replication_key_sql_datatype)

            select_sql = """SELECT {}
                                FROM {}.{}
                               WHERE {} >= {}
                               ORDER BY {} ASC
                                """.format(','.join(escaped_columns),
                                           escaped_schema, escaped_table,
                                           replication_key,
                                           casted_where_clause_arg,
                                           replication_key)
        else:
            select_sql = """SELECT {}
                                FROM {}.{}
                               ORDER BY {} ASC
                               """.format(','.join(escaped_columns),
                                          escaped_schema, escaped_table,
                                          replication_key)

        rows_saved = 0
        LOGGER.info("select %s", select_sql)
        for row in cur.execute(select_sql):
            record_message = common.row_to_singer_message(
                stream, row, stream_version, desired_columns, time_extracted)

            singer.write_message(record_message)
            rows_saved = rows_saved + 1

            #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
            #event worse would be allowing the NULL value to enter into the state
            if record_message.record[replication_key] is not None:
                state = singer.write_bookmark(
                    state, stream.tap_stream_id, 'replication_key_value',
                    record_message.record[replication_key])

            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            counter.increment()

    cur.close()
    connection.close()
    return state
def sync_table(conn_config, stream, state, desired_columns):
   connection = orc_db.open_connection(conn_config)
   connection.outputtypehandler = common.OutputTypeHandler

   cur = connection.cursor()
   cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
   cur.execute("""ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""")
   cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""")
   cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""")
   time_extracted = utils.now()

   #before writing the table version to state, check if we had one to begin with
   first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None

   #pick a new table version IFF we do not have an ORA_ROWSCN in our state
   #the presence of an ORA_ROWSCN indicates that we were interrupted last time through
   if singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN') is None:
      nascent_stream_version = int(time.time() * 1000)
   else:
      nascent_stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version')

   state = singer.write_bookmark(state,
                                 stream.tap_stream_id,
                                 'version',
                                 nascent_stream_version)
   singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

   # cur = connection.cursor()
   md = metadata.to_map(stream.metadata)
   schema_name = md.get(()).get('schema-name')

   escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns)
   escaped_schema  = schema_name
   escaped_table   = stream.table
   activate_version_message = singer.ActivateVersionMessage(
      stream=stream.stream,
      version=nascent_stream_version)

   if first_run:
      singer.write_message(activate_version_message)

   with metrics.record_counter(None) as counter:
      ora_rowscn = singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN')
      if ora_rowscn:
         LOGGER.info("Resuming Full Table replication %s from ORA_ROWSCN %s", nascent_stream_version, ora_rowscn)
         select_sql      = """SELECT {}, ORA_ROWSCN
                                FROM {}.{}
                               WHERE ORA_ROWSCN >= {}
                               ORDER BY ORA_ROWSCN ASC
                                """.format(','.join(escaped_columns),
                                           escaped_schema,
                                           escaped_table,
                                           ora_rowscn)
      else:
         select_sql      = """SELECT {}, ORA_ROWSCN
                                FROM {}.{}
                               ORDER BY ORA_ROWSCN ASC""".format(','.join(escaped_columns),
                                                                    escaped_schema,
                                                                    escaped_table)

      rows_saved = 0
      LOGGER.info("select %s", select_sql)
      for row in cur.execute(select_sql):
         ora_rowscn = row[-1]
         row = row[:-1]
         record_message = common.row_to_singer_message(stream,
                                                       row,
                                                       nascent_stream_version,
                                                       desired_columns,
                                                       time_extracted)

         singer.write_message(record_message)
         state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN', ora_rowscn)
         rows_saved = rows_saved + 1
         if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
            singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

         counter.increment()


   state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN', None)
   #always send the activate version whether first run or subsequent
   singer.write_message(activate_version_message)
   cur.close()
   connection.close()
   return state
示例#8
0
def sync_tables(conn_config, streams, state, end_scn):
   connection = orc_db.open_connection(conn_config)
   if not verify_db_supplemental_log_level(connection):
      for stream in streams:
         if not verify_table_supplemental_log_level(stream, connection):
            raise Exception("""
      Unable to replicate with logminer for stream({}) because supplmental_log_data is not set to 'ALL' for either the table or the database.
      Please run: ALTER DATABASE ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS;
            """.format(stream.tap_stream_id))



   cur = connection.cursor()
   cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
   cur.execute("""ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""")
   cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""")
   cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""")

   start_scn = min([get_bookmark(state, s.tap_stream_id, 'scn') for s in streams])
   time_extracted = utils.now()

   start_logmnr_sql = """BEGIN
                         DBMS_LOGMNR.START_LOGMNR(
                                 startScn => {},
                                 endScn => {},
                                 OPTIONS => DBMS_LOGMNR.DICT_FROM_ONLINE_CATALOG +
                                            DBMS_LOGMNR.COMMITTED_DATA_ONLY +
                                            DBMS_LOGMNR.CONTINUOUS_MINE);
                         END;""".format(start_scn, end_scn)

   LOGGER.info("Starting LogMiner for %s: %s -> %s", list(map(lambda s: s.tap_stream_id, streams)), start_scn, end_scn)
   LOGGER.info("%s",start_logmnr_sql)
   cur.execute(start_logmnr_sql)

   #mine changes
   for stream in streams:
      md_map = metadata.to_map(stream.metadata)
      desired_columns = [c for c in stream.schema.properties.keys() if common.should_sync_column(md_map, c)]
      redo_value_sql_clause = ",\n ".join(["""DBMS_LOGMNR.MINE_VALUE(REDO_VALUE, :{})""".format(idx+1)
                                           for idx,c in enumerate(desired_columns)])
      undo_value_sql_clause = ",\n ".join(["""DBMS_LOGMNR.MINE_VALUE(UNDO_VALUE, :{})""".format(idx+1)
                                           for idx,c in enumerate(desired_columns)])

      schema_name = md_map.get(()).get('schema-name')
      stream_version = get_stream_version(stream.tap_stream_id, state)
      mine_sql = """
      SELECT OPERATION, SQL_REDO, SCN, CSCN, COMMIT_TIMESTAMP,  {}, {} from v$logmnr_contents where table_name = :table_name AND operation in ('INSERT', 'UPDATE', 'DELETE')
      """.format(redo_value_sql_clause, undo_value_sql_clause)
      binds = [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \
              [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \
              [stream.table]


      rows_saved = 0
      columns_for_record = desired_columns + ['scn', '_sdc_deleted_at']
      with metrics.record_counter(None) as counter:
         LOGGER.info("Examing log for table %s", stream.tap_stream_id)
         common.send_schema_message(stream, ['lsn'])
         for op, redo, scn, cscn, commit_ts, *col_vals in cur.execute(mine_sql, binds):
            redo_vals = col_vals[0:len(desired_columns)]
            undo_vals = col_vals[len(desired_columns):]
            if op == 'INSERT' or op == 'UPDATE':
               redo_vals += [cscn, None]
               record_message = row_to_singer_message(stream, redo_vals, stream_version, columns_for_record, time_extracted)
            elif op == 'DELETE':
               undo_vals += [cscn, singer.utils.strftime(commit_ts.replace(tzinfo=pytz.UTC))]
               record_message = row_to_singer_message(stream, undo_vals, stream_version, columns_for_record, time_extracted)
            else:
               raise Exception("unrecognized logminer operation: {}".format(op))

            singer.write_message(record_message)
            rows_saved = rows_saved + 1
            counter.increment()
            state = singer.write_bookmark(state,
                                          stream.tap_stream_id,
                                          'scn',
                                          int(cscn))


            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
               singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

   for s in streams:
      LOGGER.info("updating bookmark for stream %s to end_lsn %s", s.tap_stream_id, end_scn)
      state = singer.write_bookmark(state, s.tap_stream_id, 'scn', end_scn)
      singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

   cur.close()
   connection.close()
   return state