예제 #1
0
    def listen(self):
        _logger.info("Start to listen to the binlog of %s" % self.database)

        section = 'mysql:' + self.database
        mysqlSetting = {
            "host": config().get(section, "host"),
            "port": int(config().get(section, 'port')),
            "user": config().get(section, "user"),
            "password": config().get(section, "password"),
        }

        watchedDatabases = [self.database]

        # load last binlog reader position
        logFile, logPos, resumeStrem = self._loadLastBinlogPos()

        self._stream = BinLogStreamReader(
            connection_settings=mysqlSetting,
            server_id=int(config().get(section, "slaveid")),
            only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent],
            blocking=True,
            resume_stream=resumeStrem,
            log_file=logFile,
            log_pos=logPos,
        )

        while True:
            refresh = False
            try:
                for binlogEvent in self._stream:
                    refresh = True
                    logFile, logPos = self._stream.log_file, self._stream.log_pos

                    # filter no watch database
                    if binlogEvent.schema not in watchedDatabases:
                        self._writeBinlogPos(logFile, logPos)
                        continue

                    binlog = {}
                    binlog['storage'] = 'mysql'
                    binlog['database'] = '%s' % binlogEvent.schema
                    binlog['table'] = '%s' % binlogEvent.table
                    binlog['timestamp'] = datetime.fromtimestamp(
                        binlogEvent.timestamp).strftime('%Y-%m-%d %H:%M:%S')

                    for row in binlogEvent.rows:
                        if isinstance(binlogEvent, DeleteRowsEvent):
                            binlog['values'] = row['values']
                            binlog['type'] = 'DELETE'
                        elif isinstance(binlogEvent, UpdateRowsEvent):
                            binlog['before'] = row['before_values']
                            binlog['values'] = row['after_values']
                            binlog['type'] = 'UPDATE'
                        elif isinstance(binlogEvent, WriteRowsEvent):
                            binlog['values'] = row['values']
                            binlog['type'] = 'INSERT'

                        binlogRow = json.dumps(binlog,
                                               default=timeutil.dateHandler)
                        self._pushToKafka(binlogRow, binlog['database'],
                                          binlog['table'])

                    # after pushing binlog to kafka, update the binlog position
                    self._writeBinlogPos(logFile, logPos)

                if not refresh:
                    _logger.info(
                        "NO new input binlog, current position: [%s:%d]",
                        logFile if logFile is not None else "",
                        logPos if logPos is not None else 0)
                    time.sleep(0.1)
            except Exception as e:
                print(e)
                sys.exit(1)
예제 #2
0
    def run_by_rows(self):
        try:
            server_id = 6666666 + int(self.thread_id)
            stream = BinLogStreamReader(connection_settings=self.mysql_setting,
                                        server_id=server_id,
                                        only_events=[
                                            DeleteRowsEvent, WriteRowsEvent,
                                            UpdateRowsEvent, QueryEvent
                                        ],
                                        resume_stream=True,
                                        blocking=False,
                                        log_file=f'{self.binlog_file}',
                                        log_pos=self.start_pos,
                                        only_schemas=f'{self.only_schemas}',
                                        only_tables=f'{self.only_tables}')
            rows = []
            thread_id = query = None
            for binlogevent in stream:
                log_pos = binlogevent.packet.log_pos
                if log_pos >= self.end_pos:
                    # 当当前的binlogevent日志位置大于结束的binlog时,退出
                    stream.close()
                    break
                else:
                    if isinstance(binlogevent, QueryEvent):
                        thread_id = binlogevent.slave_proxy_id
                        query = binlogevent.query

                    if not isinstance(binlogevent, QueryEvent):
                        if self.thread_id == thread_id and query == 'BEGIN':
                            for row in binlogevent.rows:
                                columns = [{
                                    'column': x.name,
                                    'type': x.type
                                } for x in binlogevent.columns]
                                binlog = {
                                    'database': binlogevent.schema,
                                    'table': binlogevent.table,
                                    'primary_key': binlogevent.primary_key,
                                    'columns': columns
                                }
                                if isinstance(binlogevent, DeleteRowsEvent):
                                    binlog['values'] = row["values"]
                                    binlog['type'] = 'DELETE'
                                    rows.append(binlog)
                                if isinstance(binlogevent, UpdateRowsEvent):
                                    binlog["before"] = row["before_values"]
                                    binlog["after"] = row["after_values"]
                                    binlog['type'] = 'UPDATE'
                                    rows.append(binlog)
                                if isinstance(binlogevent, WriteRowsEvent):
                                    binlog['values'] = row["values"]
                                    binlog['type'] = 'INSERT'
                                    rows.append(binlog)

            stream.close()
            result = {
                'status': 'success',
                'data': self._generate_rollback_sql(rows)
            }
        except Exception as err:
            # print("Exception in user code:")
            # print('-' * 60)
            # traceback.print_exc(file=sys.stdout)
            # print('-' * 60)
            print(err)
            result = {'status': 'fail', 'msg': str(err)}
        return result
예제 #3
0
    def get_data_from_binlog_to_ch(self, ):
        logger.info(
            """'only_schemas': %s,'only_tables': %s,'ignored_schemas': %s,'ignored_tables': %s """
            % (self.only_schemas, self.only_tables, self.ignored_schemas,
               self.ignored_tables))
        if cnf['mysql_server']['gtid_mode'] == 1:
            stream = BinLogStreamReader(
                connection_settings=MYSQL_DB_INFO,
                server_id=cnf['mysql_server']['server_id'],
                blocking=True,
                only_events=[
                    DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent, GtidEvent
                ],
                auto_position=get_info_ins.get_executed_gtid_set(
                    name=self.name),
                only_schemas=self.only_schemas,
                only_tables=self.only_tables,
                ignored_schemas=self.ignored_schemas,
                ignored_tables=self.ignored_tables,
                slave_heartbeat=10,
                fail_on_table_metadata_unavailable=True,
                freeze_schema=True,
                resume_stream=True)
        else:
            stream = BinLogStreamReader(
                connection_settings=MYSQL_DB_INFO,
                server_id=cnf['mysql_server']['server_id'],
                blocking=True,
                only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent],
                log_file=get_info_ins.get_log_file(name=self.name),
                log_pos=int(get_info_ins.get_log_pos(name=self.name)),
                only_schemas=self.only_schemas,
                only_tables=self.only_tables,
                ignored_schemas=self.ignored_schemas,
                ignored_tables=self.ignored_tables,
                slave_heartbeat=10,
                fail_on_table_metadata_unavailable=True,
                freeze_schema=True,
                resume_stream=True)

        for binlogevent in stream:
            if binlogevent.event_type == GTID_LOG_EVENT:
                self.last_gtid = binlogevent.gtid
            else:
                self.log_file, self.log_pos = stream.log_file, stream.log_pos
                try:
                    self.binlog_from_event_handler(binlogevent)
                except Exception as e:
                    messages = '%s : unknown error: %s' % (
                        'binlog_from_event_handler', str(e))
                    logger.error(messages)
                    alarm(alarm_cnf=alarm_cnf,
                          title=self.alarm_title,
                          messages=messages)
                    exit(3)

            if self.sequence > int(
                    cnf['bulk_insert_control']['rows_to_target']) or int(
                        time.time()) - self.start_time > int(
                            cnf['bulk_insert_control']['interval']):
                messges = """self.sequence> %s or time > %s,do  delete data,insert data,write redis""" % (
                    int(cnf['bulk_insert_control']['rows_to_target']),
                    int(cnf['bulk_insert_control']['interval']))
                logger.info(messges)
                messges = 'begin delete data from clickhouse'
                logger.info(messges)
                try:
                    self.delete_data_in_ch()
                except Exception as e:
                    messages = "delete data from clickhouse failed: %s" % str(
                        e, )
                    logger.error(messages)
                    alarm(alarm_cnf=alarm_cnf,
                          title=self.alarm_title,
                          messages=messages)
                    exit(3)

                messges = 'begin insert data to clickhouse'
                logger.info(messges)
                try:
                    self.insert_data_to_ch()
                except Exception as e:
                    messages = "delete data from clickhouse failed: %s" % str(
                        e, )
                    logger.error(messages)
                    alarm(alarm_cnf=alarm_cnf,
                          title=self.alarm_title,
                          messages=messages)
                    exit(3)

                # 生成程序当前状态,并写入redis,方便后期监控,和高可用
                messges = 'begin write infomation data to redis'
                logger.info(messges)
                try:
                    self.write_info_redis()
                except Exception as e:
                    messages = "write data to redis failed: %s" % str(e, )
                    logger.error(messages)
                    alarm(alarm_cnf=alarm_cnf,
                          title=self.alarm_title,
                          messages=messages)
                    exit(3)
예제 #4
0
def handle_binlog_stream(config):
    cache = Cache(config.SLAVE_UUID)

    # 该操作可以关闭旧有binlog连接
    stream_binlog = BinLogStreamReader(
        connection_settings=config.BINLOG_CONNECTION,
        server_id=config.SERVER_ID,
        blocking=False,
        resume_stream=True,
        slave_uuid=config.SLAVE_UUID
    )
    stream_binlog.fetchone()

    only_schemas = set()
    only_tables = set()
    event2jobs = defaultdict(list)
    for task in config.TASKS:
        only_schemas.add(task["stream"]["database"])
        only_tables.add(task["stream"]["table"])

        for job in task["jobs"]:
            for action in job["actions"]:
                event = "{host}_{schema}_{table}_{action}".format(host=config.BINLOG_CONNECTION["host"],
                                                                  schema=task["stream"]["database"], table=task["stream"]["table"], action=action)
                event2jobs[event].append(job)

    stream_binlog = BinLogStreamReader(
        connection_settings=config.BINLOG_CONNECTION,
        server_id=config.SERVER_ID,
        blocking=True,
        only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent, RotateEvent], only_schemas=only_schemas,
        only_tables=only_tables,
        freeze_schema=True,
        log_file=cache.get_log_file(),
        log_pos=cache.get_log_pos(),
        resume_stream=True,
        slave_uuid=config.SLAVE_UUID
    )

    for binlogevent in stream_binlog:
        if isinstance(binlogevent, RotateEvent):
            cache.set_log_file(binlogevent.next_binlog)
            cache.set_log_pos(binlogevent.position)
        else:
            print(binlogevent.packet.log_pos)
            for row in binlogevent.rows:
                event = {"host": binlogevent._ctl_connection.host, "schema": binlogevent.schema,
                         "table": binlogevent.table,
                         "timestamp": datetime.datetime.fromtimestamp(binlogevent.timestamp).strftime('%Y-%m-%d %H:%M:%S')}
                # 组装event
                if isinstance(binlogevent, DeleteRowsEvent):
                    event["action"] = "delete"
                    event["values"] = dict(row["values"].items())
                elif isinstance(binlogevent, UpdateRowsEvent):
                    event["action"] = "update"
                    event["before_values"] = dict(row["before_values"].items())
                    event["values"] = dict(row["after_values"].items())
                elif isinstance(binlogevent, WriteRowsEvent):
                    event["action"] = "insert"
                    event["values"] = dict(row["values"].items())

                event_type = "{host}_{schema}_{table}_{action}".format(host=event["host"], schema=event["schema"],
                                                                       table=event["table"], action=event["action"])
                jobs = event2jobs[event_type]
                for job in jobs:
                    if event["action"] in job["actions"]:
                        pipeline = job["pipeline"]
                        rows = do_pipeline(pipeline, event["values"])
                        dest = job["dest"]
                        to_dest(dest, rows)

                cache.set_log_pos(binlogevent.packet.log_pos)
                logging.info(json.dumps(event, cls=DateEncoder))
    def generate_sql(self):
        outfile = self.parser['outfile']
        database = self.parser['database']
        table = self.parser['table']
        conn_setting = {
            'host': parser['host'],
            'port': int(parser['port']),
            'user': parser['user'],
            'passwd': parser['password'],
            'charset': 'utf8'
        }

        log_file = parser['binlog']
        start_position = parser['start_position'] if parser[
            'start_position'] else 4

        dict_ = {'log_file': log_file, 'log_pos': int(start_position)}
        f = file_position()
        f.set(dict_)

        fpos = f.get()
        res_file = fpos['log_file']
        res_pos = fpos['log_pos']
        print('cache_res:', res_file, res_pos)

        stream = BinLogStreamReader(
            connection_settings=conn_setting,
            only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent],
            only_schemas=database,
            only_tables=table,
            log_file=res_file,
            log_pos=res_pos,
            server_id=30,
            blocking=True,
            resume_stream=True)

        for binlogevent in stream:
            if isinstance(binlogevent, WriteRowsEvent):
                for row in binlogevent.rows:
                    next_binlog = stream.log_file
                    postion = stream.log_pos
                    log_timestamp = datetime.datetime.fromtimestamp(
                        binlogevent.timestamp)
                    start = f.get()['log_pos']
                    log_content = ' # binlog: %s start:%s end:%s time: %s' % (
                        next_binlog, str(start), str(postion), log_timestamp)

                    template = 'INSERT INTO `{0}`.`{1}`({2}) VALUES ({3});' \
                        .format(binlogevent.schema, binlogevent.table,
                                ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())),
                                ', '.join(['%s'] * len(row['values'])))
                    values = map(fix_object, row['values'].values())
                    self.db_save(template, values, log_content, outfile)

                    dict_ = {'log_file': next_binlog, 'log_pos': postion}
                    f = file_position()
                    f.set(dict_)
                    print('set:  ', str(dict_))

            elif isinstance(binlogevent, DeleteRowsEvent):
                if binlogevent.primary_key:
                    for row in binlogevent.rows:
                        next_binlog = stream.log_file
                        postion = stream.log_pos
                        log_timestamp = datetime.datetime.fromtimestamp(
                            binlogevent.timestamp)
                        start = f.get()['log_pos']
                        log_content = ' # binlog: %s start:%s end:%s time: %s' % (
                            next_binlog, str(start), str(postion),
                            log_timestamp)

                        prikey = binlogevent.primary_key
                        beoreprikey_items = {
                            k: v
                            for k, v in row['values'].items() if k in prikey
                        }.items()
                        beoreprikey_values = [v for k, v in beoreprikey_items]

                        template = 'DELETE FROM `{0}`.`{1}` WHERE {2} LIMIT 1;'.format(
                            binlogevent.schema, binlogevent.table,
                            ' AND '.join(map(compare_items,
                                             beoreprikey_items)))
                        values = map(fix_object, beoreprikey_values)
                        self.db_save(template, values, log_content, outfile)

                        dict_ = {'log_file': next_binlog, 'log_pos': postion}
                        f = file_position()
                        f.set(dict_)
                        print('set:  ', str(dict_))

                else:
                    next_binlog = stream.log_file
                    postion = stream.log_pos
                    log_timestamp = datetime.datetime.fromtimestamp(
                        binlogevent.timestamp)
                    start = f.get()['log_pos']
                    log_content = ' # binlog: %s start:%s end:%s time: %s' % (
                        next_binlog, str(start), str(postion), log_timestamp)

                    for row in binlogevent.rows:
                        print('del_nopri_row:    ', binlogevent.schema,
                              binlogevent.table, row, log_content)

                    dict_ = {'log_file': next_binlog, 'log_pos': postion}
                    f = file_position()
                    f.set(dict_)
                    print('set:  ', str(dict_))
                    print('del表没有主键,或含有JSON的表,无法处理,退出')
                    sys.exit()

            elif isinstance(binlogevent, UpdateRowsEvent):
                if binlogevent.primary_key:
                    for row in binlogevent.rows:
                        prikey = binlogevent.primary_key
                        beoreprikey_items = {
                            k: v
                            for k, v in row['before_values'].items()
                            if k in prikey
                        }.items()
                        beoreprikey_values = [v for k, v in beoreprikey_items]

                        next_binlog = stream.log_file
                        postion = stream.log_pos
                        log_timestamp = datetime.datetime.fromtimestamp(
                            binlogevent.timestamp)
                        start = f.get()['log_pos']
                        log_content = ' # binlog: %s start:%s end:%s time: %s' % (
                            next_binlog, str(start), str(postion),
                            log_timestamp)

                        template = 'UPDATE `{0}`.`{1}` SET {2} WHERE {3} LIMIT 1;'.format(
                            binlogevent.schema, binlogevent.table, ', '.join([
                                '`%s`=%%s' % k
                                for k in row['after_values'].keys()
                            ]),
                            ' AND '.join(map(compare_items,
                                             beoreprikey_items)))
                        values = map(
                            fix_object,
                            list(row['after_values'].values()) +
                            list(beoreprikey_values))
                        self.db_save(template, values, log_content, outfile)

                        dict_ = {'log_file': next_binlog, 'log_pos': postion}
                        f = file_position()
                        f.set(dict_)
                        print('set:  ', str(dict_))

                else:
                    next_binlog = stream.log_file
                    postion = stream.log_pos
                    log_timestamp = datetime.datetime.fromtimestamp(
                        binlogevent.timestamp)
                    start = f.get()['log_pos']
                    log_content = ' # binlog: %s start:%s end:%s time: %s' % (
                        next_binlog, str(start), str(postion), log_timestamp)

                    for row in binlogevent.rows:
                        print('update_nopri_row:    ', binlogevent.schema,
                              binlogevent.table, row, log_content)

                    dict_ = {'log_file': next_binlog, 'log_pos': postion}
                    f = file_position()
                    f.set(dict_)
                    print('set:  ', str(dict_))
                    print('update表没有主键,或含有JSON的表,无法处理,退出')
                    sys.exit()
예제 #6
0
    def process_binlog(self):
        stream = BinLogStreamReader(
            connection_settings=self.connectionSettings,
            server_id=self.serverId,
            log_file=self.startFile,
            log_pos=self.startPos,
            only_schemas=self.only_schemas,
            only_tables=self.only_tables,
            resume_stream=True)

        cur = self.connection.cursor()
        tmpFile = create_unique_file(
            '%s.%s' %
            (self.connectionSettings['host'], self.connectionSettings['port'])
        )  # to simplify code, we do not use file lock for tmpFile.
        ftmp = open(tmpFile, "w")
        flagLastEvent = False
        eStartPos, lastPos = stream.log_pos, stream.log_pos
        try:
            count = 0

            for binlogevent in stream:
                print stream.log_file
                print datetime.datetime.fromtimestamp(
                    binlogevent.timestamp).strftime('%Y-%m-%d %H:%M:%S')

                if count >= self.countnum:
                    break
                if not self.stopnever:
                    # if (stream.log_file == self.endFile and stream.log_pos == self.endPos) or (stream.log_file == self.eofFile and stream.log_pos == self.eofPos):
                    if (stream.log_file == self.endFile
                            and stream.log_file <> self.startFile) or (
                                stream.log_file == self.eofFile
                                and stream.log_pos == self.eofPos):
                        flagLastEvent = True

                    elif datetime.datetime.fromtimestamp(
                            binlogevent.timestamp) < self.startTime:
                        if not (isinstance(binlogevent, RotateEvent)
                                or isinstance(binlogevent,
                                              FormatDescriptionEvent)):
                            lastPos = binlogevent.packet.log_pos
                        continue
                    elif (stream.log_file not in self.binlogList) or (
                            self.endPos and stream.log_file == self.endFile
                            and stream.log_pos > self.endPos
                    ) or (stream.log_file == self.eofFile and stream.log_pos >
                          self.eofPos) or (datetime.datetime.fromtimestamp(
                              binlogevent.timestamp) >= self.stopTime):
                        break
                    # else:
                    #     raise ValueError('unknown binlog file or position')

                if isinstance(binlogevent,
                              QueryEvent) and binlogevent.query == 'BEGIN':
                    eStartPos = lastPos

                if isinstance(binlogevent, QueryEvent):
                    sql = concat_sql_from_binlogevent(cursor=cur,
                                                      binlogevent=binlogevent,
                                                      flashback=self.flashback,
                                                      nopk=self.nopk)
                    if sql:
                        count = count + 1
                        self.sqllist.append(sql)
                        # pass
                        # print sql
                elif isinstance(binlogevent, WriteRowsEvent) or isinstance(
                        binlogevent, UpdateRowsEvent) or isinstance(
                            binlogevent, DeleteRowsEvent):
                    for row in binlogevent.rows:
                        sql = concat_sql_from_binlogevent(
                            cursor=cur,
                            binlogevent=binlogevent,
                            row=row,
                            flashback=self.flashback,
                            nopk=self.nopk,
                            eStartPos=eStartPos)
                        if self.flashback:
                            ftmp.write(sql + '\n')
                        else:
                            # print sql
                            self.sqllist.append(sql)
                        count = count + 1

                if not (isinstance(binlogevent, RotateEvent)
                        or isinstance(binlogevent, FormatDescriptionEvent)):
                    lastPos = binlogevent.packet.log_pos
                if flagLastEvent:
                    break
            ftmp.close()
            if self.flashback:
                with open(tmpFile) as ftmp:
                    for line in reversed_lines(ftmp):
                        self.sqllist.append(line.rstrip())
        finally:
            os.remove(tmpFile)
        cur.close()
        stream.close()
        return True
예제 #7
0
def main():

# DynamoDB ----------------------------

#     Keyvalue = ""
#     Keyvalue_2 = ""

#     response = table.put_item(
#    I   Item={
#             'Keyname': Keyname,
#             'Keyvalue': Keyvalue,
#         }
#     )
#     response = table.put_item(
#    I   Item={
#             'Keyname': Keyname_2,
#             'Keyvalue': Keyvalue_2,
#         }
#     )

#  ------------------------------------------
    parser.add_argument("--log_pos", "-p", help="enter the starting position")
    parser.add_argument("--log_file", "-f", help="enter the log file")
    args = parser.parse_args()
    if args.log_pos:
        args.log_pos = int(args.log_pos)

    # Get Item For DynamoDB ------------------------------
    # args.log_pos = getItem(Keyname)
    # args.log_file = getItem(Keyname_2)
    #-----------------------------------------------------

    conn = pymysql.connect(**MYSQL_SETTINGS)  
    cursor = conn.cursor()
    stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS,
                                server_id=3,
                                blocking=True,
                                only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent, QueryEvent,RotateEvent],
                                log_file=args.log_file,
                                log_pos=args.log_pos,
                                resume_stream=True
                                )
    next_binlog = ''
    delimit = ';'
    TopicArn = 'arn:aws:sns:ap-southeast-1:XXXXXXXXXXXX:Openhack-Data-Stiching'
    for binlogevent in stream:
        e_start_pos, last_pos = stream.log_pos, stream.log_pos
        if type(binlogevent).__name__ == 'RotateEvent':
            next_binlog = binlogevent.next_binlog
        else:
            result = concat_sql_from_binlog_event(cursor=cursor, binlog_event=binlogevent, row=None, e_start_pos=e_start_pos)
            result['next_binlog'] = next_binlog
            if 'Query' in result:
                result['Query'] = result['Query'].partition(delimit)[2]
            for k, v in result.items():
                if k == 'Query' and "rds_heartbeat2" not in v:
                    if v:
                        client.publish(
                            TopicArn=TopicArn,
                            Message=v
                        )
                        # Update Item For DynamoDB ------------------------------
                        # updateItem(result['position'], result['next_binlog'])
                        #------------------------------------------------------

            print(json.dumps(result))


    stream.close()
예제 #8
0
server_id = 6
blocking = True
only_schemas = ['statistics']
only_tables = ['StarUser1']
skip_to_timestamp = 1562501553
resume_stream = True
log_file = 'master.000001'
log_pos = 105709374

reader = BinLogStreamReader(
    connection_settings=MYSQL_SETTINGS,
    server_id=server_id,
    only_events=only_events,
    blocking=blocking,
    only_schemas=only_schemas,
    only_tables=only_tables,
    # skip_to_timestamp=skip_to_timestamp,
    # resume_stream=resume_stream,
    # log_file=log_file,
    # log_pos=log_pos,
)

for binlog_event in reader:
    if isinstance(binlog_event, RotateEvent):
        print(f'next_binlog={binlog_event.next_binlog}')
        print(f'position={binlog_event.position}')
    else:
        print(binlog_event.dump())
        print(binlog_event.timestamp)
        print(binlog_event.packet.log_pos)
예제 #9
0
}

stream = BinLogStreamReader(
    connection_settings=mysql_settings,
    # only_events=[ # 只看指定事件
    #     DeleteRowsEvent,
    #     WriteRowsEvent,
    #     UpdateRowsEvent
    # ],
    # ignored_events=[], # 忽略指定事件
    # only_tables=['e_tester'], # 只看指定表
    # ignored_tables=[], # 忽略指定表
    # only_schemas=[], # 只看指定数据库
    # ignored_schemas=[], # 忽略指定数据库
    blocking=False,
    server_id=2,
    resume_stream=True,  # 必须为 True 时,log_file log_pos 定位才有效。
    log_file='binlog.000099',  # 文件,需要自行记录。RotateEvent 会给出,记得保留。
    log_pos=4,  # 定位号,需要自行记录。每个文件都是从4开始,之后的定位是根据数据长短跳动的
    # GTID 由 source_id:transaction_id 组成
    # source_id 为发起事务 MySQL 实例 UUID
    # transaction_id 为事务序号,从 1 开始递增
    # transaction_id 可以用冒号隔开写多个,减号为连续多个
    # c8671405-081c-11e9-a407-ec0d9a495964:1-5:11-18
    # auto_position 会自动过滤掉指定的 gtid 。
    # auto_position='c8671405-081c-11e9-a407-ec0d9a495964:3472692',

    # skip_to_timestamp=time.mktime(time.strptime('2020-10-01 16:17:18', '%Y-%m-%d %H:%M:%S')), # 过滤指定时间前的日志
)

for binlogevent in stream:
예제 #10
0
        topic = getTopic(database)
        logger.info(database + '------------->' + str(topic))
        if topic == None:
            raise RuntimeError('Topic为空!')
        offset = getOffset(database)

        if offset == None:
            offset = int(time.time()) - 300
            setOffset(database, offset)

        logger.info(database + '------------->' + str(offset))
        blacklist = getBlackList(database)
        stream = BinLogStreamReader(
            connection_settings=mysql_settings,
            server_id=100,  # slave标识,唯一
            blocking=True,  # 阻塞等待后续事件
            skip_to_timestamp=offset,  # 从offset开始消费
            ignored_tables=blacklist,  # 忽略表
            # 设定只监控写操作:增、删、改
            only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent])
        producer = KafkaProducer(bootstrap_servers=hosts_producer_arr)
        partition = producer.partitions_for(topic)
        numPartitions = len(partition)

        logger.info('*****************开始发送数据*****************')
        for binlogevent in stream:
            for row in binlogevent.rows:
                if len(row) > 80960:
                    logger.error('长度超限:' + row)
                event = {
                    "schema": binlogevent.schema,
                    "table": binlogevent.table
def connect():
    global EVENT_LAST_SEEN
    global LOG_FILE
    global LOG_POS
    try:
        with open('tracking.time', 'r') as r:
            EVENT_LAST_SEEN, LOG_POS, LOG_FILE = r.readline().split()
            if LOG_POS:
                LOG_POS = int(LOG_POS)
            if EVENT_LAST_SEEN:
                EVENT_LAST_SEEN = int(EVENT_LAST_SEEN)
    except Exception as e:
        print(e)
    print("start stream with EVENT_LAST_SEEN=" + str(EVENT_LAST_SEEN))
    sys.stdout.flush()
    try:
        stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS,
                                    slave_heartbeat=20,
                                    resume_stream=(EVENT_LAST_SEEN
                                                   is not None),
                                    log_file=LOG_FILE,
                                    log_pos=LOG_POS,
                                    blocking=True,
                                    server_id=3,
                                    skip_to_timestamp=EVENT_LAST_SEEN,
                                    only_schemas=ONLY_SCHEMAS,
                                    ignored_tables=IGNORED_TABLES,
                                    only_events=[
                                        DeleteRowsEvent, WriteRowsEvent,
                                        UpdateRowsEvent, RotateEvent
                                    ])
        # stream._BinLogStreamReader__connect_to_stream()
        # print stream.__dict__.get("_stream_connection")

        for binlogevent in stream:
            EVENT_LAST_SEEN = binlogevent.timestamp
            if binlogevent.event_type == ROTATE_EVENT:
                LOG_POS = binlogevent.position
                LOG_FILE = binlogevent.next_binlog
                continue
            elif stream.log_pos:
                LOG_POS = stream.log_pos
            for row in binlogevent.rows:

                event = {
                    "schema": binlogevent.schema,
                    "table":
                    binlogevent.schema.lower() + "__" + binlogevent.table
                }

                if isinstance(binlogevent, DeleteRowsEvent):
                    event["action"] = "delete"
                    event = dict(event.items() + row["values"].items())
                elif isinstance(binlogevent, UpdateRowsEvent):
                    event["action"] = "update"
                    event = dict(event.items() + row["after_values"].items())
                elif isinstance(binlogevent, WriteRowsEvent):
                    event["action"] = "index"
                    event = dict(event.items() + row["values"].items())

                for field in ARRAY_FIELDS:
                    if event is not None and event.get(field,
                                                       '___') is not None:
                        if not event.get(field, '___').startswith('___'):
                            tags = event[field].split(",")
                            arrTags = []
                            for tag in tags:
                                strTag = tag.strip()
                                if len(strTag) > 0:
                                    arrTags.append(strTag)
                            event[field] = arrTags

                jsonstr = json.dumps(event,
                                     ensure_ascii=False,
                                     encoding="utf-8").encode('utf-8')
                if DUMP_JSON:
                    dumpJson(binlogevent.schema.lower(), jsonstr)
                if DUMP_KAFKA:
                    dumpKafka(binlogevent.schema.lower(), jsonstr)
                if DUMP_REDIS:
                    dumpRedis(binlogevent.schema.lower(), jsonstr)
                # print (jsonstr + os.linesep)
                # sys.stdout.flush()
        stream.close()
        print("close stream")
    except Exception as e:
        print(e)
    sys.stdout.flush()
    return True
예제 #12
0
# from pymysqlreplication           import BinLogStreamReader
# from pymysqlreplication.event     import QueryEvent
from pymysqlreplication.row_event import DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent

from pymysqlreplication import BinLogStreamReader
from pymysqlreplication.row_event import DeleteRowsEvent

mysql_settings = {
    "host": "127.0.0.1",
    "port": 3307,
    "user": "******",
    "passwd": "warlock"
}

stream = BinLogStreamReader(
    connection_settings = mysql_settings,
    server_id = 1,
    blocking = True,
    only_schemas=['warlock'],
    only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent],
    resume_stream=True,
)


for event in stream:
    print(event)
    print(event.dump())

stream.close()
예제 #13
0
    def process_binlog(self):
        stream = BinLogStreamReader(
            connection_settings=self.connectionSettings,
            server_id=self.serverId,
            log_file=self.startFile,
            log_pos=self.startPos,
            only_schemas=self.only_schemas,
            only_tables=self.only_tables,
            resume_stream=True)

        cur = self.connection.cursor()
        tmpFile = 'tmp.%s.%s.tmp' % (
            self.connectionSettings['host'], self.connectionSettings['port']
        )  # to simplify code, we do not use file lock for tmpFile.
        ftmp = open(tmpFile, "w")
        flagLastEvent = False
        eStartPos = stream.log_pos
        lastPos = stream.log_pos
        try:
            for binlogevent in stream:
                if not self.stopnever:
                    if (stream.log_file == self.endFile
                            and stream.log_pos == self.endPos) or (
                                stream.log_file == self.eofFile
                                and stream.log_pos == self.eofPos):
                        flagLastEvent = True
                    elif stream.log_file not in self.binlogList:
                        break
                    elif (self.endPos and stream.log_file == self.endFile
                          and stream.log_pos > self.endPos) or (
                              stream.log_file == self.eofFile
                              and stream.log_pos > self.eofPos):
                        break
                    # else:
                    #     raise ValueError('unknown binlog file or position')

                if isinstance(binlogevent,
                              QueryEvent) and binlogevent.query == 'BEGIN':
                    eStartPos = lastPos

                if isinstance(binlogevent, QueryEvent):
                    sql = concat_sql_from_binlogevent(cursor=cur,
                                                      binlogevent=binlogevent,
                                                      flashback=self.flashback,
                                                      popPk=self.popPk)
                    if sql:
                        print sql
                elif type(binlogevent) in (WriteRowsEvent, UpdateRowsEvent,
                                           DeleteRowsEvent):
                    for row in binlogevent.rows:
                        sql = concat_sql_from_binlogevent(
                            cursor=cur,
                            binlogevent=binlogevent,
                            row=row,
                            flashback=self.flashback,
                            popPk=self.popPk,
                            eStartPos=eStartPos)
                        if self.flashback:
                            ftmp.write(sql + '\n')
                        else:
                            print sql

                if type(binlogevent) not in (RotateEvent,
                                             FormatDescriptionEvent):
                    lastPos = binlogevent.packet.log_pos
                if flagLastEvent:
                    break
            ftmp.close()
            if self.flashback:
                # doesn't work if you can't fit the whole file in memory.
                # need to be optimized
                for line in reversed(open(tmpFile).readlines()):
                    print line.rstrip()
        finally:
            os.remove(tmpFile)
        cur.close()
        stream.close()
        return True
예제 #14
0
 def resetBinLog(self):
     self.execute("RESET MASTER")
     if self.stream is not None:
         self.stream.close()
     self.stream = BinLogStreamReader(connection_settings=self.database)
예제 #15
0
def main():

	#读取binlog 位置
	#默认位置
	print 'start'
	#log_pos = 51487324
	#log_file="mysql-bin.000008"
	
	
	blposfile = 'binlogpos.meta'	

	if os.path.exists(blposfile):
		with open(blposfile) as f:
			log_message = f.readline()
			binlogmessage = json.loads(log_message)
			log_file = binlogmessage['file']
			log_pos = binlogmessage['pos']
	elif os.path.exists('syncer.meta'):
		smpos = open('syncer.meta','r').readlines()
		log_file = ((smpos[0].split('=')[1]).split('"')[1]).strip()
		log_pos = (smpos[1].split('=')[1]).strip()
	else:
		print 'binlog 文件不存在,退出!'
		sys.exit()
	
	
	stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS,
								server_id=my_server_id,
								resume_stream=True,
								blocking=True,
								freeze_schema=True,
								only_schemas=only_schema,
								only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent],
								log_file=log_file,
								log_pos=log_pos
								)

	#flag=stream.log_pos	

	for binlogevent in stream:

		#映射
		bdb = DBR[binlogevent.schema]
		
		#过滤
		if binlogevent.table in alltidbtables:
			pass
		else:
			continue
			
		for row in binlogevent.rows:
			
			if isinstance(binlogevent,WriteRowsEvent):
			
				#异构
				for v in S:
					row['values'][v] = S[v]
			
			
				#去除多余字段
				tidblist = column_dict[ti_db][binlogevent.table].values()
				mylist = row['values'].keys()
				for m in mylist:
					if m not in tidblist:
						del row['values'][m]
				
				#判断是否有双引号,解决双引号异常		
				for va in row['values']:
					if isinstance(row['values'][va],unicode):
						if row['values'][va].find("'") >0:
							if row['values'][va].find(r"\'") >0:
								pass
							else:
								row['values'][va] = row['values'][va].replace("'",r"\'")
								print row['values'][va]
				
				template = 'INSERT INTO `{0}`.`{1}`({2}) VALUES ({3});'.format(
					bdb,binlogevent.table,
					', '.join(map(lambda key: '`%s`' % key, row['values'].keys())),
					', '.join(map(lambda v: "'%s'" % v,row["values"].values()))
				)
				
				try:
					con.execute(template)
					db.commit()
					#记录日志位置
					savepos(stream.log_file,stream.log_pos)
					logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\
					stream.log_file,stream.log_pos,template))
				except:
					savepos(stream.log_file,stream.log_pos)
					logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行发生异常的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\
					stream.log_file,stream.log_pos,template))
					sys.exit()
				
			
			elif isinstance(binlogevent, DeleteRowsEvent):
				print 'This is DELETE OPTIONS'			
				#异构
				for v in S:
					row['values'][v] = S[v]
					
				#判断是否有双引号,解决双引号异常		
				for va in row['values']:
					if isinstance(row['values'][va],unicode):

						s = row['values'][va]
						if s.find('"') >0:
							if s.find(r'\"') >0:
								pass
							else:
								row['values'][va] = s.replace('"',r'\"')
								print row['values'][va]
								
						if len(re.findall("\'",s)) == 1:
							row['values'][va] = s.replace("\'","")
							print row['values'][va]
						
				
				
				#去除多余字段
				tidblist = column_dict[ti_db][binlogevent.table].values()
				mylist = row['values'].keys()
				for m in mylist:
					if m not in tidblist:
						del row['values'][m]
				
				
				template = 'DELETE FROM `{0}`.`{1}` WHERE {2} ;'.format(
					bdb, binlogevent.table, ' AND '.join(map(compare_items, row['values'].items()))
				)
				
				#查找出主键ID并根据主键ID删除行				
				sq_sql = template.split('WHERE')[1]
				select_sql = 'SELECT `tidbid` from `%s` WHERE %s'%(binlogevent.table,sq_sql)

				SELECT_SQL = select_sql.replace('= NULL','IS NULL')
				#print SELECT_SQL
				con.execute(SELECT_SQL)				
				sql_result = con.fetchone()
				#记录日志位置
				#print sql_result
				if sql_result:
					tidbid = sql_result[0]
					del_sql = 'DELETE FROM `%s` where `tidbid`=%s;'%(binlogevent.table,tidbid)
					try:					
													
						con.execute(del_sql)
						db.commit()	
						#记录日志位置
						savepos(stream.log_file,stream.log_pos)
						logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\
						stream.log_file,stream.log_pos,template))
					except:
						savepos(stream.log_file,stream.log_pos)
						logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行发生异常的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\
						stream.log_file,stream.log_pos,del_sql))
						sys.exit()
				
			elif isinstance(binlogevent, UpdateRowsEvent):
				print 'This is UPDATE OPTIONS'
				#注入添加字段和值
				for v in S:
					row['before_values'][v] = S[v]
					
				#去除多余字段
				tidblist = column_dict[ti_db][binlogevent.table].values()
				mylist = row['before_values'].keys()
				for m in mylist:
					if m not in tidblist:
						del row['before_values'][m]
						del row['after_values'][m]
		
				#判断是否有双引号,解决双引号异常				
				for v1 in row:
					for va in row[v1]:
						if isinstance(row[v1][va],unicode):
							
							s = row[v1][va]
							if s.find('"') >0:
								if s.find(r'\"') >0:
									pass
								else:
									row[v1][va] = s.replace('"',r'\"')
									print row[v1][va]
									
							if len(re.findall("\'",s)) == 1:
								row[v1][va] = s.replace("\'","")
								print row[v1][va]
								
				template='UPDATE `{0}`.`{1}` set {2} WHERE {3} ;'.format(
					bdb, binlogevent.table,','.join(map(compare_items,row["after_values"].items())),
					' AND '.join(map(compare_items,row["before_values"].items())),datetime.datetime.fromtimestamp(binlogevent.timestamp)
				)
				template = template.replace('""','"')
				mid_sql = template.split('WHERE')[1]
				tidbid_sql = 'select `tidbid` from %s where %s'%(binlogevent.table,mid_sql)
				TIDBID_SQL = tidbid_sql.replace('= NULL','IS NULL')	
				#print TIDBID_SQL
				con.execute(TIDBID_SQL)
				tidbid_result = con.fetchone()
				
				if tidbid_result:
					tidbid = tidbid_result[0]
					u_sql = 'update `%s` set %s WHERE `tidbid` = %s;'%(binlogevent.table,','.join(map(compare_items,row["after_values"].items())),tidbid)
					
					try:												
						con.execute(u_sql)
						db.commit()
						#记录日志位置
						savepos(stream.log_file,stream.log_pos)
						logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\
						stream.log_file,stream.log_pos,u_sql))
					except:
						savepos(stream.log_file,stream.log_pos)
						logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行发生异常的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\
						stream.log_file,stream.log_pos,u_sql))
						sys.exit()
								
	stream.close()
예제 #16
0
def sync_binlog_stream(mysql_conn, config, binlog_streams, state):
    binlog_streams_map = generate_streams_map(binlog_streams)

    for tap_stream_id in binlog_streams_map.keys():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map,
                                           state)

    verify_log_file_exists(mysql_conn, log_file, log_pos)

    server_id = fetch_server_id(mysql_conn)

    connection_wrapper = make_connection_wrapper(config)

    reader = BinLogStreamReader(connection_settings={},
                                server_id=server_id,
                                log_file=log_file,
                                log_pos=log_pos,
                                resume_stream=True,
                                only_events=[
                                    RotateEvent, WriteRowsEvent,
                                    UpdateRowsEvent, DeleteRowsEvent
                                ],
                                pymysql_wrapper=connection_wrapper)

    time_extracted = utils.now()

    LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                log_file, log_pos)

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.info(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            elif catalog_entry:
                initial_binlog_complete = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id,
                    'initial_binlog_complete')

                if (initial_binlog_complete and reader.log_file == log_file
                        and reader.log_pos == log_pos):
                    LOGGER.info(
                        "Skipping event for stream(%s) log_file=%s and log_pos=%s as it was processed last sync",
                        catalog_entry.tap_stream_id, reader.log_file,
                        reader.log_pos)
                    continue

                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.info(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        state = update_bookmarks(state, binlog_streams_map, reader.log_file,
                                 reader.log_pos)

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == reader.log_file and reader.log_pos >= current_log_pos:
            break

        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    state = update_initial_binlog_complete(binlog_streams_map, state)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
예제 #17
0
def main():
    connection = pika.BlockingConnection(
        pika.ConnectionParameters(host=os.getenv('ENV_MQ_HOST'),
                                  port=int(os.getenv('ENV_MQ_PORT')),
                                  credentials=pika.PlainCredentials(
                                      os.getenv('ENV_MQ_USER'),
                                      os.getenv('ENV_MQ_PASSWD')),
                                  virtual_host='/'))
    channel = connection.channel()
    channel.queue_declare(queue='default_queue', durable=True)

    file_name = "file_pos.log"
    log_file = ''
    log_pos = 0
    if os.path.isfile(file_name):
        fo = open(file_name, "r")
        file_pos = fo.read()
        fo.close()
        if file_pos != '':
            fp_list = file_pos.split('|')
            log_file = fp_list[0]
            log_pos = int(fp_list[1])

    # server_id is your slave identifier, it should be unique.
    # set blocking to True if you want to block and wait for the next event at
    # the end of the stream
    stream = BinLogStreamReader(
        connection_settings=MYSQL_SETTINGS,
        server_id=3,
        blocking=True,
        resume_stream=True,
        log_file=log_file,
        log_pos=log_pos,
        only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent],
        only_tables=['system_message'])

    for binlogevent in stream:
        # binlogevent.dump()  # 打印所有信息

        for row in binlogevent.rows:
            # 打印 库名 和 表名
            event = {
                "schema": binlogevent.schema,
                "table": binlogevent.table,
                # "log_pos": stream.log_pos,
                # "log_file": stream.log_file
            }

            if isinstance(binlogevent, DeleteRowsEvent):
                event["action"] = "delete"
                event["data"] = row["values"]

            elif isinstance(binlogevent, UpdateRowsEvent):
                event["action"] = "update"
                event["data"] = row["after_values"]  # 注意这里不是values

            elif isinstance(binlogevent, WriteRowsEvent):
                event["action"] = "insert"
                event["data"] = row["values"]

            print(json.dumps(event, cls=DateEncoder))

            message = {
                'class': '\\MZ\\Models\\user\\UserModel',
                'method': 'getUserById',
                'data': event
            }

            body = json.dumps(message, cls=DateEncoder)

            channel.basic_publish(exchange='default_ex',
                                  routing_key='default_route',
                                  body=body)

            fo = open(file_name, "w")
            fo.write(stream.log_file + '|' + str(stream.log_pos))
            fo.close()
예제 #18
0
    def _binlog_reading(
        self,
        only_tables,
        only_schemas,
        log_file,
        log_pos,
        server_id,
        skip_dmls,
        skip_delete_tables,
        skip_update_tables,
    ) -> Generator:
        stream = BinLogStreamReader(
            connection_settings=dict(
                host=self.host,
                port=self.port,
                user=self.user,
                passwd=self.password,
            ),
            resume_stream=True,
            blocking=True,
            server_id=server_id,
            only_tables=only_tables,
            only_schemas=only_schemas,
            only_events=self.only_events,
            log_file=log_file,
            log_pos=log_pos,
            fail_on_table_metadata_unavailable=True,
            slave_heartbeat=10,
        )
        for binlog_event in stream:
            if isinstance(binlog_event, QueryEvent):
                schema = binlog_event.schema.decode()
                query = binlog_event.query.lower()
                if "alter" not in query:
                    continue
                table, convent_sql = SqlConvert.to_clickhouse(
                    schema, query, Settings.cluster_name())
                if not convent_sql:
                    continue
                event = {
                    "table": table,
                    "schema": schema,
                    "action": "query",
                    "values": {
                        "query": convent_sql
                    },
                    "event_unixtime": int(time.time() * 10**6),
                    "action_seq": 0,
                }
                yield schema, None, event, stream.log_file, stream.log_pos
            else:
                schema = binlog_event.schema
                table = binlog_event.table
                skip_dml_table_name = f"{schema}.{table}"
                for row in binlog_event.rows:
                    if isinstance(binlog_event, WriteRowsEvent):
                        event = {
                            "table": table,
                            "schema": schema,
                            "action": "insert",
                            "values": row["values"],
                            "event_unixtime": int(time.time() * 10**6),
                            "action_seq": 2,
                        }

                    elif isinstance(binlog_event, UpdateRowsEvent):
                        if "update" in skip_dmls or skip_dml_table_name in skip_update_tables:
                            continue
                        delete_event = {
                            "table": table,
                            "schema": schema,
                            "action": "delete",
                            "values": row["before_values"],
                            "event_unixtime": int(time.time() * 10**6),
                            "action_seq": 1,
                        }
                        yield binlog_event.schema, binlog_event.table, delete_event, stream.log_file, stream.log_pos
                        event = {
                            "table": table,
                            "schema": schema,
                            "action": "insert",
                            "values": row["after_values"],
                            "event_unixtime": int(time.time() * 10**6),
                            "action_seq": 2,
                        }

                    elif isinstance(binlog_event, DeleteRowsEvent):
                        if "delete" in skip_dmls or skip_dml_table_name in skip_delete_tables:
                            continue
                        event = {
                            "table": table,
                            "schema": schema,
                            "action": "delete",
                            "values": row["values"],
                            "event_unixtime": int(time.time() * 10**6),
                            "action_seq": 1,
                        }
                    else:
                        return
                    yield binlog_event.schema, binlog_event.table, event, stream.log_file, stream.log_pos
예제 #19
0
def main():
    stream = BinLogStreamReader(
        connection_settings=mysql_settings,
        server_id=1,
        only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent],
        blocking=True,
        resume_stream=True)
    for binlogevent in stream:
        for row in binlogevent.rows:
            print "%s:%s:" % (binlogevent.schema, binlogevent.table), row

            if binlogevent.table in ignore_table:
                print "ignore %s" % binlogevent.table, row
                continue

            if isinstance(binlogevent, DeleteRowsEvent):
                json_data = json.dumps(
                    {
                        "table": binlogevent.table,
                        "action": "delete",
                        "id": row["values"]["id"]
                    },
                    default=date_handler)
                delete_rows_event.apply_async((binlogevent.table, id),
                                              queue="q_task_deleterows",
                                              exchange="qgswaf",
                                              routing_key="key_deleterows")

            elif isinstance(binlogevent, UpdateRowsEvent):
                json_data = json.dumps(
                    {
                        "table": binlogevent.table,
                        "action": "update",
                        "id": row["after_values"]["id"],
                        "doc": row["after_values"]
                    },
                    default=date_handler)
                update_rows_event.apply_async(
                    (binlogevent.table, id, row["after_values"]),
                    queue="q_task_updaterows",
                    exchange="qgswaf",
                    routing_key="key_updaterows")

            elif isinstance(binlogevent, WriteRowsEvent):
                json_data = json.dumps(
                    {
                        "table": binlogevent.table,
                        "action": "insert",
                        "id": row["values"]["id"],
                        "doc": row["values"]
                    },
                    default=date_handler)
                write_rows_event.apply_async(
                    (binlogevent.table, id, row["values"]),
                    queue="q_task_writerows",
                    exchange="qgswaf",
                    routing_key="key_writerows")

                # alieses  = qgswaf_v1
                #res = es.index(index=qgswaf_aliases, doc_type=binlogevent.table, id=row["values"]["id"], body=row["values"])
                #es.indices.refresh(index="qgswaf")
            print json_data
예제 #20
0
    def __init__(self, extraction_settings, commit_settings, queue_settings):
        self.resume_file = '/tmp/stream.loc'

        self.mysql_settings = {
            "host": extraction_settings["HOST"],
            "port": extraction_settings["PORT"],
            "user": extraction_settings["USER"],
            "passwd": extraction_settings["PASS"]
        }

        self.queue = Queues(queue_settings)

        if os.path.isfile(self.resume_file) == True:
            self.log_filename, self.log_filepos = open('/tmp/stream.loc',
                                                       'r').read().split('~')
            self.log_filepos = int(self.log_filepos)
        else:
            self.log_filename, self.log_filepos = None, None

        try:
            self.stream = BinLogStreamReader(
                connection_settings=self.mysql_settings,
                server_id=1,
                blocking=True,
                resume_stream=True,
                only_events=[
                    DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent,
                    QueryEvent
                ],
                log_pos=self.log_filepos,
                log_file=self.log_filename)

            for binlogevent in self.stream:
                self.log_filename, self.log_pos = [
                    self.stream.log_file, self.stream.log_pos
                ]

                if isinstance(binlogevent, QueryEvent):
                    func_name = str(binlogevent.query).split(' ')[0].lower()
                    query = str(binlogevent.query)

                    if func_name in ['create'] or func_name in ['alter']:
                        self.queue.submit_job(func_name,
                                              [commit_settings, query])

                elif isinstance(binlogevent, RotateEvent) == False and \
                  isinstance(binlogevent, FormatDescriptionEvent) == False and \
                  isinstance(binlogevent, TableMapEvent) == False and \
                  isinstance(binlogevent, XidEvent) == False:

                    for row in binlogevent.rows:
                        log_position = binlogevent.packet.log_pos
                        table_name = binlogevent.table
                        event_time = binlogevent.timestamp
                        schema_row = binlogevent.schema

                        if isinstance(binlogevent, DeleteRowsEvent):
                            self.queue.submit_job(
                                'delete',
                                [commit_settings, table_name, row["values"]])
                        elif isinstance(binlogevent, WriteRowsEvent):
                            self.queue.submit_job(
                                'insert',
                                [commit_settings, table_name, row["values"]])
                        elif isinstance(binlogevent, UpdateRowsEvent):
                            self.queue.submit_job('update', [
                                commit_settings, table_name,
                                row["before_values"], row["after_values"]
                            ])

        except Exception as e:
            self.kill(self.log_filename, self.log_filepos)
예제 #21
0
def create_binlog_stream_reader(config: Dict, log_file: Optional[str],
                                log_pos: Optional[int],
                                gtid_pos: Optional[str]) -> BinLogStreamReader:
    """
    Create an instance of BinlogStreamReader with the right config

    Args:
        config: dictionary of the content of tap config.json
        log_file: binlog file name to start replication from (Optional if using gtid)
        log_pos: binlog pos to start replication from (Optional if using gtid)
        gtid_pos: GTID pos to start replication from (Optional if using log_file & pos)

    Returns: Instance of BinlogStreamReader
    """
    if config.get('server_id'):
        server_id = int(config.get('server_id'))
        LOGGER.info("Using provided server_id=%s", server_id)
    else:
        server_id = random.randint(
            1, 2 ^ 32)  # generate random server id for this slave
        LOGGER.info("Using randomly generated server_id=%s", server_id)

    engine = config['engine']

    kwargs = {
        'connection_settings': {},
        'pymysql_wrapper': make_connection_wrapper(config),
        'is_mariadb': connection.MARIADB_ENGINE == engine,
        'server_id': server_id,  # slave server ID
        'report_slave': socket.gethostname() or
        'pipelinewise',  # this is so this slave appears in SHOW SLAVE HOSTS;
        'only_events': [WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent],
    }

    # only fetch events pertaining to the schemas in filter db.
    if config.get('filter_db'):
        kwargs['only_schemas'] = config['filter_db'].split(',')

    if config['use_gtid']:

        if not gtid_pos:
            raise ValueError(
                f'gtid_pos is empty "{gtid_pos}"! Cannot start logical replication from empty gtid.'
            )

        LOGGER.info(
            "Starting logical replication from GTID '%s' on engine '%s'",
            gtid_pos, engine)

        # When using GTID, we want to listen in for GTID events and start from given gtid pos
        kwargs['only_events'].extend([GtidEvent, MariadbGtidEvent])
        kwargs['auto_position'] = gtid_pos

    else:
        if not log_file or not log_pos or log_pos < 0:
            raise ValueError(
                f'log file or pos is empty ("{log_file}", "{log_pos}")! '
                f'Cannot start logical replication from invalid log file/pos.')

        LOGGER.info("Starting logical replication from binlog file ['%s', %d]",
                    log_file, log_pos)

        # When not using GTID, we want to listen in for rotate events, and start from given log position and file
        kwargs['only_events'].append(RotateEvent)
        kwargs['log_file'] = log_file
        kwargs['log_pos'] = log_pos
        kwargs['resume_stream'] = True

    return BinLogStreamReader(**kwargs)
예제 #22
0
    def binlog_reading(self, server_id, only_tables, only_schemas, log_file, log_pos, insert_nums,
                       interval):
        event_list = []
        sequence = 0
        logger.info('开始同步数据时间 %s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
        logger.info(f'数据库binlog:{log_file}:{log_pos}')
        pk_dict = {}
        for schema in only_schemas:
            for table in only_tables:
                pk = self.get_primary_key(schema, table)
                name = '{0}.{1}'.format(schema, table)
                if pk:
                    pk_dict[name] = pk
                    logger.info(f'开始同步: {name}')
                else:
                    if self.check_table_exists(schema, table):
                        logger.error(f'要同步的表: {name} 不存在主键或者唯一键,程序退出....')
                        exit(1)
        stream = BinLogStreamReader(connection_settings=dict(
            host=self.host, port=self.port, user=self.user, passwd=self.password
        ), resume_stream=True, blocking=True,
            server_id=int(server_id), only_tables=only_tables, only_schemas=only_schemas,
            only_events=self.only_events, log_file=log_file, log_pos=log_pos,
            fail_on_table_metadata_unavailable=True, slave_heartbeat=10)
        try:
            for binlog_event in stream:
                for row in binlog_event.rows:
                    sequence += 1
                    event = {'schema': binlog_event.schema, 'table': binlog_event.table, 'sequence_number': sequence}
                    if isinstance(binlog_event, WriteRowsEvent):
                        event['action'] = 'insert'
                        event['values'] = row['values']
                        event['event_unixtime'] = int(time.time())
                        event['action_core'] = '2'

                    elif isinstance(binlog_event, UpdateRowsEvent):
                        event['action'] = 'insert'
                        event['values'] = row['after_values']
                        event['event_unixtime'] = int(time.time())
                        event['action_core'] = '2'

                    elif isinstance(binlog_event, DeleteRowsEvent):
                        event['action'] = 'delete'
                        event['values'] = row['values']
                        event['event_unixtime'] = int(time.time())
                        event['action_core'] = '1'

                    event_list.append(event)

                    if len(event_list) == insert_nums or (
                            int(time.time()) - event_list[0]['event_unixtime'] >= interval > 0):
                        repl_status = self.slave_status()
                        log_file = stream.log_file
                        log_pos = stream.log_pos
                        if repl_status:
                            Config.pos_handler.set_log_pos_master(
                                repl_status['Master_Host'],
                                repl_status['Master_Port'],
                                repl_status['Relay_Master_Log_File'],
                                repl_status['Exec_Master_Log_Pos']
                            )

                        data_dict = {}
                        tmp_data = []
                        for items in event_list:
                            table = items['table']
                            schema = items['schema']
                            action = items['action']
                            action_core = items['action_core']
                            data_dict.setdefault(table + schema + action + action_core, []).append(items)
                        for k, v in data_dict.items():
                            tmp_data.append(v)
                        event_list = []
                        sequence = 0
                        yield tmp_data, pk_dict, log_file, log_pos
        except KeyboardInterrupt:
            log_file, log_pos = Config.pos_handler.get_log_pos()
            message = '同步程序退出,当前同步位置 {0}:{1}'.format(log_file, log_pos)
            logger.info(message)
예제 #23
0
 def __init__(self, mapper):
     self.mapper = mapper
     self.stream = BinLogStreamReader(connection_settings = mysql_settings, only_events = [DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent], blocking = True, resume_stream = True)
예제 #24
0
def binlog_reading(only_events, conf, debug):
    mysql_conf = {}
    clickhouse_conf = {}
    event_list = []
    sequence = 0
    mysql_server_id = int(cnf['master_server']['server_id'])
    mysql_conf['host'] = cnf['master_server']['host']
    mysql_conf['port'] = int(cnf['master_server']['port'])
    mysql_conf['user'] = cnf['master_server']['user']
    mysql_conf['passwd'] = cnf['master_server']['passwd']

    clickhouse_conf['host'] = cnf['clickhouse_server']['host']
    clickhouse_conf['port'] = int(cnf['clickhouse_server']['port'])
    clickhouse_conf['passwd'] = cnf['clickhouse_server']['passwd']
    clickhouse_conf['user'] = cnf['clickhouse_server']['user']

    only_schemas = cnf['only_schemas']['schemas'].split(",")
    only_tables = cnf['only_tables']['tables'].split(",")

    alarm_mail = cnf['failure_alarm']['alarm_mail'].split(",")
    skip_dmls_all = cnf['skip_dmls_all']['skip_type'].split(",")

    skip_delete_tb_name = cnf['skip_dmls_sing']['skip_delete_tb_name'].split(
        ",")
    skip_update_tb_name = cnf['skip_dmls_sing']['skip_update_tb_name'].split(
        ",")

    insert_nums = int(cnf['bulk_insert_nums']['insert_nums'])
    interval = int(cnf['bulk_insert_nums']['interval'])
    logger.info('开始同步数据时间 %s' %
                (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    if logtoredis:
        redis = my_redis()
        logger.info("同步binlog pos点从Redis读取")
    else:
        logger.info("同步binlog pos点从文件读取")
        redis = mark_log()

    db = my_db()
    log_file, log_pos = redis.get_log_pos()
    if log_file and log_pos:
        log_file == log_file
        log_pos == log_pos
    else:
        logger.error("获取binlog pos点错误,程序退出....")
        exit(1)

    pk_dict = {}
    for schema in only_schemas:
        for table in only_tables:
            pk = db.get_pri(schema, table)
            if pk:
                name = "{0}.{1}".format(schema, table)
                pk_dict[name] = pk

            else:
                name = "{0}.{1}".format(schema, table)

                if db.check_table_exists(schema, table):
                    logger.error("要同步的表: %s 不存在主键或者唯一键,程序退出...." % (name))
                    exit(1)

    # 获取有唯一键的库表
    unique_key_dict = {}
    for schema in only_schemas:
        for table in only_tables:
            unique = db.get_unique(schema, table)
            if unique:
                name = "{0}.{1}".format(schema, table)
                unique_key_dict[name] = unique

    message = "读取binlog: {0}:{1}".format(log_file, log_pos)
    ch_info = "同步到clickhouse server {0}:{1}".format(
        cnf['clickhouse_server']['host'], cnf['clickhouse_server']['port'])
    repl_info = "{0}:{1}".format(cnf['master_server']['host'],
                                 cnf['master_server']['port'])
    alarm_info = "{0} 库:{1} 表:{2} 同步数据到clickhouse服务器:{3}失败".format(
        repl_info, only_schemas, only_tables, socket.gethostname())
    logger.info('从服务器 %s 同步数据' % (repl_info))
    logger.info(message)
    logger.info(ch_info)
    logger.info('同步到clickhouse的数据库: %s' % (only_schemas))
    logger.info('同步到clickhouse的表: %s' % (only_tables))

    stream = BinLogStreamReader(connection_settings=mysql_conf, resume_stream=True, blocking=True, \
                                server_id=mysql_server_id, only_tables=only_tables, only_schemas=only_schemas, \
                                only_events=only_events, log_file=log_file, log_pos=int(log_pos),
                                fail_on_table_metadata_unavailable=True, slave_heartbeat=10, freeze_schema=True)

    try:
        for binlogevent in stream:
            for row in binlogevent.rows:
                sequence += 1
                new_event = False
                event = {
                    "schema": binlogevent.schema,
                    "table": binlogevent.table
                }
                event['sequence_number'] = sequence
                if isinstance(binlogevent, WriteRowsEvent):
                    event["action"] = "insert"
                    event["values"] = row["values"]
                    event['event_unixtime'] = int(time.time())
                    event['action_core'] = '2'

                elif isinstance(binlogevent, UpdateRowsEvent):
                    event["action"] = "insert"
                    event["values"] = row["after_values"]
                    event['event_unixtime'] = int(time.time())
                    event['action_core'] = '2'
                    db_table = "{0}.{1}".format(binlogevent.schema,
                                                binlogevent.table)
                    if db_table in unique_key_dict.keys():
                        if row["after_values"][pk_dict[db_table][0]] != row[
                                "before_values"][pk_dict[db_table][0]]:
                            new_event = {
                                "schema": binlogevent.schema,
                                "table": binlogevent.table
                            }
                            new_event['sequence_number'] = sequence
                            new_event["action"] = "delete"
                            new_event["values"] = row["before_values"]
                            new_event['event_unixtime'] = int(time.time())
                            new_event['action_core'] = '1'

                elif isinstance(binlogevent, DeleteRowsEvent):
                    event["action"] = "delete"
                    event["values"] = row["values"]
                    event['event_unixtime'] = int(time.time())
                    event['action_core'] = '1'

                event_list.append(event)
                if new_event:
                    event_list.append(new_event)

                # 判断是否到一定批次,以及没有到达批次到达一定时间也进行提交,但是需要一条数据去触发,也就是存在一定的问题:
                # 比如设置了100条提交一次,但是只拿到了90条,此时一直没有任何数据进来,那么就不会提交。
                # 暂时这里没有好的处理办法,除非分离生产者和消费者
                if len(event_list) >= insert_nums or (
                        int(time.time()) - event_list[0]['event_unixtime'] >=
                        interval and interval > 0):

                    repl_status = db.slave_status()
                    log_file = stream.log_file
                    log_pos = stream.log_pos
                    if repl_status:
                        redis.set_log_pos('master', repl_status['Master_Host'],
                                          repl_status['Master_Port'],
                                          repl_status['Relay_Master_Log_File'],
                                          repl_status['Exec_Master_Log_Pos'])

                    data_dict = {}
                    tmp_data = []
                    for items in event_list:
                        table = items['table']
                        schema = items['schema']
                        action = items['action']
                        action_core = items['action_core']
                        data_dict.setdefault(
                            table + schema + action + action_core,
                            []).append(items)
                    for k, v in data_dict.items():
                        tmp_data.append(v)
                    #print(tmp_data)
                    status = data_to_ck(tmp_data, alarm_info, alarm_mail,
                                        debug, skip_dmls_all,
                                        skip_delete_tb_name,
                                        skip_update_tb_name, pk_dict,
                                        only_schemas, **clickhouse_conf)
                    #print('status')
                    #print(status)
                    if status:
                        redis.set_log_pos('slave', log_file, log_pos)
                        del event_list
                        event_list = []
                        sequence = 0
                        gc.collect()
                    else:
                        log_file, log_pos = redis.get_log_pos()
                        message = "SQL执行错误,当前binlog位置 {0}:{1}".format(
                            log_file, log_pos)
                        logger.error(message)
                        exit(1)

    except KeyboardInterrupt:
        log_file, log_pos = redis.get_log_pos()
        message = "同步程序退出,当前同步位置 {0}:{1}".format(log_file, log_pos)
        logger.info(message)
    finally:
        stream.close()
예제 #25
0
def start_syncer(cfg):

    MYSQL_SETTINGS = {
        "host": cfg['db_mysql_ip'],
        "port": int(cfg['db_mysql_port']),
        "user": "******",
        "passwd": "canal@Hopson2018",
    }

    logging.info("MYSQL_SETTINGS=", MYSQL_SETTINGS)
    batch = {}
    row_event_count = 0

    for o in cfg['sync_table'].split(','):
        batch[o.split('$')[0]] = []

    try:
        stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS,
                                    only_events=(QueryEvent, DeleteRowsEvent,
                                                 UpdateRowsEvent,
                                                 WriteRowsEvent),
                                    server_id=9999,
                                    blocking=True,
                                    resume_stream=True,
                                    log_file=cfg['binlogfile'],
                                    log_pos=int(cfg['binlogpos']))

        print('\nSync Configuration:')
        print('-------------------------------------------------------------')
        print('batch_size=', cfg['batch_size'])
        print('batch_timeout=', cfg['batch_timeout'])
        print('batch_row_event=', cfg['batch_row_event'])
        print('apply_timeout=', cfg['apply_timeout'])
        print('sleep_time=', cfg['sleep_time'])
        print('')

        start_time = datetime.datetime.now()
        apply_time = datetime.datetime.now()

        for binlogevent in stream:

            if get_seconds(apply_time) >= config['apply_timeout']:
                cfg['db_mysql'].close()
                cfg['db_doris'].close()
                cfg = get_config_from_db(cfg['sync_tag'])
                apply_time = datetime.datetime.now()
                print("\033[0;31;40mapply config success\033[0m")

            for o in cfg['sync_table'].split(','):
                if batch.get(o.split('$')[0]) is None:
                    batch[o.split('$')[0]] = []
                    print(
                        "\033[0;31;40mbatch['{}'] init success!\033[0m".format(
                            o.split('$')[0]))

            if isinstance(binlogevent, RotateEvent):
                current_master_log_file = binlogevent.next_binlog
                print("Next binlog file: %s", current_master_log_file)
                cfg['binlogfile'] = current_master_log_file

            row_event_count = row_event_count + 1

            if isinstance(binlogevent, QueryEvent):
                cfg['binlogpos'] = binlogevent.packet.log_pos
                event = {
                    "schema": bytes.decode(binlogevent.schema),
                    "query": binlogevent.query.lower()
                }
                if 'create' in event['query'] or 'drop' in event[
                        'query'] or 'alter' in event[
                            'query'] or 'truncate' in event['query']:
                    ddl = gen_ddl_sql(event['query'])
                    event['table'] = get_obj_name(event['query']).lower()
                    if check_sync(cfg, event) and ddl is not None:
                        if check_doris_tab_exists(cfg, event) == 0:
                            create_doris_table(cfg, event)

            if isinstance(binlogevent, DeleteRowsEvent) or \
                    isinstance(binlogevent, UpdateRowsEvent) or \
                        isinstance(binlogevent, WriteRowsEvent):

                for row in binlogevent.rows:

                    cfg['binlogpos'] = binlogevent.packet.log_pos
                    event = {
                        "schema": binlogevent.schema.lower(),
                        "table": binlogevent.table.lower()
                    }
                    if check_sync(cfg, event):

                        if isinstance(binlogevent, DeleteRowsEvent):
                            event["action"] = "delete"
                            event["data"] = row["values"]
                            sql = gen_sql(cfg, event)
                            batch[event['schema'] + '.' +
                                  event['table']].append({
                                      'event': 'delete',
                                      'sql': sql
                                  })

                        elif isinstance(binlogevent, UpdateRowsEvent):
                            event["action"] = "update"
                            event["after_values"] = row["after_values"]
                            event["before_values"] = row["before_values"]
                            sql = gen_sql(cfg, event)
                            batch[event['schema'] + '.' +
                                  event['table']].append({
                                      'event': 'insert',
                                      'sql': sql
                                  })

                        elif isinstance(binlogevent, WriteRowsEvent):
                            event["action"] = "insert"
                            event["data"] = row["values"]
                            sql = gen_sql(cfg, event)
                            batch[event['schema'] + '.' +
                                  event['table']].append({
                                      'event': 'insert',
                                      'sql': sql
                                  })

                        if check_batch_full_data(batch, cfg):
                            print("\033[0;31;40mexec full batch...\033[0m")
                            doris_exec_multi(cfg, batch, 'F')
                            for o in cfg['sync_table'].split(','):
                                if len(batch[o.split('$')
                                             [0]]) % cfg['batch_size'] == 0:
                                    batch[o.split('$')[0]] = []
                            start_time = datetime.datetime.now()
                            row_event_count = 0

            if get_seconds(start_time) >= cfg['batch_timeout']:
                if check_batch_exist_data(batch):
                    print(
                        "\033[0;31;40mtimoeout:{},start_time:{}\033[0m".format(
                            get_seconds(start_time), start_time))
                    doris_exec_multi(cfg, batch)
                    for o in cfg['sync_table'].split(','):
                        batch[o.split('$')[0]]
                    start_time = datetime.datetime.now()
                    row_event_count = 0

            if row_event_count > 0 and row_event_count % cfg[
                    'batch_row_event'] == 0:
                if check_batch_exist_data(batch):
                    print("\033[0;31;40mrow_event_count={}\033[0m".format(
                        row_event_count))
                    doris_exec_multi(cfg, batch)
                    for o in cfg['sync_table'].split(','):
                        batch[o.split('$')[0]]
                    start_time = datetime.datetime.now()
                    row_event_count = 0

    except Exception as e:
        traceback.print_exc()
        write_ckpt(cfg)
    finally:
        stream.close()
예제 #26
0
    def process_binlog(self):
        sqlList = []
        stream = BinLogStreamReader(connection_settings=self.conn_setting,
                                    server_id=self.server_id,
                                    log_file=self.start_file,
                                    log_pos=self.start_pos,
                                    only_schemas=self.only_schemas,
                                    only_tables=self.only_tables,
                                    resume_stream=True)

        flag_last_event = False
        e_start_pos, last_pos = stream.log_pos, stream.log_pos
        # to simplify code, we do not use flock for tmp_file.
        tmp_file = create_unique_file(
            '%s.%s' % (self.conn_setting['host'], self.conn_setting['port']))
        with temp_open(tmp_file, "w") as f_tmp, self.connection as cursor:
            for binlog_event in stream:
                if not self.stop_never:
                    try:
                        event_time = datetime.datetime.fromtimestamp(
                            binlog_event.timestamp)
                    except OSError:
                        event_time = datetime.datetime(1980, 1, 1, 0, 0)
                    if (stream.log_file == self.end_file and stream.log_pos == self.end_pos) or \
                            (stream.log_file == self.eof_file and stream.log_pos == self.eof_pos):
                        flag_last_event = True
                    elif event_time < self.start_time:
                        if not (isinstance(binlog_event, RotateEvent)
                                or isinstance(binlog_event,
                                              FormatDescriptionEvent)):
                            last_pos = binlog_event.packet.log_pos
                        continue
                    elif (stream.log_file not in self.binlogList) or \
                            (self.end_pos and stream.log_file == self.end_file and stream.log_pos > self.end_pos) or \
                            (stream.log_file == self.eof_file and stream.log_pos > self.eof_pos) or \
                            (event_time >= self.stop_time):
                        break
                    # else:
                    #     raise ValueError('unknown binlog file or position')

                if isinstance(binlog_event,
                              QueryEvent) and binlog_event.query == 'BEGIN':
                    e_start_pos = last_pos

                if isinstance(binlog_event, QueryEvent) and not self.only_dml:
                    sql = concat_sql_from_binlog_event(
                        cursor=cursor,
                        binlog_event=binlog_event,
                        flashback=self.flashback,
                        no_pk=self.no_pk)
                    if sql:
                        sqlList.append(sql)
                elif is_dml_event(binlog_event) and event_type(
                        binlog_event) in self.sql_type:
                    for row in binlog_event.rows:
                        sql = concat_sql_from_binlog_event(
                            cursor=cursor,
                            binlog_event=binlog_event,
                            no_pk=self.no_pk,
                            row=row,
                            flashback=self.flashback,
                            e_start_pos=e_start_pos)
                        if self.flashback:
                            f_tmp.write(sql + '\n')
                        else:
                            sqlList.append(sql)

                if not (isinstance(binlog_event, RotateEvent)
                        or isinstance(binlog_event, FormatDescriptionEvent)):
                    last_pos = binlog_event.packet.log_pos
                if flag_last_event:
                    break

            stream.close()
            f_tmp.close()
            if self.flashback:
                return self.get_rollback_sql(filename=tmp_file)
        return sqlList
예제 #27
0
    def __init__(
        self,
        connection_settings,
        server_id,
        log_file=None,
        log_pos=None,
        schemas=None,
        tables=None,
        tables_prefixes=None,
        blocking=None,
        resume_stream=None,
        nice_pause=None,
        binlog_position_file=None,
        callbacks={},
    ):
        super().__init__(callbacks=callbacks)

        self.connection_settings = connection_settings
        self.server_id = server_id
        self.log_file = log_file
        self.log_pos = log_pos
        self.schemas = None if not TableProcessor.extract_dbs(
            schemas, Util.join_lists(
                tables, tables_prefixes)) else TableProcessor.extract_dbs(
                    schemas, Util.join_lists(tables, tables_prefixes))
        self.tables = None if tables is None else TableProcessor.extract_tables(
            tables)
        self.tables_prefixes = None if tables_prefixes is None else TableProcessor.extract_tables(
            tables_prefixes)
        self.blocking = blocking
        self.resume_stream = resume_stream
        self.nice_pause = nice_pause
        self.binlog_position_file = binlog_position_file

        logging.info("raw dbs list. len()=%d",
                     0 if schemas is None else len(schemas))
        if schemas is not None:
            for schema in schemas:
                logging.info(schema)
        logging.info("normalised dbs list. len()=%d",
                     0 if self.schemas is None else len(self.schemas))
        if self.schemas is not None:
            for schema in self.schemas:
                logging.info(schema)

        logging.info("raw tables list. len()=%d",
                     0 if tables is None else len(tables))
        if tables is not None:
            for table in tables:
                logging.info(table)
        logging.info("normalised tables list. len()=%d",
                     0 if self.tables is None else len(self.tables))
        if self.tables is not None:
            for table in self.tables:
                logging.info(table)

        logging.info("raw tables-prefixes list. len()=%d",
                     0 if tables_prefixes is None else len(tables_prefixes))
        if tables_prefixes is not None:
            for table in tables_prefixes:
                logging.info(table)
        logging.info(
            "normalised tables-prefixes list. len()=%d",
            0 if self.tables_prefixes is None else len(self.tables_prefixes))
        if self.tables_prefixes is not None:
            for table in self.tables_prefixes:
                logging.info(table)

        if not isinstance(self.server_id, int):
            raise Exception(
                "Please specify server_id of src server as int. Ex.: --src-server-id=1"
            )

        self.binlog_stream = BinLogStreamReader(
            # MySQL server - data source
            connection_settings=self.connection_settings,
            server_id=self.server_id,
            # we are interested in reading CH-repeatable events only
            only_events=[
                # Possible events
                #BeginLoadQueryEvent,
                DeleteRowsEvent,
                #ExecuteLoadQueryEvent,
                #FormatDescriptionEvent,
                #GtidEvent,
                #HeartbeatLogEvent,
                #IntvarEvent
                #NotImplementedEvent,
                #QueryEvent,
                #RotateEvent,
                #StopEvent,
                #TableMapEvent,
                UpdateRowsEvent,
                WriteRowsEvent,
                #XidEvent,
            ],
            only_schemas=self.schemas,
            # in case we have any prefixes - this means we need to listen to all tables within specified schemas
            only_tables=self.tables if not self.tables_prefixes else None,
            log_file=self.log_file,
            log_pos=self.log_pos,
            freeze_schema=
            True,  # If true do not support ALTER TABLE. It's faster.
            blocking=False,
            resume_stream=self.resume_stream,
        )
예제 #28
0
파일: sync_es.py 프로젝트: yufyukky/nyaa
    def run(self):
        with open(SAVE_LOC) as f:
            pos = json.load(f)

        stream = BinLogStreamReader(
            # TODO parse out from config.py or something
            connection_settings={
                'host': MYSQL_HOST,
                'port': MYSQL_PORT,
                'user': MYSQL_USER,
                'passwd': MYSQL_PW
            },
            server_id=10,  # arbitrary
            # only care about this database currently
            only_schemas=[NT_DB],
            # these tables in the database
            only_tables=[
                "nyaa_torrents", "nyaa_statistics", "sukebei_torrents",
                "sukebei_statistics"
            ],
            # from our save file
            resume_stream=True,
            log_file=pos['log_file'],
            log_pos=pos['log_pos'],
            # skip the other stuff like table mapping
            only_events=[UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent],
            # if we're at the head of the log, block until something happens
            # note it'd be nice to block async-style instead, but the mainline
            # binlogreader is synchronous. there is an (unmaintained?) fork
            # using aiomysql if anybody wants to revive that.
            blocking=True)

        log.info(f"reading binlog from {stream.log_file}/{stream.log_pos}")

        for event in stream:
            # save the pos of the stream and timestamp with each message, so we
            # can commit in the other thread. and keep track of process latency
            pos = (stream.log_file, stream.log_pos, event.timestamp)
            with stats.pipeline() as s:
                s.incr('total_events')
                s.incr(f"event.{event.table}.{type(event).__name__}")
                s.incr('total_rows', len(event.rows))
                s.incr(f"rows.{event.table}.{type(event).__name__}",
                       len(event.rows))
                # XXX not a "timer", but we get a histogram out of it
                s.timing(
                    f"rows_per_event.{event.table}.{type(event).__name__}",
                    len(event.rows))

            if event.table == "nyaa_torrents" or event.table == "sukebei_torrents":
                if event.table == "nyaa_torrents":
                    index_name = "nyaa"
                else:
                    index_name = "sukebei"
                if type(event) is WriteRowsEvent:
                    for row in event.rows:
                        self.write_buf.put(
                            (pos, reindex_torrent(row['values'], index_name)),
                            block=True)
                elif type(event) is UpdateRowsEvent:
                    # UpdateRowsEvent includes the old values too, but we don't care
                    for row in event.rows:
                        self.write_buf.put(
                            (pos,
                             reindex_torrent(row['after_values'], index_name)),
                            block=True)
                elif type(event) is DeleteRowsEvent:
                    # ok, bye
                    for row in event.rows:
                        self.write_buf.put((pos, delet_this(row, index_name)),
                                           block=True)
                else:
                    raise Exception(f"unknown event {type(event)}")
            elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics":
                if event.table == "nyaa_statistics":
                    index_name = "nyaa"
                else:
                    index_name = "sukebei"
                if type(event) is WriteRowsEvent:
                    for row in event.rows:
                        self.write_buf.put(
                            (pos, reindex_stats(row['values'], index_name)),
                            block=True)
                elif type(event) is UpdateRowsEvent:
                    for row in event.rows:
                        self.write_buf.put(
                            (pos, reindex_stats(row['after_values'],
                                                index_name)),
                            block=True)
                elif type(event) is DeleteRowsEvent:
                    # uh ok. Assume that the torrent row will get deleted later,
                    # which will clean up the entire es "torrent" document
                    pass
                else:
                    raise Exception(f"unknown event {type(event)}")
            else:
                raise Exception(f"unknown table {s.table}")
    def start(self):
        # server_id is your slave identifier, it should be unique.
        # set blocking to True if you want to block and wait for the next event at
        # the end of the stream
        self.modules_manager.generate_modules_instances()
        if hasattr(self.transaction_manager, 'last_request_sent'):
            stream = BinLogStreamReader(
                connection_settings=self.MYSQL_SETTINGS,
                only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent],
                server_id=self.server_id,
                only_schemas=self.databases,
                only_tables=self.tables,
                blocking=True,
                resume_stream=True,
                log_pos=self.transaction_manager.last_request_sent)

        else:
            stream = BinLogStreamReader(
                connection_settings=self.MYSQL_SETTINGS,
                server_id=self.server_id,
                only_schemas=self.databases,
                only_tables=self.tables,
                only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent],
                blocking=True)

        self.logger.info(
            "Connected to the database at %s:%d with user %s" %
            (self.MYSQL_SETTINGS.get("host"), self.MYSQL_SETTINGS.get("port"),
             self.MYSQL_SETTINGS.get("user")))

        for binlogevent in stream:
            for row in binlogevent.rows:
                event = {
                    "schema": binlogevent.schema,
                    "table": binlogevent.table
                }
                if isinstance(binlogevent, DeleteRowsEvent):
                    self.logger.debug("Delete event detected.")
                    event["action"] = "delete"
                    document_id_to_remove = row["values"][self.indexes_label[
                        binlogevent.table]]
                    self.transaction_manager.write_last_request_log_pos(
                        stream, binlogevent)
                    self.modules_manager.remove_data_all_modules(
                        index=binlogevent.schema,
                        doc_type=binlogevent.table,
                        id=document_id_to_remove)
                    self.transaction_manager.number_of_delete_request += 1
                    self.transaction_manager.write_last_success_log_pos(
                        stream, binlogevent)
                    self.logger.info(
                        "Deleted document for id {0} in database {1}".format(
                            document_id_to_remove, binlogevent.table))

                elif isinstance(binlogevent, UpdateRowsEvent):
                    self.logger.debug("Update event detected.")
                    event["action"] = "update"
                    event = dict(
                        list(event.items()) +
                        list(row["after_values"].items()))
                    document_id_to_update = row["before_values"][
                        self.indexes_label[binlogevent.table]]
                    updated_body = row["after_values"]
                    if self.tables_fields[binlogevent.table] is not None:
                        new_body = {
                            field: updated_body[field]
                            for field in self.tables_fields[binlogevent.table]
                        }
                        updated_body = new_body
                    self.transaction_manager.write_last_request_log_pos(
                        stream, binlogevent)
                    self.modules_manager.update_data_all_modules(
                        index=binlogevent.schema,
                        doc_type=binlogevent.table,
                        id=document_id_to_update,
                        doc=updated_body)
                    self.transaction_manager.number_of_update_request += 1
                    self.transaction_manager.write_last_success_log_pos(
                        stream, binlogevent)
                    self.logger.info(
                        "Document for id {0} in database {2} updated to {1}".
                        format(document_id_to_update, row["after_values"],
                               binlogevent.table))

                elif isinstance(binlogevent, WriteRowsEvent):
                    self.logger.debug("Insert event detected.")
                    event["action"] = "insert"
                    event = dict(
                        list(event.items()) + list(row["values"].items()))
                    document_id_to_add = row["values"][self.indexes_label[
                        binlogevent.table]]
                    document_to_add = row["values"]
                    if self.tables_fields[binlogevent.table] is not None:
                        new_body = {
                            field: document_to_add[field]
                            for field in self.tables_fields[binlogevent.table]
                        }
                        document_to_add = new_body
                    self.transaction_manager.write_last_request_log_pos(
                        stream, binlogevent)
                    self.modules_manager.insert_data_all_modules(
                        index=binlogevent.schema,
                        doc_type=binlogevent.table,
                        doc=document_to_add,
                        id=document_id_to_add)
                    self.transaction_manager.write_last_success_log_pos(
                        stream, binlogevent)
                    self.transaction_manager.number_of_create_request += 1
                    self.logger.info(
                        "Adding in table {1} document {0} to the elastic search"
                        .format(row["values"], binlogevent.table))
                    #self.logger.info(json.dumps(event))

            sys.stdout.flush()
def start_incr_syncer(cfg):
    log("\033[0;36;40mstart incr sync...\033[0m")
    MYSQL_SETTINGS = {
        "host": cfg['db_mysql_ip'],
        "port": int(cfg['db_mysql_port']),
        "user": "******",
        "passwd": "canal@Hopson2018",
    }

    logging.info("MYSQL_SETTINGS=", MYSQL_SETTINGS)
    batch = {}
    types = {}
    pks = {}
    row_event_count = 0

    for o in cfg['sync_table'].split(','):
        evt = {
            'schema': o.split('$')[0].split('.')[0],
            'table': o.split('$')[0].split('.')[1]
        }
        if check_tab_exists_pk(cfg, evt) > 0:
            batch[o.split('$')[0]] = []
            types[o.split('$')[0]] = get_col_type(cfg, evt)
            pks[o.split('$')[0]] = True
        else:
            log("\033[0;31;40mTable:{}.{} not primary key,skip sync...\033[0m".
                format(evt['schema'], evt['table']))
            pks[o.split('$')[0]] = False

    try:
        stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS,
                                    only_events=(QueryEvent, DeleteRowsEvent,
                                                 UpdateRowsEvent,
                                                 WriteRowsEvent),
                                    server_id=9999,
                                    blocking=True,
                                    resume_stream=True,
                                    log_file=cfg['binlogfile'],
                                    log_pos=int(cfg['binlogpos']),
                                    auto_position=False)

        start_time = datetime.datetime.now()
        apply_time = datetime.datetime.now()

        for binlogevent in stream:

            if get_seconds(apply_time) >= cfg['apply_timeout']:
                cfg = get_config_from_db(cfg['sync_tag'])
                apply_time = datetime.datetime.now()
                log("\033[1;36;40mapply config success\033[0m")
                write_ckpt(cfg)
                batch = {
                    k: v
                    for k, v in batch.items() if k in [
                        tab.split('$')[0]
                        for tab in cfg['sync_table'].split(',')
                    ]
                }

            pks = {}
            for o in cfg['sync_table'].split(','):
                evt = {
                    'schema': o.split('$')[0].split('.')[0],
                    'table': o.split('$')[0].split('.')[1]
                }
                if batch.get(o.split('$')[0]) is None:
                    if check_tab_exists_pk(cfg, evt) > 0:
                        log("\033[0;36;40mfind table:{}.{} auto config sync...\033[0m"
                            .format(evt['schema'], evt['table']))
                        batch[o.split('$')[0]] = []
                        types[o.split('$')[0]] = get_col_type(cfg, evt)
                        pks[o.split('$')[0]] = True
                        if check_ck_tab_exists(cfg, evt) == 0:
                            create_ck_table(cfg, evt)
                            full_sync(cfg, evt)
                    else:
                        log("\033[0;36;40mTable:{}.{} not primary key,skip sync...\033[0m"
                            .format(evt['schema'], evt['table']))
                        pks[o.split('$')[0]] = False
                else:
                    if check_tab_exists_pk(cfg, evt) > 0:
                        pks[o.split('$')[0]] = True
                    else:
                        pks[o.split('$')[0]] = False

            if isinstance(binlogevent, RotateEvent):
                cfg['binlogfile'] = binlogevent.next_binlog

            row_event_count = row_event_count + 1

            if isinstance(binlogevent, QueryEvent):
                cfg['binlogpos'] = binlogevent.packet.log_pos
                event = {
                    "schema": bytes.decode(binlogevent.schema),
                    "query": binlogevent.query.lower()
                }
                if 'create' in event['query'] or 'drop' in event[
                        'query'] or 'alter' in event[
                            'query'] or 'truncate' in event['query']:
                    ddl = gen_ddl_sql(event['query'])
                    event['table'] = get_obj_name(event['query']).lower()

                    if check_sync(cfg, event, pks) and ddl is not None:
                        if check_ck_tab_exists(cfg, event) == 0:
                            create_ck_table(cfg, event)
                            full_sync(cfg, event)
                            batch[event['schema'] + '.' + event['table']] = []
                            types[event['schema'] + '.' +
                                  event['table']] = get_col_type(cfg, event)

            if isinstance(binlogevent, DeleteRowsEvent) or \
                    isinstance(binlogevent, UpdateRowsEvent) or \
                        isinstance(binlogevent, WriteRowsEvent):

                for row in binlogevent.rows:

                    cfg['binlogpos'] = binlogevent.packet.log_pos
                    event = {
                        "schema": binlogevent.schema.lower(),
                        "table": binlogevent.table.lower()
                    }

                    if check_sync(cfg, event, pks):

                        typ = types[event['schema'] + '.' + event['table']]

                        if check_ck_tab_exists(cfg, event) == 0:
                            create_ck_table(cfg, event)
                            full_sync(cfg, event)
                            batch[event['schema'] + '.' + event['table']] = []
                            types[event['schema'] + '.' +
                                  event['table']] = get_col_type(cfg, event)

                        if isinstance(binlogevent, DeleteRowsEvent):
                            event["action"] = "delete"
                            event["data"] = row["values"]
                            sql = gen_sql(cfg, event, typ)
                            batch[event['schema'] + '.' +
                                  event['table']].append({
                                      'event': 'delete',
                                      'sql': sql
                                  })

                        elif isinstance(binlogevent, UpdateRowsEvent):
                            event["action"] = "update"
                            event["after_values"] = row["after_values"]
                            event["before_values"] = row["before_values"]
                            sql = gen_sql(cfg, event, typ)
                            batch[event['schema'] + '.' +
                                  event['table']].append({
                                      'event': 'update',
                                      'sql': sql
                                  })

                        elif isinstance(binlogevent, WriteRowsEvent):
                            event["action"] = "insert"
                            event["data"] = row["values"]
                            sql = gen_sql(cfg, event, typ)
                            batch[event['schema'] + '.' +
                                  event['table']].append({
                                      'event': 'insert',
                                      'sql': sql
                                  })

                        if check_batch_full_data(batch, cfg):
                            log("\033[0;31;40mprocess full batch...\033[0m")
                            ck_exec(cfg, batch, 'Full')
                            for o in cfg['sync_table'].split(','):
                                if len(batch[o.split('$')
                                             [0]]) % cfg['batch_size'] == 0:
                                    batch[o.split('$')[0]] = []
                            start_time = datetime.datetime.now()
                            row_event_count = 0
                            write_ckpt(cfg)

            if get_seconds(start_time) >= cfg['batch_timeout']:
                if check_batch_exist_data(batch):
                    log("\033[0;31;40mtimoeout:{},start_time:{}\033[0m".format(
                        get_seconds(start_time), start_time))
                    ck_exec(cfg, batch)
                    for o in cfg['sync_table'].split(','):
                        batch[o.split('$')[0]] = []
                    start_time = datetime.datetime.now()
                    row_event_count = 0
                    write_ckpt(cfg)

            if row_event_count > 0 and row_event_count % cfg[
                    'batch_row_event'] == 0:
                if check_batch_exist_data(batch):
                    log("\033[0;31;40mrow_event_count={}\033[0m".format(
                        row_event_count))
                    ck_exec(cfg, batch)
                    for o in cfg['sync_table'].split(','):
                        batch[o.split('$')[0]] = []
                    start_time = datetime.datetime.now()
                    row_event_count = 0
                    write_ckpt(cfg)

    except Exception as e:
        traceback.print_exc()
        write_ckpt(cfg)
    finally:
        stream.close()