def listen(self): _logger.info("Start to listen to the binlog of %s" % self.database) section = 'mysql:' + self.database mysqlSetting = { "host": config().get(section, "host"), "port": int(config().get(section, 'port')), "user": config().get(section, "user"), "password": config().get(section, "password"), } watchedDatabases = [self.database] # load last binlog reader position logFile, logPos, resumeStrem = self._loadLastBinlogPos() self._stream = BinLogStreamReader( connection_settings=mysqlSetting, server_id=int(config().get(section, "slaveid")), only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent], blocking=True, resume_stream=resumeStrem, log_file=logFile, log_pos=logPos, ) while True: refresh = False try: for binlogEvent in self._stream: refresh = True logFile, logPos = self._stream.log_file, self._stream.log_pos # filter no watch database if binlogEvent.schema not in watchedDatabases: self._writeBinlogPos(logFile, logPos) continue binlog = {} binlog['storage'] = 'mysql' binlog['database'] = '%s' % binlogEvent.schema binlog['table'] = '%s' % binlogEvent.table binlog['timestamp'] = datetime.fromtimestamp( binlogEvent.timestamp).strftime('%Y-%m-%d %H:%M:%S') for row in binlogEvent.rows: if isinstance(binlogEvent, DeleteRowsEvent): binlog['values'] = row['values'] binlog['type'] = 'DELETE' elif isinstance(binlogEvent, UpdateRowsEvent): binlog['before'] = row['before_values'] binlog['values'] = row['after_values'] binlog['type'] = 'UPDATE' elif isinstance(binlogEvent, WriteRowsEvent): binlog['values'] = row['values'] binlog['type'] = 'INSERT' binlogRow = json.dumps(binlog, default=timeutil.dateHandler) self._pushToKafka(binlogRow, binlog['database'], binlog['table']) # after pushing binlog to kafka, update the binlog position self._writeBinlogPos(logFile, logPos) if not refresh: _logger.info( "NO new input binlog, current position: [%s:%d]", logFile if logFile is not None else "", logPos if logPos is not None else 0) time.sleep(0.1) except Exception as e: print(e) sys.exit(1)
def run_by_rows(self): try: server_id = 6666666 + int(self.thread_id) stream = BinLogStreamReader(connection_settings=self.mysql_setting, server_id=server_id, only_events=[ DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent, QueryEvent ], resume_stream=True, blocking=False, log_file=f'{self.binlog_file}', log_pos=self.start_pos, only_schemas=f'{self.only_schemas}', only_tables=f'{self.only_tables}') rows = [] thread_id = query = None for binlogevent in stream: log_pos = binlogevent.packet.log_pos if log_pos >= self.end_pos: # 当当前的binlogevent日志位置大于结束的binlog时,退出 stream.close() break else: if isinstance(binlogevent, QueryEvent): thread_id = binlogevent.slave_proxy_id query = binlogevent.query if not isinstance(binlogevent, QueryEvent): if self.thread_id == thread_id and query == 'BEGIN': for row in binlogevent.rows: columns = [{ 'column': x.name, 'type': x.type } for x in binlogevent.columns] binlog = { 'database': binlogevent.schema, 'table': binlogevent.table, 'primary_key': binlogevent.primary_key, 'columns': columns } if isinstance(binlogevent, DeleteRowsEvent): binlog['values'] = row["values"] binlog['type'] = 'DELETE' rows.append(binlog) if isinstance(binlogevent, UpdateRowsEvent): binlog["before"] = row["before_values"] binlog["after"] = row["after_values"] binlog['type'] = 'UPDATE' rows.append(binlog) if isinstance(binlogevent, WriteRowsEvent): binlog['values'] = row["values"] binlog['type'] = 'INSERT' rows.append(binlog) stream.close() result = { 'status': 'success', 'data': self._generate_rollback_sql(rows) } except Exception as err: # print("Exception in user code:") # print('-' * 60) # traceback.print_exc(file=sys.stdout) # print('-' * 60) print(err) result = {'status': 'fail', 'msg': str(err)} return result
def get_data_from_binlog_to_ch(self, ): logger.info( """'only_schemas': %s,'only_tables': %s,'ignored_schemas': %s,'ignored_tables': %s """ % (self.only_schemas, self.only_tables, self.ignored_schemas, self.ignored_tables)) if cnf['mysql_server']['gtid_mode'] == 1: stream = BinLogStreamReader( connection_settings=MYSQL_DB_INFO, server_id=cnf['mysql_server']['server_id'], blocking=True, only_events=[ DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent, GtidEvent ], auto_position=get_info_ins.get_executed_gtid_set( name=self.name), only_schemas=self.only_schemas, only_tables=self.only_tables, ignored_schemas=self.ignored_schemas, ignored_tables=self.ignored_tables, slave_heartbeat=10, fail_on_table_metadata_unavailable=True, freeze_schema=True, resume_stream=True) else: stream = BinLogStreamReader( connection_settings=MYSQL_DB_INFO, server_id=cnf['mysql_server']['server_id'], blocking=True, only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent], log_file=get_info_ins.get_log_file(name=self.name), log_pos=int(get_info_ins.get_log_pos(name=self.name)), only_schemas=self.only_schemas, only_tables=self.only_tables, ignored_schemas=self.ignored_schemas, ignored_tables=self.ignored_tables, slave_heartbeat=10, fail_on_table_metadata_unavailable=True, freeze_schema=True, resume_stream=True) for binlogevent in stream: if binlogevent.event_type == GTID_LOG_EVENT: self.last_gtid = binlogevent.gtid else: self.log_file, self.log_pos = stream.log_file, stream.log_pos try: self.binlog_from_event_handler(binlogevent) except Exception as e: messages = '%s : unknown error: %s' % ( 'binlog_from_event_handler', str(e)) logger.error(messages) alarm(alarm_cnf=alarm_cnf, title=self.alarm_title, messages=messages) exit(3) if self.sequence > int( cnf['bulk_insert_control']['rows_to_target']) or int( time.time()) - self.start_time > int( cnf['bulk_insert_control']['interval']): messges = """self.sequence> %s or time > %s,do delete data,insert data,write redis""" % ( int(cnf['bulk_insert_control']['rows_to_target']), int(cnf['bulk_insert_control']['interval'])) logger.info(messges) messges = 'begin delete data from clickhouse' logger.info(messges) try: self.delete_data_in_ch() except Exception as e: messages = "delete data from clickhouse failed: %s" % str( e, ) logger.error(messages) alarm(alarm_cnf=alarm_cnf, title=self.alarm_title, messages=messages) exit(3) messges = 'begin insert data to clickhouse' logger.info(messges) try: self.insert_data_to_ch() except Exception as e: messages = "delete data from clickhouse failed: %s" % str( e, ) logger.error(messages) alarm(alarm_cnf=alarm_cnf, title=self.alarm_title, messages=messages) exit(3) # 生成程序当前状态,并写入redis,方便后期监控,和高可用 messges = 'begin write infomation data to redis' logger.info(messges) try: self.write_info_redis() except Exception as e: messages = "write data to redis failed: %s" % str(e, ) logger.error(messages) alarm(alarm_cnf=alarm_cnf, title=self.alarm_title, messages=messages) exit(3)
def handle_binlog_stream(config): cache = Cache(config.SLAVE_UUID) # 该操作可以关闭旧有binlog连接 stream_binlog = BinLogStreamReader( connection_settings=config.BINLOG_CONNECTION, server_id=config.SERVER_ID, blocking=False, resume_stream=True, slave_uuid=config.SLAVE_UUID ) stream_binlog.fetchone() only_schemas = set() only_tables = set() event2jobs = defaultdict(list) for task in config.TASKS: only_schemas.add(task["stream"]["database"]) only_tables.add(task["stream"]["table"]) for job in task["jobs"]: for action in job["actions"]: event = "{host}_{schema}_{table}_{action}".format(host=config.BINLOG_CONNECTION["host"], schema=task["stream"]["database"], table=task["stream"]["table"], action=action) event2jobs[event].append(job) stream_binlog = BinLogStreamReader( connection_settings=config.BINLOG_CONNECTION, server_id=config.SERVER_ID, blocking=True, only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent, RotateEvent], only_schemas=only_schemas, only_tables=only_tables, freeze_schema=True, log_file=cache.get_log_file(), log_pos=cache.get_log_pos(), resume_stream=True, slave_uuid=config.SLAVE_UUID ) for binlogevent in stream_binlog: if isinstance(binlogevent, RotateEvent): cache.set_log_file(binlogevent.next_binlog) cache.set_log_pos(binlogevent.position) else: print(binlogevent.packet.log_pos) for row in binlogevent.rows: event = {"host": binlogevent._ctl_connection.host, "schema": binlogevent.schema, "table": binlogevent.table, "timestamp": datetime.datetime.fromtimestamp(binlogevent.timestamp).strftime('%Y-%m-%d %H:%M:%S')} # 组装event if isinstance(binlogevent, DeleteRowsEvent): event["action"] = "delete" event["values"] = dict(row["values"].items()) elif isinstance(binlogevent, UpdateRowsEvent): event["action"] = "update" event["before_values"] = dict(row["before_values"].items()) event["values"] = dict(row["after_values"].items()) elif isinstance(binlogevent, WriteRowsEvent): event["action"] = "insert" event["values"] = dict(row["values"].items()) event_type = "{host}_{schema}_{table}_{action}".format(host=event["host"], schema=event["schema"], table=event["table"], action=event["action"]) jobs = event2jobs[event_type] for job in jobs: if event["action"] in job["actions"]: pipeline = job["pipeline"] rows = do_pipeline(pipeline, event["values"]) dest = job["dest"] to_dest(dest, rows) cache.set_log_pos(binlogevent.packet.log_pos) logging.info(json.dumps(event, cls=DateEncoder))
def generate_sql(self): outfile = self.parser['outfile'] database = self.parser['database'] table = self.parser['table'] conn_setting = { 'host': parser['host'], 'port': int(parser['port']), 'user': parser['user'], 'passwd': parser['password'], 'charset': 'utf8' } log_file = parser['binlog'] start_position = parser['start_position'] if parser[ 'start_position'] else 4 dict_ = {'log_file': log_file, 'log_pos': int(start_position)} f = file_position() f.set(dict_) fpos = f.get() res_file = fpos['log_file'] res_pos = fpos['log_pos'] print('cache_res:', res_file, res_pos) stream = BinLogStreamReader( connection_settings=conn_setting, only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent], only_schemas=database, only_tables=table, log_file=res_file, log_pos=res_pos, server_id=30, blocking=True, resume_stream=True) for binlogevent in stream: if isinstance(binlogevent, WriteRowsEvent): for row in binlogevent.rows: next_binlog = stream.log_file postion = stream.log_pos log_timestamp = datetime.datetime.fromtimestamp( binlogevent.timestamp) start = f.get()['log_pos'] log_content = ' # binlog: %s start:%s end:%s time: %s' % ( next_binlog, str(start), str(postion), log_timestamp) template = 'INSERT INTO `{0}`.`{1}`({2}) VALUES ({3});' \ .format(binlogevent.schema, binlogevent.table, ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), ', '.join(['%s'] * len(row['values']))) values = map(fix_object, row['values'].values()) self.db_save(template, values, log_content, outfile) dict_ = {'log_file': next_binlog, 'log_pos': postion} f = file_position() f.set(dict_) print('set: ', str(dict_)) elif isinstance(binlogevent, DeleteRowsEvent): if binlogevent.primary_key: for row in binlogevent.rows: next_binlog = stream.log_file postion = stream.log_pos log_timestamp = datetime.datetime.fromtimestamp( binlogevent.timestamp) start = f.get()['log_pos'] log_content = ' # binlog: %s start:%s end:%s time: %s' % ( next_binlog, str(start), str(postion), log_timestamp) prikey = binlogevent.primary_key beoreprikey_items = { k: v for k, v in row['values'].items() if k in prikey }.items() beoreprikey_values = [v for k, v in beoreprikey_items] template = 'DELETE FROM `{0}`.`{1}` WHERE {2} LIMIT 1;'.format( binlogevent.schema, binlogevent.table, ' AND '.join(map(compare_items, beoreprikey_items))) values = map(fix_object, beoreprikey_values) self.db_save(template, values, log_content, outfile) dict_ = {'log_file': next_binlog, 'log_pos': postion} f = file_position() f.set(dict_) print('set: ', str(dict_)) else: next_binlog = stream.log_file postion = stream.log_pos log_timestamp = datetime.datetime.fromtimestamp( binlogevent.timestamp) start = f.get()['log_pos'] log_content = ' # binlog: %s start:%s end:%s time: %s' % ( next_binlog, str(start), str(postion), log_timestamp) for row in binlogevent.rows: print('del_nopri_row: ', binlogevent.schema, binlogevent.table, row, log_content) dict_ = {'log_file': next_binlog, 'log_pos': postion} f = file_position() f.set(dict_) print('set: ', str(dict_)) print('del表没有主键,或含有JSON的表,无法处理,退出') sys.exit() elif isinstance(binlogevent, UpdateRowsEvent): if binlogevent.primary_key: for row in binlogevent.rows: prikey = binlogevent.primary_key beoreprikey_items = { k: v for k, v in row['before_values'].items() if k in prikey }.items() beoreprikey_values = [v for k, v in beoreprikey_items] next_binlog = stream.log_file postion = stream.log_pos log_timestamp = datetime.datetime.fromtimestamp( binlogevent.timestamp) start = f.get()['log_pos'] log_content = ' # binlog: %s start:%s end:%s time: %s' % ( next_binlog, str(start), str(postion), log_timestamp) template = 'UPDATE `{0}`.`{1}` SET {2} WHERE {3} LIMIT 1;'.format( binlogevent.schema, binlogevent.table, ', '.join([ '`%s`=%%s' % k for k in row['after_values'].keys() ]), ' AND '.join(map(compare_items, beoreprikey_items))) values = map( fix_object, list(row['after_values'].values()) + list(beoreprikey_values)) self.db_save(template, values, log_content, outfile) dict_ = {'log_file': next_binlog, 'log_pos': postion} f = file_position() f.set(dict_) print('set: ', str(dict_)) else: next_binlog = stream.log_file postion = stream.log_pos log_timestamp = datetime.datetime.fromtimestamp( binlogevent.timestamp) start = f.get()['log_pos'] log_content = ' # binlog: %s start:%s end:%s time: %s' % ( next_binlog, str(start), str(postion), log_timestamp) for row in binlogevent.rows: print('update_nopri_row: ', binlogevent.schema, binlogevent.table, row, log_content) dict_ = {'log_file': next_binlog, 'log_pos': postion} f = file_position() f.set(dict_) print('set: ', str(dict_)) print('update表没有主键,或含有JSON的表,无法处理,退出') sys.exit()
def process_binlog(self): stream = BinLogStreamReader( connection_settings=self.connectionSettings, server_id=self.serverId, log_file=self.startFile, log_pos=self.startPos, only_schemas=self.only_schemas, only_tables=self.only_tables, resume_stream=True) cur = self.connection.cursor() tmpFile = create_unique_file( '%s.%s' % (self.connectionSettings['host'], self.connectionSettings['port']) ) # to simplify code, we do not use file lock for tmpFile. ftmp = open(tmpFile, "w") flagLastEvent = False eStartPos, lastPos = stream.log_pos, stream.log_pos try: count = 0 for binlogevent in stream: print stream.log_file print datetime.datetime.fromtimestamp( binlogevent.timestamp).strftime('%Y-%m-%d %H:%M:%S') if count >= self.countnum: break if not self.stopnever: # if (stream.log_file == self.endFile and stream.log_pos == self.endPos) or (stream.log_file == self.eofFile and stream.log_pos == self.eofPos): if (stream.log_file == self.endFile and stream.log_file <> self.startFile) or ( stream.log_file == self.eofFile and stream.log_pos == self.eofPos): flagLastEvent = True elif datetime.datetime.fromtimestamp( binlogevent.timestamp) < self.startTime: if not (isinstance(binlogevent, RotateEvent) or isinstance(binlogevent, FormatDescriptionEvent)): lastPos = binlogevent.packet.log_pos continue elif (stream.log_file not in self.binlogList) or ( self.endPos and stream.log_file == self.endFile and stream.log_pos > self.endPos ) or (stream.log_file == self.eofFile and stream.log_pos > self.eofPos) or (datetime.datetime.fromtimestamp( binlogevent.timestamp) >= self.stopTime): break # else: # raise ValueError('unknown binlog file or position') if isinstance(binlogevent, QueryEvent) and binlogevent.query == 'BEGIN': eStartPos = lastPos if isinstance(binlogevent, QueryEvent): sql = concat_sql_from_binlogevent(cursor=cur, binlogevent=binlogevent, flashback=self.flashback, nopk=self.nopk) if sql: count = count + 1 self.sqllist.append(sql) # pass # print sql elif isinstance(binlogevent, WriteRowsEvent) or isinstance( binlogevent, UpdateRowsEvent) or isinstance( binlogevent, DeleteRowsEvent): for row in binlogevent.rows: sql = concat_sql_from_binlogevent( cursor=cur, binlogevent=binlogevent, row=row, flashback=self.flashback, nopk=self.nopk, eStartPos=eStartPos) if self.flashback: ftmp.write(sql + '\n') else: # print sql self.sqllist.append(sql) count = count + 1 if not (isinstance(binlogevent, RotateEvent) or isinstance(binlogevent, FormatDescriptionEvent)): lastPos = binlogevent.packet.log_pos if flagLastEvent: break ftmp.close() if self.flashback: with open(tmpFile) as ftmp: for line in reversed_lines(ftmp): self.sqllist.append(line.rstrip()) finally: os.remove(tmpFile) cur.close() stream.close() return True
def main(): # DynamoDB ---------------------------- # Keyvalue = "" # Keyvalue_2 = "" # response = table.put_item( # I Item={ # 'Keyname': Keyname, # 'Keyvalue': Keyvalue, # } # ) # response = table.put_item( # I Item={ # 'Keyname': Keyname_2, # 'Keyvalue': Keyvalue_2, # } # ) # ------------------------------------------ parser.add_argument("--log_pos", "-p", help="enter the starting position") parser.add_argument("--log_file", "-f", help="enter the log file") args = parser.parse_args() if args.log_pos: args.log_pos = int(args.log_pos) # Get Item For DynamoDB ------------------------------ # args.log_pos = getItem(Keyname) # args.log_file = getItem(Keyname_2) #----------------------------------------------------- conn = pymysql.connect(**MYSQL_SETTINGS) cursor = conn.cursor() stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS, server_id=3, blocking=True, only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent, QueryEvent,RotateEvent], log_file=args.log_file, log_pos=args.log_pos, resume_stream=True ) next_binlog = '' delimit = ';' TopicArn = 'arn:aws:sns:ap-southeast-1:XXXXXXXXXXXX:Openhack-Data-Stiching' for binlogevent in stream: e_start_pos, last_pos = stream.log_pos, stream.log_pos if type(binlogevent).__name__ == 'RotateEvent': next_binlog = binlogevent.next_binlog else: result = concat_sql_from_binlog_event(cursor=cursor, binlog_event=binlogevent, row=None, e_start_pos=e_start_pos) result['next_binlog'] = next_binlog if 'Query' in result: result['Query'] = result['Query'].partition(delimit)[2] for k, v in result.items(): if k == 'Query' and "rds_heartbeat2" not in v: if v: client.publish( TopicArn=TopicArn, Message=v ) # Update Item For DynamoDB ------------------------------ # updateItem(result['position'], result['next_binlog']) #------------------------------------------------------ print(json.dumps(result)) stream.close()
server_id = 6 blocking = True only_schemas = ['statistics'] only_tables = ['StarUser1'] skip_to_timestamp = 1562501553 resume_stream = True log_file = 'master.000001' log_pos = 105709374 reader = BinLogStreamReader( connection_settings=MYSQL_SETTINGS, server_id=server_id, only_events=only_events, blocking=blocking, only_schemas=only_schemas, only_tables=only_tables, # skip_to_timestamp=skip_to_timestamp, # resume_stream=resume_stream, # log_file=log_file, # log_pos=log_pos, ) for binlog_event in reader: if isinstance(binlog_event, RotateEvent): print(f'next_binlog={binlog_event.next_binlog}') print(f'position={binlog_event.position}') else: print(binlog_event.dump()) print(binlog_event.timestamp) print(binlog_event.packet.log_pos)
} stream = BinLogStreamReader( connection_settings=mysql_settings, # only_events=[ # 只看指定事件 # DeleteRowsEvent, # WriteRowsEvent, # UpdateRowsEvent # ], # ignored_events=[], # 忽略指定事件 # only_tables=['e_tester'], # 只看指定表 # ignored_tables=[], # 忽略指定表 # only_schemas=[], # 只看指定数据库 # ignored_schemas=[], # 忽略指定数据库 blocking=False, server_id=2, resume_stream=True, # 必须为 True 时,log_file log_pos 定位才有效。 log_file='binlog.000099', # 文件,需要自行记录。RotateEvent 会给出,记得保留。 log_pos=4, # 定位号,需要自行记录。每个文件都是从4开始,之后的定位是根据数据长短跳动的 # GTID 由 source_id:transaction_id 组成 # source_id 为发起事务 MySQL 实例 UUID # transaction_id 为事务序号,从 1 开始递增 # transaction_id 可以用冒号隔开写多个,减号为连续多个 # c8671405-081c-11e9-a407-ec0d9a495964:1-5:11-18 # auto_position 会自动过滤掉指定的 gtid 。 # auto_position='c8671405-081c-11e9-a407-ec0d9a495964:3472692', # skip_to_timestamp=time.mktime(time.strptime('2020-10-01 16:17:18', '%Y-%m-%d %H:%M:%S')), # 过滤指定时间前的日志 ) for binlogevent in stream:
topic = getTopic(database) logger.info(database + '------------->' + str(topic)) if topic == None: raise RuntimeError('Topic为空!') offset = getOffset(database) if offset == None: offset = int(time.time()) - 300 setOffset(database, offset) logger.info(database + '------------->' + str(offset)) blacklist = getBlackList(database) stream = BinLogStreamReader( connection_settings=mysql_settings, server_id=100, # slave标识,唯一 blocking=True, # 阻塞等待后续事件 skip_to_timestamp=offset, # 从offset开始消费 ignored_tables=blacklist, # 忽略表 # 设定只监控写操作:增、删、改 only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent]) producer = KafkaProducer(bootstrap_servers=hosts_producer_arr) partition = producer.partitions_for(topic) numPartitions = len(partition) logger.info('*****************开始发送数据*****************') for binlogevent in stream: for row in binlogevent.rows: if len(row) > 80960: logger.error('长度超限:' + row) event = { "schema": binlogevent.schema, "table": binlogevent.table
def connect(): global EVENT_LAST_SEEN global LOG_FILE global LOG_POS try: with open('tracking.time', 'r') as r: EVENT_LAST_SEEN, LOG_POS, LOG_FILE = r.readline().split() if LOG_POS: LOG_POS = int(LOG_POS) if EVENT_LAST_SEEN: EVENT_LAST_SEEN = int(EVENT_LAST_SEEN) except Exception as e: print(e) print("start stream with EVENT_LAST_SEEN=" + str(EVENT_LAST_SEEN)) sys.stdout.flush() try: stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS, slave_heartbeat=20, resume_stream=(EVENT_LAST_SEEN is not None), log_file=LOG_FILE, log_pos=LOG_POS, blocking=True, server_id=3, skip_to_timestamp=EVENT_LAST_SEEN, only_schemas=ONLY_SCHEMAS, ignored_tables=IGNORED_TABLES, only_events=[ DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent, RotateEvent ]) # stream._BinLogStreamReader__connect_to_stream() # print stream.__dict__.get("_stream_connection") for binlogevent in stream: EVENT_LAST_SEEN = binlogevent.timestamp if binlogevent.event_type == ROTATE_EVENT: LOG_POS = binlogevent.position LOG_FILE = binlogevent.next_binlog continue elif stream.log_pos: LOG_POS = stream.log_pos for row in binlogevent.rows: event = { "schema": binlogevent.schema, "table": binlogevent.schema.lower() + "__" + binlogevent.table } if isinstance(binlogevent, DeleteRowsEvent): event["action"] = "delete" event = dict(event.items() + row["values"].items()) elif isinstance(binlogevent, UpdateRowsEvent): event["action"] = "update" event = dict(event.items() + row["after_values"].items()) elif isinstance(binlogevent, WriteRowsEvent): event["action"] = "index" event = dict(event.items() + row["values"].items()) for field in ARRAY_FIELDS: if event is not None and event.get(field, '___') is not None: if not event.get(field, '___').startswith('___'): tags = event[field].split(",") arrTags = [] for tag in tags: strTag = tag.strip() if len(strTag) > 0: arrTags.append(strTag) event[field] = arrTags jsonstr = json.dumps(event, ensure_ascii=False, encoding="utf-8").encode('utf-8') if DUMP_JSON: dumpJson(binlogevent.schema.lower(), jsonstr) if DUMP_KAFKA: dumpKafka(binlogevent.schema.lower(), jsonstr) if DUMP_REDIS: dumpRedis(binlogevent.schema.lower(), jsonstr) # print (jsonstr + os.linesep) # sys.stdout.flush() stream.close() print("close stream") except Exception as e: print(e) sys.stdout.flush() return True
# from pymysqlreplication import BinLogStreamReader # from pymysqlreplication.event import QueryEvent from pymysqlreplication.row_event import DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent from pymysqlreplication import BinLogStreamReader from pymysqlreplication.row_event import DeleteRowsEvent mysql_settings = { "host": "127.0.0.1", "port": 3307, "user": "******", "passwd": "warlock" } stream = BinLogStreamReader( connection_settings = mysql_settings, server_id = 1, blocking = True, only_schemas=['warlock'], only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent], resume_stream=True, ) for event in stream: print(event) print(event.dump()) stream.close()
def process_binlog(self): stream = BinLogStreamReader( connection_settings=self.connectionSettings, server_id=self.serverId, log_file=self.startFile, log_pos=self.startPos, only_schemas=self.only_schemas, only_tables=self.only_tables, resume_stream=True) cur = self.connection.cursor() tmpFile = 'tmp.%s.%s.tmp' % ( self.connectionSettings['host'], self.connectionSettings['port'] ) # to simplify code, we do not use file lock for tmpFile. ftmp = open(tmpFile, "w") flagLastEvent = False eStartPos = stream.log_pos lastPos = stream.log_pos try: for binlogevent in stream: if not self.stopnever: if (stream.log_file == self.endFile and stream.log_pos == self.endPos) or ( stream.log_file == self.eofFile and stream.log_pos == self.eofPos): flagLastEvent = True elif stream.log_file not in self.binlogList: break elif (self.endPos and stream.log_file == self.endFile and stream.log_pos > self.endPos) or ( stream.log_file == self.eofFile and stream.log_pos > self.eofPos): break # else: # raise ValueError('unknown binlog file or position') if isinstance(binlogevent, QueryEvent) and binlogevent.query == 'BEGIN': eStartPos = lastPos if isinstance(binlogevent, QueryEvent): sql = concat_sql_from_binlogevent(cursor=cur, binlogevent=binlogevent, flashback=self.flashback, popPk=self.popPk) if sql: print sql elif type(binlogevent) in (WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent): for row in binlogevent.rows: sql = concat_sql_from_binlogevent( cursor=cur, binlogevent=binlogevent, row=row, flashback=self.flashback, popPk=self.popPk, eStartPos=eStartPos) if self.flashback: ftmp.write(sql + '\n') else: print sql if type(binlogevent) not in (RotateEvent, FormatDescriptionEvent): lastPos = binlogevent.packet.log_pos if flagLastEvent: break ftmp.close() if self.flashback: # doesn't work if you can't fit the whole file in memory. # need to be optimized for line in reversed(open(tmpFile).readlines()): print line.rstrip() finally: os.remove(tmpFile) cur.close() stream.close() return True
def resetBinLog(self): self.execute("RESET MASTER") if self.stream is not None: self.stream.close() self.stream = BinLogStreamReader(connection_settings=self.database)
def main(): #读取binlog 位置 #默认位置 print 'start' #log_pos = 51487324 #log_file="mysql-bin.000008" blposfile = 'binlogpos.meta' if os.path.exists(blposfile): with open(blposfile) as f: log_message = f.readline() binlogmessage = json.loads(log_message) log_file = binlogmessage['file'] log_pos = binlogmessage['pos'] elif os.path.exists('syncer.meta'): smpos = open('syncer.meta','r').readlines() log_file = ((smpos[0].split('=')[1]).split('"')[1]).strip() log_pos = (smpos[1].split('=')[1]).strip() else: print 'binlog 文件不存在,退出!' sys.exit() stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS, server_id=my_server_id, resume_stream=True, blocking=True, freeze_schema=True, only_schemas=only_schema, only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent], log_file=log_file, log_pos=log_pos ) #flag=stream.log_pos for binlogevent in stream: #映射 bdb = DBR[binlogevent.schema] #过滤 if binlogevent.table in alltidbtables: pass else: continue for row in binlogevent.rows: if isinstance(binlogevent,WriteRowsEvent): #异构 for v in S: row['values'][v] = S[v] #去除多余字段 tidblist = column_dict[ti_db][binlogevent.table].values() mylist = row['values'].keys() for m in mylist: if m not in tidblist: del row['values'][m] #判断是否有双引号,解决双引号异常 for va in row['values']: if isinstance(row['values'][va],unicode): if row['values'][va].find("'") >0: if row['values'][va].find(r"\'") >0: pass else: row['values'][va] = row['values'][va].replace("'",r"\'") print row['values'][va] template = 'INSERT INTO `{0}`.`{1}`({2}) VALUES ({3});'.format( bdb,binlogevent.table, ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), ', '.join(map(lambda v: "'%s'" % v,row["values"].values())) ) try: con.execute(template) db.commit() #记录日志位置 savepos(stream.log_file,stream.log_pos) logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\ stream.log_file,stream.log_pos,template)) except: savepos(stream.log_file,stream.log_pos) logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行发生异常的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\ stream.log_file,stream.log_pos,template)) sys.exit() elif isinstance(binlogevent, DeleteRowsEvent): print 'This is DELETE OPTIONS' #异构 for v in S: row['values'][v] = S[v] #判断是否有双引号,解决双引号异常 for va in row['values']: if isinstance(row['values'][va],unicode): s = row['values'][va] if s.find('"') >0: if s.find(r'\"') >0: pass else: row['values'][va] = s.replace('"',r'\"') print row['values'][va] if len(re.findall("\'",s)) == 1: row['values'][va] = s.replace("\'","") print row['values'][va] #去除多余字段 tidblist = column_dict[ti_db][binlogevent.table].values() mylist = row['values'].keys() for m in mylist: if m not in tidblist: del row['values'][m] template = 'DELETE FROM `{0}`.`{1}` WHERE {2} ;'.format( bdb, binlogevent.table, ' AND '.join(map(compare_items, row['values'].items())) ) #查找出主键ID并根据主键ID删除行 sq_sql = template.split('WHERE')[1] select_sql = 'SELECT `tidbid` from `%s` WHERE %s'%(binlogevent.table,sq_sql) SELECT_SQL = select_sql.replace('= NULL','IS NULL') #print SELECT_SQL con.execute(SELECT_SQL) sql_result = con.fetchone() #记录日志位置 #print sql_result if sql_result: tidbid = sql_result[0] del_sql = 'DELETE FROM `%s` where `tidbid`=%s;'%(binlogevent.table,tidbid) try: con.execute(del_sql) db.commit() #记录日志位置 savepos(stream.log_file,stream.log_pos) logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\ stream.log_file,stream.log_pos,template)) except: savepos(stream.log_file,stream.log_pos) logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行发生异常的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\ stream.log_file,stream.log_pos,del_sql)) sys.exit() elif isinstance(binlogevent, UpdateRowsEvent): print 'This is UPDATE OPTIONS' #注入添加字段和值 for v in S: row['before_values'][v] = S[v] #去除多余字段 tidblist = column_dict[ti_db][binlogevent.table].values() mylist = row['before_values'].keys() for m in mylist: if m not in tidblist: del row['before_values'][m] del row['after_values'][m] #判断是否有双引号,解决双引号异常 for v1 in row: for va in row[v1]: if isinstance(row[v1][va],unicode): s = row[v1][va] if s.find('"') >0: if s.find(r'\"') >0: pass else: row[v1][va] = s.replace('"',r'\"') print row[v1][va] if len(re.findall("\'",s)) == 1: row[v1][va] = s.replace("\'","") print row[v1][va] template='UPDATE `{0}`.`{1}` set {2} WHERE {3} ;'.format( bdb, binlogevent.table,','.join(map(compare_items,row["after_values"].items())), ' AND '.join(map(compare_items,row["before_values"].items())),datetime.datetime.fromtimestamp(binlogevent.timestamp) ) template = template.replace('""','"') mid_sql = template.split('WHERE')[1] tidbid_sql = 'select `tidbid` from %s where %s'%(binlogevent.table,mid_sql) TIDBID_SQL = tidbid_sql.replace('= NULL','IS NULL') #print TIDBID_SQL con.execute(TIDBID_SQL) tidbid_result = con.fetchone() if tidbid_result: tidbid = tidbid_result[0] u_sql = 'update `%s` set %s WHERE `tidbid` = %s;'%(binlogevent.table,','.join(map(compare_items,row["after_values"].items())),tidbid) try: con.execute(u_sql) db.commit() #记录日志位置 savepos(stream.log_file,stream.log_pos) logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\ stream.log_file,stream.log_pos,u_sql)) except: savepos(stream.log_file,stream.log_pos) logger.info("source %s,route %s, %s ,当前读取binlog文件是%s, 读取位置是%s,执行发生异常的sql是 %s"%(binlogevent.schema,bdb,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),\ stream.log_file,stream.log_pos,u_sql)) sys.exit() stream.close()
def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id in binlog_streams_map.keys(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) server_id = fetch_server_id(mysql_conn) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: initial_binlog_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_binlog_complete') if (initial_binlog_complete and reader.log_file == log_file and reader.log_pos == log_pos): LOGGER.info( "Skipping event for stream(%s) log_file=%s and log_pos=%s as it was processed last sync", catalog_entry.tap_stream_id, reader.log_file, reader.log_pos) continue if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) state = update_bookmarks(state, binlog_streams_map, reader.log_file, reader.log_pos) # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == reader.log_file and reader.log_pos >= current_log_pos: break if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) state = update_initial_binlog_complete(binlog_streams_map, state) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def main(): connection = pika.BlockingConnection( pika.ConnectionParameters(host=os.getenv('ENV_MQ_HOST'), port=int(os.getenv('ENV_MQ_PORT')), credentials=pika.PlainCredentials( os.getenv('ENV_MQ_USER'), os.getenv('ENV_MQ_PASSWD')), virtual_host='/')) channel = connection.channel() channel.queue_declare(queue='default_queue', durable=True) file_name = "file_pos.log" log_file = '' log_pos = 0 if os.path.isfile(file_name): fo = open(file_name, "r") file_pos = fo.read() fo.close() if file_pos != '': fp_list = file_pos.split('|') log_file = fp_list[0] log_pos = int(fp_list[1]) # server_id is your slave identifier, it should be unique. # set blocking to True if you want to block and wait for the next event at # the end of the stream stream = BinLogStreamReader( connection_settings=MYSQL_SETTINGS, server_id=3, blocking=True, resume_stream=True, log_file=log_file, log_pos=log_pos, only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent], only_tables=['system_message']) for binlogevent in stream: # binlogevent.dump() # 打印所有信息 for row in binlogevent.rows: # 打印 库名 和 表名 event = { "schema": binlogevent.schema, "table": binlogevent.table, # "log_pos": stream.log_pos, # "log_file": stream.log_file } if isinstance(binlogevent, DeleteRowsEvent): event["action"] = "delete" event["data"] = row["values"] elif isinstance(binlogevent, UpdateRowsEvent): event["action"] = "update" event["data"] = row["after_values"] # 注意这里不是values elif isinstance(binlogevent, WriteRowsEvent): event["action"] = "insert" event["data"] = row["values"] print(json.dumps(event, cls=DateEncoder)) message = { 'class': '\\MZ\\Models\\user\\UserModel', 'method': 'getUserById', 'data': event } body = json.dumps(message, cls=DateEncoder) channel.basic_publish(exchange='default_ex', routing_key='default_route', body=body) fo = open(file_name, "w") fo.write(stream.log_file + '|' + str(stream.log_pos)) fo.close()
def _binlog_reading( self, only_tables, only_schemas, log_file, log_pos, server_id, skip_dmls, skip_delete_tables, skip_update_tables, ) -> Generator: stream = BinLogStreamReader( connection_settings=dict( host=self.host, port=self.port, user=self.user, passwd=self.password, ), resume_stream=True, blocking=True, server_id=server_id, only_tables=only_tables, only_schemas=only_schemas, only_events=self.only_events, log_file=log_file, log_pos=log_pos, fail_on_table_metadata_unavailable=True, slave_heartbeat=10, ) for binlog_event in stream: if isinstance(binlog_event, QueryEvent): schema = binlog_event.schema.decode() query = binlog_event.query.lower() if "alter" not in query: continue table, convent_sql = SqlConvert.to_clickhouse( schema, query, Settings.cluster_name()) if not convent_sql: continue event = { "table": table, "schema": schema, "action": "query", "values": { "query": convent_sql }, "event_unixtime": int(time.time() * 10**6), "action_seq": 0, } yield schema, None, event, stream.log_file, stream.log_pos else: schema = binlog_event.schema table = binlog_event.table skip_dml_table_name = f"{schema}.{table}" for row in binlog_event.rows: if isinstance(binlog_event, WriteRowsEvent): event = { "table": table, "schema": schema, "action": "insert", "values": row["values"], "event_unixtime": int(time.time() * 10**6), "action_seq": 2, } elif isinstance(binlog_event, UpdateRowsEvent): if "update" in skip_dmls or skip_dml_table_name in skip_update_tables: continue delete_event = { "table": table, "schema": schema, "action": "delete", "values": row["before_values"], "event_unixtime": int(time.time() * 10**6), "action_seq": 1, } yield binlog_event.schema, binlog_event.table, delete_event, stream.log_file, stream.log_pos event = { "table": table, "schema": schema, "action": "insert", "values": row["after_values"], "event_unixtime": int(time.time() * 10**6), "action_seq": 2, } elif isinstance(binlog_event, DeleteRowsEvent): if "delete" in skip_dmls or skip_dml_table_name in skip_delete_tables: continue event = { "table": table, "schema": schema, "action": "delete", "values": row["values"], "event_unixtime": int(time.time() * 10**6), "action_seq": 1, } else: return yield binlog_event.schema, binlog_event.table, event, stream.log_file, stream.log_pos
def main(): stream = BinLogStreamReader( connection_settings=mysql_settings, server_id=1, only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent], blocking=True, resume_stream=True) for binlogevent in stream: for row in binlogevent.rows: print "%s:%s:" % (binlogevent.schema, binlogevent.table), row if binlogevent.table in ignore_table: print "ignore %s" % binlogevent.table, row continue if isinstance(binlogevent, DeleteRowsEvent): json_data = json.dumps( { "table": binlogevent.table, "action": "delete", "id": row["values"]["id"] }, default=date_handler) delete_rows_event.apply_async((binlogevent.table, id), queue="q_task_deleterows", exchange="qgswaf", routing_key="key_deleterows") elif isinstance(binlogevent, UpdateRowsEvent): json_data = json.dumps( { "table": binlogevent.table, "action": "update", "id": row["after_values"]["id"], "doc": row["after_values"] }, default=date_handler) update_rows_event.apply_async( (binlogevent.table, id, row["after_values"]), queue="q_task_updaterows", exchange="qgswaf", routing_key="key_updaterows") elif isinstance(binlogevent, WriteRowsEvent): json_data = json.dumps( { "table": binlogevent.table, "action": "insert", "id": row["values"]["id"], "doc": row["values"] }, default=date_handler) write_rows_event.apply_async( (binlogevent.table, id, row["values"]), queue="q_task_writerows", exchange="qgswaf", routing_key="key_writerows") # alieses = qgswaf_v1 #res = es.index(index=qgswaf_aliases, doc_type=binlogevent.table, id=row["values"]["id"], body=row["values"]) #es.indices.refresh(index="qgswaf") print json_data
def __init__(self, extraction_settings, commit_settings, queue_settings): self.resume_file = '/tmp/stream.loc' self.mysql_settings = { "host": extraction_settings["HOST"], "port": extraction_settings["PORT"], "user": extraction_settings["USER"], "passwd": extraction_settings["PASS"] } self.queue = Queues(queue_settings) if os.path.isfile(self.resume_file) == True: self.log_filename, self.log_filepos = open('/tmp/stream.loc', 'r').read().split('~') self.log_filepos = int(self.log_filepos) else: self.log_filename, self.log_filepos = None, None try: self.stream = BinLogStreamReader( connection_settings=self.mysql_settings, server_id=1, blocking=True, resume_stream=True, only_events=[ DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent, QueryEvent ], log_pos=self.log_filepos, log_file=self.log_filename) for binlogevent in self.stream: self.log_filename, self.log_pos = [ self.stream.log_file, self.stream.log_pos ] if isinstance(binlogevent, QueryEvent): func_name = str(binlogevent.query).split(' ')[0].lower() query = str(binlogevent.query) if func_name in ['create'] or func_name in ['alter']: self.queue.submit_job(func_name, [commit_settings, query]) elif isinstance(binlogevent, RotateEvent) == False and \ isinstance(binlogevent, FormatDescriptionEvent) == False and \ isinstance(binlogevent, TableMapEvent) == False and \ isinstance(binlogevent, XidEvent) == False: for row in binlogevent.rows: log_position = binlogevent.packet.log_pos table_name = binlogevent.table event_time = binlogevent.timestamp schema_row = binlogevent.schema if isinstance(binlogevent, DeleteRowsEvent): self.queue.submit_job( 'delete', [commit_settings, table_name, row["values"]]) elif isinstance(binlogevent, WriteRowsEvent): self.queue.submit_job( 'insert', [commit_settings, table_name, row["values"]]) elif isinstance(binlogevent, UpdateRowsEvent): self.queue.submit_job('update', [ commit_settings, table_name, row["before_values"], row["after_values"] ]) except Exception as e: self.kill(self.log_filename, self.log_filepos)
def create_binlog_stream_reader(config: Dict, log_file: Optional[str], log_pos: Optional[int], gtid_pos: Optional[str]) -> BinLogStreamReader: """ Create an instance of BinlogStreamReader with the right config Args: config: dictionary of the content of tap config.json log_file: binlog file name to start replication from (Optional if using gtid) log_pos: binlog pos to start replication from (Optional if using gtid) gtid_pos: GTID pos to start replication from (Optional if using log_file & pos) Returns: Instance of BinlogStreamReader """ if config.get('server_id'): server_id = int(config.get('server_id')) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = random.randint( 1, 2 ^ 32) # generate random server id for this slave LOGGER.info("Using randomly generated server_id=%s", server_id) engine = config['engine'] kwargs = { 'connection_settings': {}, 'pymysql_wrapper': make_connection_wrapper(config), 'is_mariadb': connection.MARIADB_ENGINE == engine, 'server_id': server_id, # slave server ID 'report_slave': socket.gethostname() or 'pipelinewise', # this is so this slave appears in SHOW SLAVE HOSTS; 'only_events': [WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent], } # only fetch events pertaining to the schemas in filter db. if config.get('filter_db'): kwargs['only_schemas'] = config['filter_db'].split(',') if config['use_gtid']: if not gtid_pos: raise ValueError( f'gtid_pos is empty "{gtid_pos}"! Cannot start logical replication from empty gtid.' ) LOGGER.info( "Starting logical replication from GTID '%s' on engine '%s'", gtid_pos, engine) # When using GTID, we want to listen in for GTID events and start from given gtid pos kwargs['only_events'].extend([GtidEvent, MariadbGtidEvent]) kwargs['auto_position'] = gtid_pos else: if not log_file or not log_pos or log_pos < 0: raise ValueError( f'log file or pos is empty ("{log_file}", "{log_pos}")! ' f'Cannot start logical replication from invalid log file/pos.') LOGGER.info("Starting logical replication from binlog file ['%s', %d]", log_file, log_pos) # When not using GTID, we want to listen in for rotate events, and start from given log position and file kwargs['only_events'].append(RotateEvent) kwargs['log_file'] = log_file kwargs['log_pos'] = log_pos kwargs['resume_stream'] = True return BinLogStreamReader(**kwargs)
def binlog_reading(self, server_id, only_tables, only_schemas, log_file, log_pos, insert_nums, interval): event_list = [] sequence = 0 logger.info('开始同步数据时间 %s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) logger.info(f'数据库binlog:{log_file}:{log_pos}') pk_dict = {} for schema in only_schemas: for table in only_tables: pk = self.get_primary_key(schema, table) name = '{0}.{1}'.format(schema, table) if pk: pk_dict[name] = pk logger.info(f'开始同步: {name}') else: if self.check_table_exists(schema, table): logger.error(f'要同步的表: {name} 不存在主键或者唯一键,程序退出....') exit(1) stream = BinLogStreamReader(connection_settings=dict( host=self.host, port=self.port, user=self.user, passwd=self.password ), resume_stream=True, blocking=True, server_id=int(server_id), only_tables=only_tables, only_schemas=only_schemas, only_events=self.only_events, log_file=log_file, log_pos=log_pos, fail_on_table_metadata_unavailable=True, slave_heartbeat=10) try: for binlog_event in stream: for row in binlog_event.rows: sequence += 1 event = {'schema': binlog_event.schema, 'table': binlog_event.table, 'sequence_number': sequence} if isinstance(binlog_event, WriteRowsEvent): event['action'] = 'insert' event['values'] = row['values'] event['event_unixtime'] = int(time.time()) event['action_core'] = '2' elif isinstance(binlog_event, UpdateRowsEvent): event['action'] = 'insert' event['values'] = row['after_values'] event['event_unixtime'] = int(time.time()) event['action_core'] = '2' elif isinstance(binlog_event, DeleteRowsEvent): event['action'] = 'delete' event['values'] = row['values'] event['event_unixtime'] = int(time.time()) event['action_core'] = '1' event_list.append(event) if len(event_list) == insert_nums or ( int(time.time()) - event_list[0]['event_unixtime'] >= interval > 0): repl_status = self.slave_status() log_file = stream.log_file log_pos = stream.log_pos if repl_status: Config.pos_handler.set_log_pos_master( repl_status['Master_Host'], repl_status['Master_Port'], repl_status['Relay_Master_Log_File'], repl_status['Exec_Master_Log_Pos'] ) data_dict = {} tmp_data = [] for items in event_list: table = items['table'] schema = items['schema'] action = items['action'] action_core = items['action_core'] data_dict.setdefault(table + schema + action + action_core, []).append(items) for k, v in data_dict.items(): tmp_data.append(v) event_list = [] sequence = 0 yield tmp_data, pk_dict, log_file, log_pos except KeyboardInterrupt: log_file, log_pos = Config.pos_handler.get_log_pos() message = '同步程序退出,当前同步位置 {0}:{1}'.format(log_file, log_pos) logger.info(message)
def __init__(self, mapper): self.mapper = mapper self.stream = BinLogStreamReader(connection_settings = mysql_settings, only_events = [DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent], blocking = True, resume_stream = True)
def binlog_reading(only_events, conf, debug): mysql_conf = {} clickhouse_conf = {} event_list = [] sequence = 0 mysql_server_id = int(cnf['master_server']['server_id']) mysql_conf['host'] = cnf['master_server']['host'] mysql_conf['port'] = int(cnf['master_server']['port']) mysql_conf['user'] = cnf['master_server']['user'] mysql_conf['passwd'] = cnf['master_server']['passwd'] clickhouse_conf['host'] = cnf['clickhouse_server']['host'] clickhouse_conf['port'] = int(cnf['clickhouse_server']['port']) clickhouse_conf['passwd'] = cnf['clickhouse_server']['passwd'] clickhouse_conf['user'] = cnf['clickhouse_server']['user'] only_schemas = cnf['only_schemas']['schemas'].split(",") only_tables = cnf['only_tables']['tables'].split(",") alarm_mail = cnf['failure_alarm']['alarm_mail'].split(",") skip_dmls_all = cnf['skip_dmls_all']['skip_type'].split(",") skip_delete_tb_name = cnf['skip_dmls_sing']['skip_delete_tb_name'].split( ",") skip_update_tb_name = cnf['skip_dmls_sing']['skip_update_tb_name'].split( ",") insert_nums = int(cnf['bulk_insert_nums']['insert_nums']) interval = int(cnf['bulk_insert_nums']['interval']) logger.info('开始同步数据时间 %s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) if logtoredis: redis = my_redis() logger.info("同步binlog pos点从Redis读取") else: logger.info("同步binlog pos点从文件读取") redis = mark_log() db = my_db() log_file, log_pos = redis.get_log_pos() if log_file and log_pos: log_file == log_file log_pos == log_pos else: logger.error("获取binlog pos点错误,程序退出....") exit(1) pk_dict = {} for schema in only_schemas: for table in only_tables: pk = db.get_pri(schema, table) if pk: name = "{0}.{1}".format(schema, table) pk_dict[name] = pk else: name = "{0}.{1}".format(schema, table) if db.check_table_exists(schema, table): logger.error("要同步的表: %s 不存在主键或者唯一键,程序退出...." % (name)) exit(1) # 获取有唯一键的库表 unique_key_dict = {} for schema in only_schemas: for table in only_tables: unique = db.get_unique(schema, table) if unique: name = "{0}.{1}".format(schema, table) unique_key_dict[name] = unique message = "读取binlog: {0}:{1}".format(log_file, log_pos) ch_info = "同步到clickhouse server {0}:{1}".format( cnf['clickhouse_server']['host'], cnf['clickhouse_server']['port']) repl_info = "{0}:{1}".format(cnf['master_server']['host'], cnf['master_server']['port']) alarm_info = "{0} 库:{1} 表:{2} 同步数据到clickhouse服务器:{3}失败".format( repl_info, only_schemas, only_tables, socket.gethostname()) logger.info('从服务器 %s 同步数据' % (repl_info)) logger.info(message) logger.info(ch_info) logger.info('同步到clickhouse的数据库: %s' % (only_schemas)) logger.info('同步到clickhouse的表: %s' % (only_tables)) stream = BinLogStreamReader(connection_settings=mysql_conf, resume_stream=True, blocking=True, \ server_id=mysql_server_id, only_tables=only_tables, only_schemas=only_schemas, \ only_events=only_events, log_file=log_file, log_pos=int(log_pos), fail_on_table_metadata_unavailable=True, slave_heartbeat=10, freeze_schema=True) try: for binlogevent in stream: for row in binlogevent.rows: sequence += 1 new_event = False event = { "schema": binlogevent.schema, "table": binlogevent.table } event['sequence_number'] = sequence if isinstance(binlogevent, WriteRowsEvent): event["action"] = "insert" event["values"] = row["values"] event['event_unixtime'] = int(time.time()) event['action_core'] = '2' elif isinstance(binlogevent, UpdateRowsEvent): event["action"] = "insert" event["values"] = row["after_values"] event['event_unixtime'] = int(time.time()) event['action_core'] = '2' db_table = "{0}.{1}".format(binlogevent.schema, binlogevent.table) if db_table in unique_key_dict.keys(): if row["after_values"][pk_dict[db_table][0]] != row[ "before_values"][pk_dict[db_table][0]]: new_event = { "schema": binlogevent.schema, "table": binlogevent.table } new_event['sequence_number'] = sequence new_event["action"] = "delete" new_event["values"] = row["before_values"] new_event['event_unixtime'] = int(time.time()) new_event['action_core'] = '1' elif isinstance(binlogevent, DeleteRowsEvent): event["action"] = "delete" event["values"] = row["values"] event['event_unixtime'] = int(time.time()) event['action_core'] = '1' event_list.append(event) if new_event: event_list.append(new_event) # 判断是否到一定批次,以及没有到达批次到达一定时间也进行提交,但是需要一条数据去触发,也就是存在一定的问题: # 比如设置了100条提交一次,但是只拿到了90条,此时一直没有任何数据进来,那么就不会提交。 # 暂时这里没有好的处理办法,除非分离生产者和消费者 if len(event_list) >= insert_nums or ( int(time.time()) - event_list[0]['event_unixtime'] >= interval and interval > 0): repl_status = db.slave_status() log_file = stream.log_file log_pos = stream.log_pos if repl_status: redis.set_log_pos('master', repl_status['Master_Host'], repl_status['Master_Port'], repl_status['Relay_Master_Log_File'], repl_status['Exec_Master_Log_Pos']) data_dict = {} tmp_data = [] for items in event_list: table = items['table'] schema = items['schema'] action = items['action'] action_core = items['action_core'] data_dict.setdefault( table + schema + action + action_core, []).append(items) for k, v in data_dict.items(): tmp_data.append(v) #print(tmp_data) status = data_to_ck(tmp_data, alarm_info, alarm_mail, debug, skip_dmls_all, skip_delete_tb_name, skip_update_tb_name, pk_dict, only_schemas, **clickhouse_conf) #print('status') #print(status) if status: redis.set_log_pos('slave', log_file, log_pos) del event_list event_list = [] sequence = 0 gc.collect() else: log_file, log_pos = redis.get_log_pos() message = "SQL执行错误,当前binlog位置 {0}:{1}".format( log_file, log_pos) logger.error(message) exit(1) except KeyboardInterrupt: log_file, log_pos = redis.get_log_pos() message = "同步程序退出,当前同步位置 {0}:{1}".format(log_file, log_pos) logger.info(message) finally: stream.close()
def start_syncer(cfg): MYSQL_SETTINGS = { "host": cfg['db_mysql_ip'], "port": int(cfg['db_mysql_port']), "user": "******", "passwd": "canal@Hopson2018", } logging.info("MYSQL_SETTINGS=", MYSQL_SETTINGS) batch = {} row_event_count = 0 for o in cfg['sync_table'].split(','): batch[o.split('$')[0]] = [] try: stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS, only_events=(QueryEvent, DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent), server_id=9999, blocking=True, resume_stream=True, log_file=cfg['binlogfile'], log_pos=int(cfg['binlogpos'])) print('\nSync Configuration:') print('-------------------------------------------------------------') print('batch_size=', cfg['batch_size']) print('batch_timeout=', cfg['batch_timeout']) print('batch_row_event=', cfg['batch_row_event']) print('apply_timeout=', cfg['apply_timeout']) print('sleep_time=', cfg['sleep_time']) print('') start_time = datetime.datetime.now() apply_time = datetime.datetime.now() for binlogevent in stream: if get_seconds(apply_time) >= config['apply_timeout']: cfg['db_mysql'].close() cfg['db_doris'].close() cfg = get_config_from_db(cfg['sync_tag']) apply_time = datetime.datetime.now() print("\033[0;31;40mapply config success\033[0m") for o in cfg['sync_table'].split(','): if batch.get(o.split('$')[0]) is None: batch[o.split('$')[0]] = [] print( "\033[0;31;40mbatch['{}'] init success!\033[0m".format( o.split('$')[0])) if isinstance(binlogevent, RotateEvent): current_master_log_file = binlogevent.next_binlog print("Next binlog file: %s", current_master_log_file) cfg['binlogfile'] = current_master_log_file row_event_count = row_event_count + 1 if isinstance(binlogevent, QueryEvent): cfg['binlogpos'] = binlogevent.packet.log_pos event = { "schema": bytes.decode(binlogevent.schema), "query": binlogevent.query.lower() } if 'create' in event['query'] or 'drop' in event[ 'query'] or 'alter' in event[ 'query'] or 'truncate' in event['query']: ddl = gen_ddl_sql(event['query']) event['table'] = get_obj_name(event['query']).lower() if check_sync(cfg, event) and ddl is not None: if check_doris_tab_exists(cfg, event) == 0: create_doris_table(cfg, event) if isinstance(binlogevent, DeleteRowsEvent) or \ isinstance(binlogevent, UpdateRowsEvent) or \ isinstance(binlogevent, WriteRowsEvent): for row in binlogevent.rows: cfg['binlogpos'] = binlogevent.packet.log_pos event = { "schema": binlogevent.schema.lower(), "table": binlogevent.table.lower() } if check_sync(cfg, event): if isinstance(binlogevent, DeleteRowsEvent): event["action"] = "delete" event["data"] = row["values"] sql = gen_sql(cfg, event) batch[event['schema'] + '.' + event['table']].append({ 'event': 'delete', 'sql': sql }) elif isinstance(binlogevent, UpdateRowsEvent): event["action"] = "update" event["after_values"] = row["after_values"] event["before_values"] = row["before_values"] sql = gen_sql(cfg, event) batch[event['schema'] + '.' + event['table']].append({ 'event': 'insert', 'sql': sql }) elif isinstance(binlogevent, WriteRowsEvent): event["action"] = "insert" event["data"] = row["values"] sql = gen_sql(cfg, event) batch[event['schema'] + '.' + event['table']].append({ 'event': 'insert', 'sql': sql }) if check_batch_full_data(batch, cfg): print("\033[0;31;40mexec full batch...\033[0m") doris_exec_multi(cfg, batch, 'F') for o in cfg['sync_table'].split(','): if len(batch[o.split('$') [0]]) % cfg['batch_size'] == 0: batch[o.split('$')[0]] = [] start_time = datetime.datetime.now() row_event_count = 0 if get_seconds(start_time) >= cfg['batch_timeout']: if check_batch_exist_data(batch): print( "\033[0;31;40mtimoeout:{},start_time:{}\033[0m".format( get_seconds(start_time), start_time)) doris_exec_multi(cfg, batch) for o in cfg['sync_table'].split(','): batch[o.split('$')[0]] start_time = datetime.datetime.now() row_event_count = 0 if row_event_count > 0 and row_event_count % cfg[ 'batch_row_event'] == 0: if check_batch_exist_data(batch): print("\033[0;31;40mrow_event_count={}\033[0m".format( row_event_count)) doris_exec_multi(cfg, batch) for o in cfg['sync_table'].split(','): batch[o.split('$')[0]] start_time = datetime.datetime.now() row_event_count = 0 except Exception as e: traceback.print_exc() write_ckpt(cfg) finally: stream.close()
def process_binlog(self): sqlList = [] stream = BinLogStreamReader(connection_settings=self.conn_setting, server_id=self.server_id, log_file=self.start_file, log_pos=self.start_pos, only_schemas=self.only_schemas, only_tables=self.only_tables, resume_stream=True) flag_last_event = False e_start_pos, last_pos = stream.log_pos, stream.log_pos # to simplify code, we do not use flock for tmp_file. tmp_file = create_unique_file( '%s.%s' % (self.conn_setting['host'], self.conn_setting['port'])) with temp_open(tmp_file, "w") as f_tmp, self.connection as cursor: for binlog_event in stream: if not self.stop_never: try: event_time = datetime.datetime.fromtimestamp( binlog_event.timestamp) except OSError: event_time = datetime.datetime(1980, 1, 1, 0, 0) if (stream.log_file == self.end_file and stream.log_pos == self.end_pos) or \ (stream.log_file == self.eof_file and stream.log_pos == self.eof_pos): flag_last_event = True elif event_time < self.start_time: if not (isinstance(binlog_event, RotateEvent) or isinstance(binlog_event, FormatDescriptionEvent)): last_pos = binlog_event.packet.log_pos continue elif (stream.log_file not in self.binlogList) or \ (self.end_pos and stream.log_file == self.end_file and stream.log_pos > self.end_pos) or \ (stream.log_file == self.eof_file and stream.log_pos > self.eof_pos) or \ (event_time >= self.stop_time): break # else: # raise ValueError('unknown binlog file or position') if isinstance(binlog_event, QueryEvent) and binlog_event.query == 'BEGIN': e_start_pos = last_pos if isinstance(binlog_event, QueryEvent) and not self.only_dml: sql = concat_sql_from_binlog_event( cursor=cursor, binlog_event=binlog_event, flashback=self.flashback, no_pk=self.no_pk) if sql: sqlList.append(sql) elif is_dml_event(binlog_event) and event_type( binlog_event) in self.sql_type: for row in binlog_event.rows: sql = concat_sql_from_binlog_event( cursor=cursor, binlog_event=binlog_event, no_pk=self.no_pk, row=row, flashback=self.flashback, e_start_pos=e_start_pos) if self.flashback: f_tmp.write(sql + '\n') else: sqlList.append(sql) if not (isinstance(binlog_event, RotateEvent) or isinstance(binlog_event, FormatDescriptionEvent)): last_pos = binlog_event.packet.log_pos if flag_last_event: break stream.close() f_tmp.close() if self.flashback: return self.get_rollback_sql(filename=tmp_file) return sqlList
def __init__( self, connection_settings, server_id, log_file=None, log_pos=None, schemas=None, tables=None, tables_prefixes=None, blocking=None, resume_stream=None, nice_pause=None, binlog_position_file=None, callbacks={}, ): super().__init__(callbacks=callbacks) self.connection_settings = connection_settings self.server_id = server_id self.log_file = log_file self.log_pos = log_pos self.schemas = None if not TableProcessor.extract_dbs( schemas, Util.join_lists( tables, tables_prefixes)) else TableProcessor.extract_dbs( schemas, Util.join_lists(tables, tables_prefixes)) self.tables = None if tables is None else TableProcessor.extract_tables( tables) self.tables_prefixes = None if tables_prefixes is None else TableProcessor.extract_tables( tables_prefixes) self.blocking = blocking self.resume_stream = resume_stream self.nice_pause = nice_pause self.binlog_position_file = binlog_position_file logging.info("raw dbs list. len()=%d", 0 if schemas is None else len(schemas)) if schemas is not None: for schema in schemas: logging.info(schema) logging.info("normalised dbs list. len()=%d", 0 if self.schemas is None else len(self.schemas)) if self.schemas is not None: for schema in self.schemas: logging.info(schema) logging.info("raw tables list. len()=%d", 0 if tables is None else len(tables)) if tables is not None: for table in tables: logging.info(table) logging.info("normalised tables list. len()=%d", 0 if self.tables is None else len(self.tables)) if self.tables is not None: for table in self.tables: logging.info(table) logging.info("raw tables-prefixes list. len()=%d", 0 if tables_prefixes is None else len(tables_prefixes)) if tables_prefixes is not None: for table in tables_prefixes: logging.info(table) logging.info( "normalised tables-prefixes list. len()=%d", 0 if self.tables_prefixes is None else len(self.tables_prefixes)) if self.tables_prefixes is not None: for table in self.tables_prefixes: logging.info(table) if not isinstance(self.server_id, int): raise Exception( "Please specify server_id of src server as int. Ex.: --src-server-id=1" ) self.binlog_stream = BinLogStreamReader( # MySQL server - data source connection_settings=self.connection_settings, server_id=self.server_id, # we are interested in reading CH-repeatable events only only_events=[ # Possible events #BeginLoadQueryEvent, DeleteRowsEvent, #ExecuteLoadQueryEvent, #FormatDescriptionEvent, #GtidEvent, #HeartbeatLogEvent, #IntvarEvent #NotImplementedEvent, #QueryEvent, #RotateEvent, #StopEvent, #TableMapEvent, UpdateRowsEvent, WriteRowsEvent, #XidEvent, ], only_schemas=self.schemas, # in case we have any prefixes - this means we need to listen to all tables within specified schemas only_tables=self.tables if not self.tables_prefixes else None, log_file=self.log_file, log_pos=self.log_pos, freeze_schema= True, # If true do not support ALTER TABLE. It's faster. blocking=False, resume_stream=self.resume_stream, )
def run(self): with open(SAVE_LOC) as f: pos = json.load(f) stream = BinLogStreamReader( # TODO parse out from config.py or something connection_settings={ 'host': MYSQL_HOST, 'port': MYSQL_PORT, 'user': MYSQL_USER, 'passwd': MYSQL_PW }, server_id=10, # arbitrary # only care about this database currently only_schemas=[NT_DB], # these tables in the database only_tables=[ "nyaa_torrents", "nyaa_statistics", "sukebei_torrents", "sukebei_statistics" ], # from our save file resume_stream=True, log_file=pos['log_file'], log_pos=pos['log_pos'], # skip the other stuff like table mapping only_events=[UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent], # if we're at the head of the log, block until something happens # note it'd be nice to block async-style instead, but the mainline # binlogreader is synchronous. there is an (unmaintained?) fork # using aiomysql if anybody wants to revive that. blocking=True) log.info(f"reading binlog from {stream.log_file}/{stream.log_pos}") for event in stream: # save the pos of the stream and timestamp with each message, so we # can commit in the other thread. and keep track of process latency pos = (stream.log_file, stream.log_pos, event.timestamp) with stats.pipeline() as s: s.incr('total_events') s.incr(f"event.{event.table}.{type(event).__name__}") s.incr('total_rows', len(event.rows)) s.incr(f"rows.{event.table}.{type(event).__name__}", len(event.rows)) # XXX not a "timer", but we get a histogram out of it s.timing( f"rows_per_event.{event.table}.{type(event).__name__}", len(event.rows)) if event.table == "nyaa_torrents" or event.table == "sukebei_torrents": if event.table == "nyaa_torrents": index_name = "nyaa" else: index_name = "sukebei" if type(event) is WriteRowsEvent: for row in event.rows: self.write_buf.put( (pos, reindex_torrent(row['values'], index_name)), block=True) elif type(event) is UpdateRowsEvent: # UpdateRowsEvent includes the old values too, but we don't care for row in event.rows: self.write_buf.put( (pos, reindex_torrent(row['after_values'], index_name)), block=True) elif type(event) is DeleteRowsEvent: # ok, bye for row in event.rows: self.write_buf.put((pos, delet_this(row, index_name)), block=True) else: raise Exception(f"unknown event {type(event)}") elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics": if event.table == "nyaa_statistics": index_name = "nyaa" else: index_name = "sukebei" if type(event) is WriteRowsEvent: for row in event.rows: self.write_buf.put( (pos, reindex_stats(row['values'], index_name)), block=True) elif type(event) is UpdateRowsEvent: for row in event.rows: self.write_buf.put( (pos, reindex_stats(row['after_values'], index_name)), block=True) elif type(event) is DeleteRowsEvent: # uh ok. Assume that the torrent row will get deleted later, # which will clean up the entire es "torrent" document pass else: raise Exception(f"unknown event {type(event)}") else: raise Exception(f"unknown table {s.table}")
def start(self): # server_id is your slave identifier, it should be unique. # set blocking to True if you want to block and wait for the next event at # the end of the stream self.modules_manager.generate_modules_instances() if hasattr(self.transaction_manager, 'last_request_sent'): stream = BinLogStreamReader( connection_settings=self.MYSQL_SETTINGS, only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent], server_id=self.server_id, only_schemas=self.databases, only_tables=self.tables, blocking=True, resume_stream=True, log_pos=self.transaction_manager.last_request_sent) else: stream = BinLogStreamReader( connection_settings=self.MYSQL_SETTINGS, server_id=self.server_id, only_schemas=self.databases, only_tables=self.tables, only_events=[DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent], blocking=True) self.logger.info( "Connected to the database at %s:%d with user %s" % (self.MYSQL_SETTINGS.get("host"), self.MYSQL_SETTINGS.get("port"), self.MYSQL_SETTINGS.get("user"))) for binlogevent in stream: for row in binlogevent.rows: event = { "schema": binlogevent.schema, "table": binlogevent.table } if isinstance(binlogevent, DeleteRowsEvent): self.logger.debug("Delete event detected.") event["action"] = "delete" document_id_to_remove = row["values"][self.indexes_label[ binlogevent.table]] self.transaction_manager.write_last_request_log_pos( stream, binlogevent) self.modules_manager.remove_data_all_modules( index=binlogevent.schema, doc_type=binlogevent.table, id=document_id_to_remove) self.transaction_manager.number_of_delete_request += 1 self.transaction_manager.write_last_success_log_pos( stream, binlogevent) self.logger.info( "Deleted document for id {0} in database {1}".format( document_id_to_remove, binlogevent.table)) elif isinstance(binlogevent, UpdateRowsEvent): self.logger.debug("Update event detected.") event["action"] = "update" event = dict( list(event.items()) + list(row["after_values"].items())) document_id_to_update = row["before_values"][ self.indexes_label[binlogevent.table]] updated_body = row["after_values"] if self.tables_fields[binlogevent.table] is not None: new_body = { field: updated_body[field] for field in self.tables_fields[binlogevent.table] } updated_body = new_body self.transaction_manager.write_last_request_log_pos( stream, binlogevent) self.modules_manager.update_data_all_modules( index=binlogevent.schema, doc_type=binlogevent.table, id=document_id_to_update, doc=updated_body) self.transaction_manager.number_of_update_request += 1 self.transaction_manager.write_last_success_log_pos( stream, binlogevent) self.logger.info( "Document for id {0} in database {2} updated to {1}". format(document_id_to_update, row["after_values"], binlogevent.table)) elif isinstance(binlogevent, WriteRowsEvent): self.logger.debug("Insert event detected.") event["action"] = "insert" event = dict( list(event.items()) + list(row["values"].items())) document_id_to_add = row["values"][self.indexes_label[ binlogevent.table]] document_to_add = row["values"] if self.tables_fields[binlogevent.table] is not None: new_body = { field: document_to_add[field] for field in self.tables_fields[binlogevent.table] } document_to_add = new_body self.transaction_manager.write_last_request_log_pos( stream, binlogevent) self.modules_manager.insert_data_all_modules( index=binlogevent.schema, doc_type=binlogevent.table, doc=document_to_add, id=document_id_to_add) self.transaction_manager.write_last_success_log_pos( stream, binlogevent) self.transaction_manager.number_of_create_request += 1 self.logger.info( "Adding in table {1} document {0} to the elastic search" .format(row["values"], binlogevent.table)) #self.logger.info(json.dumps(event)) sys.stdout.flush()
def start_incr_syncer(cfg): log("\033[0;36;40mstart incr sync...\033[0m") MYSQL_SETTINGS = { "host": cfg['db_mysql_ip'], "port": int(cfg['db_mysql_port']), "user": "******", "passwd": "canal@Hopson2018", } logging.info("MYSQL_SETTINGS=", MYSQL_SETTINGS) batch = {} types = {} pks = {} row_event_count = 0 for o in cfg['sync_table'].split(','): evt = { 'schema': o.split('$')[0].split('.')[0], 'table': o.split('$')[0].split('.')[1] } if check_tab_exists_pk(cfg, evt) > 0: batch[o.split('$')[0]] = [] types[o.split('$')[0]] = get_col_type(cfg, evt) pks[o.split('$')[0]] = True else: log("\033[0;31;40mTable:{}.{} not primary key,skip sync...\033[0m". format(evt['schema'], evt['table'])) pks[o.split('$')[0]] = False try: stream = BinLogStreamReader(connection_settings=MYSQL_SETTINGS, only_events=(QueryEvent, DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent), server_id=9999, blocking=True, resume_stream=True, log_file=cfg['binlogfile'], log_pos=int(cfg['binlogpos']), auto_position=False) start_time = datetime.datetime.now() apply_time = datetime.datetime.now() for binlogevent in stream: if get_seconds(apply_time) >= cfg['apply_timeout']: cfg = get_config_from_db(cfg['sync_tag']) apply_time = datetime.datetime.now() log("\033[1;36;40mapply config success\033[0m") write_ckpt(cfg) batch = { k: v for k, v in batch.items() if k in [ tab.split('$')[0] for tab in cfg['sync_table'].split(',') ] } pks = {} for o in cfg['sync_table'].split(','): evt = { 'schema': o.split('$')[0].split('.')[0], 'table': o.split('$')[0].split('.')[1] } if batch.get(o.split('$')[0]) is None: if check_tab_exists_pk(cfg, evt) > 0: log("\033[0;36;40mfind table:{}.{} auto config sync...\033[0m" .format(evt['schema'], evt['table'])) batch[o.split('$')[0]] = [] types[o.split('$')[0]] = get_col_type(cfg, evt) pks[o.split('$')[0]] = True if check_ck_tab_exists(cfg, evt) == 0: create_ck_table(cfg, evt) full_sync(cfg, evt) else: log("\033[0;36;40mTable:{}.{} not primary key,skip sync...\033[0m" .format(evt['schema'], evt['table'])) pks[o.split('$')[0]] = False else: if check_tab_exists_pk(cfg, evt) > 0: pks[o.split('$')[0]] = True else: pks[o.split('$')[0]] = False if isinstance(binlogevent, RotateEvent): cfg['binlogfile'] = binlogevent.next_binlog row_event_count = row_event_count + 1 if isinstance(binlogevent, QueryEvent): cfg['binlogpos'] = binlogevent.packet.log_pos event = { "schema": bytes.decode(binlogevent.schema), "query": binlogevent.query.lower() } if 'create' in event['query'] or 'drop' in event[ 'query'] or 'alter' in event[ 'query'] or 'truncate' in event['query']: ddl = gen_ddl_sql(event['query']) event['table'] = get_obj_name(event['query']).lower() if check_sync(cfg, event, pks) and ddl is not None: if check_ck_tab_exists(cfg, event) == 0: create_ck_table(cfg, event) full_sync(cfg, event) batch[event['schema'] + '.' + event['table']] = [] types[event['schema'] + '.' + event['table']] = get_col_type(cfg, event) if isinstance(binlogevent, DeleteRowsEvent) or \ isinstance(binlogevent, UpdateRowsEvent) or \ isinstance(binlogevent, WriteRowsEvent): for row in binlogevent.rows: cfg['binlogpos'] = binlogevent.packet.log_pos event = { "schema": binlogevent.schema.lower(), "table": binlogevent.table.lower() } if check_sync(cfg, event, pks): typ = types[event['schema'] + '.' + event['table']] if check_ck_tab_exists(cfg, event) == 0: create_ck_table(cfg, event) full_sync(cfg, event) batch[event['schema'] + '.' + event['table']] = [] types[event['schema'] + '.' + event['table']] = get_col_type(cfg, event) if isinstance(binlogevent, DeleteRowsEvent): event["action"] = "delete" event["data"] = row["values"] sql = gen_sql(cfg, event, typ) batch[event['schema'] + '.' + event['table']].append({ 'event': 'delete', 'sql': sql }) elif isinstance(binlogevent, UpdateRowsEvent): event["action"] = "update" event["after_values"] = row["after_values"] event["before_values"] = row["before_values"] sql = gen_sql(cfg, event, typ) batch[event['schema'] + '.' + event['table']].append({ 'event': 'update', 'sql': sql }) elif isinstance(binlogevent, WriteRowsEvent): event["action"] = "insert" event["data"] = row["values"] sql = gen_sql(cfg, event, typ) batch[event['schema'] + '.' + event['table']].append({ 'event': 'insert', 'sql': sql }) if check_batch_full_data(batch, cfg): log("\033[0;31;40mprocess full batch...\033[0m") ck_exec(cfg, batch, 'Full') for o in cfg['sync_table'].split(','): if len(batch[o.split('$') [0]]) % cfg['batch_size'] == 0: batch[o.split('$')[0]] = [] start_time = datetime.datetime.now() row_event_count = 0 write_ckpt(cfg) if get_seconds(start_time) >= cfg['batch_timeout']: if check_batch_exist_data(batch): log("\033[0;31;40mtimoeout:{},start_time:{}\033[0m".format( get_seconds(start_time), start_time)) ck_exec(cfg, batch) for o in cfg['sync_table'].split(','): batch[o.split('$')[0]] = [] start_time = datetime.datetime.now() row_event_count = 0 write_ckpt(cfg) if row_event_count > 0 and row_event_count % cfg[ 'batch_row_event'] == 0: if check_batch_exist_data(batch): log("\033[0;31;40mrow_event_count={}\033[0m".format( row_event_count)) ck_exec(cfg, batch) for o in cfg['sync_table'].split(','): batch[o.split('$')[0]] = [] start_time = datetime.datetime.now() row_event_count = 0 write_ckpt(cfg) except Exception as e: traceback.print_exc() write_ckpt(cfg) finally: stream.close()