def generate_cross_dict(): __dict = {} conn = service_platform_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT id, beentocounts, plantocounts FROM detail_total_qyer_20171209a;''') for line in cursor.fetchall(): if line[1]: beentocount = int(line[1]) else: beentocount = -1 if line[2]: plantocount = int(line[2]) else: plantocount = -1 if beentocount not in (-1, 0): beentocounts = json.dumps({'qyer': beentocount}) else: beentocounts = '{}' if plantocount not in (-1, 0): plantocounts = json.dumps({'qyer': plantocount}) else: plantocounts = '{}' __dict[line[0]] = (beentocounts, plantocounts) print("[cross dict finished]") return __dict
def update_seek_table(table_name, update_time): local_conn = service_platform_pool.connection() local_cursor = local_conn.cursor() local_cursor.execute('''REPLACE INTO data_insert_seek VALUES (%s, %s);''', (table_name, update_time)) logger.debug("[update seek table][table_name: {}][update_time: {}]".format( table_name, update_time)) local_conn.commit() local_cursor.close() local_conn.close()
def insert_db(table_name, data): sql = '''REPLACE INTO {} (source, source_id, city_id, country_id, hotel_url) VALUES (%s, %s, 'NULL', 'NULL', %s)'''.format( table_name) conn = service_platform_pool.connection() cursor = conn.cursor() res = cursor.executemany(sql, data) conn.commit() cursor.close() conn.close() logger.info("[add task][table: {}][count: {}][insert: {}]".format(table_name, len(data), res))
def insert_db(data): # return sql = '''INSERT IGNORE INTO NewStation.station_src (station, src_city, src_country, map_info, station_city_map_info, status, belong_city_id, station_code_from_europeRail, src_city_code, src_station_code) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''' conn = service_platform_pool.connection() cursor = conn.cursor() _res = cursor.executemany(sql, data) conn.commit() cursor.close() conn.close() logger.info('[total: {}][insert: {}]'.format(len(data), _res))
def update_db(data): __conn = service_platform_pool.connection() __cursor = __conn.cursor() print('start', line) __res = __cursor.executemany( '''UPDATE view_data.attr_qyer_1216 SET city_id = %s WHERE source_city_id = %s;''', data) print('end', line, len(data), __res, _count) __conn.commit() __conn.close()
def update_test(): conn = service_platform_pool.connection() cursor = conn.cursor() query_sql = '''UPDATE image_wanle_huantaoyou_20171023a SET info = %s WHERE file_name = %s;''' cursor.execute(query_sql, ({ "a": "1" }, '23498847e5190ef6849a5bfcf0e506d2.png')) # for line in cursor.fetchall(): # print(str(line[-1])) conn.commit() cursor.close() conn.close()
def test(): conn = service_platform_pool.connection() cursor = conn.cursor() query_sql = '''SELECT file_name, source, sid, info->'$.p_hash' as p_hash FROM image_wanle_huantaoyou_20171023a;''' cursor.execute(query_sql) for line in cursor.fetchall(): print(str(line[-1])) cursor.close() conn.close()
def get_seek(table_name): local_conn = service_platform_pool.connection() local_cursor = local_conn.cursor() sql = '''SELECT seek FROM data_insert_id_seek WHERE task_name='{}';'''.format(table_name) local_cursor.execute(sql) _res = local_cursor.fetchone() local_cursor.close() local_conn.close() if _res is not None: return _res[0] else: return 0
def update_sql(data): sql = '''UPDATE pic_detect_task SET status = 0 WHERE id in ({});'''.format( ','.join( map(lambda x: "'{}'".format(x), data) ) ) conn = service_platform_pool.connection() cursor = conn.cursor() _res = cursor.execute(sql) conn.commit() cursor.close() conn.close() logger.info("[total: {}][execute: {}]".format(len(data), _res))
def get_task_info(): # detect list # get image local_conn = service_platform_pool.connection() local_cursor = local_conn.cursor() local_cursor.execute('''SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'ServicePlatform';''') # 强制要求按照 tag 的先后顺序排列 list_tables = list( sorted(filter(lambda x: x.startswith('list_total_qyer'), map(lambda x: x[0], local_cursor.fetchall())), key=lambda x: x.split('_')[-1])) local_cursor.close() local_conn.close() for each_table_name in list_tables: get_per_table_task_info(each_table_name)
def insert_task_data(data, _count): # 插入 pic detect task 数据 insert_sql = '''INSERT IGNORE INTO pic_detect_task (city_id, city_grade, poi_id, pic_name) VALUES (%s, %s, %s, %s);''' max_retry_times = 3 while max_retry_times: max_retry_times -= 1 try: conn = service_platform_pool.connection() cursor = conn.cursor() _insert_count = cursor.executemany(insert_sql, data) conn.commit() cursor.close() conn.close() logger.debug( "[insert data][now count: {}][insert data: {}][insert_ignore_count: {}]" .format(_count, len(data), _insert_count)) break except Exception as exc: logger.exception(msg="[run sql error]", exc_info=exc)
def insert_data(limit=1000): logger.debug("start insert data") logger.debug("get all view name") local_conn = service_platform_pool.connection() local_cursor = local_conn.cursor() local_cursor.execute('''SELECT TABLE_NAME FROM information_schema.VIEWS WHERE TABLE_SCHEMA = 'ServicePlatform';''') # 强制要求按照 tag 的先后顺序排列 view_list = list( sorted(filter(lambda x: x.startswith('view_final_'), map(lambda x: x[0], local_cursor.fetchall())), key=lambda x: x.split('_')[-1])) local_cursor.close() for each_view_final in view_list: start = time.time() u_time = get_seek(each_view_final) try: _, _, view_type, view_source, view_tag = each_view_final.split('_') except Exception: logger.error('[Unknown View Final: {}]'.format(each_view_final)) continue create_table(view_type, view_tag) to_table_name = "{}_final_{}".format(view_type, view_tag) if view_type in ('hotel', 'attr', 'rest', 'total'): local_cursor = local_conn.cursor() update_time_sql = '''SELECT {0} FROM {1} WHERE {0} >= '{2}' ORDER BY {0} LIMIT {3};'''.format(time_key[view_type], each_view_final, u_time, limit) line_count = local_cursor.execute(update_time_sql) if line_count == 0: continue # get final update time for inserting db next time final_update_time = max( map(lambda x: x[0], local_cursor.fetchall())) local_cursor.close() # replace into final data local_cursor = local_conn.cursor() query_sql = '''REPLACE INTO {1}.{2} SELECT * FROM {3} WHERE {0} >= '{4}' ORDER BY {0} LIMIT {5};'''.format(time_key[view_type], final_database, to_table_name, each_view_final, u_time, limit) try: replace_count = local_cursor.execute(query_sql) except Exception as e: logger.exception(msg="[table_name: {}][error_sql: {}]".format( each_view_final, query_sql), exc_info=e) continue local_conn.commit() local_cursor.close() else: raise TypeError("Unknown Type: {}".format(view_type)) update_seek_table(each_view_final, final_update_time) logger.debug( "[insert data][to: {}][from: {}][update_time: {}][final_update_time: {}][limit: {}][line_count: {}][" "replace_count: {}][takes: {}]".format(to_table_name, each_view_final, u_time, final_update_time, limit, line_count, replace_count, time.time() - start)) local_conn.close()
def insert_error_map_info_task(duplicate_map_info_set, task_table, task_type): # todo 当前由于 qyer 的数据表小,可以全量扫描,之后增加其他表的时候,需要修改此方法 # get all task info logger.debug("[total duplicate map_info set][len: {}]".format( len(duplicate_map_info_set))) _count = 0 for duplicate_map_info in chunks(list(duplicate_map_info_set), 500): _count += 500 logger.debug("[duplicate map_info][now: {}]".format(_count)) data = [] retry_times = 4 while retry_times: retry_times -= 1 try: _conn = service_platform_pool.connection() _cursor = _conn.cursor() if task_type == 'hotel': # 酒店数据不可用 query_sql = '''SELECT source, source_id, address FROM {} WHERE map_info IN ({});'''.format( task_table, ",".join( map( lambda x: "'{}'".format(x), filter(lambda x: map_info_legal(x), duplicate_map_info)))) elif task_type in ('attr', 'shop', 'rest', 'total'): query_sql = '''SELECT source, id, address FROM {} WHERE map_info IN ({});'''.format( task_table, ",".join( map( lambda x: "'{}'".format(x), filter(lambda x: map_info_legal(x), duplicate_map_info)))) else: continue _res = _cursor.execute(query_sql) # get all data for line in _cursor.fetchall(): if not is_legal(line[2]): continue data.append((task_table, line[0], line[1], json.dumps({'address': line[2]}))) _cursor.close() _conn.close() logger.debug( "[get duplicate map_info task][len: {}]".format(_res)) break except Exception as exc: logger.exception( msg="[get duplicate map_info task error][retry_times: {}]", exc_info=exc) insert_retry_times = 4 while insert_retry_times: insert_retry_times -= 1 try: # insert all data _conn = service_platform_pool.connection() _cursor = _conn.cursor() _res = _cursor.executemany( '''INSERT IGNORE INTO supplement_field (`table_name`, `type`, `source`, `sid`, `other_info`) VALUES (%s, 'map_info', %s, %s, %s)''', data) _conn.commit() _cursor.close() _conn.close() logger.debug( "[get duplicate map_info task][len: {}]".format(_res)) break except Exception as exc: logger.exception( msg="[insert supplement filed][retry_times: {}]".format( insert_retry_times), exc_info=exc)
def insert_data(limit=1000): logger.debug("start insert data") logger.debug("get all table name") local_conn = service_platform_pool.connection() local_cursor = local_conn.cursor() local_cursor.execute('''SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'ServicePlatform';''') # 强制要求按照 tag 的先后顺序排列 table_list = list( sorted(filter(lambda x: x.startswith('images_'), map(lambda x: x[0], local_cursor.fetchall())), key=lambda x: x.split('_')[-1])) local_cursor.close() for each_table_final in table_list: start = time.time() seek = get_seek(each_table_final) try: _, task_type, task_source, task_tag = each_table_final.split('_') except Exception: logger.error('[Unknown Task Final: {}]'.format(each_table_final)) continue create_table(task_type, task_tag) if task_type in ('hotel', 'attr', 'rest', 'total'): to_table_name = "{}_images_final_{}".format( image_type_dict[task_type], task_tag) local_cursor = local_conn.cursor() get_id_sql = '''SELECT id FROM {0} WHERE id > '{1}' ORDER BY id LIMIT {2};'''.format(each_table_final, seek, limit) line_count = local_cursor.execute(get_id_sql) if line_count == 0: continue # get final update time for inserting db next time final_seek = max(map(lambda x: x[0], local_cursor.fetchall())) local_cursor.close() # replace into final data local_cursor = local_conn.cursor() if task_type in ('attr', 'rest', 'total'): query_sql = '''REPLACE INTO {0}.{1} (file_name, source, sid, url, pic_size, bucket_name, url_md5, pic_md5, `use`, part, date) SELECT file_name, source, sid, url, pic_size, bucket_name, url_md5, pic_md5, `use`, part, date FROM {2} where id > {3} ORDER BY id LIMIT {4};;'''.format( final_database, to_table_name, each_table_final, seek, limit) elif task_type == 'hotel': query_sql = '''REPLACE INTO {0}.{1} (source, source_id, pic_url, pic_md5, part, hotel_id, status, update_date, size, flag, file_md5) SELECT source, source_id, pic_url, pic_md5, part, hotel_id, status, update_date, size, flag, file_md5 FROM {2} WHERE id > {3} ORDER BY id LIMIT {4};'''.format(final_database, to_table_name, each_table_final, seek, limit) else: continue try: replace_count = local_cursor.execute(query_sql) except Exception as e: logger.exception(msg="[table_name: {}][error_sql: {}]".format( each_table_final, query_sql), exc_info=e) continue local_conn.commit() local_cursor.close() else: raise TypeError("Unknown Type: {}".format(task_type)) update_seek_table(each_table_final, final_seek) logger.debug( "[insert data][to: {}][from: {}][seek: {}][final_seek: {}][limit: {}][line_count: {}][" "replace_count: {}][takes: {}]".format(to_table_name, each_table_final, seek, final_seek, limit, line_count, replace_count, time.time() - start)) local_conn.close()
def insert_hotel_data(): logger.debug("start prepare mongo data") logger.debug("get all table name") local_conn = service_platform_pool.connection() local_cursor = local_conn.cursor() local_cursor.execute('''SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'ServicePlatform';''') # 强制要求按照 tag 的先后顺序排列 table_list = list( sorted(filter( lambda x: x.startswith('detail_hotel_') and x.split('_')[-1] != "test" and x.split("_")[-1] > "20170928d", map(lambda x: x[0], local_cursor.fetchall())), key=lambda x: x.split('_')[-2:])) local_cursor.close() local_conn.close() for each_table_name in table_list: _count = 0 data = [] delete_sid = [] delete_source = "" sql = '''SELECT source, source_id, hotel_name, hotel_name_en, map_info, address, source_city_id FROM {};'''.format(each_table_name) for each_data in MysqlSource(data_db, table_or_query=sql, size=10000, is_table=False, is_dict_cursor=True): map_info = each_data['map_info'] lng, lat = map_info.split(',') each_data["loc"] = { "type": "Point", "coordinates": [float(lng), float(lat)] } data.append(each_data) delete_source = each_data["source"] delete_sid.append(each_data["source_id"]) _count += 1 if _count % 10000 == 0: try: hotel_collections.delete_many({ "source": delete_source, "source_id": { "$in": delete_sid } }) hotel_collections.insert_many(data) except BulkWriteError as bwe: logger.exception( msg="[bwe error][bwe details: {}]".format(bwe.details)) except Exception as exc: logger.exception(msg="[insert data error]", exc_info=exc) data = [] delete_sid = [] logger.debug("[insert_data][table: {}][count: {}]".format( each_table_name, _count)) if data: try: hotel_collections.delete_many({ "source": delete_source, "source_id": { "$in": delete_sid } }) hotel_collections.insert_many(data) except BulkWriteError as bwe: logger.exception( msg="[bwe error][bwe details: {}]".format(bwe.details)) except Exception as exc: logger.exception(msg="[insert data error]", exc_info=exc) logger.debug("[insert_data][table: {}][count: {}]".format( each_table_name, _count))