def add_open_time_filter(_v): if not is_legal(_v): return False try: _open_time = fix_daodao_open_time(_v) if is_legal(_open_time): return True except Exception: # 保存不能识别的 open time insert_unknown_keywords('{}_opentime'.format(poi_type), _v) logger.debug("[unknown open time][data: {}]".format(_v)) return False
def _update_per_uid_img(_uid, _poi_type, _old_img_list, _old_first_img, _official): global data # init source sid set if not is_legal(_old_img_list): _old_img_list = '' if not is_legal(_old_first_img): _old_first_img = '' _s_sid_set = get_source_sid_set(_uid) _img_list, _first_img = get_img(s_sid_set=_s_sid_set, poi_type=_poi_type, old_img=_old_img_list, old_first_img=_old_first_img, is_official=(int(_official) == 1)) logger.debug( "[get img info][uid: {}][img_list_len: {}][img_list: {}][first_img: {}]".format(_uid, len(_img_list), _img_list, _first_img)) # 按照 uid 排序,每当 uid 更新后,执行图片更新命令 data.append((_first_img, _img_list, _uid))
def get_tagid_dict(_poi_type): _dict = {} if _poi_type == 'attr': sql = '''SELECT tag, tag_en, original_tag FROM chat_attraction_tagS ORDER BY id;''' elif _poi_type == 'rest': sql = 'select tag,tag_en,original_tag from chat_restaurant_tagS' elif _poi_type == 'shop': sql = '''SELECT tag, tag_en, original_tag FROM chat_shopping_tagS ORDER BY id;''' else: raise TypeError("Unknown Type: {}".format(_poi_type)) conn = base_data_pool.connection() cursor = conn.cursor(cursor=DictCursor) cursor.execute(sql) for line in cursor.fetchall(): tag = line['tag'] tag_en = line['tag_en'] original_tag = line['original_tag'] _tags_set = set() for each_tag in original_tag.split('|'): if is_legal(each_tag): _tags_set.add(each_tag) _dict[tuple(_tags_set)] = (tag, tag_en) return _dict
def to_data(table_name): global offset select_sql = '''SELECT source, source_id, others_info FROM detail_hotel_{0}'''.format(table_name) try: _data = [] for result in MysqlSource(db_config=config, table_or_query=select_sql, size=10000, is_table=False, is_dict_cursor=True): offset += 1 others_info = result['others_info'] if not others_info: continue others_info = json.loads(others_info) if 'first_img' not in others_info: continue first_img_url = others_info['first_img'] if not is_legal(first_img_url): continue md5_str = encode(first_img_url) source = result['source'] source_id = result['source_id'] _data.append((source, source_id, md5_str)) if len(_data) % 1000 == 0: insert_db(table_name, _data) _data = [] insert_db(table_name, _data) except Exception as exc: logger.exception(msg="[入库出现异常]", exc_info=exc)
def check_latin(string): if not is_legal(string): return False return (len( list( filter( lambda x: toolbox.Common.is_latin_and_punctuation(x) or x == '’', string))) / len(string)) >= 0.9
def belong_city_id_insert(): """ 通过 belong_city_id 获取 city_id :return: """ sql = '''SELECT station_id, {0}.map_info AS station_map_info, {0}.station_city_map_info AS station_city_map_info, city.id AS city_id, city.map_info AS city_map_info FROM {0} JOIN base_data.city ON {0}.belong_city_id = base_data.city.id;'''.format( STATION_SRC_TABLE) _count = 0 data = [] for station_id, station_map_info, station_city_map_info, city_id, city_map_info in fetchall( new_station_pool, sql): _count += 1 if is_legal(station_map_info): distance = get_distance(city_map_info, station_map_info) map_info = station_map_info elif is_legal(station_city_map_info): distance = get_distance(city_map_info, station_map_info) map_info = station_city_map_info else: distance = None map_info = None ''' station_id, map_info, city_id, city_map_info, distance, info ''' data.append((station_id, map_info, city_id, city_map_info, distance, '通过 belong_city_id 进行匹配')) if len(data) == 1000: insert_db(data) data = [] if data: insert_db(data)
def update_each_tag_id(): tag_id = tag2id() conn = poi_ori_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT id,norm_tagid FROM {};'''.format(task_table)) data = [] _count = 0 for _id, _tag_id in cursor.fetchall(): if is_legal(_tag_id): tag_id_set = set() for each in _tag_id.split('|'): tag_id_set.add(tag_id.get(each)) small_tag = ('|'.join(filter(lambda x: is_legal(x), tag_id_set))) big_tag = get_tag(small_tag) data.append((small_tag, big_tag, _id)) _count += 1 if len(data) % 1000 == 0: logger.debug("[mk data][poi_type: {}][len: {}]".format( poi_type, _count)) res = cursor.executemany( 'update base_data.{} set tag_id=%s, tagB=%s where id=%s'. format(task_table), data) data = [] logger.debug( "[update tag id][table_name: {}][update count: {}]".format( task_table, res)) res = cursor.executemany( 'update base_data.{} set tag_id=%s, tagB=%s where id=%s'.format( task_table), data) logger.debug("[update tag id][table_name: {}][update count: {}]".format( task_table, res)) logger.debug("[mk data finished][poi_type: {}][len: {}]".format( poi_type, _count)) conn.commit() cursor.close() conn.close()
def get_old_info_dict(): sql = '''SELECT id, source, name, name_en, map_info, address, plantocounts, beentocounts, ranking, grade, commentcounts, imgurl, introduction, opentime FROM poi_merge.attr WHERE source='qyer';''' __dict = defaultdict(dict) _count = 0 for line in MysqlSource(poi_ori_config, table_or_query=sql, size=5000, is_table=False, is_dict_cursor=True): _count += 1 if _count % 3000 == 0: logger.info("[load old data info][count: {}]".format(_count)) sid = line['id'] for key_name, is_strict, num_check in check_name: if is_strict: __dict[sid][key_name] = line[key_name] else: legal_res = is_legal(line[key_name]) if not num_check: check_res = legal_res else: try: if int(legal_res) in (-1, 0): check_res = False else: check_res = True except Exception: check_res = False __dict[sid][key_name] = check_res logger.info("[load old data info finished][count: {}]".format(_count)) return __dict
def init_id2tag(): conn = base_data_pool.connection() sql = '''SELECT id, Stag FROM {};'''.format(tag_b) cursor = conn.cursor() cursor.execute(sql) _dict = defaultdict(set) for _id, _l_s_tag in cursor.fetchall(): for each in _l_s_tag.split('|'): if is_legal(each): _dict[_id].add(each) cursor.close() conn.close() return _dict
def city_pair(city_ids, config): country_dict, city_dict, map_dict = generate_dict(config) pair = set([]) pair_filter = [] for c_id in city_ids: current_city = city_dict[c_id] all_city = country_dict[current_city['country_id']] for ac in all_city: if c_id == ac['id']: continue pf = '{0}-{1}'.format(c_id, ac['id']) if pf not in pair_filter: pair_filter.append(pf) src_cid = c_id dst_cid = ac['id'] src_map_info = map_dict.get(src_cid) dst_map_info = map_dict.get(dst_cid) if not is_legal(dst_map_info): continue logger.info('%s: %s - %s: %s' % (src_cid, src_map_info, dst_cid, dst_map_info)) src_map_info_list = src_map_info.split(',') src_map_info = ','.join( [src_map_info_list[1], src_map_info_list[0]]) dst_map_info_list = dst_map_info.split(',') dst_map_info = ','.join( [dst_map_info_list[1], dst_map_info_list[0]]) if not is_map_info_legal( src_map_info) or not is_map_info_legal(dst_map_info): logger.warning( "[error map info][src_cid: {}][dst_cid: {}][src_m_info: {}][dst_m_info: {}]" .format(src_cid, dst_cid, src_map_info, dst_map_info)) continue google_url = 'http://maps.google.cn/maps/api/directions/json?origin={}&destination={}&mode=driving®ion=es&mode=driving&type=interCity&a1={}&a2={}'.format( src_map_info, dst_map_info, src_cid, dst_cid) logger.info("[new task][url: {}]".format(google_url)) pair.add((src_cid, dst_cid, google_url)) return pair
def get_norm_tag(tag_id, _poi_type): global tag_dict if _poi_type not in tag_dict: logger.debug("[init tagid][poi_type: {}]".format(_poi_type)) tag_dict[_poi_type] = get_tagid_dict(_poi_type) norm_tags = [] norm_tag_ens = [] unknown = [] lines = tradition2simple(tag_id).decode() for raw_tag in split_pattern.split(lines): tag_ok = False tag = raw_tag.strip() for t_set, values in tag_dict[_poi_type].items(): if tag in t_set: norm_tags.append(values[0]) norm_tag_ens.append(values[1]) tag_ok = True break if not tag_ok: if is_legal(tag): unknown.append(tag) norm_tag = '|'.join(sorted(norm_tags)) norm_tag_en = '|'.join(sorted(norm_tag_ens)) return norm_tag, norm_tag_en, unknown
def check_chinese(string): if not is_legal(string): return False return toolbox.Common.has_any(string, check_func=toolbox.Common.is_chinese)
def poi_merged_report(poi_type): cid2grade, grade_info = prepare_city_info() if poi_type == 'attr': query_sql = '''SELECT id, city_id, first_image, address, open, introduction, data_source, status_online, utime FROM chat_attraction;''' elif poi_type == 'shop': query_sql = '''SELECT id, city_id, first_image, address, open, introduction, data_source, status_online, utime FROM chat_shopping;''' else: query_sql = '''SELECT id, city_id, first_image, address, open, introduction, data_source, status_online, utime FROM chat_restaurant;''' poi_info = defaultdict(dict) for line in MysqlSource(db_config=data_process_config, table_or_query=query_sql, size=10000, is_dict_cursor=True, is_table=False): cid = line['city_id'] # get grade grade = cid2grade.get(cid, None) if grade is None: # not known cid continue # add cid if 'has_poi' not in poi_info[grade]: poi_info[grade]['has_poi'] = set() poi_info[grade]['has_poi'].add(line['city_id']) # poi total if 'total' not in poi_info[grade]: poi_info[grade]['total'] = 0 poi_info[grade]['total'] += 1 # poi online if 'online' not in poi_info[grade]: poi_info[grade]['online'] = 0 if 'Open' == line['status_online']: poi_info[grade]['online'] += 1 # poi update this time if 'update' not in poi_info[grade]: poi_info[grade]['update'] = 0 try: if line['utime'] > datetime.datetime.now() - datetime.timedelta(days=30): poi_info[grade]['update'] += 1 except Exception as exc: logger.exception(msg="[unknown utime][utime: {}]".format(line['utime']), exc_info=exc) # poi has img if 'img' not in poi_info[grade]: poi_info[grade]['img'] = 0 if is_legal(line['first_image']): poi_info[grade]['img'] += 1 # poi has address if 'address' not in poi_info[grade]: poi_info[grade]['address'] = 0 if is_legal(line['address']): poi_info[grade]['address'] += 1 # poi opentime if 'opentime' not in poi_info[grade]: poi_info[grade]['opentime'] = 0 if is_legal(line['open']): poi_info[grade]['opentime'] += 1 # poi introduction if 'introduction' not in poi_info[grade]: poi_info[grade]['introduction'] = 0 if is_legal(line['introduction']): try: _data = json.loads(line['introduction']) if isinstance(_data, dict): if _data.values(): if has_any(list(_data.values()), check_func=is_legal): poi_info[grade]['introduction'] += 1 except Exception as exc: logger.exception(msg="[load introduction error][introduction: {}]".format(line['introduction']), exc_info=exc) # qyer\daodao\multi in source if 'qyer' not in poi_info[grade]: poi_info[grade]['qyer'] = 0 if 'daodao' not in poi_info[grade]: poi_info[grade]['daodao'] = 0 if 'multi' not in poi_info[grade]: poi_info[grade]['multi'] = 0 if is_legal(line['data_source']): if 'qyer' in line['data_source']: poi_info[grade]['qyer'] += 1 if 'daodao' in line['data_source'] or 'tripadvisor' in line['data_source']: poi_info[grade]['daodao'] += 1 if 'qyer' in line['data_source'] and ( 'daodao' in line['data_source'] or 'tripadvisor' in line['data_source']): poi_info[grade]['multi'] += 1 for each_grade in poi_info.keys(): poi_info[each_grade]['has_poi'] = len(poi_info[each_grade]['has_poi']) db = dataset.connect('mysql+pymysql://mioji_admin:[email protected]/Report?charset=utf8') _table = db['base_data_report_summary'] dt = datetime.datetime.now() for k, v in sorted(grade_info.items(), key=lambda x: x[0]): _tmp_grade_report = poi_info.get(k, defaultdict(int)) no_poi = v - _tmp_grade_report['has_poi'] data = { 'type': poi_type, 'grade': k, 'citys': v, 'no_poi': no_poi, 'date': datetime.datetime.strftime(dt, '%Y%m%d'), 'hour': datetime.datetime.strftime(dt, '%H'), 'datetime': datetime.datetime.strftime(dt, '%Y%m%d%H00') } data.update(_tmp_grade_report) _table.upsert(data, keys=['type', 'grade', 'datetime']) db.commit()
def get_data(cid_or_geohash): global poi_type global online_table_name global data_source_table """ 返回各优先级数据,用于融合时使用,按照 线上数据 > 各源数据 ( 暂时无源内部的排序 ) 的提取规则由先向后进行数据提取 source = '' sid = '' keys = set() yield source, sid, keys """ # 线上数据,按照优先级排序,先进入的数据优先使用 id _t = time.time() if poi_type == 'attr': sql = '''SELECT id, name, name_en, alias, url FROM {1} WHERE city_id = '{0}' ORDER BY status_online DESC, status_test DESC, official DESC, grade;'''.format( cid_or_geohash, online_table_name) elif poi_type == 'rest': sql = '''SELECT id, name, name_en, url FROM {1} WHERE city_id = '{0}' ORDER BY status_online DESC, status_test DESC, official DESC, grade;'''.format( cid_or_geohash, online_table_name) elif poi_type == 'shop': sql = '''SELECT id, name, name_en, url FROM {1} WHERE city_id = '{0}' ORDER BY status_online DESC, status_test DESC, official DESC, grade;'''.format( cid_or_geohash, online_table_name) else: raise TypeError("Unknown Type: {}".format(poi_type)) for data in MysqlSource(onlinedb, table_or_query=sql, size=10000, is_table=False, is_dict_cursor=True): keys = set() if is_legal(data['name']): keys.add(data['name']) if is_legal(data['name_en']): keys.add(data['name_en']) _url = data['url'] if _url: _j_url = json.loads(_url) if 'qyer' in _j_url: try: _source = 'qyer' _sid = re.findall('place.qyer.com/poi/([\s\S]+)/', _j_url['qyer'])[0] keys.add((_source, _sid)) except Exception: pass if 'daodao' in _j_url: try: _source = 'daodao' _sid = re.findall('-d(\d+)', _j_url['daodao'])[0] keys.add((_source, _sid)) except Exception: pass if poi_type == 'attr': for key in data['alias'].split('|'): if is_legal(key): keys.add(key) logger.debug("[source: {}][sid: {}][keys: {}]".format( 'online', data['id'], keys)) yield 'online', data['id'], keys logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t)) # 各源数据,暂时不增加排序规则 _t = time.time() sql = '''SELECT id, source, name, name_en, url FROM {1} WHERE city_id = '{0}';'''.format(cid_or_geohash, data_source_table) for data in MysqlSource(data_db, table_or_query=sql, size=10000, is_table=False, is_dict_cursor=True): keys = set() if is_legal(data['name']): keys.add(data['name']) if is_legal(data['name_en']): keys.add(data['name_en']) if data['source'] == 'qyer': _sid = re.findall('place.qyer.com/poi/([\s\S]+)/', data['url'])[0] keys.add(('qyer', _sid)) if data['source'] == 'daodao': _sid = re.findall('-d(\d+)', data['url'])[0] keys.add(('daodao', _sid)) logger.debug("[source: {}][sid: {}][keys: {}]".format( data['source'], data['id'], keys)) yield data['source'], data['id'], keys logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t))
def get_new_info_dict(): # 新增数据统计 new_data_count = 0 # 用于记录新的 sid ,判断多少 sid 的数据没有回来 new_sid_set = set() # 严格数据的差异记录 diff_dict = defaultdict(set) # 非严格数据的改善记录 improve_dict = defaultdict(set) # 非严格数据的糟糕记录 worse_dict = defaultdict(set) # 字段全正确的集合 all_right_sids = set() old_info_dict = get_old_info_dict() sql = '''SELECT id, source, name, name_en, map_info, address, plantocounts, beentocounts, ranking, grade, commentcounts, imgurl, introduction, opentime FROM qyer_whole_world;''' _count = 0 for line in MysqlSource(service_platform_config, table_or_query=sql, size=5000, is_table=False, is_dict_cursor=True): _count += 1 if _count % 3000 == 0: logger.info("[load new data info][count: {}]".format(_count)) sid = line['id'] new_sid_set.add(sid) all_right = True for key_name, is_strict, num_check in check_name: legal_res = is_legal(line[key_name]) if not num_check: if not legal_res: all_right = False else: try: if int(legal_res) in (-1, 0): all_right = False except Exception: all_right = False if all_right: all_right_sids.add(sid) if sid not in old_info_dict: new_data_count += 1 continue else: old_info = old_info_dict[sid] for key_name, is_strict, num_check in check_name: if is_strict: if line[key_name] != old_info[key_name]: # 严格字段差异统计 diff_dict[key_name].add(sid) else: # 非严格字段统计 legal_res = is_legal(line[key_name]) if not num_check: check_res = legal_res else: try: if int(legal_res) in (-1, 0): check_res = False else: check_res = True except Exception: check_res = False if check_res == old_info[key_name]: continue elif check_res: improve_dict[key_name].add(sid) else: worse_dict[key_name].add(sid) logger.info("[load new data info finished][count: {}]".format(_count)) return new_data_count, set(old_info_dict.keys( )) - new_sid_set, diff_dict, improve_dict, worse_dict, all_right_sids
def get_img(s_sid_set, poi_type, old_img='', old_first_img='', is_official=False): """ Get img str by using source and sid set :param is_official: is official or not :param old_img: old img list, all img split with | :param old_first_img: old first img, use old sorting :param poi_type: poi type, Eg: attr rest shop :param s_sid_set: source and sid set :return: tuple (new_img str, new_first_img str) """ if not s_sid_set or is_official: return old_img, old_first_img conn = base_data_final_pool.connection() cursor = conn.cursor() query_sql = '''SELECT file_name, bucket_name, pic_size, pic_md5, `use`, info, url FROM poi_images WHERE (`source`, `sid`) IN ({});'''.format(','.join(map(lambda x: "('{}', '{}')".format(x[0], x[1]), s_sid_set))) _res = cursor.execute(query_sql) if not _res: return old_img, old_first_img max_size = -1 max_size_img = '' file2phash = dict() pic_total = set() p_hash_dict = defaultdict(list) for file_name, bucket_name, pic_size, pic_md5, use, info, url in cursor.fetchall(): if poi_type == 'shop' and bucket_name not in ('attr_bucket', 'shop_bucket', 'mioji-attr', 'mioji-shop'): # shopping img upload to mioji-attr or mioji-shop continue elif poi_type != 'shop' and poi_type not in bucket_name: # rest img upload to mioji-rest # attr img upload to mioji-attr continue # 生成 pic total,用于判定被过滤的图片是否为人工新添加的图片 pic_total.add(file_name) # 裂图,必须过滤 if r.get('error_img_{}'.format(file_name)) == '1': continue # pHash filter if url in ('', 'NULL', None): # 产品标注图片,不许过滤,直接使用 file2phash[file_name] = 'USE' p_hash_dict["USE"].append((file_name, -1)) continue elif not info: # 抓取图片,没有 pHash ,直接过滤 continue else: p_hash = json.loads(info)['p_hash'] # img can be used # pic size 为空一般是人工标的图片 if not is_legal(pic_size): if file_name not in old_img: continue elif str(use) != '1': continue else: # 老图,人工标的,不能过滤 file2phash[file_name] = 'USE' p_hash_dict["USE"].append((file_name, -1)) continue # get max size h, w = literal_eval(pic_size) h = int(h) w = int(w) size = h * w if size > max_size: max_size = size max_size_img = file_name # use 1 if str(use) == '1': # 过滤规则 # pixel if size < 200000: continue # scale # min scale scale = w / h if scale < 0.9: if w < 500: continue # max scale if scale > 2.5: continue p_hash_dict[p_hash].append((file_name, size)) cursor.close() conn.close() if poi_type in ('attr', 'shop'): # 获取人脸识别数据 _conn = poi_face_detect_pool.connection() _cursor = _conn.cursor() query_sql = '''SELECT pic_name FROM PoiPictureInformation WHERE is_available=0 AND poi_id IN ({});'''.format( ', '.join( map( lambda x: "'{}'".format( '###'.join(x) if x[0] != 'online' else x[1]), s_sid_set ) ) ) _cursor.execute(query_sql) face_detected = set([x[0].split('/')[-1] for x in _cursor.fetchall()]) _cursor.close() _conn.close() else: face_detected = set() # 人工添加图片 human_pic = p_hash_dict["USE"] # 机器图片,同一 pHash 中选取最大的一张图片 final_pic_dict = {} for k, v in p_hash_dict.items(): pic_res = sorted(v, key=lambda x: x[1], reverse=True) if pic_res: final_pic_dict[pic_res[0][0]] = k old_img_list = old_img.split('|') new_img_list = [] # 按照旧的图片排列顺序增加图片,并去重 for _old_file_name in old_img_list: # 人工添加图片入栈,但无 md5 进行过滤,直接放过 md5 过滤规则 if (_old_file_name not in pic_total) or (_old_file_name in human_pic): # 如果数据合法 if is_legal(_old_file_name): if _old_file_name not in face_detected: if _old_file_name not in new_img_list: # 人工添加图片入栈,但无 md5 进行过滤,直接放过任何过滤规则 new_img_list.append(_old_file_name) elif _old_file_name in final_pic_dict: if is_legal(_old_file_name): # 人脸识别过滤 if _old_file_name not in face_detected: if _old_file_name not in new_img_list: new_img_list.append(_old_file_name) # 当新增图片中有原先不存在的图片,按顺序增加图片 for k, v in final_pic_dict.items(): if is_legal(v): # 人脸识别过滤 if k not in face_detected: if v not in new_img_list: new_img_list.append(k) if old_first_img: if old_first_img in new_img_list: # 当首图没有被下掉的时候,使用原先首图 new_first_img = old_first_img # 从新图片列表中清除 first_img new_img_list.remove(old_first_img) # 在列表头部增加 first_img new_img_list.insert(0, old_first_img) else: # 否则使用新的首图 if new_img_list: new_first_img = new_img_list[0] else: new_first_img = '' else: if new_img_list: new_first_img = new_img_list[0] else: new_first_img = '' if new_first_img == '': new_img = new_first_img = max_size_img else: # 图片序列去重,不改变原来次序 final_new_img_list = list(set(new_img_list)) final_new_img_list.sort(key=new_img_list.index) new_img = '|'.join(filter(lambda x: is_legal(x), final_new_img_list)) return new_img, new_first_img
def poi_insert_data(cid, _poi_type): init_global_name(_poi_type) ''' 数据最终入库表 if _poi_type == 'attr': sql = 'replace into chat_attraction(`id`,`name`,`name_en`,`data_source`,`city_id`,' \ '`map_info`,`address`,`star`,`plantocount`,`beentocount`,`real_ranking`,' \ '`grade`,`commentcount`,`tagid`,`norm_tagid`,`norm_tagid_en`,`url`,`website_url`,`phone`,`introduction`,' \ '`open`, `open_desc`,`recommend_lv`,`prize`,`traveler_choice`, `alias`, ' \ '`image`, `ori_grade`,`nearCity`, `ranking`,`rcmd_open`,`add_info`,`address_en`,`event_mark`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \ '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,-1,"","","","")' elif _poi_type == 'rest': sql = 'replace into chat_restaurant(`id`,`name`,`name_en`,' \ '`source`,`city_id`,`map_info`,`address`,`real_ranking`,' \ '`grade`,`res_url`,`telphone`,`introduction`,`open_time`,`open_time_desc`,`prize`,' \ '`traveler_choice`,`review_num`,`price`,`price_level`,`cuisines`, ' \ '`image_urls`,`tagid`,`norm_tagid`,`norm_tagid_en`,`nearCity`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \ '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' elif _poi_type == 'shop': sql = 'replace into ' \ 'chat_shopping(`id`,`name`,`name_en`,`data_source`,`city_id`,' \ '`map_info`,`address`,`star`,`plantocount`,`beentocount`,' \ '`real_ranking`,`grade`,`commentcount`,`tagid`,`norm_tagid`,`norm_tagid_en`,`url`,`website_url`,' \ '`phone`,`introduction`,`open`,`open_desc`,`recommend_lv`,`prize`,' \ '`traveler_choice`,`image`,`nearCity`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \ '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' else: raise TypeError("Unknown Type: {}".format(poi_type)) ''' conn = poi_ori_pool.connection() # for task_dict in get_task(cid): count = 0 data = [] # 获取融合需要的 poi 信息 _info_dict, _online_official_data, _online_nonofficial_data = get_poi_dict( cid) _city_poi = get_poi_union_info(cid) # 开始数据融合 for miaoji_id, city_id, union_info in _city_poi: # 初始化融合前变量 data_dict = defaultdict(dict) # 有从其他数据源来的数据 other_source = False # 用于判定是否有线上 official 以及 nonofficial 的数据 has_official = False has_nonofficial = False # 获取线上环境数据 o_official_data = _online_official_data.get(miaoji_id, None) o_nonofficial_data = _online_nonofficial_data.get(miaoji_id, None) # 更新 official 判定 if o_official_data is not None: has_official = True if o_nonofficial_data is not None: has_nonofficial = True # 初始化融合信息 for each_name in (json_name_list + norm_name_list + others_name_list): data_dict[each_name] = {} def get_data(src_dict, is_official=False): if each_name in online2source: source_name = online2source[each_name] else: source_name = each_name if each_name in json_name_list: if source_name in src_dict: try: _res = json.loads(src_dict[source_name]) if isinstance(_res, dict): data_dict[each_name] = { k: v for k, v in _res.items() if k in available_source } else: pass except Exception: pass else: data_dict[each_name]['mioji_official' if is_official else 'mioji_nonofficial'] = src_dict.get( source_name, {}) if o_official_data is not None: get_data(o_official_data, is_official=True) if o_nonofficial_data is not None: get_data(o_nonofficial_data, is_official=False) # 遍历所有需要融合的 source 以及 id,并生成 dict 类融合内容 for s_sid in union_info.split('|_||_|'): source, source_id = s_sid.split('|') # todo 增加 online 的处理,先 load data,然后进行数据更新 # todo 使用 online 的 base data 更新 data process 的字段 # 未获得融合 id 信息 if not source_id or not source: continue # 过滤不必要的 source if source not in available_source: logger.debug("[not available source: {}]".format(source)) continue # 未获得融合数据 poi_info = _info_dict[(source, source_id)] if poi_info == {}: continue other_source = True # 加 key for each_name in (json_name_list + norm_name_list + others_name_list): if is_legal(poi_info[each_name]): if isinstance(poi_info[each_name], str): data_dict[each_name][source] = tradition2simple( poi_info[each_name]).decode() else: data_dict[each_name][source] = poi_info[each_name] # 补空白的内容 for each_name in (json_name_list + norm_name_list + others_name_list): if each_name not in data_dict: data_dict[each_name] = {} # 不能融合的内容包含两种 if not o_official_data and not o_nonofficial_data and not other_source: if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "没有可供融合的数据") logger.debug('[union_info: {}]'.format(union_info)) continue new_data_dict = {} # 通过优先级获取 !中文 ! def get_name_by_priority(): # 按照标准优先级更新字段信息 name_tmp = get_key.get_key_by_priority_or_default( data_dict['name'], norm_name, '', special_filter=check_chinese) # 从英文字段中找中文 if not name_tmp: name_tmp = get_key.get_key_by_priority_or_default( data_dict['name_en'], norm_name, '', special_filter=check_chinese) # 从英文字段中找拉丁 if not name_tmp: name_tmp = get_key.get_key_by_priority_or_default( data_dict['name_en'], norm_name, '', special_filter=check_latin) # 从中文字段中找拉丁 if not name_tmp: name_tmp = get_key.get_key_by_priority_or_default( data_dict['name'], norm_name, '', special_filter=check_latin) return name_tmp # 通过优先级获取 !拉丁字符 ! def get_name_en_by_priority(): # 从融合数据的英文字段中获取 name_en_tmp = get_key.get_key_by_priority_or_default( data_dict['name_en'], norm_name, '', special_filter=check_latin) if not name_en_tmp: get_key.get_key_by_priority_or_default( data_dict['name'], norm_name, '', special_filter=check_latin) return name_en_tmp for norm_name in norm_name_list: # 所有字段处理的过程中,对 name / name_en 进行特殊处理 if norm_name == 'name': if has_official: # official = 1 的点,不更新 name new_data_dict['name'] = data_dict['name']['mioji_official'] elif has_nonofficial: # official = 0 的点,name 已为中文的点不更新 name if any([ toolbox.Common.is_chinese(c) for c in data_dict['name']['mioji_nonofficial'] ]): new_data_dict['name'] = data_dict['name'][ 'mioji_nonofficial'] else: new_data_dict['name'] = get_name_by_priority() else: # 按照标准优先级更新字段信息 new_data_dict['name'] = get_name_by_priority() elif norm_name == 'name_en': # official 1 不更新英文名,否则按优先级更新英文名 if has_official: new_data_dict['name_en'] = data_dict['name_en'][ 'mioji_official'] else: new_data_dict['name_en'] = get_name_en_by_priority() else: new_data_dict[ norm_name] = get_key.get_key_by_priority_or_default( data_dict[norm_name], norm_name, '') # daodao url 处理 if 'daodao' in data_dict['url']: data_dict['url']['daodao'] = data_dict['url']['daodao'].replace( 'www.tripadvisor.com.hk', 'www.tripadvisor.cn') # 餐厅使用 cuisines 添加 tagid if poi_type == 'rest': data_dict['tagid'] = copy.deepcopy(data_dict['cuisines']) new_data_dict['tagid'] = json.dumps({ k: v for k, v in data_dict['tagid'].items() if k in final_source }) for json_name in json_name_list: new_data_dict[json_name] = json.dumps({ k: v for k, v in data_dict[json_name].items() if k in final_source }) new_data_dict['phone'] = new_data_dict['phone'].replace('电话号码:', '').strip() # 数据操作部分 # ori_grade modify tmp_ori_grade = {} if has_official: try: tmp_ori_grade.update(json.loads(o_official_data['ori_grade'])) except Exception as exc: logger.exception(msg="[load ori grade error]", exc_info=exc) if has_nonofficial: try: tmp_ori_grade.update( json.loads(o_nonofficial_data['ori_grade'])) except Exception as exc: logger.exception(msg="[load ori grade error]", exc_info=exc) tmp_ori_grade.update({k: v for k, v in data_dict['grade'].items()}) new_data_dict['ori_grade'] = json.dumps( {k: v for k, v in tmp_ori_grade.items() if k in final_source}) # 添加 source source = '|'.join( map(lambda x: x.split('|')[0], union_info.split('|_||_|'))) # add alias alias = '|'.join( filter( lambda x: x != new_data_dict['name'] and x != new_data_dict[ 'name_en'], set( list(data_dict['name'].values()) + list(data_dict['name_en'].values())))) # add open time final_open_time_desc = get_key.get_key_by_priority_or_default( data_dict['opentime'], 'opentime', special_filter=add_open_time_filter) if final_open_time_desc: norm_open_time = fix_daodao_open_time(final_open_time_desc) else: norm_open_time = '' # add norm tag # todo change make qyer and other can be used unknown_tag = set() if 'daodao' in data_dict['tagid']: try: daodao_tagid, daodao_tagid_en, _unknown_tag = get_norm_tag( data_dict['tagid']['daodao'], poi_type) unknown_tag.update(_unknown_tag) except Exception: daodao_tagid, daodao_tagid_en = '', '' else: daodao_tagid, daodao_tagid_en = '', '' # # rest tag # if 'daodao' in data_dict['tagid']: # try: # daodao_rest_tagid, daodao_rest_tagid_en, _ = get_norm_tag(data_dict['tagid']['daodao'], # 'rest') # except Exception: # daodao_rest_tagid, daodao_rest_tagid_en = '', '' # else: # daodao_rest_tagid, daodao_rest_tagid_en = '', '' # shop tag if 'daodao' in data_dict['tagid']: try: daodao_shop_tagid, daodao_shop_tagid_en, _ = get_norm_tag( data_dict['tagid']['daodao'], 'shop') except Exception: daodao_shop_tagid, daodao_shop_tagid_en = '', '' else: daodao_shop_tagid, daodao_shop_tagid_en = '', '' if 'qyer' in data_dict['tagid']: try: qyer_tagid, qyer_tagid_en, _unknown_tag = get_norm_tag( data_dict['tagid']['qyer'], poi_type) unknown_tag.update(_unknown_tag) except Exception: qyer_tagid, qyer_tagid_en = '', '' else: qyer_tagid, qyer_tagid_en = '', '' # # rest tag # if 'qyer' in data_dict['tagid']: # try: # qyer_rest_tagid, qyer_rest_tagid_en, _ = get_norm_tag(data_dict['tagid']['qyer'], 'rest') # except Exception: # qyer_rest_tagid, qyer_rest_tagid_en = '', '' # else: # qyer_rest_tagid, qyer_rest_tagid_en = '', '' # shop tag if 'qyer' in data_dict['tagid']: try: qyer_shop_tagid, qyer_shop_tagid_en, _ = get_norm_tag( data_dict['tagid']['qyer'], 'shop') except Exception: qyer_shop_tagid, qyer_shop_tagid_en = '', '' else: qyer_shop_tagid, qyer_shop_tagid_en = '', '' l_norm_tag = [] l_norm_tag_en = [] l_norm_tag.extend(daodao_tagid.split('|')) l_norm_tag_en.extend(daodao_tagid_en.split('|')) l_norm_tag.extend(qyer_tagid.split('|')) l_norm_tag_en.extend(qyer_tagid_en.split('|')) l_other_norm_tag = [] l_other_norm_tag.extend(daodao_shop_tagid.split('|')) l_other_norm_tag.extend(qyer_shop_tagid.split('|')) # 去除空 tag 以及重复 tag norm_tag = '|'.join(filter(lambda x: is_legal(x), set(l_norm_tag))) norm_tag_en = '|'.join( filter(lambda x: is_legal(x), set(l_norm_tag_en))) other_tag = '|'.join( filter(lambda x: is_legal(x), set(l_other_norm_tag))) # 数据入库部分 # 替换旧的 data_dict data_dict = new_data_dict # 过滤名称 if data_dict['name'].lower() in ( '', 'null', '0') and data_dict['name_en'].lower() in ('', 'null', '0'): if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "中英文名为空") logger.debug("[filter by name][name: {}][name_en: {}]".format( data_dict['name'], data_dict['name_en'])) continue if '停业' in data_dict['name'] or '停业' in data_dict['name_en']: if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "停业 POI") logger.debug( "[filter by name with close business][name: {}][name_en: {}]". format(data_dict['name'], data_dict['name_en'])) continue # 这个逻辑太蠢了,去除,23333333 # # name name_en 判断 # if data_dict['name'] != data_dict['name_en']: # if data_dict['name_en'] in data_dict['name']: # data_dict['name'] = data_dict['name'].replace(data_dict['name_en'], '') # phone 处理 if data_dict['phone'] in ('+ 新增電話號碼', '+ 新增电话号码'): data_dict['phone'] = '' # 餐厅的 price_level 单独处理 if poi_type == 'rest': data_dict['price_level'] = W2N.get( data_dict.get('price_level', ''), '0') # 添加 nearCity 字段 nearby_city = get_nearby_city(poi_city_id=city_id, poi_map_info=data_dict['map_info']) # 数据清理以及入库部分 # 全量经纬度不符合规范数据清理 try: lng, lat = data_dict['map_info'].split(',') lng = float(lng) lat = float(lat) data_dict['map_info'] = '{},{}'.format(lng, lat) except Exception as exc: logger.exception(msg="[map_info filter error][data: {}]".format( data_dict['map_info']), exc_info=exc) continue # 清理名称中的多余字符 data_dict['name'] = data_dict['name'].replace('这是您的企业吗?', '').strip() if data_dict['name_en'] in data_dict[ 'name'] and data_dict['name_en'] != data_dict['name']: data_dict['name'].replace(data_dict['name_en'], '') # 字段修改部分 # address if data_dict['address'].lower() in ('null', '0'): data_dict['address'] = '' # open time if norm_open_time.lower() in ('', 'null', '0'): if poi_type in ('attr', 'rest'): norm_open_time = '<*><*><00:00-23:55><SURE>' else: norm_open_time = '<*><*><08:00-20:00><SURE>' # 保存不能识别的 tag 以及 open time 信息 if unknown_tag: insert_unknown_keywords('{}_tag'.format(poi_type), unknown_tag) logger.debug("[unknown tag][tags: {}]".format(unknown_tag)) # 距离过远过滤 poi result = poi_is_too_far(city_id, poi_map_info=data_dict['map_info']) if not result: if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "距城市中心距离过远") logger.debug( "[poi filter by poi city distance][cid: {}][city_map: {}][poi_map_info: {}][distance: {}]" .format(city_id, result.city_map, data_dict['map_info'], result.dist)) continue # 大于 55 长度的电话置空 if len(data_dict['phone']) > 55: logger.debug( "[phone length too long][poi_id: {}][len: {}][phone: {}]". format(miaoji_id, len(data_dict['phone']), data_dict['phone'])) data_dict['phone'] = '' if poi_type == 'attr': per_data = { 'id': miaoji_id, 'name': data_dict['name'], 'name_en': data_dict['name_en'], 'data_source': source, 'city_id': city_id, 'map_info': data_dict['map_info'], 'address': data_dict['address'], 'star': data_dict['star'], 'plantocount': data_dict['plantocounts'], 'beentocount': data_dict['beentocounts'], 'real_ranking': data_dict['ranking'], # 'grade': data_dict['grade'], 'commentcount': data_dict['commentcounts'], 'tagid': data_dict['tagid'], 'norm_tagid': norm_tag, 'norm_tagid_en': norm_tag_en, 'website_url': data_dict['site'], 'phone': data_dict['phone'], 'open': norm_open_time, 'open_desc': data_dict['opentime'], 'recommend_lv': data_dict['recommend_lv'], 'prize': data_dict['prize'], 'traveler_choice': data_dict['traveler_choice'], 'alias': alias, 'image': data_dict['imgurl'], 'ori_grade': data_dict['ori_grade'], 'nearCity': nearby_city } # official 为 1 时,不更新的字段 # nonofficial 以及 新增的数据时进行更新 if not has_official: per_data.update({ 'introduction': data_dict['introduction'], 'url': data_dict['url'], }) if not has_official and not has_nonofficial: per_data.update({ # 明确更新逻辑,当之前没有融合时才会更新状态 'ranking': -1.0, 'rcmd_open': '', 'add_info': '', 'address_en': '', 'event_mark': '', 'grade': -1.0, # 明确更新逻辑,当之前没有融合时才会更新状态 'status_online': 'Open', 'status_test': 'Open' }) # 景点游览部分清理 try: tagid_data = json.loads(data_dict['tagid']) if 'daodao' in tagid_data: if is_legal(tagid_data['daodao']): if '游览' in tagid_data['daodao']: if 'online' in union_info: # 这种内容本来是要被删除的,但是由于 online 环境的某些购物还必须要更新,所以只能保留 # 并参加融合,这种内容算是特殊处理吧,对于新增的,一定不能加入游览 # filter_data_already_online(poi_type, miaoji_id, "tag 中包含游览被过滤") pass else: logger.debug("[tour filter][data: {}]".format( tagid_data['daodao'])) continue except Exception as exc: logger.exception(msg="[tour filter error]", exc_info=exc) if norm_tag == '' and other_tag != '': # 景点中包含购物被清除 if 'online' in union_info: # 这种内容本来是要被删除的,但是由于 online 环境的某些购物还必须要更新,所以只能保留 # 并参加融合,这种内容算是特殊处理吧,对于新增的,一定不能添加购物进入 # filter_data_already_online(poi_type, miaoji_id, "景点类中存在购物数据被过滤") pass else: continue data.append(per_data) elif poi_type == 'rest': data.append( (miaoji_id, data_dict['name'], data_dict['name_en'], source, key, data_dict['map_info'], data_dict['address'], data_dict['ranking'], data_dict['grade'], data_dict['url'], data_dict['phone'], data_dict['introduction'], norm_open_time, data_dict['opentime'], data_dict['prize'], data_dict['traveler_choice'], data_dict['commentcounts'], data_dict['price'], data_dict['price_level'], data_dict['cuisines'], data_dict['imgurl'], data_dict['tagid'], norm_tag, norm_tag_en, nearby_city)) elif poi_type == 'shop': per_data = { 'id': miaoji_id, 'name': data_dict['name'], 'name_en': data_dict['name_en'], 'data_source': source, 'city_id': city_id, 'map_info': data_dict['map_info'], 'address': data_dict['address'], 'star': data_dict['star'], 'plantocount': data_dict['plantocounts'], 'beentocount': data_dict['beentocounts'], 'real_ranking': data_dict['ranking'], # 'grade': data_dict['grade'], 'commentcount': data_dict['commentcounts'], 'tagid': data_dict['tagid'], 'norm_tagid': norm_tag, 'norm_tagid_en': norm_tag_en, 'website_url': data_dict['site'], 'phone': data_dict['phone'], 'open': norm_open_time, 'open_desc': data_dict['opentime'], 'recommend_lv': data_dict['recommend_lv'], 'prize': data_dict['prize'], 'traveler_choice': data_dict['traveler_choice'], 'image': data_dict['imgurl'], 'nearCity': nearby_city } # official 为 1 时,不更新的字段 # nonofficial 以及 新增的数据时进行更新 if not has_official: per_data.update({ 'introduction': data_dict['introduction'], 'url': data_dict['url'], }) if not has_official and not has_nonofficial: per_data.update({ # 需要增加默认值才能入库 'ranking': -1.0, 'rcmd_open': '', 'image_list': '', 'grade': -1.0, # 明确更新逻辑,当之前没有融合时才会更新状态 'status_online': 'Open', 'status_test': 'Open' }) shopping_tag = [ '礼品与特产商店', '大型购物中心', '农贸市场', '跳蚤市场与街边市场', '古董店', '百货商场', '厂家直营店', '购物' ] important_shopping_tag = [ '礼品与特产商店', '大型购物中心', '百货商场', '厂家直营店', '购物' ] # 购物数据过滤,通过 tag 过滤 tag_list = norm_tag.split('|') if not all([tag.strip() in shopping_tag for tag in tag_list]): if not any([ tag.strip() in important_shopping_tag for tag in tag_list ]): if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "非购物数据被过滤") continue data.append(per_data) else: raise TypeError("Unknown Type: {}".format(poi_type)) if count % 300 == 0: db = dataset.connect( "mysql+pymysql://mioji_admin:[email protected]/poi_merge?charset=utf8" ) table = db[data_process_table_name] _insert = 0 logger.debug("Total: {}".format(count)) _t = time.time() for d in data: _res = table.upsert(d, keys=['id']) if _res: _insert += 1 logger.debug( '[data upsert][count: {}][insert: {}][takes: {}]'.format( count, _insert, time.time() - _t)) logger.debug("[city_id: {}][insert_count_this_times: {}]".format( cid, _insert)) db.commit() data = [] count += 1 logger.debug("[city_id: {}][total: {}]".format(cid, count)) _insert = 0 db = dataset.connect( "mysql+pymysql://mioji_admin:[email protected]/poi_merge?charset=utf8" ) table = db[data_process_table_name] for d in data: _res = table.upsert(d, keys=['id']) if _res: _insert += 1 logger.debug("Insert: {}".format(_insert)) db.commit() logger.debug("Insert: {}".format(_insert)) conn.close() update_already_merge_city("{}_data".format(poi_type), cid)
def shared_city_id_insert(): city_list, station_list = get_city_station_list() data = [] _count = 0 near_count = 0 far_count = 0 no_count = 0 no_set = set() no_map_info_count = 0 no_map_info_set = set() for city_id, city_map_info in city_list: _count += 1 if is_legal(city_map_info): near_station = [] far_station = [] for station_id, station_map_info, station_city_map_info in station_list: if is_legal(station_map_info): distance = get_distance(city_map_info, station_map_info) map_info = station_map_info elif is_legal(station_city_map_info): distance = get_distance(city_map_info, station_map_info) map_info = station_city_map_info else: distance = None map_info = None if distance is None: continue elif distance < 20: near_station.append((station_id, map_info, distance)) elif distance < 50: far_station.append((station_id, map_info, distance)) near_station = sorted(near_station, key=lambda x: x[-1]) far_station = sorted(far_station, key=lambda x: x[-1]) if near_station: new_near_station = near_station[:3] else: new_near_station = [] if far_station: new_far_station = far_station[0] else: new_far_station = [] if new_near_station: for station_id, map_info, distance in new_near_station: data.append((station_id, map_info, city_id, city_map_info, distance, '20km 匹配 3 条')) near_count += 1 logger.info( "[20 匹配][count: {}][near: {}][city_id: {}][station: {}]". format(_count, near_count, city_id, new_near_station)) elif new_far_station: station_id, map_info, distance = new_far_station data.append((station_id, map_info, city_id, city_map_info, distance, '50km 匹配 1 条')) far_count += 1 logger.info( "[50 匹配][count: {}][far: {}][city_id: {}][station: {}]". format(_count, far_count, city_id, new_far_station)) else: no_count += 1 logger.info( "[无 station 城市][count: {}][no: {}][city_id: {}]".format( _count, no_count, city_id)) no_set.add(city_id) continue else: logger.info( "[无 map_info 城市][count: {}][no_map_info: {}][city_id: {}]". format(_count, no_map_info_count, city_id)) no_map_info_set.add(city_id) continue if len(data): insert_db(data) # if len(data) > 1: # insert_db(data) # elif len(data) == 1: # insert_db(data, False) data = [] logger.info( "[near_count: {}][far_count: {}][no_count: {}][no_map_info_count: {}]". format(near_count, far_count, no_count, no_map_info_count)) logger.info("[no_set: {}]".format(no_set)) logger.info("[no_map_info_set: {}]".format(no_map_info_set))
def insert_poi_unid(merged_dict, cid_or_geohash): global white_list global online_table_name global data_source_table start = time.time() # get city country name map_info _dev_conn = base_data_pool.connection() _dev_cursor = _dev_conn.cursor() _dev_cursor.execute('''SELECT city.id AS cid, city.name AS city_name, country.name AS country_name, city.map_info AS map_info FROM city JOIN country ON city.country_id = country.mid WHERE city.id = {};'''.format(cid_or_geohash)) cid, city_name, country, city_map_info = _dev_cursor.fetchone() _dev_cursor.close() _dev_conn.close() # 去除 total 的写法,费劲但是提升速度不明显,使用 total 后 5.9 秒获取巴黎全部信息,直接获取 6.9 秒,相差可以接受 # init id list # online_ids = set() # data_ids = set() # for _, s_sid_set in merged_dict.items(): # for source, sid in s_sid_set: # if source == 'online': # online_ids.add(sid) # else: # data_ids.add((source, sid)) # get data total # get online data name name_en map_info grade star ranking address url total_data = {} _dev_conn = base_data_pool.connection() _dev_cursor = _dev_conn.cursor() try: _t = time.time() sql = '''SELECT id, name, name_en, map_info, grade, -1, ranking, address, '' FROM {} WHERE city_id='{}';'''.format(online_table_name, cid_or_geohash) _dev_cursor.execute(sql) logger.debug('[query][sql: {}][takes: {}]'.format( sql, time.time() - _t)) except Exception as exc: logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc) for line in _dev_cursor.fetchall(): total_data[('online', line[0])] = line[1:] _dev_cursor.close() _dev_conn.close() # get poi name name_en map_info grade star ranking address url _data_conn = poi_ori_pool.connection() _data_cursor = _data_conn.cursor() try: _t = time.time() sql = '''SELECT source, id, name, name_en, map_info, grade, star, ranking, address, url FROM {} WHERE city_id='{}';'''.format(data_source_table, cid_or_geohash) _data_cursor.execute(sql) logger.debug('[query][sql: {}][takes: {}]'.format( sql, time.time() - _t)) except Exception as exc: logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc) for line in _data_cursor.fetchall(): total_data[(line[0], line[1])] = line[2:] _data_cursor.close() _data_conn.close() # init white list total data if white_list: _s_sid = [] for _each in white_list: _s_sid.extend(_each) _ori_conn = poi_ori_pool.connection() _ori_cursor = _ori_conn.cursor() try: _t = time.time() query_sql = '''SELECT source, id, name, name_en, map_info, grade, star, ranking, address, url FROM {} WHERE (source, id) IN ({});'''.format( data_source_table, ','.join(map(lambda x: "('{}', '{}')".format(*x), _s_sid))) _ori_cursor.execute(query_sql) logger.debug('[query][sql: {}][takes: {}]'.format( sql, time.time() - _t)) except Exception as exc: logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc) for line in _ori_cursor.fetchall(): total_data[(line[0], line[1])] = line[2:] _ori_cursor.close() _ori_conn.close() data = [] for uid, s_sid_set in merged_dict.items(): for source, sid in s_sid_set: # name name_en map_info grade star ranking address url name, name_en, map_info, grade, star, ranking, address, url = total_data[ (source, sid)] if not is_legal(name): name = '' if not is_legal(name_en): name_en = '' if not is_legal(grade): grade = -1.0 if not is_legal(star): star = -1.0 if not is_legal(ranking): ranking = -1.0 if not is_legal(address): address = '' if not is_legal(url): url = '' data.append( (uid, cid, city_name, country, city_map_info, source, sid, name, name_en, map_info, grade, star, ranking, address, url)) # data = [] # _dev_conn = base_data_pool.connection() # _dev_cursor = _dev_conn.cursor() # _data_conn = poi_ori_pool.connection() # _data_cursor = _data_conn.cursor() # for uid, s_sid_set in merged_dict.items(): # for source, sid in s_sid_set: # if source == 'online': # _dev_cursor.execute('''SELECT # name, # name_en, # map_info, # grade, # -1, # ranking, # address, # '' # FROM chat_attraction # WHERE id = '{}';'''.format(sid)) # try: # name, name_en, map_info, grade, star, ranking, address, url = _dev_cursor.fetchone() # except Exception as exc: # logger.exception("[error sql query][source: {}][sid: {}]".format(source, sid), exc_info=exc) # continue # else: # _data_cursor.execute('''SELECT # CASE WHEN name NOT IN ('NULL', '', NULL) # THEN name # ELSE '' END, # CASE WHEN name_en NOT IN ('NULL', '', NULL) # THEN name_en # ELSE '' END, # map_info, # CASE WHEN grade NOT IN ('NULL', '', NULL) # THEN grade # ELSE -1.0 END AS grade, # CASE WHEN star NOT IN ('NULL', '', NULL) # THEN star # ELSE -1.0 END AS star, # CASE WHEN ranking NOT IN ('NULL', '', NULL) # THEN ranking # ELSE -1.0 END AS ranking, # CASE WHEN address NOT IN ('NULL', '', NULL) # THEN address # ELSE '' END, # CASE WHEN url NOT IN ('null', '', NULL) # THEN url # ELSE '' END # FROM attr # WHERE source = '{}' AND id = '{}';'''.format(source, sid)) # try: # name, name_en, map_info, grade, star, ranking, address, url = _data_cursor.fetchone() # except Exception as exc: # logger.exception("[error sql query][source: {}][sid: {}]".format(source, sid), exc_info=exc) # continue # # data.append((uid, cid, city_name, country, city_map_info, source, sid, name, name_en, map_info, grade, # star, ranking, address, url)) # _dev_cursor.close() # _data_cursor.close() # _dev_conn.close() # _data_conn.close() _final_conn = poi_ori_pool.connection() _final_cursor = _final_conn.cursor() # for d in data: try: _t = time.time() sql = '''REPLACE INTO {}_unid (id, city_id, city_name, country_name, city_map_info, source, source_id, name, name_en, map_info, grade, star, ranking, address, url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'''.format( poi_type) _final_cursor.executemany(sql, data) logger.debug('[query][sql: {}][takes: {}]'.format( sql, time.time() - _t)) except Exception as exc: logger.exception("[insert unid table error]", exc_info=exc) _final_conn.commit() _final_cursor.close() _final_conn.close() logger.info( "[finish prepare data][city: {}][line_count: {}][takes: {}]".format( cid_or_geohash, len(data), time.time() - start))
def get_data(self): for line in self.private_db.query( '''SELECT * from {0} WHERE ptid = "qyoa" AND disable = 0 AND city_id=50016;''' .format(self.src_table_name)): data = copy.deepcopy(self.default_val) for k in self.keys: # 去除不使用的值 if k in self.skip_keys: continue # 去除空行 if is_legal(line[k]): data[(self.key_map.get(k, None) or k)] = line[k] # 从 refer 中获取 id if line['refer'] is not None: if line['refer'] != '': data[self.id_key] = line['refer'] if not self.need_new_data: if self.id_key not in data: self.errors.append(line) continue if not data[self.id_key]: self.errors.append(line) continue # 如果需要增加 id map 的关系,则使用如此方法获取 id if self.need_add_id_map: # 从 id_map 中获取 id if self.id_key not in data: id_map_miaoji_id = self.id_map.get(line[self.id_key], None) if id_map_miaoji_id: data[self.id_key] = id_map_miaoji_id # 新生成相关 id if self.id_key not in data: data[self.id_key] = self.get_new_id() # 景点,购物,餐厅更新 city_id if self.need_update_city_id: data['city_id'] = self.id_map.get(data['city_id'], None) or data['city_id'] # 保存本次所有 id self.data_id_set.add(data[self.id_key]) # 如果需要增加 id 对应关系 if self.need_add_id_map: # 存储 id 对应关系 self.insert_id_map(line[self.id_key], data[self.id_key]) if self.need_update_status: # 修改 景点、购物、餐厅 表 status if self.need_update_city_id: data['status_test'] = "Open" data['status_online'] = "Open" data['dept_status_online'] = "Open" data['dept_status_test'] = "Open" # 增加 city 表 status else: data['status_test'] = "Open" data['status_online'] = "Open" yield data