예제 #1
0
def task():
    name_dict = {}
    en_dict = {}
    query_sql = '''SELECT
  id,
  name,
  name_en
FROM chat_shopping;'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    cursor.execute(query_sql)

    for line in cursor.fetchall():
        miaoji_id = line['id']
        name = line['name']
        name_en = line['name_en']
        name_dict[name] = miaoji_id
        en_dict[get_similar_word(name_en)] = miaoji_id

    cursor.close()
    conn.close()

    query_sql = '''SELECT *
FROM qyer_outlets_new
WHERE city_id IS NOT NULL AND city_id != 'NULL';'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    cursor.execute(query_sql)
    for line in cursor.fetchall():
        name = line['name']
        name_en = get_similar_word(line['name_en'])
        if (name in name_dict) and line['name'] != '':
            uid = name_dict.get(name, '')
            u_name, u_name_en, o_name, o_name_en = get_shop_info(uid)
            final_name = get_name(
                [(line['name'], 'qyer'), (line['name_en'], 'qyer'), (u_name, 'daodao'), (u_name_en, 'daodao'),
                 (o_name, 'online'), (o_name_en, 'online')])
            update_outlets(cid=line['city_id'], name=final_name, uid=uid)
            print(uid, final_name, line['city_id'])
        elif (name_en in en_dict) and line['name_en'] != '':
            uid = en_dict.get(name_en, '')
            u_name, u_name_en, o_name, o_name_en = get_shop_info(uid)
            final_name = get_name(
                [(line['name'], 'qyer'), (line['name_en'], 'qyer'), (u_name, 'daodao'), (u_name_en, 'daodao'),
                 (o_name, 'online'), (o_name_en, 'online')])
            update_outlets(cid=line['city_id'], name=final_name, uid=uid)
            print(uid, final_name, line['city_id'])
        else:
            continue
    cursor.close()
    conn.close()
예제 #2
0
def get_shop_info(uid):
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''SELECT name, name_en FROM chat_shopping WHERE id=%s;''', (uid,))
    _res_1 = cursor.fetchone()
    cursor.close()
    conn.close()

    conn = base_data_pool.connection()
    cursor = conn.cursor()
    _res = cursor.execute('''SELECT
  name,
  name_en
FROM chat_shopping
WHERE tag_id = 9 AND id=%s;''', (uid,))
    if _res:
        _res_2 = cursor.fetchone()
    else:
        _res_2 = ('', '')
    cursor.close()
    conn.close()
    _res = list()
    _res.extend(_res_1)
    _res.extend(_res_2)
    return _res
예제 #3
0
def init_white_list():
    _dict = {}
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    query_sql = '''SELECT
  id,
  url
FROM attr
WHERE source = 'qyer';'''
    cursor.execute(query_sql)
    _count = 0
    for line in cursor.fetchall():
        _count += 1
        if _count % 2000 == 0:
            logger.debug("[init dict][count: {}]".format(_count))
        try:
            _url = line['url']
            _url_id = re.findall('place.qyer.com/poi/([\s\S]+)/', _url)[0]
            _sid = line['id']
            _dict[_url_id] = _sid
        except Exception as exc:
            logger.exception(msg="[init qyer id dict error]", exc_info=exc)
    cursor.close()
    conn.close()
    logger.debug("[init dict][count: {}]".format(_count))
    return _dict
예제 #4
0
def get_id_map_info(table_name):
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    cursor.execute("""SELECT
  attr_unid.id,
  attr_unid.source,
  attr_unid.source_id,
  attr_unid.map_info                                    AS old_map_info,
  ServicePlatform.{0}.map_info AS new_map_info,
  ServicePlatform.{0}.address
FROM attr_unid
  JOIN ServicePlatform.{0}
    ON attr_unid.source = ServicePlatform.{0}.source AND
       attr_unid.source_id = ServicePlatform.{0}.id
WHERE ServicePlatform.{0}.id IN (SELECT DISTINCT sid
                                                          FROM ServicePlatform.supplement_field
                                                          WHERE SOURCE = 'daodao' AND
                                                                TABLE_NAME =
                                                                '{0}');""".
                   format(table_name))

    data = []
    for line in cursor.fetchall():
        data.append((line['new_map_info'], line['id']))
        if len(data) % 1000 == 0:
            logger.debug("[now data][count: {}]".format(len(data)))
    logger.debug("[now data][count: {}]".format(len(data)))

    for d in chunks(data, 1000):
        update_db(d)
    cursor.close()
    conn.close()
예제 #5
0
def init_white_list_data():
    global white_list
    global white_list_data

    _s_sid = []
    for _each in white_list:
        _s_sid.extend(_each)

    if not _s_sid:
        return

    # get whole source data
    _t = time.time()
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    sql = '''SELECT *
FROM {}
WHERE (source, id) IN ({});'''.format(
        poi_type, ','.join(map(lambda x: "('{}', '{}')".format(*x), _s_sid)))
    cursor.execute(sql)
    logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t))
    for line in cursor.fetchall():
        source = line['source']
        source_id = line['id']
        white_list_data[(source, source_id)] = line
    cursor.close()
    conn.close()
예제 #6
0
def update_sql(data):
    sql = '''DELETE FROM chat_attraction WHERE id=%s;'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    _res = cursor.executemany(sql, data)
    cursor.close()
    conn.close()
    logger.info("[total: {}][execute: {}]".format(len(data), _res))
예제 #7
0
def update_already_merge_city(poi_type, cid):
    _conn = poi_ori_pool.connection()
    _cursor = _conn.cursor()
    res = _cursor.execute(
        '''REPLACE INTO already_merged_city VALUES (%s,%s)''', (poi_type, cid))
    _conn.commit()
    _cursor.close()
    _conn.close()
    return res
예제 #8
0
def init_already_merged_city(poi_type):
    _conn = poi_ori_pool.connection()
    _cursor = _conn.cursor()
    _cursor.execute(
        '''SELECT * FROM already_merged_city WHERE type='{}';'''.format(
            poi_type))
    res = list(map(lambda x: x[1], _cursor.fetchall()))
    _cursor.close()
    _conn.close()
    return res
예제 #9
0
def insert_white_list(data):
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    _res = cursor.executemany(
        '''REPLACE INTO white_list (type, md5, info) VALUES (%s, %s, %s)''',
        data)
    conn.commit()
    logger.debug("[insert white list][total: {}][insert: {}]".format(
        len(data), _res))
    cursor.close()
    conn.close()
예제 #10
0
def update(data):
    sql = '''UPDATE chat_attraction
SET beentocount = %s, plantocount = %s
WHERE id = %s;'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    _res = cursor.executemany(sql, data)
    conn.commit()
    cursor.close()
    conn.close()
    print("[total: {}][execute: {}]".format(len(data), _res))
예제 #11
0
def insert_fake_uid(uid, sid, city_id):
    sql = '''INSERT INTO attr_unid_1220 (`id`, `source`, `source_id`, `city_id`) VALUES (%s, %s, %s, %s);'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    try:
        _res = cursor.execute(sql, (uid, 'qyer', sid, city_id))
    except Exception:
        print(sql)
        raise Exception()
    conn.commit()
    cursor.close()
    conn.close()
    logger.info("[total: {}][execute: {}]".format(1, _res))
예제 #12
0
def update_sql(data):
    sql = '''UPDATE chat_attraction
SET beentocount = %s, plantocount = %s, commentcount = %s
WHERE id = %s;'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    try:
        _res = cursor.execute(sql, data)
    except Exception:
        print(sql)
        raise Exception()
    conn.commit()
    cursor.close()
    conn.close()
    logger.info("[total: {}][execute: {}]".format(1, _res))
예제 #13
0
def update_outlets(uid, name, cid):
    conn = spider_data_base_data_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''UPDATE chat_shopping
SET name = %s, tag_id = 9, city_id = %s
WHERE id = %s;''', (name, cid, uid))
    cursor.close()
    conn.close()

    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''UPDATE chat_shopping
SET name = %s, norm_tagid = '奥特莱斯', norm_tagid_en = 'Outlet', city_id = %s
WHERE id = %s;''', (name, cid, uid))
    cursor.close()
    conn.close()
예제 #14
0
def insert_unknown_keywords(_type, _keyword_or_keywords):
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    sql = '''INSERT IGNORE INTO unknown_keywords (`type`, `key_hash`, `keywords`) VALUES (%s, %s, %s);'''
    if isinstance(_keyword_or_keywords, str):
        _hash_key = encode(_keyword_or_keywords)
        cursor.execute(sql, (_type, _hash_key, _keyword_or_keywords))
    elif isinstance(_keyword_or_keywords, (list, set, tuple)):
        for each_keyword in _keyword_or_keywords:
            _hash_key = encode(each_keyword)
            cursor.execute(sql, (_type, _hash_key, each_keyword))
    else:
        logger.debug(
            "[unknown _keyword_or_keywords type: {}][_type: {}][_keyword_or_keywords: {}]"
            .format(type(_keyword_or_keywords), _type, _keyword_or_keywords))
    conn.commit()
    cursor.close()
    conn.close()
예제 #15
0
def get_qyer_grade_ranking():
    _dict = {}
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    query_sql = '''SELECT
  attr_unid.id,
  attr.grade,
  attr.ranking
FROM attr
  JOIN attr_unid ON attr_unid.source = attr.source AND attr_unid.source_id = attr.id
WHERE attr.source = 'qyer';'''
    cursor.execute(query_sql)
    for line in cursor.fetchall():
        _dict[line[0]] = (line[1] if line[1] else -1.0,
                          line[2] if line[2] else -1)
    cursor.close()
    conn.close()
    return _dict
예제 #16
0
def get_id():
    cross_dict = generate_cross_dict()
    _e_sid_set = set()
    _e_uid_set = set()
    sql = '''SELECT
  chat_attraction.id,
  chat_attraction.name,
  chat_attraction.name_en,
  attr_unid.source,
  attr_unid.source_id
FROM chat_attraction, attr_unid
WHERE beentocount = '{}' AND plantocount = '{}' AND data_source LIKE '%qyer%' AND attr_unid.source = 'qyer' AND
      chat_attraction.id = attr_unid.id;'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    cursor.execute(sql)
    data = []
    for uid, name, name_en, source, sid in cursor.fetchall():
        _tmp_res = cross_dict.get(sid)
        if _tmp_res:
            new_beentocounts, new_plantocounts = _tmp_res
            if new_beentocounts == '{}' and new_plantocounts == '{}':
                pass
            else:
                _e_uid_set.add(uid)
        else:
            print('[error sid: {}]'.format(sid))
            _e_sid_set.add(sid)
            continue
        data.append((new_beentocounts, new_plantocounts, uid))
        if len(data) == 1000:
            update(data=data)
            data = []
    if data:
        update(data=data)
    cursor.close()
    conn.close()
    print(_e_sid_set)
    print(_e_uid_set)
예제 #17
0
def filter_data_already_online(_type, _mioji_id, error):
    if _type == 'attr':
        table_name = 'chat_attraction'
    elif _type == 'rest':
        table_name = 'chat_restaurant'
    elif _type == 'shop':
        table_name = 'chat_shopping'
    else:
        raise TypeError("Unknown Type: {}".format(_type))

    _status = None
    try:
        conn = base_data_pool.connection()
        cursor = conn.cursor()
        sql = '''SELECT status_test FROM {} WHERE id='{}';'''.format(table_name, _mioji_id)
        _res = cursor.execute(sql)
        if not _res:
            return False
        _status = cursor.fetchone()[0]
        logger.debug("[type: {}][status: {}]".format(_type, _status))
        cursor.close()
        conn.close()
    except Exception as exc:
        logger.exception(msg="[get online poi status error]", exc_info=exc)

    if not _status:
        return False

    try:
        conn = poi_ori_pool.connection()
        cursor = conn.cursor()
        sql = '''INSERT IGNORE INTO filter_data_already_online (type, mioji_id, error, status) VALUES (%s, %s, %s, %s);'''
        cursor.execute(sql, (_type, _mioji_id, error, _status))
        conn.commit()
        cursor.close()
        conn.close()
    except Exception as exc:
        logger.exception(msg="[insert filter data already online error]", exc_info=exc)
예제 #18
0
def update_each_tag_id():
    tag_id = tag2id()
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''SELECT id,norm_tagid
FROM {};'''.format(task_table))
    data = []
    _count = 0
    for _id, _tag_id in cursor.fetchall():
        if is_legal(_tag_id):
            tag_id_set = set()
            for each in _tag_id.split('|'):
                tag_id_set.add(tag_id.get(each))
            small_tag = ('|'.join(filter(lambda x: is_legal(x), tag_id_set)))
            big_tag = get_tag(small_tag)
            data.append((small_tag, big_tag, _id))
            _count += 1
            if len(data) % 1000 == 0:
                logger.debug("[mk data][poi_type: {}][len: {}]".format(
                    poi_type, _count))
                res = cursor.executemany(
                    'update base_data.{} set tag_id=%s, tagB=%s where id=%s'.
                    format(task_table), data)
                data = []
                logger.debug(
                    "[update tag id][table_name: {}][update count: {}]".format(
                        task_table, res))

    res = cursor.executemany(
        'update base_data.{} set tag_id=%s, tagB=%s where id=%s'.format(
            task_table), data)
    logger.debug("[update tag id][table_name: {}][update count: {}]".format(
        task_table, res))
    logger.debug("[mk data finished][poi_type: {}][len: {}]".format(
        poi_type, _count))
    conn.commit()
    cursor.close()
    conn.close()
예제 #19
0
파일: poi_merge.py 프로젝트: 20113261/p_m
def get_max_id():
    global poi_type
    global online_table_name
    global data_source_table
    conn = base_data_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''SELECT max(id) FROM {};'''.format(online_table_name))
    _id_online = cursor.fetchone()[0]
    conn.close()

    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''SELECT max(id) FROM {}_unid;'''.format(poi_type))
    _id_merged = cursor.fetchone()[0]
    conn.close()
    if _id_merged and _id_online:
        return max(_id_online, _id_merged)
    elif _id_merged:
        return _id_merged
    elif _id_online:
        return _id_online
    else:
        raise Exception("无法获取最大 id ,不能进行融合")
예제 #20
0
def init_white_list():
    global white_list
    if white_list:
        return
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    cursor.execute('''SELECT info
FROM white_list
WHERE type = %s;''', (poi_type, ))

    _d = set()
    for line in cursor.fetchall():
        if not line:
            continue
        else:
            _data = json.loads(line['info'])
            if 'qyer' in _data:
                _d.add(('qyer', str(_data['qyer'])))
            if 'daodao' in _data:
                _d.add(('daodao', str(_data['daodao'])))
    white_list.append(_d)
    cursor.close()
    conn.close()
예제 #21
0
def get_poi_union_info(cid):
    conn = poi_ori_pool.connection()
    # 获取所有用于融合的城市 id

    logger.info("[get city task][cid: {}]".format(cid))
    cursor = conn.cursor(cursor=DictCursor)
    city_poi = []
    sql = '''SELECT
id,
city_id,
group_concat(concat(source, '|', source_id) SEPARATOR '|_||_|') AS union_info
FROM {}_unid WHERE city_id='{}'
GROUP BY id'''.format(poi_type, cid)
    _t = time.time()
    cursor.execute(sql)
    logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t))
    for line in cursor.fetchall():
        miaoji_id = line['id']
        city_id = line['city_id']
        union_info = line['union_info']
        city_poi.append((miaoji_id, city_id, union_info))
    cursor.close()
    conn.close()
    return city_poi
예제 #22
0
def get_poi_dict(city_id):
    """
    get all attraction info in the city
    :param city_id: city mioji id
    :return: city info dict
    """
    global white_list_data
    _poi_dict = defaultdict(dict)
    _online_official_data = {}
    _online_nonofficial_data = {}
    # get whole data process
    conn = data_process_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    sql = '''SELECT *
FROM {}
WHERE city_id='{}';'''.format(data_process_table_name, city_id)
    _t = time.time()
    cursor.execute(sql)
    logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t))
    for each in cursor.fetchall():
        if int(each["official"]) == 0:
            _online_nonofficial_data[each['id']] = each
        else:
            _online_official_data[each['id']] = each
    cursor.close()
    conn.close()

    # get whole base data, and update data process result
    conn = base_data_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    sql = '''SELECT *
    FROM {}
    WHERE city_id='{}';'''.format(data_process_table_name, city_id)
    _t = time.time()
    cursor.execute(sql)
    logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t))
    for each in cursor.fetchall():
        each.pop('tag_id')
        if int(each["official"]) == 0:
            _online_nonofficial_data[each['id']].update(each)
        else:
            _online_official_data[each['id']].update(each)
    cursor.close()
    conn.close()

    # get whole source data
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    sql = "select * from {} where city_id='{}'".format(poi_type, city_id)
    _t = time.time()
    cursor.execute(sql)
    logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t))
    for line in cursor.fetchall():
        source = line['source']
        source_id = line['id']
        _poi_dict[(source, source_id)] = line
    cursor.close()
    conn.close()

    for k, v in white_list_data.items():
        _poi_dict[k] = v
    return _poi_dict, _online_official_data, _online_nonofficial_data
예제 #23
0
파일: poi_merge.py 프로젝트: 20113261/p_m
def insert_poi_unid(merged_dict, cid_or_geohash):
    global white_list
    global online_table_name
    global data_source_table
    start = time.time()
    # get city country name map_info
    _dev_conn = base_data_pool.connection()
    _dev_cursor = _dev_conn.cursor()
    _dev_cursor.execute('''SELECT
  city.id       AS cid,
  city.name     AS city_name,
  country.name  AS country_name,
  city.map_info AS map_info
FROM city
  JOIN country ON city.country_id = country.mid
WHERE city.id = {};'''.format(cid_or_geohash))
    cid, city_name, country, city_map_info = _dev_cursor.fetchone()
    _dev_cursor.close()
    _dev_conn.close()

    # 去除 total 的写法,费劲但是提升速度不明显,使用 total 后 5.9 秒获取巴黎全部信息,直接获取 6.9 秒,相差可以接受
    # init id list
    # online_ids = set()
    # data_ids = set()
    # for _, s_sid_set in merged_dict.items():
    #     for source, sid in s_sid_set:
    #         if source == 'online':
    #             online_ids.add(sid)
    #         else:
    #             data_ids.add((source, sid))

    # get data total
    # get online data name name_en map_info grade star ranking address url
    total_data = {}
    _dev_conn = base_data_pool.connection()
    _dev_cursor = _dev_conn.cursor()
    try:
        _t = time.time()
        sql = '''SELECT
  id,
  name,
  name_en,
  map_info,
  grade,
  -1,
  ranking,
  address,
  ''
FROM {} WHERE city_id='{}';'''.format(online_table_name, cid_or_geohash)
        _dev_cursor.execute(sql)
        logger.debug('[query][sql: {}][takes: {}]'.format(
            sql,
            time.time() - _t))
    except Exception as exc:
        logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc)

    for line in _dev_cursor.fetchall():
        total_data[('online', line[0])] = line[1:]
    _dev_cursor.close()
    _dev_conn.close()

    # get poi name name_en map_info grade star ranking address url
    _data_conn = poi_ori_pool.connection()
    _data_cursor = _data_conn.cursor()
    try:
        _t = time.time()
        sql = '''SELECT
source,
id,
name,
name_en,
map_info,
grade,
star,
ranking,
address,
url
FROM {}
WHERE city_id='{}';'''.format(data_source_table, cid_or_geohash)
        _data_cursor.execute(sql)
        logger.debug('[query][sql: {}][takes: {}]'.format(
            sql,
            time.time() - _t))
    except Exception as exc:
        logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc)
    for line in _data_cursor.fetchall():
        total_data[(line[0], line[1])] = line[2:]
    _data_cursor.close()
    _data_conn.close()

    # init white list total data
    if white_list:
        _s_sid = []
        for _each in white_list:
            _s_sid.extend(_each)

        _ori_conn = poi_ori_pool.connection()
        _ori_cursor = _ori_conn.cursor()
        try:
            _t = time.time()
            query_sql = '''SELECT
  source,
  id,
  name,
  name_en,
  map_info,
  grade,
  star,
  ranking,
  address,
  url
FROM {}
WHERE (source, id) IN ({});'''.format(
                data_source_table,
                ','.join(map(lambda x: "('{}', '{}')".format(*x), _s_sid)))
            _ori_cursor.execute(query_sql)
            logger.debug('[query][sql: {}][takes: {}]'.format(
                sql,
                time.time() - _t))
        except Exception as exc:
            logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc)
        for line in _ori_cursor.fetchall():
            total_data[(line[0], line[1])] = line[2:]
        _ori_cursor.close()
        _ori_conn.close()

    data = []
    for uid, s_sid_set in merged_dict.items():
        for source, sid in s_sid_set:
            # name name_en map_info grade star ranking address url
            name, name_en, map_info, grade, star, ranking, address, url = total_data[
                (source, sid)]
            if not is_legal(name):
                name = ''
            if not is_legal(name_en):
                name_en = ''
            if not is_legal(grade):
                grade = -1.0
            if not is_legal(star):
                star = -1.0
            if not is_legal(ranking):
                ranking = -1.0
            if not is_legal(address):
                address = ''
            if not is_legal(url):
                url = ''

            data.append(
                (uid, cid, city_name, country, city_map_info, source, sid,
                 name, name_en, map_info, grade, star, ranking, address, url))

            #     data = []
            #     _dev_conn = base_data_pool.connection()
            #     _dev_cursor = _dev_conn.cursor()
            #     _data_conn = poi_ori_pool.connection()
            #     _data_cursor = _data_conn.cursor()
            #     for uid, s_sid_set in merged_dict.items():
            #         for source, sid in s_sid_set:
            #             if source == 'online':
            #                 _dev_cursor.execute('''SELECT
            #   name,
            #   name_en,
            #   map_info,
            #   grade,
            #   -1,
            #   ranking,
            #   address,
            #   ''
            # FROM chat_attraction
            # WHERE id = '{}';'''.format(sid))
            #                 try:
            #                     name, name_en, map_info, grade, star, ranking, address, url = _dev_cursor.fetchone()
            #                 except Exception as exc:
            #                     logger.exception("[error sql query][source: {}][sid: {}]".format(source, sid), exc_info=exc)
            #                     continue
            #             else:
            #                 _data_cursor.execute('''SELECT
            #   CASE WHEN name NOT IN ('NULL', '', NULL)
            #     THEN name
            #   ELSE '' END,
            #   CASE WHEN name_en NOT IN ('NULL', '', NULL)
            #     THEN name_en
            #   ELSE '' END,
            #   map_info,
            #   CASE WHEN grade NOT IN ('NULL', '', NULL)
            #     THEN grade
            #   ELSE -1.0 END AS grade,
            #   CASE WHEN star NOT IN ('NULL', '', NULL)
            #     THEN star
            #   ELSE -1.0 END AS star,
            #   CASE WHEN ranking NOT IN ('NULL', '', NULL)
            #     THEN ranking
            #   ELSE -1.0 END AS ranking,
            #   CASE WHEN address NOT IN ('NULL', '', NULL)
            #     THEN address
            #   ELSE '' END,
            #   CASE WHEN url NOT IN ('null', '', NULL)
            #     THEN url
            #   ELSE '' END
            # FROM attr
            # WHERE source = '{}' AND id = '{}';'''.format(source, sid))
            #                 try:
            #                     name, name_en, map_info, grade, star, ranking, address, url = _data_cursor.fetchone()
            #                 except Exception as exc:
            #                     logger.exception("[error sql query][source: {}][sid: {}]".format(source, sid), exc_info=exc)
            #                     continue
            #
            #             data.append((uid, cid, city_name, country, city_map_info, source, sid, name, name_en, map_info, grade,
            #                          star, ranking, address, url))
            #     _dev_cursor.close()
            #     _data_cursor.close()
            #     _dev_conn.close()
            #     _data_conn.close()

    _final_conn = poi_ori_pool.connection()
    _final_cursor = _final_conn.cursor()
    # for d in data:
    try:
        _t = time.time()
        sql = '''REPLACE INTO {}_unid (id, city_id, city_name, country_name, city_map_info, source, source_id, name, name_en, map_info, grade, star, ranking, address, url)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'''.format(
            poi_type)
        _final_cursor.executemany(sql, data)
        logger.debug('[query][sql: {}][takes: {}]'.format(
            sql,
            time.time() - _t))
    except Exception as exc:
        logger.exception("[insert unid table error]", exc_info=exc)
    _final_conn.commit()
    _final_cursor.close()
    _final_conn.close()

    logger.info(
        "[finish prepare data][city: {}][line_count: {}][takes: {}]".format(
            cid_or_geohash, len(data),
            time.time() - start))
예제 #24
0
def poi_merged_report(poi_type):
    init_global_name(poi_type)

    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)

    get_column_sql = "SELECT column_name FROM information_schema.columns WHERE table_schema = DATABASE() AND table_name='{}'".format(
        table_name)
    cursor.execute(get_column_sql)

    datas = cursor.fetchall()

    column_list = []

    filter_info = []
    error_data_id = []

    for data in datas:
        column_list.append(data['column_name'])

    report_data = pandas.DataFrame(columns=[
        'ID', '城市名', '国家', '城市 grade', '开发库中{}个数'.format(poi_name),
        'test 上线{}个数'.format(poi_name), '上线{}占比'.format(
            poi_name), '上线{}有图占比'.format(poi_name), '开发库中 daodao {}个数'.format(
                poi_name), '上线 daodao 个数', '上线 daodao {}占比'.format(poi_name)
    ])
    report_count = 0
    for cid, city_name, city_name_en, country, grade in init_city_dict():
        get_sql = "select * from {} where city_id = '{}'".format(
            table_name, cid)
        cursor.execute(get_sql)
        datas = cursor.fetchall()
        name_null_fail_count = 0
        city_null_fail_count = 0
        first_img_null_fail_count = 0
        norm_tag_null_fail_count = 0
        map_fail_count = 0

        img_succeed_count = 0
        daodao_count = 0
        daodao_succeed_count = 0
        success_count = 0
        for data in datas:
            # daodao 统计部分
            if 'daodao' in data['data_source']:
                daodao_count += 1

            # 数据检查部分

            # name
            if data['name'].lower() in (
                    '', 'null', '0') and data['name_en'] in ('', 'null', '0'):
                name_null_fail_count += 1
                error_data_id.append(data['id'])
                continue

            # city
            if data['city_id'] in ('', 'null', '0'):
                city_null_fail_count += 1
                error_data_id.append(data['id'])
                continue

            # map_info
            try:
                lat = float(data['map_info'].strip().split(',')[0])
                lgt = float(data['map_info'].strip().split(',')[1])
            except Exception:
                map_fail_count += 1
                error_data_id.append(data['id'])
                continue

            if 'daodao' in data['data_source']:
                daodao_succeed_count += 1

            success_count += 1

        if success_count != len(datas):
            filter_info.append((cid, city_name, country))

        report_data.loc[report_count] = [
            cid, city_name, country, grade,
            len(datas), success_count,
            "{0:04f}%".format(100 * success_count /
                              float(len(datas))) if len(datas) != 0 else '无穷大',
            "{0:04f}%".format(100 * img_succeed_count / float(success_count))
            if success_count != 0 else '无穷大', daodao_count,
            daodao_succeed_count,
            "{0:04f}%".format(100 * daodao_succeed_count / float(daodao_count))
            if daodao_count != 0 else '无穷大'
        ]
        report_count += 1
        logger.debug(' '.join([cid, city_name, city_name_en, country, grade]) +
                     '\tall:' + str(len(datas)) + "\tget:" +
                     str(success_count) + '\tname_null:' +
                     str(name_null_fail_count) + '\tcity_null:' +
                     str(city_null_fail_count) + '\timg_null:' +
                     str(first_img_null_fail_count) + '\tmap:' +
                     str(map_fail_count) + '\tnorm_tag:' +
                     str(norm_tag_null_fail_count))

    for each in filter_info:
        logger.debug("[data_filter: {}]".format(each))

    logger.debug("[error data][id: {}]".format(error_data_id))

    pandas.DataFrame(report_data).to_excel(datetime.datetime.now().strftime(
        '{}_report_%Y_%m_%d.xlsx'.format(poi_name)))
    pandas.DataFrame(report_data).to_csv(datetime.datetime.now().strftime(
        '{}_report_%Y_%m_%d.csv'.format(poi_name)))
예제 #25
0
def poi_insert_data(cid, _poi_type):
    init_global_name(_poi_type)
    '''
    数据最终入库表
    if _poi_type == 'attr':
        sql = 'replace into chat_attraction(`id`,`name`,`name_en`,`data_source`,`city_id`,' \
              '`map_info`,`address`,`star`,`plantocount`,`beentocount`,`real_ranking`,' \
              '`grade`,`commentcount`,`tagid`,`norm_tagid`,`norm_tagid_en`,`url`,`website_url`,`phone`,`introduction`,' \
              '`open`, `open_desc`,`recommend_lv`,`prize`,`traveler_choice`, `alias`, ' \
              '`image`, `ori_grade`,`nearCity`, `ranking`,`rcmd_open`,`add_info`,`address_en`,`event_mark`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \
              '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,-1,"","","","")'
    elif _poi_type == 'rest':
        sql = 'replace into chat_restaurant(`id`,`name`,`name_en`,' \
              '`source`,`city_id`,`map_info`,`address`,`real_ranking`,' \
              '`grade`,`res_url`,`telphone`,`introduction`,`open_time`,`open_time_desc`,`prize`,' \
              '`traveler_choice`,`review_num`,`price`,`price_level`,`cuisines`, ' \
              '`image_urls`,`tagid`,`norm_tagid`,`norm_tagid_en`,`nearCity`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \
              '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    elif _poi_type == 'shop':
        sql = 'replace into ' \
              'chat_shopping(`id`,`name`,`name_en`,`data_source`,`city_id`,' \
              '`map_info`,`address`,`star`,`plantocount`,`beentocount`,' \
              '`real_ranking`,`grade`,`commentcount`,`tagid`,`norm_tagid`,`norm_tagid_en`,`url`,`website_url`,' \
              '`phone`,`introduction`,`open`,`open_desc`,`recommend_lv`,`prize`,' \
              '`traveler_choice`,`image`,`nearCity`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \
              '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    else:
        raise TypeError("Unknown Type: {}".format(poi_type))
    '''

    conn = poi_ori_pool.connection()
    # for task_dict in get_task(cid):
    count = 0
    data = []

    # 获取融合需要的 poi 信息
    _info_dict, _online_official_data, _online_nonofficial_data = get_poi_dict(
        cid)
    _city_poi = get_poi_union_info(cid)
    # 开始数据融合
    for miaoji_id, city_id, union_info in _city_poi:
        # 初始化融合前变量
        data_dict = defaultdict(dict)

        # 有从其他数据源来的数据
        other_source = False

        # 用于判定是否有线上 official 以及 nonofficial 的数据
        has_official = False
        has_nonofficial = False

        # 获取线上环境数据
        o_official_data = _online_official_data.get(miaoji_id, None)
        o_nonofficial_data = _online_nonofficial_data.get(miaoji_id, None)

        # 更新 official 判定
        if o_official_data is not None:
            has_official = True
        if o_nonofficial_data is not None:
            has_nonofficial = True

        # 初始化融合信息
        for each_name in (json_name_list + norm_name_list + others_name_list):
            data_dict[each_name] = {}

            def get_data(src_dict, is_official=False):
                if each_name in online2source:
                    source_name = online2source[each_name]
                else:
                    source_name = each_name

                if each_name in json_name_list:
                    if source_name in src_dict:
                        try:
                            _res = json.loads(src_dict[source_name])
                            if isinstance(_res, dict):
                                data_dict[each_name] = {
                                    k: v
                                    for k, v in _res.items()
                                    if k in available_source
                                }
                            else:
                                pass
                        except Exception:
                            pass
                else:
                    data_dict[each_name]['mioji_official' if is_official else
                                         'mioji_nonofficial'] = src_dict.get(
                                             source_name, {})

            if o_official_data is not None:
                get_data(o_official_data, is_official=True)
            if o_nonofficial_data is not None:
                get_data(o_nonofficial_data, is_official=False)

        # 遍历所有需要融合的 source 以及 id,并生成 dict 类融合内容
        for s_sid in union_info.split('|_||_|'):
            source, source_id = s_sid.split('|')

            # todo 增加 online 的处理,先 load data,然后进行数据更新
            # todo 使用 online 的 base data 更新 data process 的字段

            # 未获得融合 id 信息
            if not source_id or not source:
                continue

            # 过滤不必要的 source
            if source not in available_source:
                logger.debug("[not available source: {}]".format(source))
                continue

            # 未获得融合数据
            poi_info = _info_dict[(source, source_id)]
            if poi_info == {}:
                continue

            other_source = True

            # 加 key
            for each_name in (json_name_list + norm_name_list +
                              others_name_list):
                if is_legal(poi_info[each_name]):
                    if isinstance(poi_info[each_name], str):
                        data_dict[each_name][source] = tradition2simple(
                            poi_info[each_name]).decode()
                    else:
                        data_dict[each_name][source] = poi_info[each_name]

        # 补空白的内容
        for each_name in (json_name_list + norm_name_list + others_name_list):
            if each_name not in data_dict:
                data_dict[each_name] = {}

        # 不能融合的内容包含两种
        if not o_official_data and not o_nonofficial_data and not other_source:
            if 'online' in union_info:
                filter_data_already_online(poi_type, miaoji_id, "没有可供融合的数据")
            logger.debug('[union_info: {}]'.format(union_info))
            continue

        new_data_dict = {}

        # 通过优先级获取 !中文 !
        def get_name_by_priority():
            # 按照标准优先级更新字段信息
            name_tmp = get_key.get_key_by_priority_or_default(
                data_dict['name'], norm_name, '', special_filter=check_chinese)
            # 从英文字段中找中文
            if not name_tmp:
                name_tmp = get_key.get_key_by_priority_or_default(
                    data_dict['name_en'],
                    norm_name,
                    '',
                    special_filter=check_chinese)
            # 从英文字段中找拉丁
            if not name_tmp:
                name_tmp = get_key.get_key_by_priority_or_default(
                    data_dict['name_en'],
                    norm_name,
                    '',
                    special_filter=check_latin)
            # 从中文字段中找拉丁
            if not name_tmp:
                name_tmp = get_key.get_key_by_priority_or_default(
                    data_dict['name'],
                    norm_name,
                    '',
                    special_filter=check_latin)
            return name_tmp

        # 通过优先级获取 !拉丁字符 !
        def get_name_en_by_priority():
            # 从融合数据的英文字段中获取
            name_en_tmp = get_key.get_key_by_priority_or_default(
                data_dict['name_en'],
                norm_name,
                '',
                special_filter=check_latin)
            if not name_en_tmp:
                get_key.get_key_by_priority_or_default(
                    data_dict['name'],
                    norm_name,
                    '',
                    special_filter=check_latin)
            return name_en_tmp

        for norm_name in norm_name_list:
            # 所有字段处理的过程中,对 name / name_en 进行特殊处理
            if norm_name == 'name':
                if has_official:
                    # official = 1 的点,不更新 name
                    new_data_dict['name'] = data_dict['name']['mioji_official']
                elif has_nonofficial:
                    # official = 0 的点,name 已为中文的点不更新 name
                    if any([
                            toolbox.Common.is_chinese(c)
                            for c in data_dict['name']['mioji_nonofficial']
                    ]):
                        new_data_dict['name'] = data_dict['name'][
                            'mioji_nonofficial']
                    else:
                        new_data_dict['name'] = get_name_by_priority()
                else:
                    # 按照标准优先级更新字段信息
                    new_data_dict['name'] = get_name_by_priority()
            elif norm_name == 'name_en':
                # official 1 不更新英文名,否则按优先级更新英文名
                if has_official:
                    new_data_dict['name_en'] = data_dict['name_en'][
                        'mioji_official']
                else:
                    new_data_dict['name_en'] = get_name_en_by_priority()

            else:
                new_data_dict[
                    norm_name] = get_key.get_key_by_priority_or_default(
                        data_dict[norm_name], norm_name, '')

        # daodao url 处理
        if 'daodao' in data_dict['url']:
            data_dict['url']['daodao'] = data_dict['url']['daodao'].replace(
                'www.tripadvisor.com.hk', 'www.tripadvisor.cn')

        # 餐厅使用 cuisines 添加 tagid
        if poi_type == 'rest':
            data_dict['tagid'] = copy.deepcopy(data_dict['cuisines'])
            new_data_dict['tagid'] = json.dumps({
                k: v
                for k, v in data_dict['tagid'].items() if k in final_source
            })

        for json_name in json_name_list:
            new_data_dict[json_name] = json.dumps({
                k: v
                for k, v in data_dict[json_name].items() if k in final_source
            })

        new_data_dict['phone'] = new_data_dict['phone'].replace('电话号码:',
                                                                '').strip()

        # 数据操作部分
        # ori_grade modify
        tmp_ori_grade = {}

        if has_official:
            try:
                tmp_ori_grade.update(json.loads(o_official_data['ori_grade']))
            except Exception as exc:
                logger.exception(msg="[load ori grade error]", exc_info=exc)

        if has_nonofficial:
            try:
                tmp_ori_grade.update(
                    json.loads(o_nonofficial_data['ori_grade']))
            except Exception as exc:
                logger.exception(msg="[load ori grade error]", exc_info=exc)

        tmp_ori_grade.update({k: v for k, v in data_dict['grade'].items()})
        new_data_dict['ori_grade'] = json.dumps(
            {k: v
             for k, v in tmp_ori_grade.items() if k in final_source})

        # 添加 source
        source = '|'.join(
            map(lambda x: x.split('|')[0], union_info.split('|_||_|')))

        # add alias
        alias = '|'.join(
            filter(
                lambda x: x != new_data_dict['name'] and x != new_data_dict[
                    'name_en'],
                set(
                    list(data_dict['name'].values()) +
                    list(data_dict['name_en'].values()))))

        # add open time
        final_open_time_desc = get_key.get_key_by_priority_or_default(
            data_dict['opentime'],
            'opentime',
            special_filter=add_open_time_filter)
        if final_open_time_desc:
            norm_open_time = fix_daodao_open_time(final_open_time_desc)
        else:
            norm_open_time = ''

        # add norm tag
        # todo change make qyer and other can be used
        unknown_tag = set()
        if 'daodao' in data_dict['tagid']:
            try:
                daodao_tagid, daodao_tagid_en, _unknown_tag = get_norm_tag(
                    data_dict['tagid']['daodao'], poi_type)
                unknown_tag.update(_unknown_tag)
            except Exception:
                daodao_tagid, daodao_tagid_en = '', ''
        else:
            daodao_tagid, daodao_tagid_en = '', ''

        # # rest tag
        # if 'daodao' in data_dict['tagid']:
        #     try:
        #         daodao_rest_tagid, daodao_rest_tagid_en, _ = get_norm_tag(data_dict['tagid']['daodao'],
        #                                                                   'rest')
        #     except Exception:
        #         daodao_rest_tagid, daodao_rest_tagid_en = '', ''
        # else:
        #     daodao_rest_tagid, daodao_rest_tagid_en = '', ''

        # shop tag
        if 'daodao' in data_dict['tagid']:
            try:
                daodao_shop_tagid, daodao_shop_tagid_en, _ = get_norm_tag(
                    data_dict['tagid']['daodao'], 'shop')
            except Exception:
                daodao_shop_tagid, daodao_shop_tagid_en = '', ''
        else:
            daodao_shop_tagid, daodao_shop_tagid_en = '', ''

        if 'qyer' in data_dict['tagid']:
            try:
                qyer_tagid, qyer_tagid_en, _unknown_tag = get_norm_tag(
                    data_dict['tagid']['qyer'], poi_type)
                unknown_tag.update(_unknown_tag)
            except Exception:
                qyer_tagid, qyer_tagid_en = '', ''
        else:
            qyer_tagid, qyer_tagid_en = '', ''

        # # rest tag
        # if 'qyer' in data_dict['tagid']:
        #     try:
        #         qyer_rest_tagid, qyer_rest_tagid_en, _ = get_norm_tag(data_dict['tagid']['qyer'], 'rest')
        #     except Exception:
        #         qyer_rest_tagid, qyer_rest_tagid_en = '', ''
        # else:
        #     qyer_rest_tagid, qyer_rest_tagid_en = '', ''

        # shop tag
        if 'qyer' in data_dict['tagid']:
            try:
                qyer_shop_tagid, qyer_shop_tagid_en, _ = get_norm_tag(
                    data_dict['tagid']['qyer'], 'shop')
            except Exception:
                qyer_shop_tagid, qyer_shop_tagid_en = '', ''
        else:
            qyer_shop_tagid, qyer_shop_tagid_en = '', ''

        l_norm_tag = []
        l_norm_tag_en = []
        l_norm_tag.extend(daodao_tagid.split('|'))
        l_norm_tag_en.extend(daodao_tagid_en.split('|'))
        l_norm_tag.extend(qyer_tagid.split('|'))
        l_norm_tag_en.extend(qyer_tagid_en.split('|'))

        l_other_norm_tag = []
        l_other_norm_tag.extend(daodao_shop_tagid.split('|'))
        l_other_norm_tag.extend(qyer_shop_tagid.split('|'))

        # 去除空 tag 以及重复 tag
        norm_tag = '|'.join(filter(lambda x: is_legal(x), set(l_norm_tag)))
        norm_tag_en = '|'.join(
            filter(lambda x: is_legal(x), set(l_norm_tag_en)))
        other_tag = '|'.join(
            filter(lambda x: is_legal(x), set(l_other_norm_tag)))

        # 数据入库部分
        # 替换旧的 data_dict
        data_dict = new_data_dict

        # 过滤名称
        if data_dict['name'].lower() in (
                '', 'null',
                '0') and data_dict['name_en'].lower() in ('', 'null', '0'):
            if 'online' in union_info:
                filter_data_already_online(poi_type, miaoji_id, "中英文名为空")
            logger.debug("[filter by name][name: {}][name_en: {}]".format(
                data_dict['name'], data_dict['name_en']))
            continue

        if '停业' in data_dict['name'] or '停业' in data_dict['name_en']:
            if 'online' in union_info:
                filter_data_already_online(poi_type, miaoji_id, "停业 POI")
            logger.debug(
                "[filter by name with close business][name: {}][name_en: {}]".
                format(data_dict['name'], data_dict['name_en']))
            continue

        # 这个逻辑太蠢了,去除,23333333

        # # name name_en 判断
        # if data_dict['name'] != data_dict['name_en']:
        #     if data_dict['name_en'] in data_dict['name']:
        #         data_dict['name'] = data_dict['name'].replace(data_dict['name_en'], '')

        # phone 处理
        if data_dict['phone'] in ('+ 新增電話號碼', '+ 新增电话号码'):
            data_dict['phone'] = ''

        # 餐厅的 price_level 单独处理
        if poi_type == 'rest':
            data_dict['price_level'] = W2N.get(
                data_dict.get('price_level', ''), '0')

        # 添加 nearCity 字段
        nearby_city = get_nearby_city(poi_city_id=city_id,
                                      poi_map_info=data_dict['map_info'])

        # 数据清理以及入库部分
        # 全量经纬度不符合规范数据清理
        try:
            lng, lat = data_dict['map_info'].split(',')
            lng = float(lng)
            lat = float(lat)
            data_dict['map_info'] = '{},{}'.format(lng, lat)
        except Exception as exc:
            logger.exception(msg="[map_info filter error][data: {}]".format(
                data_dict['map_info']),
                             exc_info=exc)
            continue

        # 清理名称中的多余字符
        data_dict['name'] = data_dict['name'].replace('这是您的企业吗?', '').strip()
        if data_dict['name_en'] in data_dict[
                'name'] and data_dict['name_en'] != data_dict['name']:
            data_dict['name'].replace(data_dict['name_en'], '')

        # 字段修改部分
        # address
        if data_dict['address'].lower() in ('null', '0'):
            data_dict['address'] = ''

        # open time
        if norm_open_time.lower() in ('', 'null', '0'):
            if poi_type in ('attr', 'rest'):
                norm_open_time = '<*><*><00:00-23:55><SURE>'
            else:
                norm_open_time = '<*><*><08:00-20:00><SURE>'

        # 保存不能识别的 tag 以及 open time 信息
        if unknown_tag:
            insert_unknown_keywords('{}_tag'.format(poi_type), unknown_tag)
            logger.debug("[unknown tag][tags: {}]".format(unknown_tag))

        # 距离过远过滤 poi
        result = poi_is_too_far(city_id, poi_map_info=data_dict['map_info'])
        if not result:
            if 'online' in union_info:
                filter_data_already_online(poi_type, miaoji_id, "距城市中心距离过远")
            logger.debug(
                "[poi filter by poi city distance][cid: {}][city_map: {}][poi_map_info: {}][distance: {}]"
                .format(city_id, result.city_map, data_dict['map_info'],
                        result.dist))
            continue

        # 大于 55 长度的电话置空
        if len(data_dict['phone']) > 55:
            logger.debug(
                "[phone length too long][poi_id: {}][len: {}][phone: {}]".
                format(miaoji_id, len(data_dict['phone']), data_dict['phone']))
            data_dict['phone'] = ''

        if poi_type == 'attr':
            per_data = {
                'id': miaoji_id,
                'name': data_dict['name'],
                'name_en': data_dict['name_en'],
                'data_source': source,
                'city_id': city_id,
                'map_info': data_dict['map_info'],
                'address': data_dict['address'],
                'star': data_dict['star'],
                'plantocount': data_dict['plantocounts'],
                'beentocount': data_dict['beentocounts'],
                'real_ranking': data_dict['ranking'],
                # 'grade': data_dict['grade'],
                'commentcount': data_dict['commentcounts'],
                'tagid': data_dict['tagid'],
                'norm_tagid': norm_tag,
                'norm_tagid_en': norm_tag_en,
                'website_url': data_dict['site'],
                'phone': data_dict['phone'],
                'open': norm_open_time,
                'open_desc': data_dict['opentime'],
                'recommend_lv': data_dict['recommend_lv'],
                'prize': data_dict['prize'],
                'traveler_choice': data_dict['traveler_choice'],
                'alias': alias,
                'image': data_dict['imgurl'],
                'ori_grade': data_dict['ori_grade'],
                'nearCity': nearby_city
            }

            # official 为 1 时,不更新的字段
            # nonofficial 以及 新增的数据时进行更新
            if not has_official:
                per_data.update({
                    'introduction': data_dict['introduction'],
                    'url': data_dict['url'],
                })

            if not has_official and not has_nonofficial:
                per_data.update({
                    # 明确更新逻辑,当之前没有融合时才会更新状态
                    'ranking': -1.0,
                    'rcmd_open': '',
                    'add_info': '',
                    'address_en': '',
                    'event_mark': '',
                    'grade': -1.0,

                    # 明确更新逻辑,当之前没有融合时才会更新状态
                    'status_online': 'Open',
                    'status_test': 'Open'
                })

            # 景点游览部分清理
            try:
                tagid_data = json.loads(data_dict['tagid'])
                if 'daodao' in tagid_data:
                    if is_legal(tagid_data['daodao']):
                        if '游览' in tagid_data['daodao']:
                            if 'online' in union_info:
                                # 这种内容本来是要被删除的,但是由于 online 环境的某些购物还必须要更新,所以只能保留
                                # 并参加融合,这种内容算是特殊处理吧,对于新增的,一定不能加入游览
                                # filter_data_already_online(poi_type, miaoji_id, "tag 中包含游览被过滤")
                                pass
                            else:
                                logger.debug("[tour filter][data: {}]".format(
                                    tagid_data['daodao']))
                                continue
            except Exception as exc:
                logger.exception(msg="[tour filter error]", exc_info=exc)

            if norm_tag == '' and other_tag != '':
                # 景点中包含购物被清除
                if 'online' in union_info:
                    # 这种内容本来是要被删除的,但是由于 online 环境的某些购物还必须要更新,所以只能保留
                    # 并参加融合,这种内容算是特殊处理吧,对于新增的,一定不能添加购物进入
                    # filter_data_already_online(poi_type, miaoji_id, "景点类中存在购物数据被过滤")
                    pass
                else:
                    continue

            data.append(per_data)
        elif poi_type == 'rest':
            data.append(
                (miaoji_id, data_dict['name'], data_dict['name_en'], source,
                 key, data_dict['map_info'], data_dict['address'],
                 data_dict['ranking'], data_dict['grade'], data_dict['url'],
                 data_dict['phone'], data_dict['introduction'], norm_open_time,
                 data_dict['opentime'], data_dict['prize'],
                 data_dict['traveler_choice'], data_dict['commentcounts'],
                 data_dict['price'], data_dict['price_level'],
                 data_dict['cuisines'], data_dict['imgurl'],
                 data_dict['tagid'], norm_tag, norm_tag_en, nearby_city))
        elif poi_type == 'shop':
            per_data = {
                'id': miaoji_id,
                'name': data_dict['name'],
                'name_en': data_dict['name_en'],
                'data_source': source,
                'city_id': city_id,
                'map_info': data_dict['map_info'],
                'address': data_dict['address'],
                'star': data_dict['star'],
                'plantocount': data_dict['plantocounts'],
                'beentocount': data_dict['beentocounts'],
                'real_ranking': data_dict['ranking'],
                # 'grade': data_dict['grade'],
                'commentcount': data_dict['commentcounts'],
                'tagid': data_dict['tagid'],
                'norm_tagid': norm_tag,
                'norm_tagid_en': norm_tag_en,
                'website_url': data_dict['site'],
                'phone': data_dict['phone'],
                'open': norm_open_time,
                'open_desc': data_dict['opentime'],
                'recommend_lv': data_dict['recommend_lv'],
                'prize': data_dict['prize'],
                'traveler_choice': data_dict['traveler_choice'],
                'image': data_dict['imgurl'],
                'nearCity': nearby_city
            }
            # official 为 1 时,不更新的字段
            # nonofficial 以及 新增的数据时进行更新
            if not has_official:
                per_data.update({
                    'introduction': data_dict['introduction'],
                    'url': data_dict['url'],
                })

            if not has_official and not has_nonofficial:
                per_data.update({
                    # 需要增加默认值才能入库
                    'ranking': -1.0,
                    'rcmd_open': '',
                    'image_list': '',
                    'grade': -1.0,

                    # 明确更新逻辑,当之前没有融合时才会更新状态
                    'status_online': 'Open',
                    'status_test': 'Open'
                })
            shopping_tag = [
                '礼品与特产商店', '大型购物中心', '农贸市场', '跳蚤市场与街边市场', '古董店', '百货商场',
                '厂家直营店', '购物'
            ]
            important_shopping_tag = [
                '礼品与特产商店', '大型购物中心', '百货商场', '厂家直营店', '购物'
            ]

            # 购物数据过滤,通过 tag 过滤
            tag_list = norm_tag.split('|')
            if not all([tag.strip() in shopping_tag for tag in tag_list]):
                if not any([
                        tag.strip() in important_shopping_tag
                        for tag in tag_list
                ]):
                    if 'online' in union_info:
                        filter_data_already_online(poi_type, miaoji_id,
                                                   "非购物数据被过滤")
                    continue

            data.append(per_data)
        else:
            raise TypeError("Unknown Type: {}".format(poi_type))

        if count % 300 == 0:
            db = dataset.connect(
                "mysql+pymysql://mioji_admin:[email protected]/poi_merge?charset=utf8"
            )
            table = db[data_process_table_name]
            _insert = 0
            logger.debug("Total: {}".format(count))
            _t = time.time()
            for d in data:
                _res = table.upsert(d, keys=['id'])
                if _res:
                    _insert += 1
            logger.debug(
                '[data upsert][count: {}][insert: {}][takes: {}]'.format(
                    count, _insert,
                    time.time() - _t))
            logger.debug("[city_id: {}][insert_count_this_times: {}]".format(
                cid, _insert))
            db.commit()
            data = []
        count += 1

    logger.debug("[city_id: {}][total: {}]".format(cid, count))
    _insert = 0
    db = dataset.connect(
        "mysql+pymysql://mioji_admin:[email protected]/poi_merge?charset=utf8"
    )
    table = db[data_process_table_name]
    for d in data:
        _res = table.upsert(d, keys=['id'])
        if _res:
            _insert += 1
    logger.debug("Insert: {}".format(_insert))
    db.commit()
    logger.debug("Insert: {}".format(_insert))
    conn.close()
    update_already_merge_city("{}_data".format(poi_type), cid)
예제 #26
0
def mk_base_data_final(_poi_type):
    if _poi_type == 'attr':
        table_name = 'chat_attraction'
    elif _poi_type == 'rest':
        table_name = 'chat_restaurant'
    elif _poi_type == 'shop':
        table_name = 'chat_shopping'
    else:
        raise TypeError("Unknown Type: {}".format(_poi_type))
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    if _poi_type == 'attr':
        cursor.execute('''TRUNCATE base_data.{};'''.format(table_name))
        conn.commit()
        sql = '''REPLACE INTO base_data.{0}
  SELECT
    id,
    name,
    alias,
    name_en,
    name_en_alias,
    map_info,
    city_id,
    grade,
    ranking,
    '',
    address,
    introduction,
    open,
    ticket,
    intensity,
    rcmd_intensity,
    first_image,
    image_list,
    level,
    hot_level,
    url,
    tagB,
    disable,
    utime,
    nearCity,
    official,
    status_online,
    status_test
  FROM {0};'''.format(table_name)
    elif _poi_type == 'shop':
        cursor.execute('''TRUNCATE base_data.{};'''.format(table_name))
        conn.commit()
        sql = '''REPLACE INTO base_data.{0}
  SELECT
    id,
    name,
    name_en,
    map_info,
    city_id,
    address,
    grade,
    ranking,
    '',
    introduction,
    open,
    intensity,
    rcmd_intensity,
    level,
    first_image,
    image_list,
    hot_level,
    url,
    tagB,
    disable,
    utime,
    nearCity,
    0,
    'Open',
    'Open'
  FROM {0};'''.format(table_name)
    elif _poi_type == 'rest':
        sql = '''REPLACE INTO base_data.{0}
  SELECT
    id,
    name,
    name_en,
    city_id,
    map_info,
    address,
    telphone,
    introduction,
    ranking,
    grade,
    max_price,
    min_price,
    price_level,
    open_time,
    '',
    0,
    michelin_star,
    level,
    first_image,
    image_list,
    hot_level,
    url,
    tagB,
    disable,
    utime,
    nearCity,
    0,
    status_online,
    status_test
  FROM {0};'''.format(table_name)
    else:
        raise TypeError("Unknown Type: {}".format(_poi_type))
    res = cursor.execute(sql)
    cursor.close()
    conn.close()
    logger.debug(
        "[replace base data table][table: {}][replace_count: {}]".format(
            table_name, res))