예제 #1
0
def add_open_time_filter(_v):
    if not is_legal(_v):
        return False
    try:
        _open_time = fix_daodao_open_time(_v)
        if is_legal(_open_time):
            return True
    except Exception:
        # 保存不能识别的 open time
        insert_unknown_keywords('{}_opentime'.format(poi_type), _v)
        logger.debug("[unknown open time][data: {}]".format(_v))
    return False
예제 #2
0
def _update_per_uid_img(_uid, _poi_type, _old_img_list, _old_first_img, _official):
    global data
    # init source sid set
    if not is_legal(_old_img_list):
        _old_img_list = ''
    if not is_legal(_old_first_img):
        _old_first_img = ''
    _s_sid_set = get_source_sid_set(_uid)
    _img_list, _first_img = get_img(s_sid_set=_s_sid_set, poi_type=_poi_type, old_img=_old_img_list,
                                    old_first_img=_old_first_img, is_official=(int(_official) == 1))
    logger.debug(
        "[get img info][uid: {}][img_list_len: {}][img_list: {}][first_img: {}]".format(_uid, len(_img_list), _img_list,
                                                                                        _first_img))
    # 按照 uid 排序,每当 uid 更新后,执行图片更新命令
    data.append((_first_img, _img_list, _uid))
예제 #3
0
def get_tagid_dict(_poi_type):
    _dict = {}
    if _poi_type == 'attr':
        sql = '''SELECT
  tag,
  tag_en,
  original_tag
FROM chat_attraction_tagS
ORDER BY id;'''
    elif _poi_type == 'rest':
        sql = 'select tag,tag_en,original_tag from chat_restaurant_tagS'
    elif _poi_type == 'shop':
        sql = '''SELECT
  tag,
  tag_en,
  original_tag
FROM chat_shopping_tagS
ORDER BY id;'''
    else:
        raise TypeError("Unknown Type: {}".format(_poi_type))

    conn = base_data_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    cursor.execute(sql)
    for line in cursor.fetchall():
        tag = line['tag']
        tag_en = line['tag_en']
        original_tag = line['original_tag']
        _tags_set = set()
        for each_tag in original_tag.split('|'):
            if is_legal(each_tag):
                _tags_set.add(each_tag)
        _dict[tuple(_tags_set)] = (tag, tag_en)
    return _dict
예제 #4
0
파일: img_to_data.py 프로젝트: 20113261/p_m
def to_data(table_name):
    global offset
    select_sql = '''SELECT
  source,
  source_id,
  others_info
FROM detail_hotel_{0}'''.format(table_name)
    try:
        _data = []
        for result in MysqlSource(db_config=config,
                                  table_or_query=select_sql,
                                  size=10000,
                                  is_table=False,
                                  is_dict_cursor=True):
            offset += 1
            others_info = result['others_info']
            if not others_info:
                continue
            others_info = json.loads(others_info)
            if 'first_img' not in others_info:
                continue
            first_img_url = others_info['first_img']

            if not is_legal(first_img_url):
                continue
            md5_str = encode(first_img_url)
            source = result['source']
            source_id = result['source_id']
            _data.append((source, source_id, md5_str))
            if len(_data) % 1000 == 0:
                insert_db(table_name, _data)
                _data = []
        insert_db(table_name, _data)
    except Exception as exc:
        logger.exception(msg="[入库出现异常]", exc_info=exc)
예제 #5
0
def check_latin(string):
    if not is_legal(string):
        return False
    return (len(
        list(
            filter(
                lambda x: toolbox.Common.is_latin_and_punctuation(x) or x ==
                '’', string))) / len(string)) >= 0.9
예제 #6
0
def belong_city_id_insert():
    """
    通过 belong_city_id 获取 city_id
    :return:
    """
    sql = '''SELECT
  station_id,
  {0}.map_info              AS station_map_info,
  {0}.station_city_map_info AS station_city_map_info,
  city.id                               AS city_id,
  city.map_info                         AS city_map_info
FROM {0}
  JOIN base_data.city ON {0}.belong_city_id = base_data.city.id;'''.format(
        STATION_SRC_TABLE)
    _count = 0
    data = []
    for station_id, station_map_info, station_city_map_info, city_id, city_map_info in fetchall(
            new_station_pool, sql):
        _count += 1
        if is_legal(station_map_info):
            distance = get_distance(city_map_info, station_map_info)
            map_info = station_map_info
        elif is_legal(station_city_map_info):
            distance = get_distance(city_map_info, station_map_info)
            map_info = station_city_map_info
        else:
            distance = None
            map_info = None
        '''
        station_id, map_info, city_id, city_map_info, distance, info
        '''
        data.append((station_id, map_info, city_id, city_map_info, distance,
                     '通过 belong_city_id 进行匹配'))
        if len(data) == 1000:
            insert_db(data)
            data = []

    if data:
        insert_db(data)
예제 #7
0
def update_each_tag_id():
    tag_id = tag2id()
    conn = poi_ori_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''SELECT id,norm_tagid
FROM {};'''.format(task_table))
    data = []
    _count = 0
    for _id, _tag_id in cursor.fetchall():
        if is_legal(_tag_id):
            tag_id_set = set()
            for each in _tag_id.split('|'):
                tag_id_set.add(tag_id.get(each))
            small_tag = ('|'.join(filter(lambda x: is_legal(x), tag_id_set)))
            big_tag = get_tag(small_tag)
            data.append((small_tag, big_tag, _id))
            _count += 1
            if len(data) % 1000 == 0:
                logger.debug("[mk data][poi_type: {}][len: {}]".format(
                    poi_type, _count))
                res = cursor.executemany(
                    'update base_data.{} set tag_id=%s, tagB=%s where id=%s'.
                    format(task_table), data)
                data = []
                logger.debug(
                    "[update tag id][table_name: {}][update count: {}]".format(
                        task_table, res))

    res = cursor.executemany(
        'update base_data.{} set tag_id=%s, tagB=%s where id=%s'.format(
            task_table), data)
    logger.debug("[update tag id][table_name: {}][update count: {}]".format(
        task_table, res))
    logger.debug("[mk data finished][poi_type: {}][len: {}]".format(
        poi_type, _count))
    conn.commit()
    cursor.close()
    conn.close()
예제 #8
0
def get_old_info_dict():
    sql = '''SELECT
  id,
  source,
  name,
  name_en,
  map_info,
  address,
  plantocounts,
  beentocounts,
  ranking,
  grade,
  commentcounts,
  imgurl,
  introduction,
  opentime
FROM poi_merge.attr 
WHERE source='qyer';'''
    __dict = defaultdict(dict)
    _count = 0
    for line in MysqlSource(poi_ori_config,
                            table_or_query=sql,
                            size=5000,
                            is_table=False,
                            is_dict_cursor=True):
        _count += 1
        if _count % 3000 == 0:
            logger.info("[load old data info][count: {}]".format(_count))
        sid = line['id']

        for key_name, is_strict, num_check in check_name:
            if is_strict:
                __dict[sid][key_name] = line[key_name]
            else:
                legal_res = is_legal(line[key_name])
                if not num_check:
                    check_res = legal_res
                else:
                    try:
                        if int(legal_res) in (-1, 0):
                            check_res = False
                        else:
                            check_res = True
                    except Exception:
                        check_res = False
                __dict[sid][key_name] = check_res
    logger.info("[load old data info finished][count: {}]".format(_count))
    return __dict
예제 #9
0
def init_id2tag():
    conn = base_data_pool.connection()
    sql = '''SELECT
  id,
  Stag
FROM {};'''.format(tag_b)
    cursor = conn.cursor()
    cursor.execute(sql)
    _dict = defaultdict(set)
    for _id, _l_s_tag in cursor.fetchall():
        for each in _l_s_tag.split('|'):
            if is_legal(each):
                _dict[_id].add(each)
    cursor.close()
    conn.close()
    return _dict
예제 #10
0
def city_pair(city_ids, config):
    country_dict, city_dict, map_dict = generate_dict(config)
    pair = set([])
    pair_filter = []
    for c_id in city_ids:
        current_city = city_dict[c_id]
        all_city = country_dict[current_city['country_id']]
        for ac in all_city:
            if c_id == ac['id']:
                continue
            pf = '{0}-{1}'.format(c_id, ac['id'])
            if pf not in pair_filter:
                pair_filter.append(pf)
                src_cid = c_id
                dst_cid = ac['id']

                src_map_info = map_dict.get(src_cid)
                dst_map_info = map_dict.get(dst_cid)
                if not is_legal(dst_map_info): continue
                logger.info('%s: %s  - %s: %s' %
                            (src_cid, src_map_info, dst_cid, dst_map_info))
                src_map_info_list = src_map_info.split(',')
                src_map_info = ','.join(
                    [src_map_info_list[1], src_map_info_list[0]])
                dst_map_info_list = dst_map_info.split(',')
                dst_map_info = ','.join(
                    [dst_map_info_list[1], dst_map_info_list[0]])
                if not is_map_info_legal(
                        src_map_info) or not is_map_info_legal(dst_map_info):
                    logger.warning(
                        "[error map info][src_cid: {}][dst_cid: {}][src_m_info: {}][dst_m_info: {}]"
                        .format(src_cid, dst_cid, src_map_info, dst_map_info))
                    continue

                google_url = 'http://maps.google.cn/maps/api/directions/json?origin={}&destination={}&mode=driving&region=es&mode=driving&type=interCity&a1={}&a2={}'.format(
                    src_map_info, dst_map_info, src_cid, dst_cid)

                logger.info("[new task][url: {}]".format(google_url))
                pair.add((src_cid, dst_cid, google_url))
    return pair
예제 #11
0
def get_norm_tag(tag_id, _poi_type):
    global tag_dict
    if _poi_type not in tag_dict:
        logger.debug("[init tagid][poi_type: {}]".format(_poi_type))
        tag_dict[_poi_type] = get_tagid_dict(_poi_type)
    norm_tags = []
    norm_tag_ens = []
    unknown = []
    lines = tradition2simple(tag_id).decode()
    for raw_tag in split_pattern.split(lines):
        tag_ok = False
        tag = raw_tag.strip()
        for t_set, values in tag_dict[_poi_type].items():
            if tag in t_set:
                norm_tags.append(values[0])
                norm_tag_ens.append(values[1])
                tag_ok = True
                break
        if not tag_ok:
            if is_legal(tag):
                unknown.append(tag)
    norm_tag = '|'.join(sorted(norm_tags))
    norm_tag_en = '|'.join(sorted(norm_tag_ens))
    return norm_tag, norm_tag_en, unknown
예제 #12
0
def check_chinese(string):
    if not is_legal(string):
        return False
    return toolbox.Common.has_any(string, check_func=toolbox.Common.is_chinese)
예제 #13
0
def poi_merged_report(poi_type):
    cid2grade, grade_info = prepare_city_info()
    if poi_type == 'attr':
        query_sql = '''SELECT
      id,
      city_id,
      first_image,
      address,
      open,
      introduction,
      data_source,
      status_online,
      utime
    FROM chat_attraction;'''
    elif poi_type == 'shop':
        query_sql = '''SELECT
              id,
              city_id,
              first_image,
              address,
              open,
              introduction,
              data_source,
              status_online,
              utime
            FROM chat_shopping;'''
    else:
        query_sql = '''SELECT
              id,
              city_id,
              first_image,
              address,
              open,
              introduction,
              data_source,
              status_online,
              utime
            FROM chat_restaurant;'''

    poi_info = defaultdict(dict)
    for line in MysqlSource(db_config=data_process_config, table_or_query=query_sql, size=10000, is_dict_cursor=True,
                            is_table=False):
        cid = line['city_id']

        # get grade
        grade = cid2grade.get(cid, None)

        if grade is None:
            # not known cid
            continue

        # add cid
        if 'has_poi' not in poi_info[grade]:
            poi_info[grade]['has_poi'] = set()
        poi_info[grade]['has_poi'].add(line['city_id'])

        # poi total
        if 'total' not in poi_info[grade]:
            poi_info[grade]['total'] = 0
        poi_info[grade]['total'] += 1

        # poi online
        if 'online' not in poi_info[grade]:
            poi_info[grade]['online'] = 0
        if 'Open' == line['status_online']:
            poi_info[grade]['online'] += 1

        # poi update this time
        if 'update' not in poi_info[grade]:
            poi_info[grade]['update'] = 0
        try:
            if line['utime'] > datetime.datetime.now() - datetime.timedelta(days=30):
                poi_info[grade]['update'] += 1
        except Exception as exc:
            logger.exception(msg="[unknown utime][utime: {}]".format(line['utime']), exc_info=exc)

        # poi has img
        if 'img' not in poi_info[grade]:
            poi_info[grade]['img'] = 0
        if is_legal(line['first_image']):
            poi_info[grade]['img'] += 1

        # poi has address
        if 'address' not in poi_info[grade]:
            poi_info[grade]['address'] = 0
        if is_legal(line['address']):
            poi_info[grade]['address'] += 1

        # poi opentime
        if 'opentime' not in poi_info[grade]:
            poi_info[grade]['opentime'] = 0
        if is_legal(line['open']):
            poi_info[grade]['opentime'] += 1

        # poi introduction
        if 'introduction' not in poi_info[grade]:
            poi_info[grade]['introduction'] = 0
        if is_legal(line['introduction']):
            try:
                _data = json.loads(line['introduction'])
                if isinstance(_data, dict):
                    if _data.values():
                        if has_any(list(_data.values()), check_func=is_legal):
                            poi_info[grade]['introduction'] += 1
            except Exception as exc:
                logger.exception(msg="[load introduction error][introduction: {}]".format(line['introduction']),
                                 exc_info=exc)

        # qyer\daodao\multi in source
        if 'qyer' not in poi_info[grade]:
            poi_info[grade]['qyer'] = 0
        if 'daodao' not in poi_info[grade]:
            poi_info[grade]['daodao'] = 0
        if 'multi' not in poi_info[grade]:
            poi_info[grade]['multi'] = 0

        if is_legal(line['data_source']):
            if 'qyer' in line['data_source']:
                poi_info[grade]['qyer'] += 1
            if 'daodao' in line['data_source'] or 'tripadvisor' in line['data_source']:
                poi_info[grade]['daodao'] += 1
            if 'qyer' in line['data_source'] and (
                            'daodao' in line['data_source'] or 'tripadvisor' in line['data_source']):
                poi_info[grade]['multi'] += 1

    for each_grade in poi_info.keys():
        poi_info[each_grade]['has_poi'] = len(poi_info[each_grade]['has_poi'])

    db = dataset.connect('mysql+pymysql://mioji_admin:[email protected]/Report?charset=utf8')
    _table = db['base_data_report_summary']
    dt = datetime.datetime.now()
    for k, v in sorted(grade_info.items(), key=lambda x: x[0]):
        _tmp_grade_report = poi_info.get(k, defaultdict(int))
        no_poi = v - _tmp_grade_report['has_poi']
        data = {
            'type': poi_type,
            'grade': k,
            'citys': v,
            'no_poi': no_poi,
            'date': datetime.datetime.strftime(dt, '%Y%m%d'),
            'hour': datetime.datetime.strftime(dt, '%H'),
            'datetime': datetime.datetime.strftime(dt, '%Y%m%d%H00')
        }
        data.update(_tmp_grade_report)
        _table.upsert(data, keys=['type', 'grade', 'datetime'])
    db.commit()
예제 #14
0
파일: poi_merge.py 프로젝트: 20113261/p_m
def get_data(cid_or_geohash):
    global poi_type
    global online_table_name
    global data_source_table
    """
    返回各优先级数据,用于融合时使用,按照 线上数据 > 各源数据 ( 暂时无源内部的排序 ) 的提取规则由先向后进行数据提取
    source = ''
    sid = ''
    keys = set()
    yield source, sid, keys
    """

    # 线上数据,按照优先级排序,先进入的数据优先使用 id
    _t = time.time()
    if poi_type == 'attr':
        sql = '''SELECT id, name, name_en, alias, url
    FROM {1}
    WHERE city_id = '{0}'
    ORDER BY status_online DESC, status_test DESC, official DESC, grade;'''.format(
            cid_or_geohash, online_table_name)
    elif poi_type == 'rest':
        sql = '''SELECT id, name, name_en, url
        FROM {1}
        WHERE city_id = '{0}'
        ORDER BY status_online DESC, status_test DESC, official DESC, grade;'''.format(
            cid_or_geohash, online_table_name)
    elif poi_type == 'shop':
        sql = '''SELECT id, name, name_en, url
        FROM {1}
        WHERE city_id = '{0}'
        ORDER BY status_online DESC, status_test DESC, official DESC, grade;'''.format(
            cid_or_geohash, online_table_name)
    else:
        raise TypeError("Unknown Type: {}".format(poi_type))
    for data in MysqlSource(onlinedb,
                            table_or_query=sql,
                            size=10000,
                            is_table=False,
                            is_dict_cursor=True):
        keys = set()
        if is_legal(data['name']):
            keys.add(data['name'])
        if is_legal(data['name_en']):
            keys.add(data['name_en'])
        _url = data['url']
        if _url:
            _j_url = json.loads(_url)
            if 'qyer' in _j_url:
                try:
                    _source = 'qyer'
                    _sid = re.findall('place.qyer.com/poi/([\s\S]+)/',
                                      _j_url['qyer'])[0]
                    keys.add((_source, _sid))
                except Exception:
                    pass
            if 'daodao' in _j_url:
                try:
                    _source = 'daodao'
                    _sid = re.findall('-d(\d+)', _j_url['daodao'])[0]
                    keys.add((_source, _sid))
                except Exception:
                    pass
        if poi_type == 'attr':
            for key in data['alias'].split('|'):
                if is_legal(key):
                    keys.add(key)
        logger.debug("[source: {}][sid: {}][keys: {}]".format(
            'online', data['id'], keys))
        yield 'online', data['id'], keys
    logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t))

    # 各源数据,暂时不增加排序规则
    _t = time.time()
    sql = '''SELECT
  id,
  source,
  name,
  name_en,
  url
FROM {1}
WHERE city_id = '{0}';'''.format(cid_or_geohash, data_source_table)
    for data in MysqlSource(data_db,
                            table_or_query=sql,
                            size=10000,
                            is_table=False,
                            is_dict_cursor=True):
        keys = set()
        if is_legal(data['name']):
            keys.add(data['name'])
        if is_legal(data['name_en']):
            keys.add(data['name_en'])
        if data['source'] == 'qyer':
            _sid = re.findall('place.qyer.com/poi/([\s\S]+)/', data['url'])[0]
            keys.add(('qyer', _sid))
        if data['source'] == 'daodao':
            _sid = re.findall('-d(\d+)', data['url'])[0]
            keys.add(('daodao', _sid))
        logger.debug("[source: {}][sid: {}][keys: {}]".format(
            data['source'], data['id'], keys))
        yield data['source'], data['id'], keys
    logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t))
예제 #15
0
def get_new_info_dict():
    # 新增数据统计
    new_data_count = 0

    # 用于记录新的 sid ,判断多少 sid 的数据没有回来
    new_sid_set = set()

    # 严格数据的差异记录
    diff_dict = defaultdict(set)

    # 非严格数据的改善记录
    improve_dict = defaultdict(set)

    # 非严格数据的糟糕记录
    worse_dict = defaultdict(set)

    # 字段全正确的集合
    all_right_sids = set()

    old_info_dict = get_old_info_dict()
    sql = '''SELECT
      id,
      source,
      name,
      name_en,
      map_info,
      address,
      plantocounts,
      beentocounts,
      ranking,
      grade,
      commentcounts,
      imgurl,
      introduction,
      opentime
    FROM qyer_whole_world;'''
    _count = 0
    for line in MysqlSource(service_platform_config,
                            table_or_query=sql,
                            size=5000,
                            is_table=False,
                            is_dict_cursor=True):
        _count += 1
        if _count % 3000 == 0:
            logger.info("[load new data info][count: {}]".format(_count))
        sid = line['id']
        new_sid_set.add(sid)

        all_right = True
        for key_name, is_strict, num_check in check_name:
            legal_res = is_legal(line[key_name])
            if not num_check:
                if not legal_res:
                    all_right = False
            else:
                try:
                    if int(legal_res) in (-1, 0):
                        all_right = False
                except Exception:
                    all_right = False
        if all_right:
            all_right_sids.add(sid)

        if sid not in old_info_dict:
            new_data_count += 1
            continue
        else:
            old_info = old_info_dict[sid]

        for key_name, is_strict, num_check in check_name:
            if is_strict:
                if line[key_name] != old_info[key_name]:
                    # 严格字段差异统计
                    diff_dict[key_name].add(sid)
            else:
                # 非严格字段统计
                legal_res = is_legal(line[key_name])
                if not num_check:
                    check_res = legal_res
                else:
                    try:
                        if int(legal_res) in (-1, 0):
                            check_res = False
                        else:
                            check_res = True
                    except Exception:
                        check_res = False
                if check_res == old_info[key_name]:
                    continue
                elif check_res:
                    improve_dict[key_name].add(sid)
                else:
                    worse_dict[key_name].add(sid)
    logger.info("[load new data info finished][count: {}]".format(_count))

    return new_data_count, set(old_info_dict.keys(
    )) - new_sid_set, diff_dict, improve_dict, worse_dict, all_right_sids
예제 #16
0
def get_img(s_sid_set, poi_type, old_img='', old_first_img='', is_official=False):
    """
    Get img str by using source and sid set
    :param is_official: is official or not
    :param old_img: old img list, all img split with |
    :param old_first_img:  old first img, use old sorting
    :param poi_type: poi type, Eg: attr rest shop
    :param s_sid_set: source and sid set
    :return: tuple (new_img str, new_first_img str)
    """
    if not s_sid_set or is_official:
        return old_img, old_first_img

    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    query_sql = '''SELECT
  file_name,
  bucket_name,
  pic_size,
  pic_md5,
  `use`,
  info,
  url
FROM poi_images
WHERE (`source`, `sid`) IN ({});'''.format(','.join(map(lambda x: "('{}', '{}')".format(x[0], x[1]), s_sid_set)))
    _res = cursor.execute(query_sql)
    if not _res:
        return old_img, old_first_img

    max_size = -1
    max_size_img = ''
    file2phash = dict()
    pic_total = set()
    p_hash_dict = defaultdict(list)
    for file_name, bucket_name, pic_size, pic_md5, use, info, url in cursor.fetchall():
        if poi_type == 'shop' and bucket_name not in ('attr_bucket', 'shop_bucket', 'mioji-attr', 'mioji-shop'):
            # shopping img upload to mioji-attr or mioji-shop
            continue
        elif poi_type != 'shop' and poi_type not in bucket_name:
            # rest img upload to mioji-rest
            # attr img upload to mioji-attr
            continue

        # 生成 pic total,用于判定被过滤的图片是否为人工新添加的图片
        pic_total.add(file_name)

        # 裂图,必须过滤
        if r.get('error_img_{}'.format(file_name)) == '1':
            continue

        # pHash filter
        if url in ('', 'NULL', None):
            # 产品标注图片,不许过滤,直接使用
            file2phash[file_name] = 'USE'
            p_hash_dict["USE"].append((file_name, -1))
            continue
        elif not info:
            # 抓取图片,没有 pHash ,直接过滤
            continue
        else:
            p_hash = json.loads(info)['p_hash']

        # img can be used
        # pic size 为空一般是人工标的图片
        if not is_legal(pic_size):
            if file_name not in old_img:
                continue
            elif str(use) != '1':
                continue
            else:
                # 老图,人工标的,不能过滤
                file2phash[file_name] = 'USE'
                p_hash_dict["USE"].append((file_name, -1))
                continue

        # get max size
        h, w = literal_eval(pic_size)
        h = int(h)
        w = int(w)
        size = h * w
        if size > max_size:
            max_size = size
            max_size_img = file_name

        # use 1
        if str(use) == '1':
            # 过滤规则
            # pixel
            if size < 200000:
                continue

            # scale
            # min scale
            scale = w / h
            if scale < 0.9:
                if w < 500:
                    continue

            # max scale
            if scale > 2.5:
                continue

            p_hash_dict[p_hash].append((file_name, size))

    cursor.close()
    conn.close()

    if poi_type in ('attr', 'shop'):
        # 获取人脸识别数据
        _conn = poi_face_detect_pool.connection()
        _cursor = _conn.cursor()
        query_sql = '''SELECT pic_name
FROM PoiPictureInformation
WHERE is_available=0 AND poi_id IN ({});'''.format(
            ', '.join(
                map(
                    lambda x: "'{}'".format(
                        '###'.join(x) if x[0] != 'online' else x[1]),
                    s_sid_set
                )
            )
        )
        _cursor.execute(query_sql)
        face_detected = set([x[0].split('/')[-1] for x in _cursor.fetchall()])
        _cursor.close()
        _conn.close()
    else:
        face_detected = set()

    # 人工添加图片
    human_pic = p_hash_dict["USE"]

    # 机器图片,同一 pHash 中选取最大的一张图片
    final_pic_dict = {}
    for k, v in p_hash_dict.items():
        pic_res = sorted(v, key=lambda x: x[1], reverse=True)
        if pic_res:
            final_pic_dict[pic_res[0][0]] = k

    old_img_list = old_img.split('|')

    new_img_list = []
    # 按照旧的图片排列顺序增加图片,并去重
    for _old_file_name in old_img_list:
        # 人工添加图片入栈,但无 md5 进行过滤,直接放过 md5 过滤规则
        if (_old_file_name not in pic_total) or (_old_file_name in human_pic):
            # 如果数据合法
            if is_legal(_old_file_name):
                if _old_file_name not in face_detected:
                    if _old_file_name not in new_img_list:
                        # 人工添加图片入栈,但无 md5 进行过滤,直接放过任何过滤规则
                        new_img_list.append(_old_file_name)

        elif _old_file_name in final_pic_dict:
            if is_legal(_old_file_name):
                # 人脸识别过滤
                if _old_file_name not in face_detected:
                    if _old_file_name not in new_img_list:
                        new_img_list.append(_old_file_name)

    # 当新增图片中有原先不存在的图片,按顺序增加图片
    for k, v in final_pic_dict.items():
        if is_legal(v):
            # 人脸识别过滤
            if k not in face_detected:
                if v not in new_img_list:
                    new_img_list.append(k)

    if old_first_img:
        if old_first_img in new_img_list:
            # 当首图没有被下掉的时候,使用原先首图
            new_first_img = old_first_img
            # 从新图片列表中清除 first_img
            new_img_list.remove(old_first_img)
            # 在列表头部增加 first_img
            new_img_list.insert(0, old_first_img)
        else:
            # 否则使用新的首图
            if new_img_list:
                new_first_img = new_img_list[0]
            else:
                new_first_img = ''
    else:
        if new_img_list:
            new_first_img = new_img_list[0]
        else:
            new_first_img = ''

    if new_first_img == '':
        new_img = new_first_img = max_size_img
    else:
        # 图片序列去重,不改变原来次序
        final_new_img_list = list(set(new_img_list))
        final_new_img_list.sort(key=new_img_list.index)

        new_img = '|'.join(filter(lambda x: is_legal(x), final_new_img_list))

    return new_img, new_first_img
예제 #17
0
def poi_insert_data(cid, _poi_type):
    init_global_name(_poi_type)
    '''
    数据最终入库表
    if _poi_type == 'attr':
        sql = 'replace into chat_attraction(`id`,`name`,`name_en`,`data_source`,`city_id`,' \
              '`map_info`,`address`,`star`,`plantocount`,`beentocount`,`real_ranking`,' \
              '`grade`,`commentcount`,`tagid`,`norm_tagid`,`norm_tagid_en`,`url`,`website_url`,`phone`,`introduction`,' \
              '`open`, `open_desc`,`recommend_lv`,`prize`,`traveler_choice`, `alias`, ' \
              '`image`, `ori_grade`,`nearCity`, `ranking`,`rcmd_open`,`add_info`,`address_en`,`event_mark`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \
              '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,-1,"","","","")'
    elif _poi_type == 'rest':
        sql = 'replace into chat_restaurant(`id`,`name`,`name_en`,' \
              '`source`,`city_id`,`map_info`,`address`,`real_ranking`,' \
              '`grade`,`res_url`,`telphone`,`introduction`,`open_time`,`open_time_desc`,`prize`,' \
              '`traveler_choice`,`review_num`,`price`,`price_level`,`cuisines`, ' \
              '`image_urls`,`tagid`,`norm_tagid`,`norm_tagid_en`,`nearCity`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \
              '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    elif _poi_type == 'shop':
        sql = 'replace into ' \
              'chat_shopping(`id`,`name`,`name_en`,`data_source`,`city_id`,' \
              '`map_info`,`address`,`star`,`plantocount`,`beentocount`,' \
              '`real_ranking`,`grade`,`commentcount`,`tagid`,`norm_tagid`,`norm_tagid_en`,`url`,`website_url`,' \
              '`phone`,`introduction`,`open`,`open_desc`,`recommend_lv`,`prize`,' \
              '`traveler_choice`,`image`,`nearCity`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \
              '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    else:
        raise TypeError("Unknown Type: {}".format(poi_type))
    '''

    conn = poi_ori_pool.connection()
    # for task_dict in get_task(cid):
    count = 0
    data = []

    # 获取融合需要的 poi 信息
    _info_dict, _online_official_data, _online_nonofficial_data = get_poi_dict(
        cid)
    _city_poi = get_poi_union_info(cid)
    # 开始数据融合
    for miaoji_id, city_id, union_info in _city_poi:
        # 初始化融合前变量
        data_dict = defaultdict(dict)

        # 有从其他数据源来的数据
        other_source = False

        # 用于判定是否有线上 official 以及 nonofficial 的数据
        has_official = False
        has_nonofficial = False

        # 获取线上环境数据
        o_official_data = _online_official_data.get(miaoji_id, None)
        o_nonofficial_data = _online_nonofficial_data.get(miaoji_id, None)

        # 更新 official 判定
        if o_official_data is not None:
            has_official = True
        if o_nonofficial_data is not None:
            has_nonofficial = True

        # 初始化融合信息
        for each_name in (json_name_list + norm_name_list + others_name_list):
            data_dict[each_name] = {}

            def get_data(src_dict, is_official=False):
                if each_name in online2source:
                    source_name = online2source[each_name]
                else:
                    source_name = each_name

                if each_name in json_name_list:
                    if source_name in src_dict:
                        try:
                            _res = json.loads(src_dict[source_name])
                            if isinstance(_res, dict):
                                data_dict[each_name] = {
                                    k: v
                                    for k, v in _res.items()
                                    if k in available_source
                                }
                            else:
                                pass
                        except Exception:
                            pass
                else:
                    data_dict[each_name]['mioji_official' if is_official else
                                         'mioji_nonofficial'] = src_dict.get(
                                             source_name, {})

            if o_official_data is not None:
                get_data(o_official_data, is_official=True)
            if o_nonofficial_data is not None:
                get_data(o_nonofficial_data, is_official=False)

        # 遍历所有需要融合的 source 以及 id,并生成 dict 类融合内容
        for s_sid in union_info.split('|_||_|'):
            source, source_id = s_sid.split('|')

            # todo 增加 online 的处理,先 load data,然后进行数据更新
            # todo 使用 online 的 base data 更新 data process 的字段

            # 未获得融合 id 信息
            if not source_id or not source:
                continue

            # 过滤不必要的 source
            if source not in available_source:
                logger.debug("[not available source: {}]".format(source))
                continue

            # 未获得融合数据
            poi_info = _info_dict[(source, source_id)]
            if poi_info == {}:
                continue

            other_source = True

            # 加 key
            for each_name in (json_name_list + norm_name_list +
                              others_name_list):
                if is_legal(poi_info[each_name]):
                    if isinstance(poi_info[each_name], str):
                        data_dict[each_name][source] = tradition2simple(
                            poi_info[each_name]).decode()
                    else:
                        data_dict[each_name][source] = poi_info[each_name]

        # 补空白的内容
        for each_name in (json_name_list + norm_name_list + others_name_list):
            if each_name not in data_dict:
                data_dict[each_name] = {}

        # 不能融合的内容包含两种
        if not o_official_data and not o_nonofficial_data and not other_source:
            if 'online' in union_info:
                filter_data_already_online(poi_type, miaoji_id, "没有可供融合的数据")
            logger.debug('[union_info: {}]'.format(union_info))
            continue

        new_data_dict = {}

        # 通过优先级获取 !中文 !
        def get_name_by_priority():
            # 按照标准优先级更新字段信息
            name_tmp = get_key.get_key_by_priority_or_default(
                data_dict['name'], norm_name, '', special_filter=check_chinese)
            # 从英文字段中找中文
            if not name_tmp:
                name_tmp = get_key.get_key_by_priority_or_default(
                    data_dict['name_en'],
                    norm_name,
                    '',
                    special_filter=check_chinese)
            # 从英文字段中找拉丁
            if not name_tmp:
                name_tmp = get_key.get_key_by_priority_or_default(
                    data_dict['name_en'],
                    norm_name,
                    '',
                    special_filter=check_latin)
            # 从中文字段中找拉丁
            if not name_tmp:
                name_tmp = get_key.get_key_by_priority_or_default(
                    data_dict['name'],
                    norm_name,
                    '',
                    special_filter=check_latin)
            return name_tmp

        # 通过优先级获取 !拉丁字符 !
        def get_name_en_by_priority():
            # 从融合数据的英文字段中获取
            name_en_tmp = get_key.get_key_by_priority_or_default(
                data_dict['name_en'],
                norm_name,
                '',
                special_filter=check_latin)
            if not name_en_tmp:
                get_key.get_key_by_priority_or_default(
                    data_dict['name'],
                    norm_name,
                    '',
                    special_filter=check_latin)
            return name_en_tmp

        for norm_name in norm_name_list:
            # 所有字段处理的过程中,对 name / name_en 进行特殊处理
            if norm_name == 'name':
                if has_official:
                    # official = 1 的点,不更新 name
                    new_data_dict['name'] = data_dict['name']['mioji_official']
                elif has_nonofficial:
                    # official = 0 的点,name 已为中文的点不更新 name
                    if any([
                            toolbox.Common.is_chinese(c)
                            for c in data_dict['name']['mioji_nonofficial']
                    ]):
                        new_data_dict['name'] = data_dict['name'][
                            'mioji_nonofficial']
                    else:
                        new_data_dict['name'] = get_name_by_priority()
                else:
                    # 按照标准优先级更新字段信息
                    new_data_dict['name'] = get_name_by_priority()
            elif norm_name == 'name_en':
                # official 1 不更新英文名,否则按优先级更新英文名
                if has_official:
                    new_data_dict['name_en'] = data_dict['name_en'][
                        'mioji_official']
                else:
                    new_data_dict['name_en'] = get_name_en_by_priority()

            else:
                new_data_dict[
                    norm_name] = get_key.get_key_by_priority_or_default(
                        data_dict[norm_name], norm_name, '')

        # daodao url 处理
        if 'daodao' in data_dict['url']:
            data_dict['url']['daodao'] = data_dict['url']['daodao'].replace(
                'www.tripadvisor.com.hk', 'www.tripadvisor.cn')

        # 餐厅使用 cuisines 添加 tagid
        if poi_type == 'rest':
            data_dict['tagid'] = copy.deepcopy(data_dict['cuisines'])
            new_data_dict['tagid'] = json.dumps({
                k: v
                for k, v in data_dict['tagid'].items() if k in final_source
            })

        for json_name in json_name_list:
            new_data_dict[json_name] = json.dumps({
                k: v
                for k, v in data_dict[json_name].items() if k in final_source
            })

        new_data_dict['phone'] = new_data_dict['phone'].replace('电话号码:',
                                                                '').strip()

        # 数据操作部分
        # ori_grade modify
        tmp_ori_grade = {}

        if has_official:
            try:
                tmp_ori_grade.update(json.loads(o_official_data['ori_grade']))
            except Exception as exc:
                logger.exception(msg="[load ori grade error]", exc_info=exc)

        if has_nonofficial:
            try:
                tmp_ori_grade.update(
                    json.loads(o_nonofficial_data['ori_grade']))
            except Exception as exc:
                logger.exception(msg="[load ori grade error]", exc_info=exc)

        tmp_ori_grade.update({k: v for k, v in data_dict['grade'].items()})
        new_data_dict['ori_grade'] = json.dumps(
            {k: v
             for k, v in tmp_ori_grade.items() if k in final_source})

        # 添加 source
        source = '|'.join(
            map(lambda x: x.split('|')[0], union_info.split('|_||_|')))

        # add alias
        alias = '|'.join(
            filter(
                lambda x: x != new_data_dict['name'] and x != new_data_dict[
                    'name_en'],
                set(
                    list(data_dict['name'].values()) +
                    list(data_dict['name_en'].values()))))

        # add open time
        final_open_time_desc = get_key.get_key_by_priority_or_default(
            data_dict['opentime'],
            'opentime',
            special_filter=add_open_time_filter)
        if final_open_time_desc:
            norm_open_time = fix_daodao_open_time(final_open_time_desc)
        else:
            norm_open_time = ''

        # add norm tag
        # todo change make qyer and other can be used
        unknown_tag = set()
        if 'daodao' in data_dict['tagid']:
            try:
                daodao_tagid, daodao_tagid_en, _unknown_tag = get_norm_tag(
                    data_dict['tagid']['daodao'], poi_type)
                unknown_tag.update(_unknown_tag)
            except Exception:
                daodao_tagid, daodao_tagid_en = '', ''
        else:
            daodao_tagid, daodao_tagid_en = '', ''

        # # rest tag
        # if 'daodao' in data_dict['tagid']:
        #     try:
        #         daodao_rest_tagid, daodao_rest_tagid_en, _ = get_norm_tag(data_dict['tagid']['daodao'],
        #                                                                   'rest')
        #     except Exception:
        #         daodao_rest_tagid, daodao_rest_tagid_en = '', ''
        # else:
        #     daodao_rest_tagid, daodao_rest_tagid_en = '', ''

        # shop tag
        if 'daodao' in data_dict['tagid']:
            try:
                daodao_shop_tagid, daodao_shop_tagid_en, _ = get_norm_tag(
                    data_dict['tagid']['daodao'], 'shop')
            except Exception:
                daodao_shop_tagid, daodao_shop_tagid_en = '', ''
        else:
            daodao_shop_tagid, daodao_shop_tagid_en = '', ''

        if 'qyer' in data_dict['tagid']:
            try:
                qyer_tagid, qyer_tagid_en, _unknown_tag = get_norm_tag(
                    data_dict['tagid']['qyer'], poi_type)
                unknown_tag.update(_unknown_tag)
            except Exception:
                qyer_tagid, qyer_tagid_en = '', ''
        else:
            qyer_tagid, qyer_tagid_en = '', ''

        # # rest tag
        # if 'qyer' in data_dict['tagid']:
        #     try:
        #         qyer_rest_tagid, qyer_rest_tagid_en, _ = get_norm_tag(data_dict['tagid']['qyer'], 'rest')
        #     except Exception:
        #         qyer_rest_tagid, qyer_rest_tagid_en = '', ''
        # else:
        #     qyer_rest_tagid, qyer_rest_tagid_en = '', ''

        # shop tag
        if 'qyer' in data_dict['tagid']:
            try:
                qyer_shop_tagid, qyer_shop_tagid_en, _ = get_norm_tag(
                    data_dict['tagid']['qyer'], 'shop')
            except Exception:
                qyer_shop_tagid, qyer_shop_tagid_en = '', ''
        else:
            qyer_shop_tagid, qyer_shop_tagid_en = '', ''

        l_norm_tag = []
        l_norm_tag_en = []
        l_norm_tag.extend(daodao_tagid.split('|'))
        l_norm_tag_en.extend(daodao_tagid_en.split('|'))
        l_norm_tag.extend(qyer_tagid.split('|'))
        l_norm_tag_en.extend(qyer_tagid_en.split('|'))

        l_other_norm_tag = []
        l_other_norm_tag.extend(daodao_shop_tagid.split('|'))
        l_other_norm_tag.extend(qyer_shop_tagid.split('|'))

        # 去除空 tag 以及重复 tag
        norm_tag = '|'.join(filter(lambda x: is_legal(x), set(l_norm_tag)))
        norm_tag_en = '|'.join(
            filter(lambda x: is_legal(x), set(l_norm_tag_en)))
        other_tag = '|'.join(
            filter(lambda x: is_legal(x), set(l_other_norm_tag)))

        # 数据入库部分
        # 替换旧的 data_dict
        data_dict = new_data_dict

        # 过滤名称
        if data_dict['name'].lower() in (
                '', 'null',
                '0') and data_dict['name_en'].lower() in ('', 'null', '0'):
            if 'online' in union_info:
                filter_data_already_online(poi_type, miaoji_id, "中英文名为空")
            logger.debug("[filter by name][name: {}][name_en: {}]".format(
                data_dict['name'], data_dict['name_en']))
            continue

        if '停业' in data_dict['name'] or '停业' in data_dict['name_en']:
            if 'online' in union_info:
                filter_data_already_online(poi_type, miaoji_id, "停业 POI")
            logger.debug(
                "[filter by name with close business][name: {}][name_en: {}]".
                format(data_dict['name'], data_dict['name_en']))
            continue

        # 这个逻辑太蠢了,去除,23333333

        # # name name_en 判断
        # if data_dict['name'] != data_dict['name_en']:
        #     if data_dict['name_en'] in data_dict['name']:
        #         data_dict['name'] = data_dict['name'].replace(data_dict['name_en'], '')

        # phone 处理
        if data_dict['phone'] in ('+ 新增電話號碼', '+ 新增电话号码'):
            data_dict['phone'] = ''

        # 餐厅的 price_level 单独处理
        if poi_type == 'rest':
            data_dict['price_level'] = W2N.get(
                data_dict.get('price_level', ''), '0')

        # 添加 nearCity 字段
        nearby_city = get_nearby_city(poi_city_id=city_id,
                                      poi_map_info=data_dict['map_info'])

        # 数据清理以及入库部分
        # 全量经纬度不符合规范数据清理
        try:
            lng, lat = data_dict['map_info'].split(',')
            lng = float(lng)
            lat = float(lat)
            data_dict['map_info'] = '{},{}'.format(lng, lat)
        except Exception as exc:
            logger.exception(msg="[map_info filter error][data: {}]".format(
                data_dict['map_info']),
                             exc_info=exc)
            continue

        # 清理名称中的多余字符
        data_dict['name'] = data_dict['name'].replace('这是您的企业吗?', '').strip()
        if data_dict['name_en'] in data_dict[
                'name'] and data_dict['name_en'] != data_dict['name']:
            data_dict['name'].replace(data_dict['name_en'], '')

        # 字段修改部分
        # address
        if data_dict['address'].lower() in ('null', '0'):
            data_dict['address'] = ''

        # open time
        if norm_open_time.lower() in ('', 'null', '0'):
            if poi_type in ('attr', 'rest'):
                norm_open_time = '<*><*><00:00-23:55><SURE>'
            else:
                norm_open_time = '<*><*><08:00-20:00><SURE>'

        # 保存不能识别的 tag 以及 open time 信息
        if unknown_tag:
            insert_unknown_keywords('{}_tag'.format(poi_type), unknown_tag)
            logger.debug("[unknown tag][tags: {}]".format(unknown_tag))

        # 距离过远过滤 poi
        result = poi_is_too_far(city_id, poi_map_info=data_dict['map_info'])
        if not result:
            if 'online' in union_info:
                filter_data_already_online(poi_type, miaoji_id, "距城市中心距离过远")
            logger.debug(
                "[poi filter by poi city distance][cid: {}][city_map: {}][poi_map_info: {}][distance: {}]"
                .format(city_id, result.city_map, data_dict['map_info'],
                        result.dist))
            continue

        # 大于 55 长度的电话置空
        if len(data_dict['phone']) > 55:
            logger.debug(
                "[phone length too long][poi_id: {}][len: {}][phone: {}]".
                format(miaoji_id, len(data_dict['phone']), data_dict['phone']))
            data_dict['phone'] = ''

        if poi_type == 'attr':
            per_data = {
                'id': miaoji_id,
                'name': data_dict['name'],
                'name_en': data_dict['name_en'],
                'data_source': source,
                'city_id': city_id,
                'map_info': data_dict['map_info'],
                'address': data_dict['address'],
                'star': data_dict['star'],
                'plantocount': data_dict['plantocounts'],
                'beentocount': data_dict['beentocounts'],
                'real_ranking': data_dict['ranking'],
                # 'grade': data_dict['grade'],
                'commentcount': data_dict['commentcounts'],
                'tagid': data_dict['tagid'],
                'norm_tagid': norm_tag,
                'norm_tagid_en': norm_tag_en,
                'website_url': data_dict['site'],
                'phone': data_dict['phone'],
                'open': norm_open_time,
                'open_desc': data_dict['opentime'],
                'recommend_lv': data_dict['recommend_lv'],
                'prize': data_dict['prize'],
                'traveler_choice': data_dict['traveler_choice'],
                'alias': alias,
                'image': data_dict['imgurl'],
                'ori_grade': data_dict['ori_grade'],
                'nearCity': nearby_city
            }

            # official 为 1 时,不更新的字段
            # nonofficial 以及 新增的数据时进行更新
            if not has_official:
                per_data.update({
                    'introduction': data_dict['introduction'],
                    'url': data_dict['url'],
                })

            if not has_official and not has_nonofficial:
                per_data.update({
                    # 明确更新逻辑,当之前没有融合时才会更新状态
                    'ranking': -1.0,
                    'rcmd_open': '',
                    'add_info': '',
                    'address_en': '',
                    'event_mark': '',
                    'grade': -1.0,

                    # 明确更新逻辑,当之前没有融合时才会更新状态
                    'status_online': 'Open',
                    'status_test': 'Open'
                })

            # 景点游览部分清理
            try:
                tagid_data = json.loads(data_dict['tagid'])
                if 'daodao' in tagid_data:
                    if is_legal(tagid_data['daodao']):
                        if '游览' in tagid_data['daodao']:
                            if 'online' in union_info:
                                # 这种内容本来是要被删除的,但是由于 online 环境的某些购物还必须要更新,所以只能保留
                                # 并参加融合,这种内容算是特殊处理吧,对于新增的,一定不能加入游览
                                # filter_data_already_online(poi_type, miaoji_id, "tag 中包含游览被过滤")
                                pass
                            else:
                                logger.debug("[tour filter][data: {}]".format(
                                    tagid_data['daodao']))
                                continue
            except Exception as exc:
                logger.exception(msg="[tour filter error]", exc_info=exc)

            if norm_tag == '' and other_tag != '':
                # 景点中包含购物被清除
                if 'online' in union_info:
                    # 这种内容本来是要被删除的,但是由于 online 环境的某些购物还必须要更新,所以只能保留
                    # 并参加融合,这种内容算是特殊处理吧,对于新增的,一定不能添加购物进入
                    # filter_data_already_online(poi_type, miaoji_id, "景点类中存在购物数据被过滤")
                    pass
                else:
                    continue

            data.append(per_data)
        elif poi_type == 'rest':
            data.append(
                (miaoji_id, data_dict['name'], data_dict['name_en'], source,
                 key, data_dict['map_info'], data_dict['address'],
                 data_dict['ranking'], data_dict['grade'], data_dict['url'],
                 data_dict['phone'], data_dict['introduction'], norm_open_time,
                 data_dict['opentime'], data_dict['prize'],
                 data_dict['traveler_choice'], data_dict['commentcounts'],
                 data_dict['price'], data_dict['price_level'],
                 data_dict['cuisines'], data_dict['imgurl'],
                 data_dict['tagid'], norm_tag, norm_tag_en, nearby_city))
        elif poi_type == 'shop':
            per_data = {
                'id': miaoji_id,
                'name': data_dict['name'],
                'name_en': data_dict['name_en'],
                'data_source': source,
                'city_id': city_id,
                'map_info': data_dict['map_info'],
                'address': data_dict['address'],
                'star': data_dict['star'],
                'plantocount': data_dict['plantocounts'],
                'beentocount': data_dict['beentocounts'],
                'real_ranking': data_dict['ranking'],
                # 'grade': data_dict['grade'],
                'commentcount': data_dict['commentcounts'],
                'tagid': data_dict['tagid'],
                'norm_tagid': norm_tag,
                'norm_tagid_en': norm_tag_en,
                'website_url': data_dict['site'],
                'phone': data_dict['phone'],
                'open': norm_open_time,
                'open_desc': data_dict['opentime'],
                'recommend_lv': data_dict['recommend_lv'],
                'prize': data_dict['prize'],
                'traveler_choice': data_dict['traveler_choice'],
                'image': data_dict['imgurl'],
                'nearCity': nearby_city
            }
            # official 为 1 时,不更新的字段
            # nonofficial 以及 新增的数据时进行更新
            if not has_official:
                per_data.update({
                    'introduction': data_dict['introduction'],
                    'url': data_dict['url'],
                })

            if not has_official and not has_nonofficial:
                per_data.update({
                    # 需要增加默认值才能入库
                    'ranking': -1.0,
                    'rcmd_open': '',
                    'image_list': '',
                    'grade': -1.0,

                    # 明确更新逻辑,当之前没有融合时才会更新状态
                    'status_online': 'Open',
                    'status_test': 'Open'
                })
            shopping_tag = [
                '礼品与特产商店', '大型购物中心', '农贸市场', '跳蚤市场与街边市场', '古董店', '百货商场',
                '厂家直营店', '购物'
            ]
            important_shopping_tag = [
                '礼品与特产商店', '大型购物中心', '百货商场', '厂家直营店', '购物'
            ]

            # 购物数据过滤,通过 tag 过滤
            tag_list = norm_tag.split('|')
            if not all([tag.strip() in shopping_tag for tag in tag_list]):
                if not any([
                        tag.strip() in important_shopping_tag
                        for tag in tag_list
                ]):
                    if 'online' in union_info:
                        filter_data_already_online(poi_type, miaoji_id,
                                                   "非购物数据被过滤")
                    continue

            data.append(per_data)
        else:
            raise TypeError("Unknown Type: {}".format(poi_type))

        if count % 300 == 0:
            db = dataset.connect(
                "mysql+pymysql://mioji_admin:[email protected]/poi_merge?charset=utf8"
            )
            table = db[data_process_table_name]
            _insert = 0
            logger.debug("Total: {}".format(count))
            _t = time.time()
            for d in data:
                _res = table.upsert(d, keys=['id'])
                if _res:
                    _insert += 1
            logger.debug(
                '[data upsert][count: {}][insert: {}][takes: {}]'.format(
                    count, _insert,
                    time.time() - _t))
            logger.debug("[city_id: {}][insert_count_this_times: {}]".format(
                cid, _insert))
            db.commit()
            data = []
        count += 1

    logger.debug("[city_id: {}][total: {}]".format(cid, count))
    _insert = 0
    db = dataset.connect(
        "mysql+pymysql://mioji_admin:[email protected]/poi_merge?charset=utf8"
    )
    table = db[data_process_table_name]
    for d in data:
        _res = table.upsert(d, keys=['id'])
        if _res:
            _insert += 1
    logger.debug("Insert: {}".format(_insert))
    db.commit()
    logger.debug("Insert: {}".format(_insert))
    conn.close()
    update_already_merge_city("{}_data".format(poi_type), cid)
예제 #18
0
def shared_city_id_insert():
    city_list, station_list = get_city_station_list()
    data = []
    _count = 0
    near_count = 0
    far_count = 0
    no_count = 0
    no_set = set()
    no_map_info_count = 0
    no_map_info_set = set()
    for city_id, city_map_info in city_list:
        _count += 1
        if is_legal(city_map_info):
            near_station = []
            far_station = []
            for station_id, station_map_info, station_city_map_info in station_list:
                if is_legal(station_map_info):
                    distance = get_distance(city_map_info, station_map_info)
                    map_info = station_map_info
                elif is_legal(station_city_map_info):
                    distance = get_distance(city_map_info, station_map_info)
                    map_info = station_city_map_info
                else:
                    distance = None
                    map_info = None
                if distance is None:
                    continue
                elif distance < 20:
                    near_station.append((station_id, map_info, distance))
                elif distance < 50:
                    far_station.append((station_id, map_info, distance))

            near_station = sorted(near_station, key=lambda x: x[-1])
            far_station = sorted(far_station, key=lambda x: x[-1])

            if near_station:
                new_near_station = near_station[:3]
            else:
                new_near_station = []
            if far_station:
                new_far_station = far_station[0]
            else:
                new_far_station = []

            if new_near_station:
                for station_id, map_info, distance in new_near_station:
                    data.append((station_id, map_info, city_id, city_map_info,
                                 distance, '20km 匹配 3 条'))
                near_count += 1
                logger.info(
                    "[20 匹配][count: {}][near: {}][city_id: {}][station: {}]".
                    format(_count, near_count, city_id, new_near_station))
            elif new_far_station:
                station_id, map_info, distance = new_far_station
                data.append((station_id, map_info, city_id, city_map_info,
                             distance, '50km 匹配 1 条'))
                far_count += 1
                logger.info(
                    "[50 匹配][count: {}][far: {}][city_id: {}][station: {}]".
                    format(_count, far_count, city_id, new_far_station))
            else:
                no_count += 1
                logger.info(
                    "[无 station 城市][count: {}][no: {}][city_id: {}]".format(
                        _count, no_count, city_id))
                no_set.add(city_id)
                continue
        else:
            logger.info(
                "[无 map_info 城市][count: {}][no_map_info: {}][city_id: {}]".
                format(_count, no_map_info_count, city_id))
            no_map_info_set.add(city_id)
            continue

        if len(data):
            insert_db(data)
        # if len(data) > 1:
        #     insert_db(data)
        # elif len(data) == 1:
        #     insert_db(data, False)
        data = []

    logger.info(
        "[near_count: {}][far_count: {}][no_count: {}][no_map_info_count: {}]".
        format(near_count, far_count, no_count, no_map_info_count))
    logger.info("[no_set: {}]".format(no_set))
    logger.info("[no_map_info_set: {}]".format(no_map_info_set))
예제 #19
0
파일: poi_merge.py 프로젝트: 20113261/p_m
def insert_poi_unid(merged_dict, cid_or_geohash):
    global white_list
    global online_table_name
    global data_source_table
    start = time.time()
    # get city country name map_info
    _dev_conn = base_data_pool.connection()
    _dev_cursor = _dev_conn.cursor()
    _dev_cursor.execute('''SELECT
  city.id       AS cid,
  city.name     AS city_name,
  country.name  AS country_name,
  city.map_info AS map_info
FROM city
  JOIN country ON city.country_id = country.mid
WHERE city.id = {};'''.format(cid_or_geohash))
    cid, city_name, country, city_map_info = _dev_cursor.fetchone()
    _dev_cursor.close()
    _dev_conn.close()

    # 去除 total 的写法,费劲但是提升速度不明显,使用 total 后 5.9 秒获取巴黎全部信息,直接获取 6.9 秒,相差可以接受
    # init id list
    # online_ids = set()
    # data_ids = set()
    # for _, s_sid_set in merged_dict.items():
    #     for source, sid in s_sid_set:
    #         if source == 'online':
    #             online_ids.add(sid)
    #         else:
    #             data_ids.add((source, sid))

    # get data total
    # get online data name name_en map_info grade star ranking address url
    total_data = {}
    _dev_conn = base_data_pool.connection()
    _dev_cursor = _dev_conn.cursor()
    try:
        _t = time.time()
        sql = '''SELECT
  id,
  name,
  name_en,
  map_info,
  grade,
  -1,
  ranking,
  address,
  ''
FROM {} WHERE city_id='{}';'''.format(online_table_name, cid_or_geohash)
        _dev_cursor.execute(sql)
        logger.debug('[query][sql: {}][takes: {}]'.format(
            sql,
            time.time() - _t))
    except Exception as exc:
        logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc)

    for line in _dev_cursor.fetchall():
        total_data[('online', line[0])] = line[1:]
    _dev_cursor.close()
    _dev_conn.close()

    # get poi name name_en map_info grade star ranking address url
    _data_conn = poi_ori_pool.connection()
    _data_cursor = _data_conn.cursor()
    try:
        _t = time.time()
        sql = '''SELECT
source,
id,
name,
name_en,
map_info,
grade,
star,
ranking,
address,
url
FROM {}
WHERE city_id='{}';'''.format(data_source_table, cid_or_geohash)
        _data_cursor.execute(sql)
        logger.debug('[query][sql: {}][takes: {}]'.format(
            sql,
            time.time() - _t))
    except Exception as exc:
        logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc)
    for line in _data_cursor.fetchall():
        total_data[(line[0], line[1])] = line[2:]
    _data_cursor.close()
    _data_conn.close()

    # init white list total data
    if white_list:
        _s_sid = []
        for _each in white_list:
            _s_sid.extend(_each)

        _ori_conn = poi_ori_pool.connection()
        _ori_cursor = _ori_conn.cursor()
        try:
            _t = time.time()
            query_sql = '''SELECT
  source,
  id,
  name,
  name_en,
  map_info,
  grade,
  star,
  ranking,
  address,
  url
FROM {}
WHERE (source, id) IN ({});'''.format(
                data_source_table,
                ','.join(map(lambda x: "('{}', '{}')".format(*x), _s_sid)))
            _ori_cursor.execute(query_sql)
            logger.debug('[query][sql: {}][takes: {}]'.format(
                sql,
                time.time() - _t))
        except Exception as exc:
            logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc)
        for line in _ori_cursor.fetchall():
            total_data[(line[0], line[1])] = line[2:]
        _ori_cursor.close()
        _ori_conn.close()

    data = []
    for uid, s_sid_set in merged_dict.items():
        for source, sid in s_sid_set:
            # name name_en map_info grade star ranking address url
            name, name_en, map_info, grade, star, ranking, address, url = total_data[
                (source, sid)]
            if not is_legal(name):
                name = ''
            if not is_legal(name_en):
                name_en = ''
            if not is_legal(grade):
                grade = -1.0
            if not is_legal(star):
                star = -1.0
            if not is_legal(ranking):
                ranking = -1.0
            if not is_legal(address):
                address = ''
            if not is_legal(url):
                url = ''

            data.append(
                (uid, cid, city_name, country, city_map_info, source, sid,
                 name, name_en, map_info, grade, star, ranking, address, url))

            #     data = []
            #     _dev_conn = base_data_pool.connection()
            #     _dev_cursor = _dev_conn.cursor()
            #     _data_conn = poi_ori_pool.connection()
            #     _data_cursor = _data_conn.cursor()
            #     for uid, s_sid_set in merged_dict.items():
            #         for source, sid in s_sid_set:
            #             if source == 'online':
            #                 _dev_cursor.execute('''SELECT
            #   name,
            #   name_en,
            #   map_info,
            #   grade,
            #   -1,
            #   ranking,
            #   address,
            #   ''
            # FROM chat_attraction
            # WHERE id = '{}';'''.format(sid))
            #                 try:
            #                     name, name_en, map_info, grade, star, ranking, address, url = _dev_cursor.fetchone()
            #                 except Exception as exc:
            #                     logger.exception("[error sql query][source: {}][sid: {}]".format(source, sid), exc_info=exc)
            #                     continue
            #             else:
            #                 _data_cursor.execute('''SELECT
            #   CASE WHEN name NOT IN ('NULL', '', NULL)
            #     THEN name
            #   ELSE '' END,
            #   CASE WHEN name_en NOT IN ('NULL', '', NULL)
            #     THEN name_en
            #   ELSE '' END,
            #   map_info,
            #   CASE WHEN grade NOT IN ('NULL', '', NULL)
            #     THEN grade
            #   ELSE -1.0 END AS grade,
            #   CASE WHEN star NOT IN ('NULL', '', NULL)
            #     THEN star
            #   ELSE -1.0 END AS star,
            #   CASE WHEN ranking NOT IN ('NULL', '', NULL)
            #     THEN ranking
            #   ELSE -1.0 END AS ranking,
            #   CASE WHEN address NOT IN ('NULL', '', NULL)
            #     THEN address
            #   ELSE '' END,
            #   CASE WHEN url NOT IN ('null', '', NULL)
            #     THEN url
            #   ELSE '' END
            # FROM attr
            # WHERE source = '{}' AND id = '{}';'''.format(source, sid))
            #                 try:
            #                     name, name_en, map_info, grade, star, ranking, address, url = _data_cursor.fetchone()
            #                 except Exception as exc:
            #                     logger.exception("[error sql query][source: {}][sid: {}]".format(source, sid), exc_info=exc)
            #                     continue
            #
            #             data.append((uid, cid, city_name, country, city_map_info, source, sid, name, name_en, map_info, grade,
            #                          star, ranking, address, url))
            #     _dev_cursor.close()
            #     _data_cursor.close()
            #     _dev_conn.close()
            #     _data_conn.close()

    _final_conn = poi_ori_pool.connection()
    _final_cursor = _final_conn.cursor()
    # for d in data:
    try:
        _t = time.time()
        sql = '''REPLACE INTO {}_unid (id, city_id, city_name, country_name, city_map_info, source, source_id, name, name_en, map_info, grade, star, ranking, address, url)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'''.format(
            poi_type)
        _final_cursor.executemany(sql, data)
        logger.debug('[query][sql: {}][takes: {}]'.format(
            sql,
            time.time() - _t))
    except Exception as exc:
        logger.exception("[insert unid table error]", exc_info=exc)
    _final_conn.commit()
    _final_cursor.close()
    _final_conn.close()

    logger.info(
        "[finish prepare data][city: {}][line_count: {}][takes: {}]".format(
            cid_or_geohash, len(data),
            time.time() - start))
예제 #20
0
    def get_data(self):
        for line in self.private_db.query(
                '''SELECT * from {0} WHERE ptid = "qyoa" AND disable = 0 AND city_id=50016;'''
                .format(self.src_table_name)):
            data = copy.deepcopy(self.default_val)
            for k in self.keys:
                # 去除不使用的值
                if k in self.skip_keys:
                    continue
                # 去除空行
                if is_legal(line[k]):
                    data[(self.key_map.get(k, None) or k)] = line[k]

            # 从 refer 中获取 id
            if line['refer'] is not None:
                if line['refer'] != '':
                    data[self.id_key] = line['refer']

            if not self.need_new_data:
                if self.id_key not in data:
                    self.errors.append(line)
                    continue
                if not data[self.id_key]:
                    self.errors.append(line)
                    continue

            # 如果需要增加 id map 的关系,则使用如此方法获取 id
            if self.need_add_id_map:
                # 从 id_map 中获取 id
                if self.id_key not in data:
                    id_map_miaoji_id = self.id_map.get(line[self.id_key], None)
                    if id_map_miaoji_id:
                        data[self.id_key] = id_map_miaoji_id

                # 新生成相关 id
                if self.id_key not in data:
                    data[self.id_key] = self.get_new_id()

            # 景点,购物,餐厅更新 city_id
            if self.need_update_city_id:
                data['city_id'] = self.id_map.get(data['city_id'],
                                                  None) or data['city_id']

            # 保存本次所有 id
            self.data_id_set.add(data[self.id_key])

            # 如果需要增加 id 对应关系
            if self.need_add_id_map:
                # 存储 id 对应关系
                self.insert_id_map(line[self.id_key], data[self.id_key])

            if self.need_update_status:
                # 修改 景点、购物、餐厅 表 status
                if self.need_update_city_id:
                    data['status_test'] = "Open"
                    data['status_online'] = "Open"
                    data['dept_status_online'] = "Open"
                    data['dept_status_test'] = "Open"
                # 增加 city 表 status
                else:
                    data['status_test'] = "Open"
                    data['status_online'] = "Open"

            yield data