예제 #1
0
    def shuffle_for_area(self, re_data):
        if "STATUS_1" in re_data:
            del re_data["STATUS_1"]
            re_data["STATUS_"] = "1"
        location_result = get_lat_lng(re_data["ADDR_"])
        if location_result["status"] == 0:
            re_data["LNG_"] = str(location_result["result"]["location"]["lng"])
            re_data["LAT_"] = str(location_result["result"]["location"]["lat"])
            address_result = get_area(lat_lng=re_data["LAT_"] + "," +
                                      re_data["LNG_"])
            # todo use formatted_address or not
            re_data["DISTRICT_NAME_"] = address_result["result"][
                "addressComponent"]["district"]
            re_data["DISTRICT_CODE_"] = address_result["result"][
                "addressComponent"]["adcode"]
            re_data["AREA_CODE_"] = address_result["result"][
                "addressComponent"]["adcode"]
            re_data["CITY_"] = address_result["result"]["addressComponent"][
                "city"]
            re_data["CITY_CODE_"] = address_result["result"][
                "addressComponent"]["adcode"][:4] + "00"
            re_data["PROVINCE_NAME_"] = address_result["result"][
                "addressComponent"]["province"]
            re_data["PROVINCE_CODE_"] = address_result["result"][
                "addressComponent"]["adcode"][:2] + "00"
            # todo use formatted_location or not
            # re_data["LNG_"] = lng_lat.split(",")[0]
            # re_data["LAT_"] = lng_lat.split(",")[1]
        else:
            re_data["LNG_"] = ""
            re_data["LAT_"] = ""

        # 网点 CODE_
        hash_m = hashlib.md5()
        hash_m.update(re_data["ADDR_"].encode("utf-8"))
        hash_addr_ = hash_m.hexdigest()
        re_data["CODE_"] = re_data["BANK_CODE_"] + "_" + re_data[
            "AREA_CODE_"] + "_" + hash_addr_
        # for i in range(1, 10000):
        #         branch_code = "ABC" + "_" + re_data["AREA_CODE_"] + "_" + "00000"
        #         branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i)
        #         if branch_code in branch_code_list:
        #             continue
        #         else:
        #             branch_code_list.append(branch_code)
        #             break
        # print("*"*150)
        # print(re_data)
        return re_data
예제 #2
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()
    prov_n = ""
    prov_c = None
    city_n = None
    city_c = None
    area_n = None
    area_c = None

    # if data.get("AREA_CODE_"):
    #     for area in area_list:
    #         if area["CODE_"] == data["AREA_CODE_"]:
    #             area_c = area["CODE_"]
    #             area_n = area["NAME_"]
    #             city_c = area["PARENT_"]
    #             print(area_n)
    #             break
    #
    #     for city in city_list:
    #         if city["CODE_"] == city_c:
    #             city_n = city["NAME_"]
    #             prov_c = city["PARENT_"]
    #             break
    # elif data.get("CITY_CODE_"):
    #     for city in city_list:
    #         if city["CODE_"] == data["CITY_CODE_"]:
    #             city_n = city["NAME_"]
    #             prov_c = city["PARENT_"]
    #             break
    # else:
    #     prov_c = ""
    #     prov_n = ""

    if data.get("AREA_CODE_"):
        prov_n = data["AREA_CODE_"][:2] + "00"
    elif data.get("CITY_CODE_"):
        prov_n = data["CITY_CODE_"][:2] + "00"
    else:
        prov_n = ""
    for pro in province_list:
        if pro["CODE_"] == prov_c:
            prov_n = pro["NAME_"]
            break

    # if prov_n in ["北京市", "上海市", "天津市", "重庆市"]:
    #     city_n = prov_n

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "CMBC" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = "CMBC"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-5]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    # re_data["AREA_CODE_"] = area_c
    # re_data["AREA_NAME_"] = area_n
    # re_data["UNIT_CODE_"] = "CMBC" + "_" + city_c

    # "F"
    re_data["ADDR_"] = prov_n + data["ADDR_"]
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_NAME_"] = city_n

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"]["district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"]["adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "CMBC" + "_" + re_data.get("CITY_CODE_", "")
    # re_data["LAT_"] = data["LAT_"]
    # re_data["LNG_"] = data["LNG_"]
    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n
    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    if "BUSINESS_HOURS_" in data:
        re_data["BUSINESS_HOURS_"] = "0:00-24:00"
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "自助银行"
    re_data["TYPE_"] = "ZZ"

    return re_data
예제 #3
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()
    prov_n = None
    prov_c = None
    city_n = None
    city_c = None
    area_n = None
    area_c = None
    addr_ = None

    # 市级信息清洗
    for city in city_list:
        if city["NAME_"] == data["CITY_NAME_"]:
            city_n = city["NAME_"]
            city_c = city["CODE_"]
            prov_c = city["CODE_"][:2] + "00"
            break
    if not city_n:
        for city in city_list:
            if city["NAME_"][:-1] == data["CITY_NAME_"]:
                city_n = city["NAME_"]
                city_c = city["CODE_"]
                prov_c = city["CODE_"][:2] + "00"
    if not city_n:
        for area in area_list:
            if area["NAME_"][:-1] == data["CITY_NAME_"]:
                area_n = area["NAME_"]
                area_c = area["CODE_"]
                city_c = area["CODE_"][:-2] + "00"
                prov_c = area["CODE_"][:2] + "00"
                break
        for city in city_list:
            if city["CODE_"] == city_c:
                city_n = city["NAME_"]
    # 省级信息清洗
    for prov in province_list:
        if prov["NAME_"][:2] == data["CITY_NAME_"]:
            prov_n = prov["NAME_"]
            prov_c = prov["CODE_"]
            city_n = prov["NAME_"]
            city_c = prov["CODE_"]
            break
        elif prov["CODE_"] == prov_c:
            prov_n = prov["NAME_"]
            break

    # 区县级信息清洗
    if not area_n:
        for area in area_list:
            if city_n:
                if city_n == prov_n:
                    if area["CODE_"][:2] == city_c[:2]:
                        if area["NAME_"] in data["ADDR_"]:
                            area_n = area["NAME_"]
                            area_c = area["CODE_"]
                elif area["CODE_"][:4] == city_c[:4]:
                    if area["NAME_"] in data["ADDR_"]:
                        area_n = area["NAME_"]
                        area_c = area["CODE_"]
    if not area_n:
        for area in area_list:
            if len(area["NAME_"]) < 3:
                continue
            elif area["CODE_"][:4] == city_c[:4]:
                if (area["NAME_"][:-1] in data["ADDR_"]) and (
                        area["NAME_"][:-1] != city_n[:-1]):
                    check_index = data["ADDR_"].find(area["NAME_"][:-1])
                    if ("道" not in data["ADDR_"][check_index:check_index+len(area["NAME_"][:-1])+2]) \
                            and ("路" not in data["ADDR_"][check_index:check_index+len(area["NAME_"][:-1])+2]) \
                            and ("街" not in data["ADDR_"][check_index:check_index+len(area["NAME_"][:-1])+2]):
                        area_n = area["NAME_"]
                        area_c = area["CODE_"]
    # 特殊情况
    if "个旧市" in data["ADDR_"]:
        area_n = "个旧市"
        area_c = "532501"
        city_n = "红河哈尼族彝族自治州"
        city_c = "532500"
        prov_n = "云南省"
        prov_c = "5300"
    elif "辛集市" in data["ADDR_"]:
        area_n = "辛集市"
        area_c = "139002"
        city_n = "石家庄市"
        city_c = "130100"
        prov_n = "河北省"
        prov_c = "1300"
    # 地址清洗
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "PAB" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = "PAB"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    # re_data["AREA_CODE_"] = area_c
    # re_data["AREA_NAME_"] = area_n
    # re_data["UNIT_CODE_"] = "PAB" + "_" + city_c

    # "F"
    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_NAME_"] = city_n
    # re_data["LAT_"] = data["LAT_"]
    # re_data["LNG_"] = data["LNG_"]
    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "PAB" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    if "BUSINESS_HOURS_" in data:
        re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"]
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"

    return re_data
예제 #4
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()

    for each in ["北京市", "天津市", "上海市", "重庆市"]:
        if each in data["CITY_NAME_"]:
            for pro in province_list:
                if pro["NAME_"] == each:
                    re_data["PROVINCE_NAME_"] = pro["NAME_"]
                    re_data["PROVINCE_CODE_"] = pro["CODE_"]
                    re_data["CITY_NAME_"] = pro["NAME_"]
                    re_data["CITY_CODE_"] = pro["CODE_"][:3] + "100"
                    break
    else:
        for city in city_list:
            if city["NAME_"] in data["CITY_NAME_"]:
                re_data["CITY_NAME_"] = city["NAME_"]
                re_data["CITY_CODE_"] = city["CODE_"]
                re_data["PROVINCE_CODE_"] = city["PARENT_"]
                break
        if re_data.get("PROVINCE_CODE_"):
            for pro in province_list:
                if pro["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = pro["NAME_"]
                    break

    # "C"
    re_data["BANK_CODE_"] = "CBHB"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    # re_data["UNIT_CODE_"] = "CBHB" + re_data.get("CITY_CODE_", "")

    # "F"
    re_data["ADDR_"] = data["ADDR_"]
    re_data["NAME_"] = data["NAME_"]

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "CBHB" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    re_data["BUSINESS_HOURS_"] = "0:00-24:00"
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"
    return re_data
예제 #5
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()
    prov_n = None
    prov_c = None
    city_n = None
    city_c = None
    area_n = None
    area_c = None
    addr_ = None

    # 省市信息清洗
    for area in area_list:
        if area["NAME_"][:2] == data["CITY_NAME_"][:2]:
            area_n = area["NAME_"]
            area_c = area["CODE_"]
            city_c = area["CODE_"][:-2] + "00"
            prov_c = area["CODE_"][:2] + "00"
            break
    for city in city_list:
        if city["NAME_"][:2] == data["CITY_NAME_"][:2]:
            city_n = city["NAME_"]
            city_c = city["CODE_"]
            prov_c = city["CODE_"][:2] + "00"
        elif city_c == city["CODE_"]:
            city_n = city["NAME_"]
    for prov in province_list:
        if prov["NAME_"][:2] == data["CITY_NAME_"][:2]:
            prov_n = prov["NAME_"]
            prov_c = prov["CODE_"]
            city_n = prov["NAME_"]
            city_c = prov["CODE_"]
        elif prov_c == prov["CODE_"]:
            prov_n = prov["NAME_"]

    # 区县信息清洗
    for area in area_list:
        if area["CODE_"][:2] == prov_c[:2]:
            if area["NAME_"] in data["ADDR_"]:
                area_n = area["NAME_"]
                area_c = area["CODE_"]

    # 地址清洗
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "CMB" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = "CMB"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-6]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    # re_data["AREA_CODE_"] = area_c
    # re_data["AREA_NAME_"] = area_n
    # re_data["UNIT_CODE_"] = "CMB" + "_" + city_c

    # "F"
    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_NAME_"] = city_n
    # re_data["LAT_"] = data["LAT_"]
    # re_data["LNG_"] = data["LNG_"]
    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "CMB" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    if "BUSINESS_HOURS_" in data:
        re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"]

    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "自助银行"
    re_data["TYPE_"] = "ZZ"

    return re_data
예제 #6
0
def data_shuffle(data, province_list, city_list, area_list):
    for city in city_list:
        if city["NAME_"] == "县":
            city_list.remove(city)

    re_data = dict()
    addr_ = None
    area_c = None
    area_n = None
    city_c = None
    city_n = None
    prov_c = None
    prov_n = None

    # 西藏地区编码与数据库编码不符,单独清理
    if "西藏" in data["PROVINCE_NAME_"]:
        data["CITY_NAME_"] = data["CITY_NAME_"].replace("西藏自治区", "")
        if "西藏" in data["ADDR_"]:
            data["ADDR_"] = data["ADDR_"].replace("西藏", "西藏自治区")
        else:
            data["ADDR_"] = "西藏自治区" + data["ADDR_"]

        for city in city_list:
            if city["CODE_"][:2] == "54":
                if data["CITY_NAME_"][:2] == city["NAME_"][:2]:
                    data["ADDR_"] = data["ADDR_"].replace(
                        data["CITY_NAME_"], city["NAME_"])
                    data["CITY_NAME_"] = city["NAME_"]
                    data["CITY_CODE_"] = city["CODE_"]
                    data["ADDR_"] = data["ADDR_"].replace(
                        data["CITY_NAME_"][:-1] + "地区", data["CITY_NAME_"])

                if data["CITY_NAME_"][:-1] not in data["ADDR_"]:
                    data["ADDR_"] = data["ADDR_"][:5] + data[
                        "CITY_NAME_"] + data["ADDR_"][5:]

    # 青海地区编码与数据库编码不符,单独清理
    if "青海" in data["PROVINCE_NAME_"]:
        data["PROVINCE_NAME_"] = "青海省"
        data["CITY_NAME_"] = data["CITY_NAME_"].replace("青海", "")

        if "青海省" not in data["ADDR_"]:
            data["ADDR_"] = "青海省" + data["ADDR_"]

        for city in city_list:
            if city["CODE_"][:2] == "63":
                if city["NAME_"][:2] == data["CITY_NAME_"][:2]:
                    data["CITY_NAME_"] = city["NAME_"]
                    data["CITY_CODE_"] = city["CODE_"]

            if data["CITY_NAME_"][:-1] not in data["ADDR_"]:
                data["ADDR_"] = data["ADDR_"][:3] + data["CITY_NAME_"] + data[
                    "ADDR_"][3:]

    # 新疆地区编码与数据库编码不符,单独清理
    if "新疆" in data["PROVINCE_NAME_"]:
        data["PROVINCE_NAME_"] = "新疆维吾尔自治区"
        data["CITY_NAME_"] = data["CITY_NAME_"].replace("新疆维吾尔自治区", "")
        data["CITY_NAME_"] = data["CITY_NAME_"].replace("新疆", "")

        if ("新疆维吾尔自治区" not in data["ADDR_"]) and ("新疆" not in data["ADDR_"]):
            data["ADDR_"] = "新疆维吾尔自治区" + data["ADDR_"]
        elif ("新疆" in data["ADDR_"]) and ("新疆维吾尔自治区" not in data["ADDR_"]):
            data["ADDR_"] = "新疆维吾尔自治区" + data["ADDR_"][2:]

        for city in city_list:
            if city["CODE_"][:2] == "65":
                if city["NAME_"][:2] == data["CITY_NAME_"][:2]:
                    data["CITY_NAME_"] = city["NAME_"]
                    data["CITY_CODE_"] = city["CODE_"]

        # 哈密市只有一个伊州区,网点信息都是此区的
        if data["CITY_NAME_"] == "哈密市":
            data["AREA_NAME_"] = "伊州区"
            data["AREA_CODE_"] = "650502"
        for area in area_list:
            if area["CODE_"][:2] == "65":
                if area["NAME_"][:2] in data["AREA_NAME_"]:
                    data["AREA_NAME_"] = area["NAME_"]
                    data["AREA_CODE_"] = area["CODE_"]

    # 内蒙古, 广西, 宁夏 字段统一:
    if (("内蒙古" in data["ADDR_"]) or ("广西" in data["ADDR_"])
            or ("新疆" in data["ADDR_"]) or ("宁夏" in data["ADDR_"])):
        if data["PROVINCE_NAME_"] not in data["ADDR_"]:
            data["ADDR_"] = data["ADDR_"].replace("内蒙古", "内蒙古自治区")
            data["ADDR_"] = data["ADDR_"].replace("广西", "广西壮族自治区")
            data["ADDR_"] = data["ADDR_"].replace("宁夏", "宁夏回族自治区")
        if data["PROVINCE_NAME_"] in data["CITY_NAME_"]:
            data["CITY_NAME_"] = data["CITY_NAME_"].replace(
                data["PROVINCE_NAME_"], "")
        if data["CITY_NAME_"][:-1] not in data["ADDR_"]:
            data["ADDR_"] = data["ADDR_"][:len(data["PROVINCE_NAME_"])] + data[
                "CITY_NAME_"] + data["ADDR_"][len(data["PROVINCE_NAME_"]):]
            data["ADDR_"] = re.sub(
                r"{}{}地?区?市?".format(data["CITY_NAME_"],
                                     data["CITY_NAME_"][:2]),
                data["CITY_NAME_"], data["ADDR_"])
        if "区区" in data["ADDR_"]:
            data["ADDR_"] = data["ADDR_"].replace("区区", "区")

    # 吉林省吉林市清洗
    if "吉林" in data["PROVINCE_NAME_"]:
        if "吉林市" not in data["CITY_NAME_"]:
            data["CITY_NAME_"] = data["CITY_NAME_"].replace("吉林省", "")
            data["CITY_NAME_"] = data["CITY_NAME_"].replace("吉林", "")
            data["CITY_CODE_"] = "220200"

    # 省级名称清洗
    for prov in province_list:
        if prov["CODE_"][:2] == data["PROVINCE_CODE_"][:2]:
            data["PROVINCE_CODE_"] = prov["CODE_"]
            data["PROVINCE_NAME_"] = prov["NAME_"]
            break

    # 市级清洗
    if data["PROVINCE_NAME_"][:2] in data["CITY_NAME_"]:
        if data["CITY_NAME_"] == '北京市' or data["CITY_NAME_"] == '天津市' or data[
                "CITY_NAME_"] == '上海市' or data["CITY_NAME_"] == '重庆市':
            pass
        else:
            data["CITY_NAME_"] = data["CITY_NAME_"].replace(
                data["PROVINCE_NAME_"], "")
            data["CITY_NAME_"] = data["CITY_NAME_"].replace(
                data["PROVINCE_NAME_"][:-1], "")
            data["CITY_NAME_"] = data["CITY_NAME_"].replace(
                data["PROVINCE_NAME_"][:3], "")
            data["CITY_NAME_"] = data["CITY_NAME_"].replace(
                data["PROVINCE_NAME_"][:2], "")
    for city in city_list:
        if city["NAME_"] == "市辖区":
            continue
        elif city["CODE_"][:2] == data["PROVINCE_CODE_"][:2]:
            if city["CODE_"] == data["CITY_CODE_"]:
                data["CITY_CODE_"] = city["CODE_"]
                data["CITY_NAME_"] = city["NAME_"]
                break
            elif (city["NAME_"][:2] == data["CITY_NAME_"][:2]) and (
                    city["CODE_"] != data["CITY_CODE_"]):
                data["CITY_CODE_"] = city["CODE_"]
                data["CITY_NAME_"] = city["NAME_"]
                break
            elif (city["NAME_"] in data["ADDR_"][:len(data["PROVINCE_NAME_"]) +
                                                 len(city["NAME_"])]) and (
                                                     not data["CITY_NAME_"]):
                data["CITY_CODE_"] = city["CODE_"]
                data["CITY_NAME_"] = city["NAME_"]
                break

    # 区县级清洗
    if data["PROVINCE_NAME_"][:2] in data["AREA_NAME_"]:
        data["AREA_NAME_"] = data["AREA_NAME_"].replace(
            data["PROVINCE_NAME_"], "")
        data["AREA_NAME_"] = data["AREA_NAME_"].replace(
            data["PROVINCE_NAME_"][:-1], "")
        data["AREA_NAME_"] = data["AREA_NAME_"].replace(
            data["PROVINCE_NAME_"][:4], "")
        data["AREA_NAME_"] = data["AREA_NAME_"].replace(
            data["PROVINCE_NAME_"][:3], "")
        data["AREA_NAME_"] = data["AREA_NAME_"].replace(
            data["PROVINCE_NAME_"][:2], "")
        data["AREA_NAME_"] = data["AREA_NAME_"].replace(data["CITY_NAME_"], "")
        data["AREA_NAME_"] = data["AREA_NAME_"].replace(
            data["CITY_NAME_"][:-1], "")
        data["AREA_NAME_"] = data["AREA_NAME_"].replace(
            data["CITY_NAME_"][:3], "")
        # data["AREA_NAME_"] = data["AREA_NAME_"].replace(data["CITY_NAME_"][:2], "")
        data["AREA_NAME_"] = data["AREA_NAME_"][:2].replace(
            "地区", "") + data["AREA_NAME_"][2:]
    for area in area_list:
        if area["CODE_"][:2] == data["PROVINCE_CODE_"][:2]:
            if area["CODE_"] == data["AREA_CODE_"]:
                data["AREA_NAME_"] = area["NAME_"]
                data["AREA_CODE_"] = area["CODE_"]
                break
            elif (area["NAME_"] == data["AREA_NAME_"]) and (
                    area["CODE_"] != data["AREA_CODE_"]):
                data["AREA_NAME_"] = area["NAME_"]
                data["AREA_CODE_"] = area["CODE_"]
                break
            elif (
                (area["NAME_"] in data["ADDR_"][:len(data["PROVINCE_NAME_"]) +
                                                len(data["CITY_NAME_"]) +
                                                len(area["NAME_"])])
                    and (not data["AREA_NAME_"])):
                data["CITY_CODE_"] = city["CODE_"]
                data["CITY_NAME_"] = city["NAME_"]

    # 地址清洗
    # 地址中有省级和市级
    if (data["PROVINCE_NAME_"] in data["ADDR_"]) and (data["CITY_NAME_"]
                                                      in data["ADDR_"]):
        addr_ = data["ADDR_"]

    # 地址中有省级没有市级
    elif (data["PROVINCE_NAME_"] in data["ADDR_"]) and (data["CITY_NAME_"]
                                                        not in data["ADDR_"]):
        if data["CITY_NAME_"][:-1] in data["ADDR_"][:len(data["PROVINCE_NAME_"]
                                                         ) +
                                                    len(data["CITY_NAME_"])]:
            addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] +
                     data["ADDR_"]
                     [len(data["PROVINCE_NAME_"]):len(data["PROVINCE_NAME_"]) +
                      len(data["CITY_NAME_"])].replace(data["CITY_NAME_"][:-1],
                                                       data["CITY_NAME_"]) +
                     data["ADDR_"][len(data["PROVINCE_NAME_"]) +
                                   len(data["CITY_NAME_"]):])
        elif data["CITY_NAME_"][:3] in data[
                "ADDR_"][:len(data["PROVINCE_NAME_"]) +
                         len(data["CITY_NAME_"])]:
            addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] +
                     data["ADDR_"]
                     [len(data["PROVINCE_NAME_"]):len(data["PROVINCE_NAME_"]) +
                      len(data["CITY_NAME_"])].replace(data["CITY_NAME_"][:3],
                                                       data["CITY_NAME_"]) +
                     data["ADDR_"][len(data["PROVINCE_NAME_"]) +
                                   len(data["CITY_NAME_"]):])
        elif data["CITY_NAME_"][:2] in data[
                "ADDR_"][:len(data["PROVINCE_NAME_"]) +
                         len(data["CITY_NAME_"])]:
            addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] +
                     data["ADDR_"]
                     [len(data["PROVINCE_NAME_"]):len(data["PROVINCE_NAME_"]) +
                      len(data["CITY_NAME_"])].replace(data["CITY_NAME_"][:2],
                                                       data["CITY_NAME_"]) +
                     data["ADDR_"][len(data["PROVINCE_NAME_"]) +
                                   len(data["CITY_NAME_"]):])
        else:
            addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] +
                     data["CITY_NAME_"] +
                     data["ADDR_"][len(data["PROVINCE_NAME_"]):])

    # 地址中没有省级有市级
    elif (data["PROVINCE_NAME_"] not in data["ADDR_"]) and (data["CITY_NAME_"]
                                                            in data["ADDR_"]):
        if data["PROVINCE_NAME_"][:-1] in data[
                "ADDR_"][:len(data["PROVINCE_NAME_"])]:
            if data["CITY_NAME_"] == "吉林市" and ("吉林省" not in data["ADDR_"]):
                addr_ = data["PROVINCE_NAME_"] + data["ADDR_"]
            else:
                addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])].replace(
                    data["PROVINCE_NAME_"][:-1], data["PROVINCE_NAME_"]) +
                         data["ADDR_"][len(data["PROVINCE_NAME_"]):])
        elif (data["PROVINCE_NAME_"][:3] in data["ADDR_"][:len(data["PROVINCE_NAME_"])]) and \
                (data["CITY_NAME_"] in data["ADDR_"]):
            addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])].replace(
                data["PROVINCE_NAME_"][:3], data["PROVINCE_NAME_"]) +
                     data["ADDR_"][len(data["PROVINCE_NAME_"]):])
        elif (data["PROVINCE_NAME_"][:2] in data["ADDR_"][:len(data["PROVINCE_NAME_"])]) and\
                (data["CITY_NAME_"] in data["ADDR_"]):
            addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])].replace(
                data["PROVINCE_NAME_"][:2], data["PROVINCE_NAME_"]) +
                     data["ADDR_"][len(data["PROVINCE_NAME_"]):])
        else:
            addr_ = data["PROVINCE_NAME_"] + data["ADDR_"]

    # 地址中没有省级没有市级
    elif (data["PROVINCE_NAME_"]
          not in data["ADDR_"]) and (data["CITY_NAME_"] not in data["ADDR_"]):
        if data["CITY_NAME_"][:-1] in data["ADDR_"][:len(data["CITY_NAME_"])]:
            addr_ = (data["PROVINCE_NAME_"] +
                     data["ADDR_"][:len(data["CITY_NAME_"])].replace(
                         data["CITY_NAME_"][:-1], data["CITY_NAME_"]) +
                     data["ADDR_"][len(data["CITY_NAME_"]):])
        elif data["CITY_NAME_"][:3] in data["ADDR_"][:len(data["CITY_NAME_"])]:
            addr_ = (data["PROVINCE_NAME_"] +
                     data["ADDR_"][:len(data["CITY_NAME_"])].replace(
                         data["CITY_NAME_"][:3], data["CITY_NAME_"]) +
                     data["ADDR_"][len(data["CITY_NAME_"]):])
        elif data["CITY_NAME_"][:2] in data["ADDR_"][:len(data["CITY_NAME_"])]:
            addr_ = (data["PROVINCE_NAME_"] +
                     data["ADDR_"][:len(data["CITY_NAME_"])].replace(
                         data["CITY_NAME_"][:2], data["CITY_NAME_"]) +
                     data["ADDR_"][len(data["CITY_NAME_"]):])
        else:
            addr_ = data["PROVINCE_NAME_"] + data["CITY_NAME_"] + data["ADDR_"]

    # 地址中有区县级
    if data["AREA_NAME_"] in addr_:
        pass
    # 直辖市
    elif data["CITY_CODE_"] == data["PROVINCE_CODE_"]:
        if data["AREA_NAME_"][:-1] in addr_[:len(data["PROVINCE_NAME_"]) +
                                            len(data["AREA_NAME_"])]:
            addr_ = (
                addr_[:len(data["PROVINCE_NAME_"]) +
                      len(data["AREA_NAME_"])].replace(data["AREA_NAME_"][:-1],
                                                       data["AREA_NAME_"]) +
                addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):])
        elif data["AREA_NAME_"][:4] in addr_[:len(data["PROVINCE_NAME_"]) +
                                             len(data["AREA_NAME_"])]:
            addr_ = (
                addr_[:len(data["PROVINCE_NAME_"])].replace(
                    data["AREA_NAME_"][:4], data["AREA_NAME_"]) +
                addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):])
        elif data["AREA_NAME_"][:3] in addr_[:len(data["PROVINCE_NAME_"]) +
                                             len(data["AREA_NAME_"])]:
            addr_ = (
                addr_[:len(data["PROVINCE_NAME_"]) +
                      len(data["AREA_NAME_"])].replace(data["AREA_NAME_"][:3],
                                                       data["AREA_NAME_"]) +
                addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):])
        elif data["AREA_NAME_"][:2] in addr_[:len(data["PROVINCE_NAME_"]) +
                                             len(data["AREA_NAME_"])]:
            addr_ = (
                addr_[:len(data["PROVINCE_NAME_"]) +
                      len(data["AREA_NAME_"])].replace(data["AREA_NAME_"][:2],
                                                       data["AREA_NAME_"]) +
                addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):])
        else:
            addr_ = (addr_[:len(data["PROVINCE_NAME_"])] + data["AREA_NAME_"] +
                     addr_[len(data["PROVINCE_NAME_"]):])
    # 非直辖市
    elif (data["AREA_NAME_"] == "城区") or (data["AREA_NAME_"] == "郊区"):
        addr_ = addr_.replace(data["AREA_NAME_"], "")
    elif (data["AREA_NAME_"][:-1]
          in addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) +
                   len(data["AREA_NAME_"])]):
        addr_ = (addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) +
                       len(data["AREA_NAME_"])].replace(
                           data["AREA_NAME_"][:-1], data["AREA_NAME_"]) +
                 addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):])
    elif (data["AREA_NAME_"][:4]
          in addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) +
                   len(data["AREA_NAME_"])]):
        addr_ = (addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) +
                       len(data["AREA_NAME_"])].replace(
                           data["AREA_NAME_"][:4], data["AREA_NAME_"]) +
                 addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):])
    elif (data["AREA_NAME_"][:3]
          in addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) +
                   len(data["AREA_NAME_"])]):
        addr_ = (addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) +
                       len(data["AREA_NAME_"])].replace(
                           data["AREA_NAME_"][:3], data["AREA_NAME_"]) +
                 addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):])
    elif (data["AREA_NAME_"][:2]
          in addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) +
                   len(data["AREA_NAME_"])]):
        addr_ = (addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) +
                       len(data["AREA_NAME_"])].replace(
                           data["AREA_NAME_"][:2], data["AREA_NAME_"]) +
                 addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):])
    else:
        if len(data["AREA_NAME_"]) > 3:
            addr_ = (
                addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"])] +
                data["AREA_NAME_"] +
                addr_[len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]):])

    # 剩余数据在数据库中无区县级
    if not addr_:
        if data["PROVINCE_NAME_"] not in data[
                "ADDR_"][:len(data["PROVINCE_NAME_"])]:
            data["ADDR_"] = data["PROVINCE_NAME_"] + data["ADDR_"]
        if data["CITY_NAME_"] not in data["ADDR_"][:len(data["PROVINCE_NAME_"]
                                                        ) +
                                                   len(data["CITY_NAME_"])]:
            data["ADDR_"] = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] +
                             data["CITY_NAME_"] +
                             data["ADDR_"][len(data["PROVINCE_NAME_"]):])
        addr_ = data["ADDR_"]
        # data["AREA_CODE_"] = data["CITY_CODE_"]

    if "直辖" in data["CITY_NAME_"]:
        addr_ = data["ADDR_"]

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "ABC" + "_" + data["CITY_CODE_"] + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # re_data["_id"] = data["_id"]
    # "C"
    re_data["BANK_CODE_"] = "ABC"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
    # re_data["AREA_CODE_"] = data["AREA_CODE_"]
    # re_data["AREA_NAME_"] = data["AREA_NAME_"]
    # re_data["UNIT_CODE_"] = "ABC" + "_" + data["CITY_CODE_"]

    # "F"
    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = data["CITY_CODE_"]
    # re_data["CITY_NAME_"] = data["CITY_NAME_"]
    # re_data["LAT_"] = data["LAT_"]
    # re_data["LNG_"] = data["LNG_"]

    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = data["PROVINCE_CODE_"][:4]
    # re_data["PROVINCE_NAME_"] = data["PROVINCE_NAME_"]
    # re_data["DISTRICT_CODE_"] = data["AREA_CODE_"]
    # re_data["DISTRICT_NAME_"] = data["AREA_NAME_"]
    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "ABC" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    re_data["URL_"] = data["URL_"]
    re_data["TEL_"] = data.get("TEL_", "")

    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
        if re_data["SOURCE_TYPE_NAME_"] == "营业网点":
            re_data["TYPE_NAME_"] = "支行"
            re_data["TYPE_"] = "ZH"
        else:
            re_data["TYPE_NAME_"] = "自助银行"
            re_data["TYPE_"] = "ZZ"
    if "BUSINESS_HOURS_" in data:
        re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"]

    return re_data
예제 #7
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()

    for each in ["北京市", "天津市", "上海市", "重庆市"]:
        if each in data["CITY_NAME_"]:
            for pro in province_list:
                if pro["NAME_"] == each:
                    re_data["PROVINCE_NAME_"] = pro["NAME_"]
                    re_data["PROVINCE_CODE_"] = pro["CODE_"]
                    re_data["CITY_NAME_"] = pro["NAME_"]
                    re_data["CITY_CODE_"] = pro["CODE_"][:3] + "100"
                    break
    else:
        for city in city_list:
            if city["NAME_"] in data["CITY_NAME_"]:
                re_data["CITY_NAME_"] = city["NAME_"]
                re_data["CITY_CODE_"] = city["CODE_"]
                re_data["PROVINCE_CODE_"] = city["PARENT_"]
                break
        if re_data.get("PROVINCE_CODE_"):
            for pro in province_list:
                if pro["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = pro["NAME_"]
                    break

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "CGB" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = "CGB"
    re_data["BANK_NAME_"] = "广发银行"
    re_data["SPIDER_TIME_"] = data["DATETIME_"]

    # "F"
    # "地址:宝山区牡丹江路1211号"
    re_data["ADDR_"] = data["ADDR_"].replace("地址:", "")
    re_data["NAME_"] = data["NAME_"]

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "CGB" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]

    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        # 电话:02168037370
        re_data["TEL_"] = data["TEL_"].replace("电话:", "")
    if "BUSINESS_HOURS_" in data:
        re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"]
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    # if ("社区银行" in re_data.get("SOURCE_TYPE_NAME_", "")) or ("网点" in re_data.get("SOURCE_TYPE_NAME_", "")):
    #     re_data["TYPE_NAME_"] = "支行"
    #     re_data["TYPE_"] = "ZH"
    # else:
    #     re_data["TYPE_NAME_"] = "自助银行"
    #     re_data["TYPE_"] = "ZZ"
    re_data["TYPE_NAME_"] = "自助银行"
    re_data["TYPE_"] = "ZZ"

    return re_data
예제 #8
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()
    # 省级信息清洗
    for prov in province_list:
        if prov["NAME_"][:2] in data["PROVINCE_NAME_"]:
            re_data["PROVINCE_NAME_"] = prov["NAME_"]
            re_data["PROVINCE_CODE_"] = prov["CODE_"]
            break
    # 市级信息清洗
    re_data["CITY_NAME_"] = ''
    for city in city_list:
        if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
            if city["NAME_"][:2] in data["CITY_NAME_"]:
                re_data["CITY_NAME_"] = city["NAME_"]
                re_data["CITY_CODE_"] = city["CODE_"]
                break
    # 区县级信息清洗
    import re
    try:
        area_name = re.findall('[市县](.*[区镇县])', data.get('ADDR_'))[0]
    except:
        area_name = ''
    # # 区县级信息清洗
    area_n = ''
    area_c = ''
    if area_name:
        for area in area_list:
            if area["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
                if area["NAME_"] == area_name:
                    area_n = area["NAME_"]
                    area_c = area["CODE_"]
                elif area["NAME_"][:-1] == area_name[:-1]:
                    area_n = area["NAME_"]
                    area_c = area["CODE_"]

    # 地址清洗
    prov_n = re_data["PROVINCE_NAME_"]
    city_n = re_data["CITY_NAME_"]
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    re_data["BANK_CODE_"] = "SPDB"
    re_data["BANK_NAME_"] = "浦发银行"
    re_data["SPIDER_TIME_"] = data["DATETIME_"]

    # "F"
    re_data["ADDR_"] = addr_
    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    if data["PROVINCE_NAME_"] == data["CITY_NAME_"]:
        re_data["CITY_NAME_"] = re_data["PROVINCE_NAME_"]

    re_data["UNIT_CODE_"] = "SPDB" + "_" + re_data.get("CITY_CODE_", "")
    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n
    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]

    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"

    return re_data
예제 #9
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()
    prov_c = ""
    prov_n = ""
    city_c = ""
    city_n = ""
    area_c = ""
    area_n = ""

    for city in city_list:
        if city["NAME_"] == data["CITY_NAME_"]:
            city_n = city["NAME_"]
            city_c = city["CODE_"]
            prov_c = city["PARENT_"]

    for prov in province_list:
        if prov["CODE_"] == prov_c:
            prov_n = prov["NAME_"]

    if not city_c:
        for prov in province_list:
            if prov["NAME_"] == data["CITY_NAME_"]:
                prov_n = prov["NAME_"]
                prov_c = prov["CODE_"]
                city_n = prov["NAME_"]
                city_c = prov_c[:2] + "0100"

    for area in area_list:
        if city_c:
            if area["PARENT_"] == city_c:
                if area["NAME_"] == data["AREA_NAME_"]:
                    area_n = area["NAME_"]
                    area_c = area["CODE_"]
        else:
            if area["NAME_"] == data["AREA_NAME_"]:
                area_n = area["NAME_"]
                area_c = area["CODE_"]
                city_c = area["PARENT_"]

    # 地址清洗
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # if (not area_c) or (not area_n) or (not city_n) or (not city_c) or (not prov_c) or (not prov_n):
    #     print(prov_n, prov_c, city_n, city_c, area_n, area_c)
    #     print(data)
    # print(addr_)

    # "C"
    re_data["BANK_CODE_"] = "BCM"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    # re_data["AREA_CODE_"] = area_c
    # re_data["AREA_NAME_"] = area_n

    # "F"
    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_NAME_"] = city_n
    # re_data["LAT_"] = data["LAT_"]
    # re_data["LNG_"] = data["LNG_"]
    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n
    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"]["district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"]["adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "BCM" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    if "BUSINESS_HOURS_" in data:
        re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"]
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"

    return re_data
예제 #10
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()

    # 省级信息清洗
    data["PROVINCE_NAME_"] = '广东'
    for prov in province_list:
        if prov["NAME_"][:2] in data["PROVINCE_NAME_"]:
            re_data["PROVINCE_NAME_"] = prov["NAME_"]
            re_data["PROVINCE_CODE_"] = prov["CODE_"]
            break

    # 市级信息清洗
    re_data["CITY_NAME_"] = ''
    for city in city_list:
        if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
            if city["NAME_"][:2] in data["CITY_NAME_"]:
                re_data["CITY_NAME_"] = city["NAME_"]
                re_data["CITY_CODE_"] = city["CODE_"]
                break

    # 区县级信息清洗
    for area in area_list:
        if area["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
            if area["NAME_"] == data["AREA_NAME_"]:
                area_n = area["NAME_"]
                area_c = area["CODE_"]
            elif area["NAME_"][:-1] == data["AREA_NAME_"][:-1]:
                area_n = area["NAME_"]
                area_c = area["CODE_"]

    # "C"
    re_data["BANK_CODE_"] = "NRCB"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    # re_data["UNIT_CODE_"] = "CBHB" + re_data.get("CITY_CODE_", "")

    # "F"
    re_data["ADDR_"] = data["ADDR_"]
    re_data["NAME_"] = data["NAME_"]

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"]["district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"]["adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "NRCB" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    re_data["BUSINESS_HOURS_"] = "0:00-24:00"
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"
    return re_data
예제 #11
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()
    prov_n = ""
    prov_c = ""
    city_n = ""
    city_c = ""
    area_n = ""
    area_c = ""
    addr_ = ""

    # if "武候区" in data["ADDR_"]:
    #     data["ADDR_"] = data["ADDR_"].replace("武候区", "武侯区")
    # elif "管城区" in data["ADDR_"]:
    #     data["ADDR_"] = data["ADDR_"].replace("管城区", "管城回族区")
    #
    # # 省级市级信息清洗
    # if data["CITY_"] == "吴江市":
    #     data["CITY_"] = "吴江区"
    # elif data["CITY_"] == "香港":
    #     data["CITY_"] = "香港特别行政区"
    # for area in area_list:
    #     if area["NAME_"] == data["CITY_"]:
    #         area_n = area["NAME_"]
    #         area_c = area["CODE_"]
    #         city_c = area["PARENT_"]
    #         prov_c = area_c[:2] + "00"
    #         break
    # for city in city_list:
    #     if city_c == city["CODE_"]:
    #         city_n = city["NAME_"]
    #         break
    #     elif city["NAME_"] == data["CITY_"]:
    #         city_n = city["NAME_"]
    #         city_c = city["CODE_"]
    #         prov_c = city["PARENT_"]
    #         break
    # for prov in province_list:
    #     if prov_c == prov["CODE_"]:
    #         prov_n = prov["NAME_"]
    #         break
    #     elif prov["NAME_"] == data["CITY_"]:
    #         prov_n = prov["NAME_"]
    #         prov_c = prov["CODE_"]
    #         city_n = prov["NAME_"]
    #         city_c = prov["CODE_"]
    #         break
    # # 区县级信息清洗
    # if not area_c:
    #     for area in area_list:
    #         if city_c == prov_c:
    #             if area["CODE_"][:2] == prov_c[:2]:
    #                 if area["NAME_"] in data["ADDR_"]:
    #                     area_n = area["NAME_"]
    #                     area_c = area["CODE_"]
    #         else:
    #             if area["CODE_"][:4] == city_c[:4]:
    #                 if area["NAME_"] in data["ADDR_"]:
    #                     area_n = area["NAME_"]
    #                     area_c = area["CODE_"]
    # if not area_c:
    #     for area in area_list:
    #         if len(area["NAME_"]) > 2:
    #             if area["CODE_"][:4] == city_c[:4]:
    #                 if (area["NAME_"][:-1] in data["ADDR_"]) and (area["NAME_"][:-1] != city_n[:-1]):
    #                     check_index = data["ADDR_"].find(area["NAME_"][:-1])
    #                     if ("道" not in data["ADDR_"][check_index:check_index + len(area["NAME_"][:-1]) + 3]) \
    #                             and (
    #                             "路" not in data["ADDR_"][check_index:check_index + len(area["NAME_"][:-1]) + 3]) \
    #                             and (
    #                             "街" not in data["ADDR_"][check_index:check_index + len(area["NAME_"][:-1]) + 3]):
    #                         area_n = area["NAME_"]
    #                         area_c = area["CODE_"]
    #
    # # 地址清洗
    # if ("广西自治区" in data["ADDR_"]) or ("新疆自治区" in data["ADDR_"]):
    #     data["ADDR_"] = data["ADDR_"].replace("广西自治区", "广西壮族自治区")
    #     data["ADDR_"] = data["ADDR_"].replace("新疆自治区", "新疆维吾尔自治区")
    # if prov_n in data["ADDR_"]:
    #     addr_ = data["ADDR_"]
    # elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
    #     addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    # elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
    #     addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    # elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
    #     addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    # elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
    #     addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    # else:
    #     addr_ = prov_n + data["ADDR_"]
    #
    # if city_n in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_
    # elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_[:len(prov_n) + len(city_n)].replace(
    #         city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    # elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_[:len(prov_n) + len(city_n)].replace(
    #         city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    # elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_[:len(prov_n) + len(city_n)].replace(
    #         city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    # elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_[:len(prov_n) + len(city_n)].replace(
    #         city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    # else:
    #     addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "CEB" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = "CEB"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-2]
    re_data["CREATE_TIME_"] = data["DATETIME_"]

    # re_data["AREA_CODE_"] = area_c
    # re_data["UNIT_CODE_"] = "CEB" + "_" + city_c

    # "F"
    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_"] = city_n
    # re_data["LAT_"] = data["LAT_"]
    # re_data["LNG_"] = data["LNG_"]
    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n
    # re_data["DISTRICT_CODE_"] = area_c
    # re_data["DISTRICT_NAME_"] = area_n

    result = get_lat_lng(address=re_data["NAME_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "CEB" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["DEALTIME_"] = data["DEALTIME_"]
    re_data["URL_"] = data["URL_"]
    re_data["TEL_"] = data["TEL_"]
    re_data["BUSINESS_HOURS_"] = ""

    return re_data
예제 #12
0
    def generic_shuffle(self, data):
        """
        清洗脚本写到这里
        :param data:
        :return re_data:
        """
        re_data = dict()
        serial_number = req_for_serial_number(code="WD_SS_YY")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # 得到经度和维度 补全省市区域数据
        try:
            lat_result = get_lat_lng(address=data["ADDR_"])
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            try:
                lat_result = get_lat_lng(address=data["CITY_NAME_"]+data["NAME_"])
                re_data["LAT_"] = lat_result["result"]["location"]["lat"]
                re_data["LNG_"] = lat_result["result"]["location"]["lng"]
            except KeyError:
                re_data["LAT_"] = None
                re_data["LNG_"] = None
            except Exception as e:
                re_data["LAT_"] = None
                re_data["LNG_"] = None
                self.logger.info("获取经纬度失败错误为{}".format(e))
        except Exception as e:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
            self.logger.info("获取经纬度失败错误为{}".format(e))
        if re_data["LNG_"]:
            try:
                area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])]))
            except Exception as e:
                self.logger.info("获取地址信息失败错误为{}".format(e))
            else:
                try:
                    re_data["PROVINCE_NAME_"] = area_result["result"]["addressComponent"]["province"]
                    re_data["CITY_NAME_"] = area_result["result"]["addressComponent"]["city"]
                    re_data["AREA_NAME_"] = area_result["result"]["addressComponent"]["district"]
                    re_data["AREA_CODE_"] = area_result["result"]["addressComponent"]["adcode"]
                    re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                    re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                except KeyError:
                    pass

        # 设备
        if "DEVICE_" in data:
            re_data["DEVICE_"] = data["DEVICE_"]
        # 医院等级
        if "GRADE_" in data:
            re_data["GRADE_"] = data["GRADE_"]
        # 特色
        if "SPECIAL_" in data:
            re_data["SPECIAL_"] = data["SPECIAL_"]
        # 电话
        if "TEL_" in data:
            re_data["TEL_"] = data["TEL_"]
        # 医院id
        if "HOSPITAL_ID_" in data:
            re_data["HOSPITAL_ID_"] = data["HOSPITAL_ID_"]
        # 医院名称
        if "NAME_" in data:
            re_data["NAME_"] = data["NAME_"]
        # 地址
        if "ADDR_" in data:
            re_data["ADDR_"] = data["ADDR_"]
        # 床位
        if "BEDS_" in data:
            re_data["BEDS_"] = data["BEDS_"]
        # 医院性质
        if "TYPE_" in data:
            re_data["TYPE_"] = data["TYPE_"]
        # 网站
        if "WEBSITE_" in data:
            re_data["WEBSITE_"] = data["WEBSITE_"]
        # 门诊量
        if "VOLNUM_" in data:
            re_data["VOLNUM_"] = data["VOLNUM_"]
        # print(re_data)
        re_data = super(Branchssyy, self).generic_shuffle(data=data, re_data=re_data, field=None)
        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
예제 #13
0
def data_shuffle(data, province_list, city_list, area_list):
    data_list = list()
    for city in city_list:
        if city["NAME_"] == "县":
            city_list.remove(city)

    prov_c = ""
    prov_n = ""
    city_c = ""
    city_n = ""
    area_c = ""
    area_n = ""

    # 省市级信息
    if "北京" in data["CITY_NAME_"]:
        prov_n = "北京市"
        prov_c = "1100"
        city_n = "北京市"
        city_c = "110100"
    elif "天津" in data["CITY_NAME_"]:
        prov_n = "天津市"
        prov_c = "1200"
        city_n = "天津市"
        city_c = "120100"
    elif "上海" in data["CITY_NAME_"]:
        prov_n = "上海市"
        prov_c = "3100"
        city_n = "上海市"
        city_c = "310100"
    elif "重庆" in data["CITY_NAME_"]:
        prov_n = "重庆市"
        prov_c = "5000"
        city_n = "重庆市"
        city_c = "500100"
    else:
        for city in city_list:
            if city["NAME_"][:-1] in data["CITY_NAME_"]:
                city_n = city["NAME_"]
                city_c = city["CODE_"]
                prov_c = city["PARENT_"]
                break
        if prov_c:
            for prov in province_list:
                if prov["CODE_"] == prov_c:
                    prov_n = prov["NAME_"]
                    break

    response = req_for_something(data["URL_"])

    a = re.sub(r"[^\w|,]+", "", response.content.decode("utf-8"))

    b = a.split("|")

    for each in b:
        re_data = dict()
        message = each.split(",")
        if len(message) == 1:
            continue
        # city_n = message[0]
        # city_c = message[1]
        name = message[2]
        addr_ = message[3]
        tel = message[4]
        business_time = message[5] + message[6]
        # lng = message[8]
        # lat = message[9]

        # # 区县级清洗
        # for area in area_list:
        #     if area["PARENT_"] == city_c:
        #         if area["NAME_"] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]
        #         elif area["NAME_"][:-1] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]
        #         elif area["NAME_"][:4] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]
        #         elif area["NAME_"][:3] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]
        #         elif area["NAME_"][:2] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]

        # 地址清洗
        if prov_n in addr_:
            pass
        elif prov_n[:-1] in addr_[:len(prov_n)]:
            addr_ = addr_[:len(prov_n)].replace(prov_n[:-1],
                                                prov_n) + addr_[len(prov_n):]
        elif prov_n[:4] in addr_[:len(prov_n)]:
            addr_ = addr_[:len(prov_n)].replace(prov_n[:4],
                                                prov_n) + addr_[len(prov_n):]
        elif prov_n[:3] in addr_[:len(prov_n)]:
            addr_ = addr_[:len(prov_n)].replace(prov_n[:3],
                                                prov_n) + addr_[len(prov_n):]
        elif prov_n[:2] in addr_[:len(prov_n)]:
            addr_ = addr_[:len(prov_n)].replace(prov_n[:2],
                                                prov_n) + addr_[len(prov_n):]
        else:
            addr_ = prov_n + addr_

        if city_n in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_
        elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_[:len(prov_n) + len(city_n)].replace(
                city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
        elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_[:len(prov_n) + len(city_n)].replace(
                city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
        elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_[:len(prov_n) + len(city_n)].replace(
                city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
        elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_[:len(prov_n) + len(city_n)].replace(
                city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
        else:
            addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

        # "C"
        re_data["BANK_CODE_"] = "CZB"
        re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
        re_data["SPIDER_TIME_"] = data["DATETIME_"]
        # re_data["AREA_CODE_"] = area_c
        # re_data["AREA_NAME_"] = area_n
        # re_data["UNIT_CODE_"] = "CZB" + "_" + city_c

        # "F"
        re_data["ADDR_"] = addr_
        re_data["PROVINCE_NAME_"] = prov_n
        re_data["PROVINCE_CODE_"] = prov_c
        re_data["CITY_CODE_"] = city_c
        re_data["CITY_NAME_"] = city_n
        # re_data["LAT_"] = lat
        # re_data["LNG_"] = lng
        re_data["NAME_"] = name
        # re_data["PROVINCE_CODE_"] = prov_c
        # re_data["PROVINCE_NAME_"] = prov_n

        result = get_lat_lng(address=re_data["ADDR_"])
        try:
            re_data["LAT_"] = str(result["result"]["location"]["lat"])
            re_data["LNG_"] = str(result["result"]["location"]["lng"])
        except KeyError:
            re_data["LAT_"] = ""
            re_data["LNG_"] = ""
        else:
            dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
            try:
                re_data["AREA_NAME_"] = dis_result["result"][
                    "addressComponent"]["district"]
            except KeyError:
                re_data["AREA_NAME_"] = ""
            try:
                re_data["AREA_CODE_"] = dis_result["result"][
                    "addressComponent"]["adcode"]
            except KeyError:
                re_data["AREA_CODE_"] = ""
            else:
                re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                for city in city_list:
                    if city["CODE_"] == re_data["CITY_CODE_"]:
                        re_data["CITY_NAME_"] = city["NAME_"]
                        break
                for prov in province_list:
                    if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                        re_data["PROVINCE_NAME_"] = prov["NAME_"]
                        break
        re_data["UNIT_CODE_"] = "CZB" + "_" + re_data.get("CITY_CODE_", "")

        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["URL_"] = data["URL_"]
        re_data["TEL_"] = tel
        re_data["BUSINESS_HOURS_"] = business_time
        if "SOURCE_TYPE_NAME_" in data:
            re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
        re_data["TYPE_NAME_"] = "支行"
        re_data["TYPE_"] = "ZH"

        data_list.append(re_data)

    return data_list
예제 #14
0
    def main(self):
        # # 创建表
        # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,'
        #              '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,'
        #              '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,'
        #              '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar'
        #              ') IMMUTABLE_ROWS = true')
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        # 获取数据
        # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection)
        mongo_data_list = self.m_client.search_from_mongodb(
            collection=self.collection,
            field_name="DEALTIME_",
            field_value={"$gt": "1555136656.0579224"},
            data_id="5cb65fac9bb3df61a09c6625")

        count = 0
        while True:
            # 取一条处理
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(3)
                data = mongo_data_list.__next__()

            # 清洗
            try:
                data["PHONE_"] = data["PHONE_"].replace("无,", "")
                u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日",
                                         data["UPDATETIME_"])
                if u_time_list:
                    u_ = u_time_list[0].replace("年", "-")
                    u_ = u_.replace("月", "-")
                    u_l = u_.split("-")
                    if len(u_l[1]) == 1:
                        u_l[1] = "0" + u_l[1]
                    if len(u_l[2]) == 1:
                        u_l[2] = "0" + u_l[2]
                    data["UPDATETIME_"] = "-".join(u_l)
            except Exception as e:
                self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}")
                continue

            # 获取经纬度
            try:
                if data["ADDRESS_"]:
                    data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:])
                    location_result = get_lat_lng(address=data["ADDRESS_"])
                    if location_result["status"] == 0:
                        data["LNG_"] = str(
                            location_result["result"]["location"]["lng"])
                        data["LAT_"] = str(
                            location_result["result"]["location"]["lat"])
                    else:
                        self.logger.warning(f"_id: {data['_id']} 获取经纬度失败")
                else:
                    continue
            except Exception as e:
                self.logger.exception(
                    f"_id: {data['_id']} 获取经纬度失败, error: {e}")
                continue
            # upsert to HBase
            try:
                re_data = self.__check_lat(data=data)
                # 向 HBase 中插入一条
                self.p_client.upsert_to_phoenix_by_one(
                    connection=self.connection, data=re_data)
                count += 1
                if count % 100 == 0:
                    self.logger.info(
                        f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}")
            except Exception as e:
                self.logger.exception(
                    f"HBase 插入失败, _id: {data['_id']}, error: {e}")
                continue

        # 关闭 MongoDB cursor
        mongo_data_list.close()
        self.logger.info(
            f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条"
        )
예제 #15
0
파일: __init__.py 프로젝트: ILKKAI/dataETL
    def generic_shuffle(self, data):
        re_data = dict()
        serial_number = req_for_serial_number(code="MAPBAR")
        re_data["ID_"] = serial_number
        re_data["NAME_"] = data["NAME_"]
        re_data["ADDRESS_"] = data["ADDRESS_"].replace("|", "")
        re_data["ADDRESS_"] = re_data["ADDRESS_"].replace("地址:", "")
        # re_data["PROVINCE_CODE_"] = "3100"
        # re_data["PROVINCE_NAME_"] = "上海市"
        # re_data["CITY_CODE_"] = "310100"
        # re_data["CITY_NAME_"] = "上海市"
        re_data["HOT_"] = 0
        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        re_data["SOURCE_NAME_"] = "图吧"
        re_data["SOURCE_TYPE_"] = "图吧"
        # 获取经纬度
        try:
            if re_data["ADDRESS_"]:
                location_result = get_lat_lng(address=re_data["ADDRESS_"])
                if location_result["status"] == 0:
                    re_data["LNG_"] = str(
                        location_result["result"]["location"]["lng"])
                    re_data["LAT_"] = str(
                        location_result["result"]["location"]["lat"])
                else:
                    re_data["LNG_"] = ""
                    re_data["LAT_"] = ""
                    self.logger.warning(f"_id: {data['_id']} 获取经纬度失败")
            else:
                re_data["LNG_"] = ""
                re_data["LAT_"] = ""
        except Exception as e:
            self.logger.exception(f"_id: {data['_id']} 获取经纬度失败, error: {e}")
        if re_data["LAT_"]:
            try:
                area_result = get_area(",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])]))
            except Exception as e:
                self.logger.exception(f"_id: {data['_id']} 获取地址失败, error: {e}")
            else:
                try:
                    re_data["AREA_NAME_"] = area_result["result"][
                        "addressComponent"]["district"]
                except KeyError:
                    re_data["AREA_NAME_"] = ""
                try:
                    re_data["AREA_CODE_"] = area_result["result"][
                        "addressComponent"]["adcode"]
                except KeyError:
                    re_data["AREA_CODE_"] = ""
                else:
                    re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                    re_data[
                        "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                    for city in self.city_list:
                        if city["CODE_"] == re_data["CITY_CODE_"]:
                            re_data["CITY_NAME_"] = city["NAME_"]
                            break
                    for prov in self.province_list:
                        if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                            re_data["PROVINCE_NAME_"] = prov["NAME_"]
                            break

        if not re_data.get("CITY_NAME_", ""):
            for city in self.city_list:
                if city["NAME_"][:2] in data["TYPE_"]:
                    re_data["CITY_CODE_"] = city["CODE_"]
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            if re_data.get("CITY_NAME_", ""):
                for prov in self.province_list:
                    if prov["CODE_"][:2] == re_data["CITY_CODE_"][:2]:
                        re_data["PROVINCE_CODE_"] = prov["CODE_"]
                        re_data["PROVINCE_NAME_"] = prov["NAME_"]
                        break

        # CHA_BRANCH_MAIN_ROUTE 主干道
        if "道路" in data["TYPE_"]:
            road_data = dict()
            road_data.update(re_data)
            road_data["ID_"] = req_for_serial_number(code="WD_GD")
            road_data["ADDR_"] = road_data["ADDRESS_"]
            del road_data["ADDRESS_"]
            road_shuffle_data = super(MapbarScript,
                                      self).generic_shuffle(data=data,
                                                            re_data=road_data,
                                                            field=None)

        # CHA_BRANCH_FACILITY 图吧
        # serial_number = req_for_serial_number(code="MAPBAR")
        # re_data["ID_"] = serial_number
        re_data["TYPE1_"] = data["BTYPE_"]
        try:
            re_data["TYPE1_CODE_"] = self.type1_dict[re_data["TYPE1_"]]
        except KeyError:
            raise Exception("暂不需要清洗的数据")
        # 小分类清洗(合并部分分类)
        if data["TYPE_"][2:] in ["户外运动俱乐部", "赛马场及马术俱乐部", "室内运动健身俱乐部"]:
            re_data["TYPE2_"] = "俱乐部"
            re_data["TYPE2_CODE_"] = "JLB"
        elif data["TYPE_"][2:] in ["连锁店", "便利店"]:
            re_data["TYPE2_"] = "便利店"
            re_data["TYPE2_CODE_"] = "BLD"
        elif data["TYPE_"][2:] in ["电子商城", "电器商城"]:
            re_data["TYPE2_"] = "家电数码"
            re_data["TYPE2_CODE_"] = "JDSM"
        elif data["TYPE_"][2:] in ["诊所/卫生所", "门诊/急诊部"]:
            re_data["TYPE2_"] = "门诊/卫生所"
            re_data["TYPE2_CODE_"] = "MZWSS"
        else:
            re_data["TYPE2_"] = data["TYPE_"][2:]
            re_data["TYPE2_CODE_"] = self.type2_dict.get(re_data["TYPE2_"])
        re_data["SOURCE_TYPE1_"] = data["BTYPE_"]
        re_data["SOURCE_TYPE1_CODE_"] = self.type1_dict.get(
            re_data["SOURCE_TYPE1_"])
        re_data["SOURCE_TYPE2_"] = data["TYPE_"][2:]
        re_data["SOURCE_TYPE2_CODE_"] = self.source_type2_dict.get(
            re_data["SOURCE_TYPE2_"])
        re_data["PHONE_"] = data["PHONE_"].replace("无,", "")
        re_data["BUS_"] = data["BUS_"]
        re_data["BUSSTOP_"] = data["BUSSTOP_"]

        shuffle_data = super(MapbarScript,
                             self).generic_shuffle(data=data,
                                                   re_data=re_data,
                                                   field=None)

        return_list = list()
        return_list.append({
            "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FACILITY"),
            "DATA_": shuffle_data
        })
        if "road_shuffle_data" in dir():
            return_list.append({
                "TABLE_NAME_":
                TABLE_NAME("CHA_BRANCH_MAIN_ROUTE"),
                "DATA_":
                road_shuffle_data
            })
        return return_list
예제 #16
0
    def generic_shuffle(self, data):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data_list = list()

        # print(data["CONTENT_HTML_"])

        # 根据CONTENT_HTML_ 获取商圈字典型列表("区域":"商圈名")
        soup = BeautifulSoup(data["CONTENT_HTML_"], "html.parser")
        dl = soup.find_all('dl', {"class": "list"})
        # 商圈字典型列表
        dt_dict = dict()
        for item in dl:
            # print(item)
            dt = item.dt.a.string
            li_list = list()
            for li in item.find_all('li'):
                # print(li)
                li_list.append(li.a.string)
            dt_dict[dt] = li_list
        for area_name in dt_dict:

            shopping_list = dt_dict[area_name]
            # print(dt_dict)

            # 得到各商圈经度和维度 补全省市区域数据
            for shopping_name in shopping_list:
                re_data = dict()

                # 时间维度
                re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                    "-", "")
                # 标签
                if "TAGS_" in data:
                    re_data["TAGS_"] = ""
                # SOURCE
                source = re.findall(r"(https?://.*?)/", data["URL_"])
                re_data["SOURCE_"] = source[0]
                # 数据来源名称
                re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
                # # 数据来源编码
                # s_index = data["ENTITY_CODE_"].rfind("_")
                # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
                # 资讯来源分类
                re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
                # ID
                serial_number = req_for_serial_number(code="WD_SS_SQ")
                re_data["ID_"] = serial_number
                try:
                    lat_result = get_lat_lng(address=data["CITY_"] + "市" +
                                             area_name + shopping_name)
                    re_data["LAT_"] = lat_result["result"]["location"]["lat"]
                    re_data["LNG_"] = lat_result["result"]["location"]["lng"]
                except KeyError:
                    re_data["LAT_"] = None
                    re_data["LNG_"] = None
                except Exception as e:
                    re_data["LAT_"] = None
                    re_data["LNG_"] = None
                    self.logger.info("获取经纬度失败错误信息为{}".format(e))
                if re_data["LAT_"]:
                    try:
                        area_result = get_area(",".join(
                            [str(re_data["LAT_"]),
                             str(re_data["LNG_"])]))
                    except Exception as e:
                        self.logger.info(f"获取地址失败, ERROR: {e}")
                    else:
                        try:

                            re_data["ADDR_"] = area_result["result"][
                                "formatted_address"]
                            re_data["PROVINCE_NAME_"] = area_result["result"][
                                "addressComponent"]["province"]
                            re_data["CITY_NAME_"] = area_result["result"][
                                "addressComponent"]["city"]
                            re_data["AREA_NAME_"] = area_result["result"][
                                "addressComponent"]["district"]
                            re_data["AREA_CODE_"] = area_result["result"][
                                "addressComponent"]["adcode"]
                            re_data["CITY_CODE_"] = re_data[
                                "AREA_CODE_"][:4] + "00"
                            re_data["PROVINCE_CODE_"] = re_data[
                                "AREA_CODE_"][:2] + "00"
                        except KeyError:
                            re_data["ADDR_"] = shopping_name
                            re_data["PROVINCE_NAME_"] = None
                            re_data["CITY_NAME_"] = data["CITY_"] + "市"
                            re_data["AREA_NAME_"] = None
                            re_data["AREA_CODE_"] = None
                            re_data["CITY_CODE_"] = None
                            re_data["PROVINCE_CODE_"] = None

                re_data["NAME_"] = shopping_name
                re_data = super(Branchsssq,
                                self).generic_shuffle(data=data,
                                                      re_data=re_data,
                                                      field=None)
                re_data_list.append({
                    "TABLE_NAME_": self.p_client.table_name,
                    "DATA_": re_data
                })
        # print(re_data_list)
        return re_data_list
예제 #17
0
파일: __init__.py 프로젝트: ILKKAI/dataETL
    def generic_shuffle(self, data):
        """
        清洗脚本写到这里
        :param data:
        :return re_data:
        """

        re_data = dict()
        serial_number = req_for_serial_number(code="WD_SS_XX")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # 得到经度和维度 补全省市区域数据
        try:
            lat_result = get_lat_lng(address=data["ADDR_"])
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
        except Exception as e:
            self.logger.info("获取经纬度失败信息为{}".format(e))
        if re_data["LAT_"]:
            try:
                area_result = get_area(",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])]))
            except Exception as e:
                self.logger.info(f"获取地址失败, ERROR: {e}")
            else:
                try:
                    re_data["PROVINCE_NAME_"] = area_result["result"][
                        "addressComponent"]["province"]
                    re_data["CITY_NAME_"] = area_result["result"][
                        "addressComponent"]["city"]
                    re_data["AREA_NAME_"] = area_result["result"][
                        "addressComponent"]["district"]
                    re_data["AREA_CODE_"] = area_result["result"][
                        "addressComponent"]["adcode"]
                    re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                    re_data[
                        "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                except KeyError:
                    pass

        # 学校名称
        if "NAME_" in data:
            re_data["NAME_"] = data["NAME_"]
        # 属性(市重点、区重点、全国重点)
        if "LEVEL_" in data:
            re_data["LEVEL_"] = data["LEVEL_"]
        # 图片
        if "IMAGES_" in data:
            if data["IMAGES_"]:
                response = req_for_something(url=data["IMAGES_"])
                if response:
                    t = base64.b64encode(response.content)
                    re_data["IMAGES_"] = t.decode("utf-8")
        # 学校类型
        if "SCHOOL_TYPE_" in data:
            re_data["SCHOOL_TYPE_"] = data["SCHOOL_TYPE_"]
        # 学校性质
        if "SCHOOL_NATURE_" in data:
            re_data["SCHOOL_NATURE_"] = data["SCHOOL_NATURE_"]
        # 电话
        if "TEL_" in data:
            pattern1 = re.compile(r"(\d{3,4}-\d{8})(\d{3,4}-\d{8})")
            pattern2 = re.compile(r"(\d{3,4}-\d{8})(\d{8})")
            pattern3 = re.compile(r"(\d{3,4}-\d{8})(\d{11})")
            pattern4 = re.compile(r"(\d{3,4}-\d{8})(\d{8})(\d{8})")
            pattern5 = re.compile(r"(\d{8})(\d{11})")
            pattern6 = re.compile(r"(\d{8})(\d{8})")
            pattern7 = re.compile(r"(\d{3,4}-\d{7})(\d{3,4}-\d{7})")
            pattern8 = re.compile(r"(\d{3,4}-\d{8})(\d{11})(\d{11})")
            pattern9 = re.compile(r"(\d{3,4}-\d{7})(\d{7})")
            if re.match(pattern1, data["TEL_"]):
                phone_number = re.sub(pattern1, r"\1  \2", data["TEL_"])
            elif re.match(pattern2, data["TEL_"]):
                phone_number = re.sub(pattern2, r"\1  \2", data["TEL_"])
            elif re.match(pattern3, data["TEL_"]):
                phone_number = re.sub(pattern3, r"\1  \2", data["TEL_"])
            elif re.match(pattern4, data["TEL_"]):
                phone_number = re.sub(pattern4, r"\1  \2  \3", data["TEL_"])
            elif re.match(pattern5, data["TEL_"]):
                phone_number = re.sub(pattern5, r"\1  \2", data["TEL_"])
            elif re.match(pattern6, data["TEL_"]):
                phone_number = re.sub(pattern6, r"\1  \2", data["TEL_"])
            elif re.match(pattern7, data["TEL_"]):
                phone_number = re.sub(pattern7, r"\1  \2", data["TEL_"])
            elif re.match(pattern8, data["TEL_"]):
                phone_number = re.sub(pattern8, r"\1  \2  \3", data["TEL_"])
            elif re.match(pattern9, data["TEL_"]):
                phone_number = re.sub(pattern9, r"\1  \2", data["TEL_"])
            else:
                phone_number = data["TEL_"]
            re_data["TEL_"] = phone_number
        # 地址
        if "ADDR_" in data:
            re_data["ADDR_"] = data["ADDR_"]
        re_data = super(Branchssxx, self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)
        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
예제 #18
0
def data_shuffle(data, province_list, city_list, area_list):
    for city in city_list:
        if city["NAME_"] == "县":
            city_list.remove(city)

    re_data = dict()
    prov_n = ""
    prov_c = ""
    city_n = ""
    city_c = ""
    area_n = ""
    area_c = ""
    addr_ = ""

    # 内蒙古, 广西, 新疆, 宁夏, 西藏 字段统一:
    if ("内蒙古" in data["ADDR_"][:3] or "广西" in data["ADDR_"][:2]
            or "新疆" in data["ADDR_"][:2] or "宁夏" in data["ADDR_"][:2]
            or "西藏" in data["ADDR_"][:2]):
        if "自治区" not in data["ADDR_"]:
            data["ADDR_"] = data["ADDR_"].replace("内蒙古", "内蒙古自治区")
            data["ADDR_"] = data["ADDR_"].replace("广西", "广西壮族自治区")
            data["ADDR_"] = data["ADDR_"].replace("新疆", "新疆维吾尔自治区")
            data["ADDR_"] = data["ADDR_"].replace("宁夏", "宁夏回族自治区")
            data["ADDR_"] = data["ADDR_"].replace("西藏", "西藏自治区")

    elif "京山县" in data["AREA_NAME_"]:
        data["AREA_NAME_"] = data["AREA_NAME_"].replace("荆州", "荆门")

    for city in city_list:
        if city["NAME_"] in data["AREA_NAME_"]:
            city_n = city["NAME_"]
            city_c = city["CODE_"]
            prov_c = city["CODE_"][:2] + "00"
            break
    for area in area_list:
        if city_c:
            if area["PARENT_"] == city_c:
                if area["NAME_"] in data["AREA_NAME_"]:
                    area_n = area["NAME_"]
                    area_c = area["CODE_"]
                    break
        else:
            if (area["NAME_"][-1] == "区") and (len(area["NAME_"]) == 2):
                continue
            if area["NAME_"] in data["AREA_NAME_"]:
                area_n = area["NAME_"]
                area_c = area["CODE_"]
                city_c = area["CODE_"][:-2] + "00"
                prov_c = area["CODE_"][:2] + "00"
                break
    for prov in province_list:
        if prov_c:
            if prov["CODE_"] == prov_c:
                prov_n = prov["NAME_"]
                prov_c = prov["CODE_"]
                break
        else:
            if prov["NAME_"] in data["AREA_NAME_"]:
                prov_n = prov["NAME_"]
                prov_c = prov["CODE_"]
                break
            elif prov["NAME_"][:-1] in data["AREA_NAME_"]:
                prov_n = prov["NAME_"]
                prov_c = prov["CODE_"]
                break

    if data["AREA_NAME_"] == "洋浦经济开发区":
        prov_n = "海南省"
        prov_c = "4600"
        city_n = "儋州市"
        city_c = "460400"
        area_n = "洋浦经济开发区"
        area_c = ""
    elif ("北京" in data["AREA_NAME_"][:3] or "天津" in data["AREA_NAME_"][:3]
          or "上海" in data["AREA_NAME_"][:3] or "重庆" in data["AREA_NAME_"][:3]):
        city_n = prov_n
        city_c = prov_c

    if not area_c:
        for area in area_list:
            if area["PARENT_"] == city_c:
                if area["NAME_"][:2] in data["AREA_NAME_"][-len(area["NAME_"]
                                                                ):]:
                    area_n = area["NAME_"]
                    area_c = area["CODE_"]
                    break
    if not area_c:
        for area in area_list:
            if area["PARENT_"] == city_c:
                if area["NAME_"] in data["ADDR_"]:
                    area_n = area["NAME_"]
                    area_c = area["CODE_"]

    # 地址清洗
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # 去除地址尾部邮编
    addr_ = re.sub(r"[((][0-9]{5,6}[))]|[((][))]", "", addr_)

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "BOC" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = "BOC"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
    # re_data["AREA_CODE_"] = area_c
    # re_data["AREA_NAME_"] = area_n
    # re_data["UNIT_CODE_"] = "BOC" + "_" + city_c

    # "F"
    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_NAME_"] = city_n
    address = re_data["ADDR_"].replace("#", "号")
    if len(address) > 50:
        address_re = re.findall(r".*?号", address)
        if address_re:
            address = address_re[0]
        else:
            address = address[:50]

    result = get_lat_lng(address=address)
    if result["status"] == 2:
        result = get_lat_lng(address=address[:-5])

    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    if not re_data.get("PROVINCE_CODE_"):
        re_data["PROVINCE_CODE_"] = prov_c
    if not re_data.get("PROVINCE_NAME_"):
        re_data["PROVINCE_NAME_"] = prov_n
    if not re_data.get("CITY_CODE_"):
        re_data["CITY_CODE_"] = city_c
    if not re_data.get("CITY_NAME_"):
        re_data["CITY_NAME_"] = city_n
    re_data["UNIT_CODE_"] = "BOC" + "_" + re_data.get("CITY_CODE_", "")
    # re_data["LAT_"] = ""
    # re_data["LNG_"] = ""
    re_data["NAME_"] = data["NAME_"]

    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n
    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    if "BUSINESS_HOURS_" in data:
        re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"]
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"

    return re_data
예제 #19
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()
    dis_result = get_area(",".join([data["LAT_"], data["LNG_"]]))
    try:
        data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
            "district"]
    except KeyError:
        data["AREA_NAME_"] = ""
    try:
        data["AREA_CODE_"] = dis_result["result"]["addressComponent"]["adcode"]
    except KeyError:
        data["AREA_CODE_"] = ""
    else:
        data["CITY_CODE_"] = data["AREA_CODE_"][:4] + "00"
        data["PROVINCE_CODE_"] = data["AREA_CODE_"][:2] + "00"
        for city in city_list:
            if city["CODE_"] == data["CITY_CODE_"]:
                data["CITY_NAME_"] = city["NAME_"]
                break
        for prov in province_list:
            if prov["CODE_"] == data["PROVINCE_CODE_"]:
                data["PROVINCE_NAME_"] = prov["NAME_"]
                break

    if not data["AREA_NAME_"]:
        for prov in province_list:
            if prov["NAME_"] in data["ADDR_"] or prov["NAME_"] in data["NAME_"]:
                data["PROVINCE_NAME_"] = prov["NAME_"]
                data["PROVINCE_CODE_"] = prov["CODE_"]
                break
            elif prov["NAME_"][:-1] in data["ADDR_"] or prov[
                    "NAME_"][:-1] in data["NAME_"]:
                data["PROVINCE_NAME_"] = prov["NAME_"]
                data["PROVINCE_CODE_"] = prov["CODE_"]
                break
            elif prov["NAME_"][:2] in data["ADDR_"] or prov[
                    "NAME_"][:2] in data["NAME_"]:
                data["PROVINCE_NAME_"] = prov["NAME_"]
                data["PROVINCE_CODE_"] = prov["CODE_"]
                break

        for city in city_list:
            if city["NAME_"] in data["ADDR_"] or city["NAME_"] in data["NAME_"]:
                data["CITY_NAME_"] = city["NAME_"]
                data["CITY_CODE_"] = city["CODE_"]
                break
            elif city["NAME_"][:-1] in data["ADDR_"] or city[
                    "NAME_"][:-1] in data["NAME_"]:
                data["CITY_NAME_"] = city["NAME_"]
                data["CITY_CODE_"] = city["CODE_"]
                break
            elif city["NAME_"][:2] in data["ADDR_"] or city[
                    "NAME_"][:2] in data["NAME_"]:
                data["CITY_NAME_"] = city["NAME_"]
                data["CITY_CODE_"] = city["CODE_"]
                break

        if data["CITY_CODE_"] and not data["PROVINCE_CODE_"]:
            data["PROVINCE_CODE_"] = data["CITY_CODE_"][:2] + "00"
            for prov in province_list:
                if prov["CODE_"] == data["PROVINCE_CODE_"]:
                    data["PROVINCE_NAME_"] = prov["NAME_"]

        if data.get("CITY_CODE_"):
            for area in area_list:
                if area["PARENT_"] == data["CITY_CODE_"]:
                    if area["NAME_"] in data["ADDR_"]:
                        data["AREA_NAME_"] = area["NAME_"]
                        data["AREA_CODE_"] = area["CODE_"]
                        break
                    elif area["NAME_"][:-1] in data["ADDR_"]:
                        data["AREA_NAME_"] = area["NAME_"]
                        data["AREA_CODE_"] = area["CODE_"]
                        break
                    elif area["NAME_"][:2] in data["ADDR_"]:
                        data["AREA_NAME_"] = area["NAME_"]
                        data["AREA_CODE_"] = area["CODE_"]
                        break

    if data.get("PROVINCE_NAME_", "no value") in ["北京市", "天津市", "上海市", "重庆市"]:
        data["CITY_NAME_"] = data["PROVINCE_NAME_"]
        data["CITY_CODE_"] = data["PROVINCE_CODE_"]

    prov_n = data.get("PROVINCE_NAME_", "")
    city_n = data.get("CITY_NAME_", "")

    # 地址清洗
    if ("广西自治区" in data["ADDR_"]) or ("新疆自治区" in data["ADDR_"]):
        data["ADDR_"] = data["ADDR_"].replace("广西自治区", "广西壮族自治区")
        data["ADDR_"] = data["ADDR_"].replace("新疆自治区", "新疆维吾尔自治区")
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "SPDB" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = "SPDB"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]

    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_"] = city_n
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n
    # re_data["DISTRICT_CODE_"] = area_c
    # re_data["DISTRICT_NAME_"] = area_n
    # re_data["LAT_"] = data["LAT_"]
    # re_data["LNG_"] = data["LNG_"]

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break

    re_data["UNIT_CODE_"] = "SPDB" + "_" + re_data.get("CITY_CODE_", "")
    re_data["NAME_"] = data["NAME_"]
    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    # re_data["DEALTIME_"] = data["DEALTIME_"]
    re_data["URL_"] = data["URL_"]
    # re_data["TEL_"] = ""
    # re_data["BUSINESS_HOURS_"] = ""
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"
    return re_data
예제 #20
0
파일: __init__.py 프로젝트: ILKKAI/dataETL
    def generic_shuffle(self, data):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data = dict()
        # ID
        serial_number = req_for_serial_number(code="WD_JT_DT")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # 得到经度和维度 补全省市区域数据
        temp_location = data["SUBWAY_NAME_"][:data["SUBWAY_NAME_"].find(
            "|")] + data["STATION_NAME_"] + "地铁站"
        # print(temp_location)
        # try:
        #     res = req_for_textLoc(text=data["SUBWAY_NAME_"][:data["SUBWAY_NAME_"].find("|")] + data["STATION_NAME_"]+"地铁站")
        # except Exception as e:
        #     self.logger.exception(f"2.2--err: 请求模型 req_for_textLoc 错误."
        #                           f" 原始数据 collection = {self.m_client.mongo_collection};"
        #                           f" ENTITY_CODE_ = {self.entity_code};"
        #                           f" 原始数据 _id = {data['_id']};"
        #                           f" error: {e}.")
        # else:
        #     if "error" not in res:
        #         if res["tagsId"] == "None" or res["tagsId"] is None:
        #             pass
        #         else:
        #             re_data["TAGS_"] = res["tagsId"]
        #         if res["flag"] == 1:
        try:
            lat_result = get_lat_lng(address=temp_location)
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
        except Exception as e:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
            self.logger.info("获取经纬度失败错误信息为{}".format(e))
        if re_data["LAT_"]:
            lat_handle = ""
            try:
                lat_origin = ",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])])
                i = 0
                find_tag = False
                while True:
                    s3 = get_periphery(classify="地铁站",
                                       tag="交通设施",
                                       lat_lng=lat_origin,
                                       radius=3000,
                                       page_num=i)
                    for nearby in s3["results"]:
                        if nearby["name"] == data["STATION_NAME_"]:
                            find_tag = True
                            lat = str(nearby["location"]["lat"])
                            lng = str(nearby["location"]["lng"])
                            re_data["LAT_"] = lat
                            re_data["LNG_"] = lng
                            lat_handle = lat + "," + lng
                            break
                    if find_tag:
                        break
                    i += 1
                    if len(s3["results"]) != 20:
                        break
            except Exception as e:
                self.logger.info(f"获取精确经纬度失败, ERROR: {e}")
            if len(lat_handle) > 0:
                # 获取精确经纬度后根据精确经纬度补全地址信息
                try:
                    # area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])]))
                    area_result = get_area(lat_handle)
                except Exception as e:
                    self.logger.info(f"获取地址失败, ERROR: {e}")
                else:
                    try:
                        re_data["PROVINCE_NAME_"] = area_result["result"][
                            "addressComponent"]["province"]
                        re_data["CITY_NAME_"] = area_result["result"][
                            "addressComponent"]["city"]
                        re_data["AREA_NAME_"] = area_result["result"][
                            "addressComponent"]["district"]
                        re_data["AREA_CODE_"] = area_result["result"][
                            "addressComponent"]["adcode"]
                        re_data[
                            "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                        re_data["PROVINCE_CODE_"] = re_data[
                            "AREA_CODE_"][:2] + "00"
                    except KeyError:
                        pass
            else:
                try:
                    area_result = get_area(",".join(
                        [str(re_data["LAT_"]),
                         str(re_data["LNG_"])]))
                except Exception as e:
                    self.logger.info(f"获取地址失败, ERROR: {e}")
                else:
                    try:
                        re_data["PROVINCE_NAME_"] = area_result["result"][
                            "addressComponent"]["province"]
                        re_data["CITY_NAME_"] = area_result["result"][
                            "addressComponent"]["city"]
                        re_data["AREA_NAME_"] = area_result["result"][
                            "addressComponent"]["district"]
                        re_data["AREA_CODE_"] = area_result["result"][
                            "addressComponent"]["adcode"]
                        re_data[
                            "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                        re_data["PROVINCE_CODE_"] = re_data[
                            "AREA_CODE_"][:2] + "00"
                    except KeyError:
                        pass
        # print(re_data)
        # 站点名称
        if "STATION_NAME_" in data:
            re_data["STATION_NAME_"] = data["STATION_NAME_"]
        # 途经路线(地铁几号线)
        temp_subway = data["SUBWAY_NAME_"].replace("|", "-")
        AROUND_ROUTE_ = re.findall(r"地铁\d+号线", temp_subway)
        if len(AROUND_ROUTE_) == 1:
            re_data["AROUND_ROUTE_"] = AROUND_ROUTE_[0]
        elif len(AROUND_ROUTE_) > 1:
            re_data["AROUND_ROUTE_"] = ",".join(AROUND_ROUTE_)
        else:
            re_data["AROUND_ROUTE_"] = ""

        # 地铁名称
        if "SUBWAY_NAME_" in data:
            SUBWAY_NAME_ = data["SUBWAY_NAME_"].replace("|", "-")
            if "," in SUBWAY_NAME_:
                re_data_list = list()
                SUBWAY_LIST = SUBWAY_NAME_.split(",")
                for subway in SUBWAY_LIST:
                    # 拆开的地铁名称需要再获取serial_number
                    serial_number = req_for_serial_number(code="WD_JT_DT")
                    re_data["ID_"] = serial_number
                    re_data["SUBWAY_NAME_"] = subway + "-" + re_data[
                        "STATION_NAME_"]
                    re_data = super(Branchjtdt,
                                    self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)
                    # temp_dict = deepcopy(re_data)
                    temp_dict = deepcopy({
                        "TABLE_NAME_": self.p_client.table_name,
                        "DATA_": re_data
                    })
                    re_data_list.append(temp_dict)
                return re_data_list
            else:
                re_data["SUBWAY_NAME_"] = SUBWAY_NAME_ + "-" + re_data[
                    "STATION_NAME_"]
                re_data = super(Branchjtdt,
                                self).generic_shuffle(data=data,
                                                      re_data=re_data,
                                                      field=None)
                return [{
                    "TABLE_NAME_": self.p_client.table_name,
                    "DATA_": re_data
                }]
예제 #21
0
    def generic_shuffle(self, data, field="CONTENT_"):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data = dict()

        if "PUBLISH_TIME_" not in data:
            return None
        # 时间维度
        if re.findall(r"\d{4}-\d{1,2}-\d{1,2}", data["PUBLISH_TIME_"]):
            pass
        elif re.findall(r"\d{4}年\d{1,2}月\d{1,2}日", data["PUBLISH_TIME_"]):
            data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("年", "-")
            data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("月", "-")
            data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("日", "")

        else:
            if ("年" in data["PUBLISH_TIME_"]) and ("月" in data["PUBLISH_TIME_"]) and ("二" in data["PUBLISH_TIME_"]):
                format_list = list()
                for i in data["PUBLISH_TIME_"][:10]:
                    format_list.append(self.number_dict[i])
                    data["PUBLISH_TIME_"] = "".join(format_list)

            # 暂无其他情形
            # elif
            else:
                find_time = re.findall(r"\|(\w{4}[-年]\w{1,2}[-月]\w{1,2})日?\W?\|", data["CONTENT_"])
                if find_time:
                    if "二" in find_time[0]:
                        format_list = list()
                        for i in find_time[0]:
                            format_list.append(self.number_dict[i])
                            data["PUBLISH_TIME_"] = "".join(format_list)
                    else:
                        data["PUBLISH_TIME_"] = find_time[0]
                        data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("年", "-")
                        data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("月", "-")
                        data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("日", "")
                else:
                    data["PUBLISH_TIME_"] = ""

        if data["PUBLISH_TIME_"]:
            shuffle_list = data["PUBLISH_TIME_"].split("-")
            shuffle_list[0] = shuffle_list[0][:4]
            if len(shuffle_list[1]) == 2:
                pass
            elif len(shuffle_list[1]) == 1:
                shuffle_list[1] = "0" + shuffle_list[1]
            elif len(shuffle_list[1]) > 2:
                shuffle_list[1] = shuffle_list[1][:2]

            if len(shuffle_list[2]) == 2:
                pass
            elif len(shuffle_list[2]) == 1:
                shuffle_list[2] = "0" + shuffle_list[2]
            elif len(shuffle_list[2]) > 2:
                shuffle_list[2] = shuffle_list[2][:2]

            data["PUBLISH_TIME_"] = "-".join(shuffle_list)

        re_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "")

        # re_data["REMARK_"] = ""

        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""

        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]

        # 数据来源编码
        s_index = data["ENTITY_CODE_"].rfind("_")
        re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]

        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:7]

        re_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
        re_data["TITLE_"] = data["TITLE_"]

        # 作者
        if "AUTHOR_" in data:
            if "编辑" in data["AUTHOR_"]:
                re_data["AUTHOR_"] = re.findall(r"编辑[::](\w+)", data["AUTHOR_"])[0]
            else:
                re_data["AUTHOR_"] = data["AUTHOR_"]

        re_data["IMPORTANCE_"] = "N"
        # 阅读数
        if "READ_" in data:
            re_data["READS_"] = data["READ_"]
        else:
            re_data["READS_"] = 0
        # 点赞数
        if "LIKES_" in data:
            re_data["LIKES_"] = data["LIKES_"]
        else:
            re_data["LIKES_"] = 0
        # 评论数
        if "COMMENTS_" in data:
            re_data["COMMENTS_"] = data["COMMENTS_"]
        elif "COMMENT_" in data:
            re_data["COMMENTS_"] = data["COMMENT_"]
        else:
            re_data["COMMENTS_"] = 0
        # 参与数
        if "JOINS_" in data:
            re_data["JOINS_"] = data["JOINS_"]
        elif "JOIN_" in data:
            re_data["JOINS_"] = data["JOIN_"]
        else:
            re_data["JOINS_"] = 0

        # 内容
        re_data["CONTENT_"] = re.sub(r"(var.*?;\|)(?![a-zA-Z])", "", data["CONTENT_"])

        # HTML 标签
        re_data['CONTENT_HTML_'] = data["HTML_"]
        data["CONTENT_HTML_"] = data["HTML_"]
        re_data["CONTENT_HTML_"] = re.sub(r"href=\".*?\"", "href=\"javaScript:void(0);\"", re_data["CONTENT_HTML_"])

        if '28857' in re_data['CONTENT_HTML_'] or '您的IP' in re_data['CONTENT_HTML_']:
            try:
                soup = BeautifulSoup(re_data['CONTENT_HTML_'])
                soup.find('div', attrs={'class': 'online-desc-con'}).decompose()
                soup.find_all('script')[0].decompose()
                re_data['CONTENT_HTML_'] = soup.prettify()
            except Exception as e:
                self.logger.exception(f'IP检测内容清除出错')

        # TODO del data["HTML_] is wrong
        del data["HTML_"]
        re_data["CONTENT_"] = re_data["CONTENT_"].replace("|", "")
        re_data["TITLE_"] = re_data["TITLE_"].replace("|", "")
        # 是否营销活动
        re_data["ACT_"] = "N"

        # 版本
        re_data["VERSION_"] = "0"

        if "IMAGE_" in data:
            try:
                response = req_for_something(url=data["IMAGE_"])
                if response:
                    t = base64.b64encode(response.content)
                    data["IMAGE_"] = t.decode("utf-8")
                    response.close()
            except Exception:
                pass

        # 调用模型
        # 摘要
        try:
            brief = req_for_ts(re_data["CONTENT_"][0:1000])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_ts 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if brief:
                re_data["BRIEF_"] = brief["summary"]
            else:
                re_data["BRIEF_"] = '暂无摘要'
        # 情感分析
        try:
            sentiment = req_for_senti(re_data["TITLE_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_senti 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if sentiment:
                if sentiment["sentiment"] == "中性":
                    re_data["EMOTION_"] = "NORMAL"
                if sentiment["sentiment"] == "正面":
                    re_data["EMOTION_"] = "POSITIVE"
                if sentiment["sentiment"] == "敏感":
                    re_data["EMOTION_"] = "NAGETIVE"

        # 是否敏感
        try:
            censor = req_for_censor(re_data["CONTENT_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_censor 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if censor:
                if censor["censor"] == "N":
                    re_data["SENSITIVE_"] = "N"
                else:
                    re_data["SENSITIVE_"] = "Y"
                    re_data["SENSITIVE_WORD_"] = censor["words"]
        # 热度
        try:
            hot = req_for_news_hot(title=re_data["TITLE_"], content=re_data["CONTENT_"][0:1000])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_news_hot 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if hot:
                re_data["HOT_"] = hot["level"]

        # 地址分析
        try:
            res = req_for_textLoc(text=re_data["CONTENT_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_textLoc 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if "error" not in res:
                if res["tagsId"] == "None" or res["tagsId"] is None:
                    pass
                else:
                    re_data["TAGS_"] = res["tagsId"]
                if res["flag"] == 1:
                    address = res["full"]
                else:
                    address = res["addr"]
                try:
                    lat_result = get_lat_lng(address=address)
                    re_data["LAT_"] = lat_result["result"]["location"]["lat"]
                    re_data["LNG_"] = lat_result["result"]["location"]["lng"]
                except KeyError:
                    re_data["LAT_"] = None
                    re_data["LNG_"] = None
                except Exception as e:
                    self.logger.info(f"获取经纬度失败, ERROR: {e}")
                    re_data["LAT_"] = None
                    re_data["LNG_"] = None
                if re_data["LAT_"]:
                    try:
                        area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])]))
                    except Exception as e:
                        self.logger.info(f"获取地址失败, ERROR: {e}")
                    else:
                        try:
                            re_data["AREA_NAME_"] = area_result["result"]["addressComponent"]["district"]
                        except KeyError:
                            re_data["AREA_NAME_"] = ""
                        try:
                            re_data["AREA_CODE_"] = area_result["result"]["addressComponent"]["adcode"]
                        except KeyError:
                            re_data["AREA_CODE_"] = ""
                        else:
                            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                            for city in self.city_list:
                                if city["CODE_"] == re_data["CITY_CODE_"]:
                                    re_data["CITY_NAME_"] = city["NAME_"]
                                    break
                            for prov in self.province_list:
                                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                                    break

        # 信用卡关联性
        try:
            res = req_for_credit_relative(text=re_data["CONTENT_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_credit_relative 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if res["creditrelative"]:
                re_data["MODULE_TYPE_"] = "CREDITCARD"

        # 银行名称、编码
        if "BANK_NAME_" in data:
            re_data["BANK_NAME_"] = data["BANK_NAME_"]
        if "BANK_CODE_" in data:
            re_data["BANK_CODE_"] = data["BANK_CODE_"]

        re_data = super(BranchNews, self).generic_shuffle(data=data, re_data=re_data, field="CONTENT_")

        # 财资直接发布
        re_data['DATA_STATUS_'] = 'CHECK'
        # 是否发布
        if not re_data.get("PUBLISH_TIME_"):
            re_data["PUBLISH_STATUS_"] = "N"
        else:
            re_data["PUBLISH_STATUS_"] = "Y"

        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
예제 #22
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()

    for city in city_list:
        if city["NAME_"] == "县":
            city_list.remove(city)

    prov_c = ""
    prov_n = ""
    city_c = ""
    city_n = ""
    addr_ = data["ADDR_"]

    # 省市级信息
    if "北京" in data["CITY_NAME_"]:
        prov_n = "北京市"
        prov_c = "1100"
        city_n = "北京市"
        city_c = "110100"
    elif "天津" in data["CITY_NAME_"]:
        prov_n = "天津市"
        prov_c = "1200"
        city_n = "天津市"
        city_c = "120100"
    elif "上海" in data["CITY_NAME_"]:
        prov_n = "上海市"
        prov_c = "3100"
        city_n = "上海市"
        city_c = "310100"
    elif "重庆" in data["CITY_NAME_"]:
        prov_n = "重庆市"
        prov_c = "5000"
        city_n = "重庆市"
        city_c = "500100"
    else:
        for city in city_list:
            if city["NAME_"][:-1] in data["CITY_NAME_"]:
                city_n = city["NAME_"]
                city_c = city["CODE_"]
                prov_c = city["PARENT_"]
                break
        if prov_c:
            for prov in province_list:
                if prov["CODE_"] == prov_c:
                    prov_n = prov["NAME_"]
                    break

    # 地址清洗
    if prov_n in addr_:
        pass
    elif prov_n[:-1] in addr_[:len(prov_n)]:
        addr_ = addr_[:len(prov_n)].replace(prov_n[:-1],
                                            prov_n) + addr_[len(prov_n):]
    elif prov_n[:4] in addr_[:len(prov_n)]:
        addr_ = addr_[:len(prov_n)].replace(prov_n[:4],
                                            prov_n) + addr_[len(prov_n):]
    elif prov_n[:3] in addr_[:len(prov_n)]:
        addr_ = addr_[:len(prov_n)].replace(prov_n[:3],
                                            prov_n) + addr_[len(prov_n):]
    elif prov_n[:2] in addr_[:len(prov_n)]:
        addr_ = addr_[:len(prov_n)].replace(prov_n[:2],
                                            prov_n) + addr_[len(prov_n):]
    else:
        addr_ = prov_n + addr_

    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # "C"
    re_data["BANK_CODE_"] = "PAB"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-4]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    # re_data["AREA_CODE_"] = area_c
    # re_data["AREA_NAME_"] = area_n
    # re_data["UNIT_CODE_"] = "CZB" + "_" + city_c

    # "F"
    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_NAME_"] = city_n
    # re_data["LAT_"] = lat
    # re_data["LNG_"] = lng
    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "PAB" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    # re_data["TEL_"] = tel
    # re_data["BUSINESS_HOURS_"] = business_time
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "自助银行"
    re_data["TYPE_"] = "ZZ"

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "PAB" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    return re_data
예제 #23
0
def data_shuffle(data, province_list, city_list, area_list):
    bank_dict = {'邮储银行': 'PSBC', '光大银行': 'CEB', '农商银行': 'RCB'}
    if data.get('BANK_NAME_') not in bank_dict.keys():
        return None

    re_data = dict()
    # 省级信息清洗
    for prov in province_list:
        if prov["NAME_"][:2] in data["PROVINCE_NAME_"]:
            re_data["PROVINCE_NAME_"] = prov["NAME_"]
            re_data["PROVINCE_CODE_"] = prov["CODE_"]
            break
    # 市级信息清洗
    for city in city_list:
        if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
            if city["NAME_"][:2] in data["CITY_NAME_"]:
                re_data["CITY_NAME_"] = city["NAME_"]
                re_data["CITY_CODE_"] = city["CODE_"]
                break
    # 区县级信息清洗
    for area in area_list:
        if area["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
            if area["NAME_"] == data["AREA_NAME_"]:
                area_n = area["NAME_"]
                area_c = area["CODE_"]
            elif area["NAME_"][:-1] == data["AREA_NAME_"][:-1]:
                area_n = area["NAME_"]
                area_c = area["CODE_"]
    # 地址清洗
    prov_n = re_data["PROVINCE_NAME_"]
    city_n = re_data["CITY_NAME_"]
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "ICBC" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = bank_dict.get(data.get('BANK_NAME_'))
    re_data["BANK_NAME_"] = data.get('BANK_NAME_')
    re_data["SPIDER_TIME_"] = data["DATETIME_"]

    # "F"
    re_data["ADDR_"] = addr_
    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    if data["PROVINCE_NAME_"] == data["CITY_NAME_"]:
        re_data["CITY_NAME_"] = re_data["PROVINCE_NAME_"]

    re_data["UNIT_CODE_"] = re_data["BANK_CODE_"] + "_" + re_data.get(
        "CITY_CODE_", "")
    re_data["NAME_"] = data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n
    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]

    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"

    return re_data
예제 #24
0
def data_shuffle(data, province_list, city_list, area_list):
    for city in city_list:
        if city["NAME_"] == "县":
            city_list.remove(city)

    re_data = dict()

    # for prov in province_list:
    #     if prov["NAME_"][:2] in data["PROVINCE_NAME_"]:
    #         re_data["PROVINCE_CODE_"] = prov["CODE_"]
    #         re_data["PROVINCE_NAME_"] = prov["NAME_"]
    #         break
    # for city in city_list:
    #     if city["NAME_"][:2] in data["CITY_NAME_"]:
    #         re_data["CITY_CODE_"] = city["CODE_"]
    #         re_data["CITY_NAME_"] = city["NAME_"]
    #         break

    # prov_n = ""
    # prov_c = ""
    # city_n = ""
    # city_c = ""
    # area_n = ""
    # area_c = ""
    # addr_ = ""
    #
    # # 内蒙古, 广西, 新疆, 宁夏, 西藏 字段统一:
    # if ("内蒙古" in data["ADDR_"][:3] or "广西" in data["ADDR_"][:2] or "新疆" in data["ADDR_"][:2] or
    #         "宁夏" in data["ADDR_"][:2] or "西藏" in data["ADDR_"][:2]):
    #     if "自治区" not in data["ADDR_"]:
    #         data["ADDR_"] = data["ADDR_"].replace("内蒙古", "内蒙古自治区")
    #         data["ADDR_"] = data["ADDR_"].replace("广西", "广西壮族自治区")
    #         data["ADDR_"] = data["ADDR_"].replace("新疆", "新疆维吾尔自治区")
    #         data["ADDR_"] = data["ADDR_"].replace("宁夏", "宁夏回族自治区")
    #         data["ADDR_"] = data["ADDR_"].replace("西藏", "西藏自治区")
    #
    # elif "京山县" in data["AREA_NAME_"]:
    #     data["AREA_NAME_"] = data["AREA_NAME_"].replace("荆州", "荆门")
    #
    # for city in city_list:
    #     if city["NAME_"] in data["AREA_NAME_"]:
    #         city_n = city["NAME_"]
    #         city_c = city["CODE_"]
    #         prov_c = city["CODE_"][:2] + "00"
    #         break
    # for area in area_list:
    #     if city_c:
    #         if area["PARENT_"] == city_c:
    #             if area["NAME_"] in data["AREA_NAME_"]:
    #                 area_n = area["NAME_"]
    #                 area_c = area["CODE_"]
    #                 break
    #     else:
    #         if (area["NAME_"][-1] == "区") and (len(area["NAME_"]) == 2):
    #             continue
    #         if area["NAME_"] in data["AREA_NAME_"]:
    #             area_n = area["NAME_"]
    #             area_c = area["CODE_"]
    #             city_c = area["CODE_"][:-2] + "00"
    #             prov_c = area["CODE_"][:2] + "00"
    #             break
    # for prov in province_list:
    #     if prov_c:
    #         if prov["CODE_"] == prov_c:
    #             prov_n = prov["NAME_"]
    #             prov_c = prov["CODE_"]
    #             break
    #     else:
    #         if prov["NAME_"] in data["AREA_NAME_"]:
    #             prov_n = prov["NAME_"]
    #             prov_c = prov["CODE_"]
    #             break
    #         elif prov["NAME_"][:-1] in data["AREA_NAME_"]:
    #             prov_n = prov["NAME_"]
    #             prov_c = prov["CODE_"]
    #             break
    #
    # if data["AREA_NAME_"] == "洋浦经济开发区":
    #     prov_n = "海南省"
    #     prov_c = "4600"
    #     city_n = "儋州市"
    #     city_c = "460400"
    #     area_n = "洋浦经济开发区"
    #     area_c = ""
    # elif ("北京" in data["AREA_NAME_"][:3] or "天津" in data["AREA_NAME_"][:3] or
    #       "上海" in data["AREA_NAME_"][:3] or "重庆" in data["AREA_NAME_"][:3]):
    #     city_n = prov_n
    #     city_c = prov_c
    #
    # if not area_c:
    #     for area in area_list:
    #         if area["PARENT_"] == city_c:
    #             if area["NAME_"][:2] in data["AREA_NAME_"][-len(area["NAME_"]):]:
    #                 area_n = area["NAME_"]
    #                 area_c = area["CODE_"]
    #                 break
    # if not area_c:
    #     for area in area_list:
    #         if area["PARENT_"] == city_c:
    #             if area["NAME_"] in data["ADDR_"]:
    #                 area_n = area["NAME_"]
    #                 area_c = area["CODE_"]
    #
    # # 地址清洗
    # if prov_n in data["ADDR_"]:
    #     addr_ = data["ADDR_"]
    # elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
    #     addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    # elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
    #     addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    # elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
    #     addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    # elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
    #     addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    # else:
    #     addr_ = prov_n + data["ADDR_"]
    #
    # if city_n in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_
    # elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_[:len(prov_n) + len(city_n)].replace(
    #         city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    # elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_[:len(prov_n) + len(city_n)].replace(
    #         city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    # elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_[:len(prov_n) + len(city_n)].replace(
    #         city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    # elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
    #     addr_ = addr_[:len(prov_n) + len(city_n)].replace(
    #         city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    # else:
    #     addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]
    #
    # # # 添加分行编码
    # # branch_code = None
    # # for i in range(1, 10000):
    # #     branch_code = "BOC" + "_" + city_c + "_" + "00000"
    # #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    # #     if branch_code in branch_code_list:
    # #         continue
    # #     else:
    # #         branch_code_list.append(branch_code)
    # #         break

    # "C"
    re_data["BANK_CODE_"] = "BOC"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-7]
    # re_data["AREA_CODE_"] = area_c
    # re_data["AREA_NAME_"] = area_n
    # re_data["UNIT_CODE_"] = "BOC" + "_" + re_data["CITY_CODE_"]

    # "F"
    re_data[
        "ADDR_"] = data["PROVINCE_NAME_"] + data["CITY_NAME_"] + data['ADDR_']
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_NAME_"] = city_n
    # re_data["LAT_"] = ""
    # re_data["LNG_"] = ""
    re_data["NAME_"] = re_data["ADDR_"] + data["NAME_"]
    # re_data["PROVINCE_CODE_"] = prov_c
    # re_data["PROVINCE_NAME_"] = prov_n

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break
    re_data["UNIT_CODE_"] = "BOC" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    if "BUSINESS_HOURS_" in data:
        re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"]
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "自助银行"
    re_data["TYPE_"] = "ZZ"

    return re_data
예제 #25
0
파일: __init__.py 프로젝트: ILKKAI/dataETL
    def generic_shuffle(self, data):
        # print(data)
        re_data = dict()
        # 通用字段
        # ID_  历史信息 ID_
        serial_number = req_for_serial_number(code="WD_JZ_FJ_DATA")
        re_data["ID_"] = serial_number
        re_data["URL_"] = data["URL_"]
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 实体编码、名称及 url
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        re_data["CREATE_TIME_"] = create_time
        re_data["CREATE_BY_ID_"] = CREATE_ID
        re_data["CREATE_BY_NAME_"] = CREATE_NAME
        # 爬取时间
        if "DATETIME_" in data:
            re_data["SPIDER_TIME_"] = data["DATETIME_"]
        elif ("DATETIME_" not in data) and ("DEALTIME_" in data):
            d_time = arrow.get(data["DEALTIME_"])
            date_time = d_time.format("YYYY-MM-DD")
            re_data["SPIDER_TIME_"] = date_time
        # 状态
        if "DATA_STATUS_" not in re_data:
            re_data["DATA_STATUS_"] = "UNCHECK"
        if "PUBLISH_STATUS_" not in re_data:
            re_data["PUBLISH_STATUS_"] = "N"
        # 名称
        re_data["NAME_"] = data["NAME_"].replace("|", "")
        # 类型: 住宅(ZZ)、写字楼(XZL)、商铺(SP)
        if "LISP" in data["ENTITY_CODE_"]:
            re_data["TYPE_"] = "SP"
        elif "LIXQ" in data["ENTITY_CODE_"] or "LJXQ" in data["ENTITY_CODE_"]:
            re_data["TYPE_"] = "ZZ"
        elif "LIXZL" in data["ENTITY_CODE_"]:
            re_data["TYPE_"] = "XZL"

        # 验证名称是否在基本表中
        verify_name = value_replace(re_data["NAME_"])
        house_id = self.if_exists(name=verify_name, city_name="厦门市")

        # 基本表存在, 只插入 DATA 表
        if house_id:
            re_data["P_ID_"] = house_id
            if "TITLE_" in data:
                re_data["TITLE_"] = data["TITLE_"].replace("|", "")
            if "PUBLISH_TIME_" in data:
                re_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
            else:
                re_data["PUBLISH_TIME_"] = data["DATETIME_"][:10]
            price = re.findall(r"[\d.]+", data["PRICE_"])
            if price:
                re_data["PRICE_"] = price[0]
            else:
                re_data["PRICE_"] = 0
            if "租赁" in data["ENTITY_NAME_"]:
                re_data["USE_TYPE_"] = "RENT"
            else:
                re_data["USE_TYPE_"] = "SALE"

            return [{"TABLE_NAME_": self.data_table_name, "DATA_": re_data}]
        else:
            # 基本信息表ID_
            base_id = req_for_serial_number(code="WD_JZ_FJ_BASE")
            # DATA_ 表
            data_dict = dict()
            data_dict.update(re_data)
            data_dict["P_ID_"] = base_id
            if "TITLE_" in data:
                data_dict["TITLE_"] = data["TITLE_"].replace("|", "")
            if "PUBLISH_TIME_" in data:
                data_dict["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
            else:
                data_dict["PUBLISH_TIME_"] = data["DATETIME_"][:10]
            price = re.findall(r"[\d.]+", data["PRICE_"])
            if price:
                data_dict["PRICE_"] = price[0]
            else:
                data_dict["PRICE_"] = 0
            if "租赁" in data["ENTITY_NAME_"]:
                data_dict["USE_TYPE_"] = "RENT"
            else:
                data_dict["USE_TYPE_"] = "SALE"
            # 基本信息表
            basic_dict = dict()
            basic_dict.update(re_data)
            basic_dict["ID_"] = base_id
            basic_dict["URL_"] = data["URL_"]
            basic_dict["PROVINCE_CODE_"] = "3500"
            basic_dict["PROVINCE_NAME_"] = "福建省"
            basic_dict["CITY_CODE_"] = "350200"
            basic_dict["CITY_NAME_"] = "厦门市"
            basic_dict["SALE_PRICE_"] = 0
            basic_dict["RENT_PRICE_"] = 0
            if "YEAR_" in data:
                year = re.findall(r"\d+", data["YEAR_"])
                if year:
                    basic_dict["YEAR_"] = year[0]

            # 地址分析
            try:
                if basic_dict["PROVINCE_NAME_"] == basic_dict["CITY_NAME_"]:
                    basic_dict["ADDR_"] = basic_dict[
                        "PROVINCE_NAME_"] + basic_dict["NAME_"]
                else:
                    basic_dict[
                        "ADDR_"] = basic_dict["PROVINCE_NAME_"] + basic_dict[
                            "CITY_NAME_"] + basic_dict["NAME_"]
                # print(basic_dict["ADDR_"])
                res = req_for_textLoc(text=basic_dict["ADDR_"])
                # print(res)
            except Exception as e:
                self.logger.exception(
                    f"2.2--err: 请求模型 req_for_textLoc 错误."
                    f" 原始数据 collection = {self.m_client.mongo_collection};"
                    f" ENTITY_CODE_ = {self.entity_code};"
                    f" 原始数据 _id = {data['_id']};"
                    f" error: {e}.")
            else:
                if "error" not in res:
                    if res["tagsId"] == "None" or res["tagsId"] is None:
                        pass
                    else:
                        basic_dict["TAGS_"] = res["tagsId"]
                    if res["flag"] == 1:
                        basic_dict["ADDR_"] = res["full"]
                    else:
                        basic_dict["ADDR_"] = data["ADDR_"]
                    try:
                        lat_result = get_lat_lng(address=basic_dict["ADDR_"])
                        basic_dict["LAT_"] = lat_result["result"]["location"][
                            "lat"]
                        basic_dict["LNG_"] = lat_result["result"]["location"][
                            "lng"]
                    except KeyError:
                        basic_dict["LAT_"] = None
                        basic_dict["LNG_"] = None
                    except Exception as e:
                        self.logger.info(f"获取经纬度失败, ERROR: {e}")
                        basic_dict["LAT_"] = None
                        basic_dict["LNG_"] = None
                    if basic_dict["LAT_"]:
                        try:
                            area_result = get_area(",".join([
                                str(basic_dict["LAT_"]),
                                str(basic_dict["LNG_"])
                            ]))
                        except Exception as e:
                            self.logger.info(f"获取地址失败, ERROR: {e}")
                        else:
                            try:
                                basic_dict["AREA_NAME_"] = area_result[
                                    "result"]["addressComponent"]["district"]
                                basic_dict["AREA_CODE_"] = area_result[
                                    "result"]["addressComponent"]["adcode"]
                            except KeyError:
                                pass
                            try:
                                basic_dict["ADDR_"] = area_result["result"][
                                    "formatted_address"]
                            except KeyError:
                                pass
            # basic_dict["AREA_CODE_"] = data[""]
            # basic_dict["AREA_NAME_"] = data[""]
            # basic_dict["LAT_"] = data[""]
            # basic_dict["LNG_"] = data[""]
            # basic_dict["BANK_CODE_"] = data[""]
            # basic_dict["BANK_NAME_"] = data[""]
            # basic_dict["REMARK_"] = data[""]
            basic_dict["M_STATUS_"] = "N"
            basic_dict["DELETE_STATUS_"] = "N"
            # basic_dict["TAGS_"] = data[""]
            # 数据来源 URL
            source = re.findall(r"(https?://.*?)/", data["URL_"])
            re_data["SOURCE_"] = source[0]
            # 数据来源 网站名称
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
            basic_dict["SOURCE_TYPE_"] = "链家"
            # basic_dict["PRICE_TYPE_"] = data[""]
            basic_dict["ADDR_"] = data["ADDR_"]

            return [{
                "TABLE_NAME_": self.data_table_name,
                "DATA_": data_dict
            }, {
                "TABLE_NAME_": self.base_table_name,
                "DATA_": basic_dict
            }]
예제 #26
0
    def generic_shuffle(self, data):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :param data:
        :return:
        """

        re_data = dict()
        serial_number = req_for_serial_number(code="WD_JT_GJ")
        re_data["ID_"] = serial_number

        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # # 补全经度纬度和省市等信息
        # try:
        #     city = data["ENTITY_NAME_"][data["ENTITY_NAME_"].rfind("-")+1:]
        #     lat_result_list = get_infomation(data["NAME_"], city)
        #     print(lat_result_list)
        # except KeyError:
        #     re_data["LAT_"] = None
        #     re_data["LNG_"] = None
        # except Exception as e:
        #     re_data["LAT_"] = None
        #     re_data["LNG_"] = None
        #     self.logger.info("获取经纬度失败{}".format(e))
        # if lat_result_list.get('result') and len(lat_result_list['result']) > 0:
        #     for lat_result in lat_result_list['result']:
        #         if lat_result["name"] == "{}-公交车站".format(data["NAME_"]):
        #             print("找到公交")
        #             re_data["LAT_"] = lat_result["location"]["lat"]
        #             re_data["LNG_"] = lat_result["location"]["lng"]
        #             break

        temp_location = data["ENTITY_NAME_"][data["ENTITY_NAME_"].rfind("-") +
                                             1:] + data["NAME_"] + "公交车站"
        try:
            lat_result = get_lat_lng(address=temp_location)
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
        except Exception as e:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
            self.logger.info("获取经纬度失败错误信息为{}".format(e))
        if re_data.get("LAT_"):
            # 根据前面查询的经纬度获取周围公交车站精确经纬度
            lat_handle = ""
            try:
                lat_origin = ",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])])
                i = 0
                find_tag = False
                while True:
                    s3 = get_periphery(classify="公交车站",
                                       tag="交通设施",
                                       lat_lng=lat_origin,
                                       radius=3000,
                                       page_num=i)
                    for nearby in s3["results"]:
                        if data["NAME_"] in nearby["name"]:
                            find_tag = True
                            lat = str(nearby["location"]["lat"])
                            lng = str(nearby["location"]["lng"])
                            re_data["LAT_"] = lat
                            re_data["LNG_"] = lng
                            lat_handle = lat + "," + lng
                            break
                    if find_tag:
                        break
                    i += 1
                    if len(s3["results"]) != 20:
                        break
            except Exception as e:
                self.logger.info(f"获取精确经纬度失败, ERROR: {e}")
            if len(lat_handle) > 0:
                # 获取精确经纬度后根据精确经纬度补全地址信息
                try:
                    area_result = get_area(lat_handle)
                except Exception as e:
                    self.logger.info(f"获取地址失败, ERROR: {e}")
                else:
                    try:
                        re_data["PROVINCE_NAME_"] = area_result["result"][
                            "addressComponent"]["province"]
                        re_data["CITY_NAME_"] = area_result["result"][
                            "addressComponent"]["city"]
                        re_data["AREA_NAME_"] = area_result["result"][
                            "addressComponent"]["district"]
                        re_data["AREA_CODE_"] = area_result["result"][
                            "addressComponent"]["adcode"]
                        re_data[
                            "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                        re_data["PROVINCE_CODE_"] = re_data[
                            "AREA_CODE_"][:2] + "00"
                    except KeyError:
                        pass
            else:
                try:
                    area_result = get_area(",".join(
                        [str(re_data["LAT_"]),
                         str(re_data["LNG_"])]))
                except Exception as e:
                    self.logger.info(f"获取地址失败, ERROR: {e}")
                else:
                    try:
                        re_data["PROVINCE_NAME_"] = area_result["result"][
                            "addressComponent"]["province"]
                        re_data["CITY_NAME_"] = area_result["result"][
                            "addressComponent"]["city"]
                        re_data["AREA_NAME_"] = area_result["result"][
                            "addressComponent"]["district"]
                        re_data["AREA_CODE_"] = area_result["result"][
                            "addressComponent"]["adcode"]
                        re_data[
                            "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                        re_data["PROVINCE_CODE_"] = re_data[
                            "AREA_CODE_"][:2] + "00"
                    except KeyError:
                        pass

        # 站点描述
        re_data["DESCRIBE_"] = data["DESCRIBE_"]
        # 周边站点
        re_data["AROUND_STATIONS_"] = self.handle_special_text(
            data["AROUND_STATIONS_"]).replace("|", ",")
        # 途径路线
        re_data["AROUND_ROUTE_"] = self.handle_special_text(
            data["AROUND_ROUTE_"]).replace("|", ",")
        if re_data["AROUND_ROUTE_"]:
            re_data["AROUND_ROUTE_"] = re_data["AROUND_ROUTE_"].replace(
                "公交线路", "")
        # 站点名称
        re_data["NAME_"] = data["NAME_"]
        re_data = super(Branchjtgj, self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)

        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
예제 #27
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()
    ##############
    # 先处理省市区, 经纬度
    ##############
    branch_name = data.get('CITY_NAME_')
    # 市级信息清洗  jsonpath.jsonpath(province_list, '$.[*].CODE_')
    for city in city_list:
        # if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
        if city["NAME_"][:2] in branch_name:
            re_data["CITY_NAME_"] = city["NAME_"]
            re_data["CITY_CODE_"] = city["CODE_"]
            parent = city['PARENT_']
            # 省级信息清洗, 获取
            break
    try:  # 利用市编码先用jsonpath找到所有城市编码,使用index方法查到对应的索引,利用下标取值
        province = province_list[jsonpath.jsonpath(
            province_list, '$.[*].CODE_').index(parent)]
        re_data["PROVINCE_NAME_"] = province["NAME_"]
        re_data["PROVINCE_CODE_"] = province["CODE_"]
    except:
        re_data["PROVINCE_NAME_"] = ''
        re_data["PROVINCE_CODE_"] = ''

    import re
    try:
        area_name = re.findall('市(.*[区镇县])', data.get('ADDR_'))[0]
    except:
        area_name = ''
    # # 区县级信息清洗
    area_n = ''
    area_c = ''
    if area_name:
        for area in area_list:
            if area["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
                if area["NAME_"] == area_name:
                    area_n = area["NAME_"]
                    area_c = area["CODE_"]
                elif area["NAME_"][:-1] == area_name[:-1]:
                    area_n = area["NAME_"]
                    area_c = area["CODE_"]

    # 地址清洗
    prov_n = re_data["PROVINCE_NAME_"]
    city_n = re_data["CITY_NAME_"]
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    # 将市的名称补全
    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:  # '市' 缺失 ,添加市
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # # 添加分行编码
    # branch_code = None
    # for i in range(1, 10000):
    #     branch_code = "ICBC" + "_" + city_c + "_" + "00000"
    #     branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i)
    #     if branch_code in branch_code_list:
    #         continue
    #     else:
    #         branch_code_list.append(branch_code)
    #         break

    # "C"
    re_data["BANK_CODE_"] = "PAB"
    re_data["BANK_NAME_"] = "平安银行"
    re_data["SPIDER_TIME_"] = data["DATETIME_"]
    re_data["AREA_CODE_"] = area_c
    re_data["AREA_NAME_"] = area_n

    # "F"
    re_data["ADDR_"] = addr_
    # re_data["CITY_CODE_"] = city_c
    # re_data["CITY_NAME_"] = city_n
    # re_data["LAT_"] = data["LAT_"]
    # re_data["LNG_"] = data["LNG_"]
    result = get_lat_lng(address=re_data["ADDR_"])  # 获取经纬度
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:  # 通过百度接口获取 AREA_NAME_
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break

    ##############
    # 其他数据项
    ##############
    # 银行简称与 CITY_CODE_ 的拼接
    re_data["UNIT_CODE_"] = "PAB" + "_" + re_data.get("CITY_CODE_", "")
    re_data["NAME_"] = data["NAME_"]
    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]

    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"

    return re_data
예제 #28
0
def data_shuffle(data, province_list, city_list, area_list):
    re_data = dict()

    # 省级信息清洗
    for prov in province_list:
        if prov["NAME_"][:2] in data["PROVINCE_NAME_"]:
            re_data["PROVINCE_NAME_"] = prov["NAME_"]
            re_data["PROVINCE_CODE_"] = prov["CODE_"]
            break

    # 市级信息清洗
    for city in city_list:
        if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]:
            if city["NAME_"][:2] in data["CITY_NAME_"]:
                re_data["CITY_NAME_"] = city["NAME_"]
                re_data["CITY_CODE_"] = city["CODE_"]
                break

    # 区县级信息清洗
    import re
    import copy
    addr = copy.deepcopy(data["ADDR_"])
    data["ADDR_"] = re.findall(r'地址:([\w\S]+),', addr)[0] if re.findall(
        r'地址:([\w\S]+),', addr) else ''
    # '地址:石家庄市裕华区谈固东街150号,电话:0311-85081812'
    data["TEL_"] = re.findall(r'电话:([\w\d\-]*)', addr)[0] if re.findall(
        r'电话:([\w\d\-]*)', addr) else ''

    # 地址清洗
    prov_n = re_data.get("PROVINCE_NAME_")
    city_n = re_data.get("CITY_NAME_")
    if prov_n in data["ADDR_"]:
        addr_ = data["ADDR_"]
    elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:4] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:3] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):]
    elif prov_n[:2] in data["ADDR_"][:len(prov_n)]:
        addr_ = data["ADDR_"][:len(prov_n)].replace(
            prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
    else:
        addr_ = prov_n + data["ADDR_"]

    # 将市的名称补全
    if city_n in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_
    elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:  # '市' 缺失 ,添加市
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
    elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
        addr_ = addr_[:len(prov_n) + len(city_n)].replace(
            city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
    else:
        addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

    # "C"
    re_data["BANK_CODE_"] = "BHB"
    re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
    re_data["SPIDER_TIME_"] = data["DATETIME_"]

    # "F"
    re_data["ADDR_"] = data["ADDR_"]
    re_data["NAME_"] = data["NAME_"]

    result = get_lat_lng(address=re_data["ADDR_"])
    try:
        re_data["LAT_"] = str(result["result"]["location"]["lat"])
        re_data["LNG_"] = str(result["result"]["location"]["lng"])
    except KeyError:
        re_data["LAT_"] = ""
        re_data["LNG_"] = ""
    else:
        dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
        try:
            re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][
                "district"]
        except KeyError:
            re_data["AREA_NAME_"] = ""
        try:
            re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][
                "adcode"]
        except KeyError:
            re_data["AREA_CODE_"] = ""
        else:
            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
            for city in city_list:
                if city["CODE_"] == re_data["CITY_CODE_"]:
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            for prov in province_list:
                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                    break

    re_data["UNIT_CODE_"] = "BHB" + "_" + re_data.get("CITY_CODE_", "")

    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    if "TEL_" in data:
        re_data["TEL_"] = data["TEL_"]
    re_data["BUSINESS_HOURS_"] = "0:00-24:00"
    if "SOURCE_TYPE_NAME_" in data:
        re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
    re_data["TYPE_NAME_"] = "支行"
    re_data["TYPE_"] = "ZH"
    return re_data