예제 #1
0
# -*- coding: utf-8 -*-
"""  GDSZ_SZS_TZJG_XMGS """
from database._mongodb import MongoClient
from tools.req_for_wordExcelZip import find_type


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="GDSZ_SZS_TZJG_XMGS",
                             mongo_collection="GOV_ZX_GDS")
    data_list = main_mongo.main()
    for data in data_list[:2]:
        re_data = data_shuffle(data)
        print(re_data)
예제 #2
0
# -*- coding: utf-8 -*-
"""  GDSZ_HYS_FGW_XMXX"""
from database._mongodb import MongoClient
from tools.req_for_wordExcelZip import find_type


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="GDSZ_HYS_FGW_XMXX", mongo_collection="GOV_ZX_GDS")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #3
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_ZYCJ_ZYW_HG",
                             mongo_collection="ZX_CJXW_ZYCJ")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #4
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_HY_SDW_ZZXW",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #5
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="JRCP_LCCP_ZGGSYH_APP_LCLB", mongo_collection="JRCP_LCCP")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #6
0
def run():
    main_mongo = MongoClient(entity_code="ZTB_JSDFNCSYYH", mongo_collection="CommonBidding")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #7
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_HY_ZGNYE_ZJGD",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #8
0
    # if not any([num in item for num in ['供应链', '产业链', '应收账款', '应收款']]):
    if not any([num in item for num in ['项目', '公告']]):
        return False
    else:
        # if not any([num in item for num in ['微信', '营销', 'APP', 'app', 'App']]):
        #     return False
        # else:
        #     return True
        return True


if __name__ == '__main__':

    import pandas as pd

    main_mongo = MongoClient(entity_code="", mongo_collection="CommonBidding")
    db, collection_list = main_mongo.client_to_mongodb()
    collection = main_mongo.get_check_collection(
        db=db, collection_list=collection_list)
    # mon_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    # mon_list = ['12', '01']
    # mon_list = ['08', '09', '10', '11', '12']

    try:
        data_list = collection.find(
            {
                '$and': [
                    {
                        'NOTICE_TIME_': {
                            '$gte': '2019-01-01'
                        }
예제 #9
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_HY_ZGHGW_NHHYDT",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #10
0
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="WD_JT_GJ_GJWZD_NB",
                             mongo_collection="WD_JT_GJ")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #11
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="ORGANIZE_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None,
            entity_type="ORGANIZE_FINASSIST").area_from_mysql())

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 创建表
        # # 网点表创建语句
        # sql = ('create table "ORGANIZE_FINASSIST" ("ID_" varchar primary key, "C"."BANK_NAME_" varchar,'
        #        '"C"."BANK_CODE_" varchar, "C"."NAME_" varchar,'
        #        '"C"."CODE_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."UNIT_CODE_" varchar, "C"."ADDR_" varchar,'
        #        '"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar, "C"."CITY_" varchar,'
        #        '"C"."CITY_CODE_" varchar, "C"."DISTRICT_NAME_" varchar, "C". "DISTRICT_CODE_" varchar,'
        #        '"C"."LAT_" varchar, "C"."LNG_" varchar, "C"."CREATE_TIME_" varchar, "C"."DEALTIME_" varchar,'
        #        '"C"."URL_" varchar, "C"."TEL_" varchar, "C"."BUSINESS_HOURS_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."IMPORTANCE_" varchar) IMMUTABLE_ROWS = true')
        #
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            # find_id = ""
            if entity_code == "ECITICORGANIZE":
                find_id = "5c3f48479bb3df1d97d762e1"
            else:
                find_id = None
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        re_data = module_name.data_shuffle(
                            data, province_list, city_list, area_list)
                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.exception("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    print(data_id)

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            area_data = ""
                            try:
                                # self.logger.info("_id {}".format(data_id))
                                area_data = self.shuffle_for_area(list_data)
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.exception(
                                    "_id:{} 获取经纬度失败, {}".format(data_id, e))
                                continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if area_data:
                                try:
                                    # print(area_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=area_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.exception(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        area_data = ""
                        try:
                            area_data = self.shuffle_for_area(re_data)
                        except urllib3.exceptions.NewConnectionError as e:
                            # self.remove_id_list.remove(data_id)
                            # self.copy_mongo_data_list.remove(copy_data)
                            self.logger.exception("_id: {}获取经纬度失败, {}".format(
                                data_id, e))
                        except Exception as e:
                            # self.remove_id_list.remove(data_id)
                            # self.copy_mongo_data_list.remove(copy_data)
                            self.logger.exception("_id: {}获取经纬度失败, {}".format(
                                data_id, e))
                            continue
                        # phoenix_HBase 插入数据
                        if area_data:
                            try:
                                # print(area_data)
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=area_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.exception(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                    if self.success_count % 100 == 0:
                        self.logger.info("HBase 插入成功, 成功条数 {} 条".format(
                            self.success_count))
                    # 添加 {d:1}
                    # if self.success_count % 50 == 0:
                    #     update_count = m_client.update_to_mongodb(collection=collection,
                    #                                               data_id=self.remove_id_list,
                    #                                               data_dict={"d": 1})
                    #     self.remove_id_list = []
                    #     self.remove_count += update_count
                    #     self.logger.info("MongoDB 更新成功")

                mongo_data_list.close()

                # 添加 {d:1}
                # if self.remove_id_list:
                #     update_count = m_client.update_to_mongodb(collection=collection,
                #                                               data_id=self.remove_id_list,
                #                                               data_dict={"d": 1})
                #     self.remove_id_list = []
                #     self.remove_count += update_count
                #     self.logger.info("MongoDB 更新成功")
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            # 删除数据
            # if status:
            # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                            entity_code=entity_code,
            #                                            remove_id_list=self.remove_id_list)
            # self.remove_count += delete_count
            # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")

            # # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #12
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_HYBG_ITJZ_YHBG",
                             mongo_collection="ZX_HYBG")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_GJJRJG_ZHRMGHGSWJ_ZSHYFX",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #14
0
from crm_scripts import GenericScript
from database._mongodb import MongoClient
from tools.web_api_of_baidu import get_lat_lng, get_area


def data_shuffle(data, ):
    re_data = dict()
    re_data['ACTIME_NAME_'] = data.get('TITLE_')
    re_data['RELEASE_DATE_'] = data.get('PUBLISH_TIME_')
    re_data['ACTIVE_DESC_HTML_'] = data.get('HTML_')
    re_data['ACTIVE_DESC_TEXT_'] = data.get('CONTENT_')[:501]
    re_data['DATA_SOURCE_NAME_'] = data.get('SOURCE_NAME_')
    re_data['DATA_SOURCE_URL_'] = data.get('URL_')
    re_data['AMOUNT_OF_READING_'] = data.get('READ_NUM_')
    re_data['ACTIVE_KEYWORDS_'] = data.get('')
    re_data['ACTIVE_OBJECT_'] = data.get('')

    re_data['BANK_NAME_'] = data.get('BANK_NAME_')

    return re_data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="CRMJPFX_YXHD_SHYH",
                             mongo_collection="CRMJPFX_YXHD")
    data_list = main_mongo.main()
    for data in data_list[:2]:
        re_data = data_shuffle(data=data, )
        print(re_data)
예제 #15
0
class Entrust(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="JSENTRUST_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="ENTRUST")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["NAME_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)
        re_data["ID_"] = row_key
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        re_data["STATUS_"] = "1"
        re_data["DEALTIME_"] = data["DEALTIME_"]
        re_data["URL_"] = data["URL_"]

        if data["ENTITY_CODE_"] == "CHINATRC":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]

            pub_date = eval(data["PUB_DATE_"])
            date = str(pub_date["time"])[:-3]
            t = arrow.get(int(date))
            publish_date = str(t)[:10]
            period_code = publish_date.replace("-", "")
            re_data["PERIOD_CODE_"] = period_code
            # re_data["REMARK_"]

            # re_data["UPDATE_TIME_"]

            re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            re_data["FUNCTION_"] = data["FUNCTION_"]

            pro_date = eval(data["PRO_START_"])
            pro_date = str(pro_date["time"])[:-3]
            p_t = arrow.get(int(pro_date))
            product_date = str(p_t)[:10]
            re_data["PRO_START_"] = product_date
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = publish_date
            # re_data["SCALE_"] = data[""]
            # re_data["MONTH_"]
            # re_data["YIELD_RATE_"]
            # re_data["START_FUNDS_"]
            # re_data["PURPOSE_"]
            # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"]
            #
            # re_data["DISTRIBU_MODE_"]
            # re_data["INVEST_AREA_"]
            # re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # re_data["INVEST_DIRECTION_"]
            # re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"]
            # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            # re_data["ISSUER_AREA_"]
            # re_data["RESERVE_INFO_"]
            # re_data["TRUSTEESHIP_BANK_"]
            # re_data["OTHER_INFO_"]
            # re_data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "TRUSTHEXUN":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # re_data["REMARK_"]
            # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # re_data["PRO_START_"]
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace(
                "至月", "")
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            re_data["SCALE_"] = data["SCALE_"]
            # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # re_data["PURPOSE_"]
            # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"]
            #
            # re_data["DISTRIBU_MODE_"]
            # re_data["INVEST_AREA_"]
            # re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # re_data["INVEST_DIRECTION_"]
            re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            re_data["CURRENCY_"] = data["CURRENCY_"]
            re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"]
            # re_data["TRUSTEESHIP_BANK_"]
            re_data["OTHER_INFO_"] = data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "YANGLEE":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # # re_data["REMARK_"]
            # # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # # re_data["PRO_START_"]
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            # re_data["SCALE_"] = data["SCALE_"]
            # # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # # re_data["PURPOSE_"]
            # # re_data["ESTAB_ANNOUNCEMENT_"]
            re_data["ENTRUST_STATUS_"] = data["STATUS_"]
            #
            re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"]
            # # re_data["INVEST_AREA_"]
            re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # # re_data["INVEST_DIRECTION_"]
            # re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"] = data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            # re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"]
            re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"]
            re_data["OTHER_INFO_"] = data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "TRUSTONE":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # # re_data["REMARK_"]
            # # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # # re_data["PRO_START_"]
            # re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            # re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            re_data["SCALE_"] = data["SCALE_"]
            # # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # # re_data["PURPOSE_"]
            # # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"] = data["STATUS_"]
            # #
            re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"]
            re_data["INVEST_AREA_"] = data["INVEST_AREA_"]
            re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            re_data["INVEST_DIRECTION_"] = data["INVEST_DIRECTION_"]
            re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"] = data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            # re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            re_data["RESERVE_INFO_"] = re.sub(r"</?\w*>", "",
                                              data["RESERVE_INFO_"])
            # re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"]
            # re_data["OTHER_INFO_"] = data["OTHER_INFO_"]

        return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "ENTRUST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."CREATE_TIME_" varchar, "C"."STATUS_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."FUNCTION_" varchar,'
        #              '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar,'
        #              '"C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."UPDATE_TIME_" varchar,'
        #              '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."ISSUER_" varchar, "C"."PRO_START_" varchar,'
        #              '"C"."INVEST_PERIOD_" varchar,"C"."RUN_MODE_" varchar, "C"."INDUSTRY_" varchar,'
        #              '"C"."PUB_DATE_" varchar, "C"."SCALE_" varchar, "C"."MONTH_" varchar, "C"."YIELD_RATE_" varchar,'
        #              '"C"."START_FUNDS_" varchar, "C"."PURPOSE_" varchar, "C"."ESTAB_ANNOUNCEMENT_" varchar,'
        #              '"C"."ENTRUST_STATUS_" varchar, "C"."DISTRIBU_MODE_" varchar, "C"."INVEST_AREA_" varchar,'
        #              '"C"."TERM_TYPE_" varchar, "C"."INVEST_DIRECTION_" varchar, "C"."INVEST_MODE_" varchar,'
        #              '"C"."CURRENCY_" varchar, "C"."MANAGE_TYPE_" varchar, "C"."SALE_TARGET_" varchar,'
        #              '"C"."PROFIT_TYPE_" varchar, "C"."ISSUER_AREA_" varchar, "C"."RESERVE_INFO_" varchar,'
        #              '"C"."TRUSTEESHIP_BANK_" varchar, "C"."OTHER_INFO_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection, data_id="5c67307d9bb3df76b4229f79")

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            # try:
            re_data = self.data_shuffle(data=data)
            # except Exception as e:
            #     self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
            #     continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # # add {d:1}
                # try:
                #     self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id,
                #                                     data_dict={"d": 1})
                #     self.remove_count += 1
                #     if self.remove_count % 10 == 0:
                #         self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count))
                # except Exception as e:
                #     self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e))
                #     continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #16
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_GJJRJG_ZGZXQYXXW_XXJL",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #17
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_HY_QCYJW_HGJJ", mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #18
0
# -*- coding: utf-8 -*-
"""21世纪经济网-商业  ZX_CJXW_ZYCJ_21SJJJW_SHY"""
import re
from database._mongodb import MongoClient


def data_shuffle(data):
    data["HTML_"] = re.sub(r"<p class=\"copyright\".*?</p>", "", data["HTML_"])
    data["HTML_"] = re.sub(r"<a.*?返回21经济首页.*?</a>", "", data["HTML_"])
    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_ZYCJ_21SJJJW_SHY",
                             mongo_collection="ZX_CJXW_ZYCJ")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #19
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_GJJRJG_ZGYJH_YJH",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #20
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_HY_ZGDZQYXH_HYTJ",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #21
0
import os
from database._mongodb import MongoClient


def data_shuffle(data):
    com_list = ["光大永明人寿保险公司", "太平人寿保险公司", "中信保诚人寿保险公司"]
    if data.get("PRO_NAME"):
        data["PRO_NAME_"] = data["PRO_NAME"]
        for com_name in com_list:
            if data["PRO_NAME_"][:2] == com_name[:2]:
                data["COM_NAME_"] = com_name
        curPath = os.path.abspath(os.path.dirname(__file__))
        data["LOCAL_PDF_PATH_"] = "".join([
            curPath, "/渤海代理保险合同条款/", data["PRO_NAME_"].replace("产品计划", ""),
            ".pdf"
        ])
        data["LOCAL_PDF_NAME_"] = data["PRO_NAME_"] + "条款"
        if data.get("PDF_"):
            del data["PDF_"]

        return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="JRCP_BX_BHYH_GW_ALL",
                             mongo_collection="JRCP_BX")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        # print(re_data)
예제 #22
0
"""中国民生银行 理财产品 JRCP_LCCP_ZGMSYH_GW_ALL"""
from database._mongodb import MongoClient


def data_shuffle(data):
    if "PRO_NAME_" not in data:
        return
    # 风险等级
    if "SOURCE_RISK_LEVEL_CODE_" in data:
        if data["SOURCE_RISK_LEVEL_"] == "1":
            data["RISK_LEVEL_CODE_"] = "R1"
        elif data["SOURCE_RISK_LEVEL_"] == "2":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["SOURCE_RISK_LEVEL_"] == "3":
            data["RISK_LEVEL_CODE_"] = "R3"
        elif data["SOURCE_RISK_LEVEL_"] == "4":
            data["RISK_LEVEL_CODE_"] = "R4"
        elif data["SOURCE_RISK_LEVEL_"] == "5":
            data["RISK_LEVEL_CODE_"] = "R5"
    if "START_FUNDS_" in data:
        data["START_FUNDS_"] = data["START_FUNDS_"].replace(",", "")
    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="JRCP_LCCP_ZGMSYH_GW_ALL", mongo_collection="JRCP_LCCP")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #23
0
class JsInsuranceCcbData(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="JSINSURANCE_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 MySQL 对象
        __mysql_config = {
            "host": MYSQL_HOST_25,
            "port": MYSQL_PORT_25,
            "database": MYSQL_DATABASE_25,
            "user": MYSQL_USER_25,
            "password": MYSQL_PASSWORD_25,
            "table": MYSQL_TABLE_25
        }

        __mysql_client = MysqlClient(**__mysql_config)
        __mysql_connection = __mysql_client.client_to_mysql()

        self.type = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'TYPE\'",
            connection=__mysql_connection)

        __mysql_client.close_client(connection=__mysql_connection)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="INSURANCE")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""
        self.a = list()

    def data_shuffle(self, data):
        if data["ENTITY_CODE_"] == "PAINSURANCE":
            return None
        elif data["ENTITY_CODE_"] == "BJBINSURANCE":
            data["CONTET_"] = data["CONTET_"].replace("|主险2:", "主险2:")
            first_shuffle = data["CONTET_"].split("|")
            data_list = list()
            company_dict = dict()
            index_list = list()
            for first in first_shuffle:
                if first[-2:] == "公司":
                    company_index = first_shuffle.index(first)
                    company_dict[company_index] = first
                    index_list.append(company_index)
                else:
                    continue

            for key in index_list:
                # print(index_list)
                j = key + 1
                for i in range(100):
                    if index_list.index(key) == len(index_list) - 1:
                        if j == len(first_shuffle) - 1:
                            break
                    else:
                        if j == index_list[index_list.index(key) + 1]:
                            break

                    data_dict = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(first_shuffle[j].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    data_dict["ID_"] = row_key
                    data_dict["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    data_dict["ENTITY_NAME_"] = data["ENTITY_NAME_"].replace(
                        "模板", "产品")
                    data_dict["BANK_CODE_"] = "BJB"
                    data_dict["BANK_NAME_"] = "北京银行"
                    data_dict["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    data_dict["URL_"] = data["URL_"]
                    data_dict["PRODUCT_NAME_"] = first_shuffle[j]
                    j += 1
                    # data_dict["TYPE_"] = first_shuffle[j]
                    data_dict["TYPE_"] = ""
                    data_dict["TYPE_CODE_"] = ""
                    for i in self.type:
                        if i["ITEM_LABEL_"][:-1] in first_shuffle[j]:
                            data_dict["TYPE_"] = data_dict["TYPE_"] + i[
                                "ITEM_LABEL_"] + "|"
                            data_dict["TYPE_CODE_"] = data_dict[
                                "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                    data_dict["TYPE_"] = data_dict["TYPE_"][:-1]
                    data_dict["TYPE_CODE_"] = data_dict["TYPE_CODE_"][:-1]
                    j += 1
                    # data_dict["RISK_LEVEL_"] = first_shuffle[j]
                    j += 1
                    data_dict["PAY_METHOD_"] = first_shuffle[j]
                    j += 1
                    # data_dict["INSURANCE_DATE_"] = first_shuffle[j]
                    j += 1
                    # data_dict["TOUZIZHE_TYPE_"] = first_shuffle[j]
                    j += 1
                    data_dict["COM_NAME_"] = company_dict[key]
                    # data_dict["CONSIGNMENT_"] = "代销"
                    # if "CONTENT_" in data:
                    #     data_dict["CONTENT_"] = data["CONTENT_"]
                    data_dict["DEALTIME_"] = data["DEALTIME_"]
                    data_dict["CREATE_TIME_"] = data["DATETIME_"]
                    data_dict["STATUS_"] = "1"
                    # print(data_dict)
                    data_list.append(data_dict)

            return data_list

        elif data["ENTITY_CODE_"] == "CIBINSURANCE":
            data_list = list()
            insurance_name = re.findall(r".*?计划", data["PRODUCT_NAME_"])
            for name in insurance_name:
                re_data = dict()
                # HBase row_key
                hash_m = hashlib.md5()
                hash_m.update(name.encode("utf-8"))
                hash_title = hash_m.hexdigest()
                row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                # "C"
                re_data["ID_"] = row_key
                re_data["PRODUCT_NAME_"] = name
                re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                re_data["BANK_CODE_"] = "CIB"
                re_data["BANK_NAME_"] = "兴业银行"
                re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                    "-", "")
                re_data["URL_"] = data["URL_"]
                re_data["DEALTIME_"] = data["DEALTIME_"]
                re_data["CREATE_TIME_"] = data["DATETIME_"]
                re_data["STATUS_"] = "1"

                re_data["TYPE_"] = ""
                re_data["TYPE_CODE_"] = ""
                for i in self.type:
                    if i["ITEM_LABEL_"][:-1] in name:
                        re_data["TYPE_"] = re_data["TYPE_"] + i[
                            "ITEM_LABEL_"] + "|"
                        re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"] + i[
                            "ITEM_VALUE_"] + "|"
                re_data["TYPE_"] = re_data["TYPE_"][:-1]
                re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                data_list.append(re_data)

            return data_list

        else:
            if "INSURANCE_NAME_" not in data and ("PRODUCT_NAME_" not in data):
                return None
            else:
                if "INSURANCE_NAME_" in data:
                    # # 承保年龄
                    # if ("INSURANCE_AGE_" not in data) or (not data["INSURANCE_AGE_"]):
                    #     age = re.findall(r"(\d*)周岁", data["INSURANCE_NAME_"])
                    #     if age:
                    #         data["INSURANCE_AGE_"] = age[0]

                    # 保障期限
                    # if ("INSURANCE_DATE_" not in data) or (not data["INSURANCE_DATE_"]):
                    #     limit = re.findall(r"保(终身)|保(\d*年)|(\d*年)期", data["INSURANCE_NAME_"])
                    #     if limit:
                    #         for l in limit[0]:
                    #             if l:
                    #                 data["INSURANCE_DATE_"] = l
                    #                 break

                    re_data = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(data["INSURANCE_NAME_"].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    re_data["ID_"] = row_key
                    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                    re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace(
                        "INSURANCE", "")
                    re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace(
                        "保险产品", "")
                    if "INSURANCE_NAME_" in data:
                        re_data["PRODUCT_NAME_"] = data["INSURANCE_NAME_"]
                    if ("INSURANCE_AGE_" in data) or ("AGE_" in data):
                        re_data["AGE_"] = data["INSURANCE_AGE_"]
                    if "TYPE_" in data:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        if data["TYPE_"] == "财险":
                            re_data["TYPE_"] = "财产险"
                            re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE"
                        else:
                            for i in self.type:
                                if i["ITEM_LABEL_"][:-1] in data["TYPE_"]:
                                    re_data["TYPE_"] = re_data["TYPE_"] + i[
                                        "ITEM_LABEL_"] + "|"
                                    re_data["TYPE_CODE_"] = re_data[
                                        "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                            re_data["TYPE_"] = re_data["TYPE_"][:-1]
                            re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    else:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        for i in self.type:
                            if i["ITEM_LABEL_"][:-1] in data["ENTITY_NAME_"]:
                                re_data["TYPE_"] = re_data["TYPE_"] + i[
                                    "ITEM_LABEL_"] + "|"
                                re_data["TYPE_CODE_"] = re_data[
                                    "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                        re_data["TYPE_"] = re_data["TYPE_"][:-1]
                        re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    # if "INSURANCE_DATE_" in data:
                    #     re_data["INSURANCE_DATE_"] = data["INSURANCE_DATE_"]
                    if "INSURANCE_DETAIL_" in data:
                        re_data["PRODUCT_DETAIL_"] = data["INSURANCE_DETAIL_"]
                    if "COMPANY_NAME_" in data:
                        re_data["COM_NAME_"] = data["COMPANY_NAME_"]
                    if "LIMIT_NUMBER_" in data:
                        re_data["BUY_LIMIT_"] = data["LIMIT_NUMBER_"]
                    # re_data["AREA_CODE_"]
                    # re_data["UNIT_CODE_"]
                    re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    if "CONTENT_" in data:
                        re_data["CONTENT_"] = data["CONTENT_"]
                    # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
                    re_data["STATUS_"] = "1"
                    # re_data["REMARK_"] = ""
                    re_data["CREATE_TIME_"] = data["DATETIME_"]
                    # re_data["UPDATE_TIME_"]
                    # re_data["TITLE_"] = data["TITLE_"]
                    re_data["URL_"] = data["URL_"]
                    re_data["DEALTIME_"] = data["DEALTIME_"]
                    # re_data["DATETIME_"] = data["DATETIME_"]

                    return re_data

                elif "PRODUCT_NAME_" in data:
                    re_data = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(data["PRODUCT_NAME_"].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    re_data["ID_"] = row_key
                    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                    # re_data["BANK_CODE_"] = data["ENTITY_CODE_"]
                    # re_data["BANK_NAME_"] = data["ENTITY_NAME_"]
                    if "PRODUCT_NAME_" in data:
                        re_data["PRODUCT_NAME_"] = data["PRODUCT_NAME_"]
                    if "FEATURE_NAME_" in data:
                        re_data["FEATURE_NAME_"] = data["FEATURE_NAME_"]
                    if "TYPE_" in data:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        if data["TYPE_"] == "财险":
                            re_data["TYPE_"] = "财产险"
                            re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE"
                        elif data["TYPE_"] == "100种疾病保障":
                            re_data["TYPE_"] = "健康险"
                            re_data["TYPE_CODE_"] = "HEALTH_INSURANCE"
                        else:
                            for i in self.type:
                                if i["ITEM_LABEL_"][:-1] in data["TYPE_"]:
                                    re_data["TYPE_"] = re_data["TYPE_"] + i[
                                        "ITEM_LABEL_"] + "|"
                                    re_data["TYPE_CODE_"] = re_data[
                                        "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                            re_data["TYPE_"] = re_data["TYPE_"][:-1]
                            re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    if "POLICY_DUTY_" in data:
                        re_data["POLICY_DUTY_"] = data["POLICY_DUTY_"]
                    if "PRODUCT_CASE_" in data:
                        re_data["PRODUCT_CASE_"] = data["PRODUCT_CASE_"]
                    if "BUY_LIMIT_" in data:
                        re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"]
                    if "ENSURE_PRICE_" in data:
                        re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"]
                    # re_data["AREA_CODE_"]
                    # re_data["UNIT_CODE_"]
                    re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    if "PRODUCT_PRICE_" in data:
                        re_data["PRODUCT_PRICE_"] = data["PRODUCT_PRICE_"]
                    if "PRODUCT_ID_" in data:
                        re_data["PRODUCT_ID_"] = data["PRODUCT_ID_"]
                    if "PRODUCT_CLAUSE_" in data:
                        re_data["PRODUCT_CLAUSE_"] = data["PRODUCT_CLAUSE_"]
                    if "GENDER_" in data:
                        re_data["GENDER_"] = data["GENDER_"]
                    if "AGE_" in data:
                        re_data["AGE_"] = data["AGE_"]
                    if "COM_NAME_" in data:
                        re_data["COM_NAME_"] = data["COM_NAME_"]
                    if "PAY_METHOD_" in data:
                        re_data["PAY_METHOD_"] = data["PAY_METHOD_"]
                    if "PROBLEM_" in data:
                        re_data["PROBLEM_"] = data["PROBLEM_"]
                    if "CLAIM_" in data:
                        re_data["CLAIM_"] = data["CLAIM_"]
                    if "COMMENT_" in data:
                        re_data["COMMENT_"] = data["COMMENT_"]
                    if "ENSURE_CONTENT_" in data:
                        re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"]
                    if "INSURE_INFO_" in data:
                        re_data["INSURE_INFO_"] = data["INSURE_INFO_"]
                    if "RATE_INFO_" in data:
                        re_data["RATE_INFO_"] = data["RATE_INFO_"]
                    if "SALE_SERVICE_" in data:
                        re_data["SALE_SERVICE_"] = data["SALE_SERVICE_"]

                    # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
                    re_data["STATUS_"] = "1"
                    # re_data["REMARK_"] = ""
                    re_data["CREATE_TIME_"] = data["DATETIME_"]
                    # re_data["UPDATE_TIME_"]
                    # re_data["TITLE_"] = data["TITLE_"]
                    re_data["URL_"] = data["URL_"]
                    re_data["DEALTIME_"] = data["DEALTIME_"]
                    # re_data["DATETIME_"] = data["DATETIME_"]

                    return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "INSURANCE" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,'
        #              ' "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, '
        #              '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,'
        #              '"C"."TYPE_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C".PRODUCT_CLAUSE_ varchar,'
        #              '"C"."SOURCE_" varchar, "C"."PRODUCT_NAME_" varchar, "C"."FEATURE_NAME_" varchar,'
        #              '"C"."POLICY_DUTY_" varchar, "C"."PRODUCT_CASE_" varchar, "C"."BUY_LIMIT_" varchar,'
        #              '"C"."ENSURE_PRICE_" varchar, "C"."PRODUCT_PRICE_" varchar, "C"."PRODUCT_ID_" varchar,'
        #              '"C"."GENDER_" varchar, "C"."AGE_" varchar, "C"."COM_NAME_" varchar, "C"."TYPE_CODE_" varchar,'
        #              '"C"."PAY_METHOD_" varchar, "C"."PRODUCT_DETAIL_" varchar, "C"."PROBLEM_" varchar,'
        #              '"C"."CLAIM_" varchar, "C"."COMMENT_" varchar, "C"."STATUS_" varchar,'
        #              '"C"."ENSURE_CONTENT_" varchar, "C"."INSURE_INFO_" varchar, "C"."RATE_INFO_" varchar,'
        #              '"C"."SALE_SERVICE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            # print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                if isinstance(re_data, dict):
                    # upsert data to HBase
                    try:
                        success_count = self.p_client.upsert_to_phoenix_by_one(
                            connection=self.connection, data=re_data)
                    except jaydebeapi.DatabaseError as e:
                        self.logger.info("错误 id: {}, 错误信息 {}".format(
                            self.data_id, e))
                        continue

                elif isinstance(re_data, list):
                    for r_d in re_data:
                        # upsert data to HBase
                        try:
                            success_count = self.p_client.upsert_to_phoenix_by_one(
                                connection=self.connection, data=r_d)
                        except jaydebeapi.DatabaseError as e:
                            self.logger.info("错误 id: {}, 错误信息 {}".format(
                                self.data_id, e))
                            continue

            #     # add {d:1}
            #     try:
            #         self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id,
            #                                         data_dict={"d": 1})
            #         self.remove_count += 1
            #         if self.remove_count % 10 == 0:
            #             self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count))
            #     except Exception as e:
            #         self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e))
            #         continue
                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #24
0
def data_shuffle(data):
    # 住宅名称
    data['NAME_'] = data['NAME_'].replace('整租·', '')
    data['NAME_'] = data['NAME_'].replace('独栋·', '')
    # print(data['NAME_'])
    try:
        data['NAME_'] = data['NAME_'][:data['NAME_'].index('室') - 1]
    except:
        pass

    if "·" in data["NAME_"] or "·" in data["NAME_"]:
        house_name = re.findall(r"[\u4e00-\u9fa5]{2}[^\w]([\w()\-().,,]+)\|",
                                data["NAME_"])
    else:
        house_name = re.findall(r"\|([\w()\-().,,]+)\|", data["NAME_"])
        # print(house_name)
    data["TITLE_"] = data["NAME_"]
    if house_name:
        data["NAME_"] = house_name[0]
    # print(house_name)
    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="WD_JZ_FJ_LIXQZL_FS",
                             mongo_collection="WD_JZ_FJ_FS")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #25
0
# -*- coding: utf-8 -*-
"""南海农商银行 官网动态 ZX_GWDT_NHYH_NHXW"""
import re
from database._mongodb import MongoClient


def data_shuffle(data):
    # if data["CONTENT_"]:
    #     data["CONTENT_"] = re.sub(r"/\*[^\u4e00-\u9fa5]+", "", data["CONTENT_"], count=1)
    # if data["HTML_"]:
    #     data["HTML_"] = re.sub("<p[^>]+align=\"center\">.*?</p>", "", data["HTML_"])
    data["BANK_NAME_"] = "南海农商银行"
    data["BANK_CODE_"] = "NRCB"
    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_GWDT_NHYH_NHXW",
                             mongo_collection="ZX_GWDT")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #26
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_GJJRJG_QGZXQYGZXT_LWJTGGS",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #27
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_HY_DYSYW_SYDC",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #28
0
# -*- coding: utf-8 -*-
"""
无 CONTENT_
"""
from database._mongodb import MongoClient


def data_shuffle(data):
    if "CONTENT_" not in data:
        data["CONTENT_"] = ""
    if "HTML_" not in data:
        data["HTML_"] = ""
    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_ZCGG_SJS_ZJHL", mongo_collection="ZX_ZCGG")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #29
0
# -*- coding: utf-8 -*-
from database._mongodb import MongoClient


def data_shuffle(data):

    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_CJXW_HY_ZGJCXXZW_YDYL",
                             mongo_collection="ZX_CJXW_HY")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)
예제 #30
0
# -*- coding: utf-8 -*-
""""ZX_GWDT_GSYH_GHKX": "中国工商银行|ICBC","""
from database._mongodb import MongoClient


def data_shuffle(data):
    data["BANK_NAME_"] = "中国工商银行"
    data["BANK_CODE_"] = "ICBC"
    return data


if __name__ == '__main__':
    main_mongo = MongoClient(entity_code="ZX_GWDT_GSYH_GHKX",
                             mongo_collection="ZX_GWDT")
    data_list = main_mongo.main()
    for data in data_list:
        re_data = data_shuffle(data)
        print(re_data)