예제 #1
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["FINPRODUCT_FINASSIST"]

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 表创建语句
        # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, '
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,'
        #        '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,'
        #        '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,'
        #        '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,'
        #        '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #        '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,'
        #        '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,'
        #        '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,'
        #        '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,'
        #        '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]:
            # for entity in self.entity_list:
            status = False
            module_name = __import__(entity)
            self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            # find_id = "5c3f118f8d7fee068da6ef53"
            find_id = None
            try:
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        if entity == "CHINANETFINANCIAL":
                            re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status,
                                                               produc_category=self.produc_category,
                                                               revenue_type=self.revenue_type,
                                                               operaton_pattern=self.operaton_pattern,
                                                               purchase_amount=self.purchase_amount,
                                                               duration_type=self.duration_type)
                        elif entity == "JSFIN_CCBDATA":
                            re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data)
                        else:
                            re_data = module_name.data_shuffle(data)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e))
                        continue

                    print(data_id)

                    # phoenix_HBase 插入数据
                    if isinstance(re_data, dict):
                        try:
                            success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data)
                            once_count += success_count
                            self.success_count += success_count
                            # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(data)
                            self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                            continue
                    elif isinstance(re_data, list):
                        for r_data in re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(connection=connection,
                                                                                  data=r_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(data)
                                self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            break
            # # 删除数据
            # if status:
            #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                                entity_code=entity,
            #                                                remove_id_list=self.remove_id_list)
            #     self.remove_count += delete_count
            #     # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
            #
            # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

            # 关闭连接
        m_client.client_close()
        # p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()