Пример #1
0
def run():
    script = GenericScript(entity_code="86ZGRM", entity_type="CommonBidding")

    # 从 MongoDB 获取数据
    mongo_data_list = script.data_from_mongo()
    data_list = data_shuffle(mongo_data_list)
    # 创建 Phoenix 对象
    p_client = PhoenixHbase(table_name="CommonBidding")
    # 连接 Phoenix
    connection = p_client.connect_to_phoenix()
    # 插入数据
    p_client.upsert_to_phoenix(connection=connection, data_list=data_list)
    # 关闭连接
    p_client.close_client_phoenix(connection=connection)
Пример #2
0
    def run(self):
        count = 0
        # # 创建 Phoenix 对象-注意表格名字
        p_client = PhoenixHbase(table_name="FUND")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="JSFUND_CCBDATA")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        #查询省市区的编码列表
        # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA")
        # province_list, city_list, area_list, dir_area_list = script.area_from_mysql()
        list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS")
        list_TYPE = self.dict_from_mysql("FUND_TYPE")

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 基金表创建语句
        # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,'
        #     '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,'
        #     '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,'
        #     '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,'
        #     '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,'
        #     '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,'
        #     '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,'
        #     '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,'
        #     '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,'
        #     '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,'
        #     '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE)
                        re_data = module_name.data_shuffle(
                            data, list_SUBS_STATUS, list_TYPE)
                        # re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # try:
                            #     area_data = self.shuffle_for_area(list_data)
                            # except Exception as e:
                            #     self.remove_id_list.remove(data_id)
                            #     self.copy_mongo_data_list.remove(copy_data)
                            #     self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e))
                            #     continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    count += 1
                                    print(count)
                                    # print(list_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=list_data)
                                    # 导出csv
                                    # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv")
                                    once_count += success_count
                                    self.success_count += success_count
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            success_count))
                                    if self.success_count % 50 == 0:
                                        update_count = m_client.update_to_mongodb(
                                            collection=collection,
                                            data_id=self.remove_id_list,
                                            data_dict={"d": 1})

                                        self.remove_count += update_count
                                        self.logger.info("MongoDB 更新成功")
                                except Exception as e:
                                    self.remove_id_list.remove(data_id)
                                    self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        # try:
                        # area_data = self.shuffle_for_area(re_data)
                        # except Exception as e:
                        #     self.remove_id_list.remove(data_id)
                        #     self.copy_mongo_data_list.remove(copy_data)
                        #     self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e))
                        #     continue
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                if self.success_count % 100 == 0:
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            self.success_count))
                                # 添加 {d:1}
                                if self.success_count % 50 == 0:
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=self.remove_id_list,
                                        data_dict={"d": 1})

                                    self.remove_count += update_count
                                    self.logger.info("MongoDB 更新成功")
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #3
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="CommonBidding")
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="CommonBidding")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        old_client = MongoClient(mongo_collection="CommonBidding")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        old_client.mongo_db = "spider_data_old"
        db_old, collection_list_old = old_client.client_to_mongodb()
        collection_old = db_old["CommonBidding"]

        # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # 招投标表创建语句
        # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,'
        #        '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,'
        #        '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,'
        #        '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,'
        #        '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,'
        #        '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,'
        #        '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,'
        #        '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,'
        #        '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,'
        #        '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,'
        #        '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,'
        #        '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        # self.file_list = ["CommonBidding_86JCW"]
        for f in self.file_list:
            status = False
            entity_code = f.replace(".py", "")
            module_name = __import__(entity_code)
            entity_code_mongo = entity_code.replace("CommonBidding_", "")
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                try:
                    self.find_count += mongo_data_list.count()
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(1)
                    self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    self.remove_id_list.append(data_id)
                    del data["_id"]
                    # 深拷贝源数据,用于插入 spider_data 库中
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    # 数据清洗
                    try:
                        re_data = module_name.data_shuffle(data)
                        final_data = self.shuffle_data(re_data)
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue
                    # phoenix_HBase 插入数据
                    if final_data:
                        try:
                            p_client.upsert_to_phoenix_by_one(
                                connection=connection, data=final_data)
                            once_count += 1
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(copy_data)
                            self.logger.warning(
                                "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                    data_id, e))
                            continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))

                # 删除数据
                if status:
                    delete_count = self.delete_data_from_mongo(
                        m_client=m_client,
                        collection=collection,
                        entity_code=entity_code_mongo,
                        remove_id_list=self.remove_id_list)
                    self.remove_count += delete_count
                else:
                    self.logger.info("HBase 插入成功条数0条, 不执行删除")

                # 将数据插入 spider_data_old 中
                if status:
                    try:
                        old_client.mongo_db = "spider_data_old"
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except pymongo.errors.ServerSelectionTimeoutError as e:
                        time.sleep(1)
                        self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except Exception as e:
                        self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Пример #4
0
    def run(self):
        # # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="NEWS_FINASSIST")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="NEWS_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)
        #
        # # 表创建语句
        # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, '
        #        '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, '
        #        '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)'
        #        'IMMUTABLE_ROWS = true')

        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            # self.remove_id_list = []
            # self.copy_mongo_data_list = []
            # self.branch_code_list = []
            if entity_code == "CAIJINGNEWS":
                find_id = "5c6bfa508d7fee512a4ca68f"
            else:
                find_id = ""
            # find_id = ""
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for i in range(1000000):
                    try:
                        data = mongo_data_list.__next__()
                    except pymongo.errors.ServerSelectionTimeoutError:
                        continue
                    except StopIteration:
                        break

                    # for data in mongo_data_list:
                    data_id = data["_id"]
                    if self.success_count % 100 == 0:
                        self.logger.info(
                            "running on data_id: {}".format(data_id))
                    # print(data_id)
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        data_list = [data]
                        re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            # self.remove_id_list.remove(data_id)
                            continue
                    except Exception as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    if entity_code != "CNINFONEWS":
                                        ai_data = self.get_brief_from_ai(
                                            data=list_data)
                                    else:
                                        ai_data = list_data
                                    # print(ai_data["CONTENT_"])
                                except Exception as e:
                                    self.logger.info("AI 调取失败, 错误信息", e)
                                    ai_data = re_data
                                try:
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=ai_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    if self.success_count % 10 == 0:
                                        self.logger.info(
                                            "HBase 插入成功, 成功条数{}条".format(
                                                once_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                                try:
                                    # 添加 {d:1}
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=data_id,
                                        data_dict={"d": 1})
                                    self.remove_count += update_count
                                    # self.logger.info("MongoDB 更新成功")
                                    if self.remove_count % 10 == 0:
                                        self.logger.info(
                                            "MongoDB 更新成功, 成功条数 {} 条".format(
                                                "10"))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue

                    elif isinstance(re_data, dict):
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                self.logger.info(
                                    "HBase 插入成功, 成功条数 {} 条".format(
                                        success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue

                if once_count > 0:
                    status = True
                    self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format(
                        entity_code, once_count))
                mongo_data_list.close()
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #5
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="WEIBOBASICINFO")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None, entity_type=None).area_from_mysql())

        # 删除表
        p_client.drop_table_phoenix(connection=connection)
        # quit()

        # 创建表
        sql = (
            'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,'
            '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,'
            '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,'
            '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,'
            '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,'
            '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
            '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,'
            '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        status = False
        self.logger.info("开始进行 WEIBOBASICINFO")

        try:
            mongo_data_list = m_client.all_from_mongodb(collection=collection)
        except pymongo.errors.ServerSelectionTimeoutError:
            time.sleep(1)
            mongo_data_list = m_client.all_from_mongodb(collection=collection)

        # 清洗数据并插入 HBase
        if mongo_data_list:
            self.find_count = mongo_data_list.count()
            for data in mongo_data_list:
                re_data = ""
                data_id = data["_id"]
                copy_data = {}
                self.remove_id_list.append(data_id)
                try:
                    del data["_id"]
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    re_data = self.data_shuffle(data=data,
                                                province_list=province_list)
                    if not re_data:
                        self.bad_count += 1
                        continue
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                        data_id, e))

                # phoenix_HBase 插入数据
                try:
                    success_count = p_client.upsert_to_phoenix_by_one(
                        connection=connection, data=re_data)
                    self.success_count += success_count
                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(
                        data_id, e))
                    continue
            if self.success_count > 0:
                status = True
                self.logger.info("HBase 插入成功, 成功条数 {}".format(
                    self.success_count))
        else:
            quit()

        # # 删除数据
        # if status:
        #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
        #                                                remove_id_list=self.remove_id_list)
        #     self.remove_count += delete_count
        # else:
        #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
        #
        # # 将数据插入 spider_data_old 中
        # if status:
        #     try:
        #         old_client.mongo_db = "spider_data_old"
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #     except pymongo.errors.ServerSelectionTimeoutError as e:
        #         time.sleep(1)
        #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except Exception as e:
        #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #6
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="ORGANIZE_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None,
            entity_type="ORGANIZE_FINASSIST").area_from_mysql())

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 创建表
        # # 网点表创建语句
        # sql = ('create table "ORGANIZE_FINASSIST" ("ID_" varchar primary key, "C"."BANK_NAME_" varchar,'
        #        '"C"."BANK_CODE_" varchar, "C"."NAME_" varchar,'
        #        '"C"."CODE_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."UNIT_CODE_" varchar, "C"."ADDR_" varchar,'
        #        '"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar, "C"."CITY_" varchar,'
        #        '"C"."CITY_CODE_" varchar, "C"."DISTRICT_NAME_" varchar, "C". "DISTRICT_CODE_" varchar,'
        #        '"C"."LAT_" varchar, "C"."LNG_" varchar, "C"."CREATE_TIME_" varchar, "C"."DEALTIME_" varchar,'
        #        '"C"."URL_" varchar, "C"."TEL_" varchar, "C"."BUSINESS_HOURS_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."IMPORTANCE_" varchar) IMMUTABLE_ROWS = true')
        #
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            # find_id = ""
            if entity_code == "ECITICORGANIZE":
                find_id = "5c3f48479bb3df1d97d762e1"
            else:
                find_id = None
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        re_data = module_name.data_shuffle(
                            data, province_list, city_list, area_list)
                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.exception("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    print(data_id)

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            area_data = ""
                            try:
                                # self.logger.info("_id {}".format(data_id))
                                area_data = self.shuffle_for_area(list_data)
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.exception(
                                    "_id:{} 获取经纬度失败, {}".format(data_id, e))
                                continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if area_data:
                                try:
                                    # print(area_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=area_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.exception(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        area_data = ""
                        try:
                            area_data = self.shuffle_for_area(re_data)
                        except urllib3.exceptions.NewConnectionError as e:
                            # self.remove_id_list.remove(data_id)
                            # self.copy_mongo_data_list.remove(copy_data)
                            self.logger.exception("_id: {}获取经纬度失败, {}".format(
                                data_id, e))
                        except Exception as e:
                            # self.remove_id_list.remove(data_id)
                            # self.copy_mongo_data_list.remove(copy_data)
                            self.logger.exception("_id: {}获取经纬度失败, {}".format(
                                data_id, e))
                            continue
                        # phoenix_HBase 插入数据
                        if area_data:
                            try:
                                # print(area_data)
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=area_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.exception(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                    if self.success_count % 100 == 0:
                        self.logger.info("HBase 插入成功, 成功条数 {} 条".format(
                            self.success_count))
                    # 添加 {d:1}
                    # if self.success_count % 50 == 0:
                    #     update_count = m_client.update_to_mongodb(collection=collection,
                    #                                               data_id=self.remove_id_list,
                    #                                               data_dict={"d": 1})
                    #     self.remove_id_list = []
                    #     self.remove_count += update_count
                    #     self.logger.info("MongoDB 更新成功")

                mongo_data_list.close()

                # 添加 {d:1}
                # if self.remove_id_list:
                #     update_count = m_client.update_to_mongodb(collection=collection,
                #                                               data_id=self.remove_id_list,
                #                                               data_dict={"d": 1})
                #     self.remove_id_list = []
                #     self.remove_count += update_count
                #     self.logger.info("MongoDB 更新成功")
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            # 删除数据
            # if status:
            # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                            entity_code=entity_code,
            #                                            remove_id_list=self.remove_id_list)
            # self.remove_count += delete_count
            # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")

            # # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()