Пример #1
0
 def client_to_mongodb(self):
     mon_logger = Logger().logger
     mon_logger.info("开始连接MongoDB({}:{}),database={}".format(
         self.mongo_host, self.mongo_port, self.mongo_database))
     try:
         collection_list = self.db.collection_names()
         mon_logger.info("MongoDB({}:{})连接成功".format(
             self.mongo_host, self.mongo_port))
         return collection_list
     except pymongo.errors.ServerSelectionTimeoutError as e:
         mon_logger.warning("MongoDB({}:{})连接失败".format(
             self.mongo_host, self.mongo_port))
         for i in range(2, 6):
             try:
                 collection_list = self.db.collection_names()
                 mon_logger.info("MongoDB({}:{})连接成功".format(
                     self.mongo_host, self.mongo_port))
                 return collection_list
             except Exception:
                 mon_logger.warning("MongoDB({}:{})第{}次连接失败".format(
                     self.mongo_host, self.mongo_port, i))
             if i == 5:
                 mon_logger.error(
                     "MongoDB连接失败,错误信息为: {}, 请检查各项参数是否正确host={}, port={},database={}"
                     .format(e, self.mongo_host, self.mongo_port,
                             self.mongo_database))
                 self.client_close()
Пример #2
0
 def client_to_mysql(self):
     '''
     :return: 返回mysql连接
     '''
     mysql_logger = Logger().logger
     try:
         mysql_logger.info("正在连接MySQL({}@{}:{})".format(
             self.mysql_user, self.mysql_host, self.mysql_port))
         connection = pymysql.connect(**self.mysql_config)
         mysql_logger.info("Mysql连接成功({}@{}:{})".format(
             self.mysql_user, self.mysql_host, self.mysql_port))
         return connection
     except pymysql.err.OperationalError as e:
         for retry_count in range(2, 7):
             try:
                 mysql_logger.warning(
                     "MySQL连接失败,正在重试第{}次连接".format(retry_count))
                 connection = pymysql.connect(**self.mysql_config)
                 mysql_logger.info("Mysql连接成功")
                 return connection
             except Exception as e:
                 mysql_logger.warning("第{}次连接MySQL失败".format(retry_count))
                 # print(retry_count)
                 if retry_count == 6:
                     mysql_logger.error("MySQL连接失败,错误信息为{}".format(e))
Пример #3
0
class FundScript(object):
    def __init__(self):
        self.code_list = [
            "STCNFUND", "ABCFUND", "CCBFUND", "CITICFUND", "ICBCFUND"
        ]
        self.logger = Logger().logger
        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.copy_mongo_data_list = list()
        self.remove_id_list = list()
        self.branch_code_list = list()

        # 基金
        self.verify_list = [
            "ENTITY_CODE_", "ENTITY_NAME_", "URL_", "PERIOD_CODE_", "STATUS_",
            "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_",
            "FUND_NEW_VALUE_", "TOTAL_NEW_VALUE_", "FUND_OLD_VALUE_",
            "TOTAL_OLD_VALUE_", "DAILY_RATE_", "YEAR_REWARD_", "SUBS_STATUS_",
            "ATONEM_STATUS_", "TYPE_", "ID_", "NEWEST_VALUE_", "TOTAL_VALUE_",
            "POPULARITY_", "RATING_", "OLD_VALUE_", "UNIT_VALUE_", "SCALE_",
            "ESTABLISH_DATE_", "RISK_LEVEL_", "BASE_INFO_", "YIELD_",
            "INVEST_", "MONTH_RATE_", "QUARTER_RATE_", "HALF_YEAR_RATE_",
            "HISTORY_RATE_", "FUND_STATUS_", "COMPANY_", "SUBS_STATUS_CODE_",
            "TYPE_CODE_"
        ]

    # 从 MongoDB 获取数据
    def get_data_from_mongo(self, m_client, collection, entity_code):
        m_client.mongo_db = "spider_data"
        m_client.mongo_entity_code = entity_code

        try:
            mongo_data_list = m_client.search_from_mongodb(collection)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.search_from_mongodb(collection)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    # 从 MongoDB 删除数据
    def delete_data_from_mongo(self, m_client, collection, entity_code,
                               remove_id_list):
        m_client.mongo_entity_code = entity_code

        try:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

        # 网点 CODE_
        hash_m = hashlib.md5()
        hash_m.update(re_data["ADDR_"].encode("utf-8"))
        hash_addr_ = hash_m.hexdigest()
        re_data["CODE_"] = re_data["BANK_CODE_"] + "_" + re_data[
            "AREA_CODE_"] + "_" + hash_addr_
        # for i in range(1, 10000):
        #         branch_code = "ABC" + "_" + re_data["AREA_CODE_"] + "_" + "00000"
        #         branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i)
        #         if branch_code in branch_code_list:
        #             continue
        #         else:
        #             branch_code_list.append(branch_code)
        #             break
        return re_data

    def gaode_get_lat_lng(self, address):
        url = URL_FOR_LAT_LNG + "?" + "key=" + AK + "&address=" + address
        # url = url + "?location={}&output=json&pois=1&ak={}".format(address, ak)
        response = requests.get(url)
        temp = json.loads(response.content)
        response.close()
        return temp['geocodes'][0]['location']

    def dict_from_mysql(self, dict_code):
        # 创建 MySQL 对象
        mysql_config = {
            "host": "172.22.67.25",
            "port": 3306,
            "database": "chabei",
            "user": "******",
            "password": "******",
            "table": "sys_dict_item"
        }

        mysql_client = MysqlClient(**mysql_config)
        mysql_connection = mysql_client.client_to_mysql()
        result = mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'{}\'"
            .format(dict_code),
            connection=mysql_connection)

        mysql_client.close_client(connection=mysql_connection)
        return result

    # 主函数
    def run(self):
        count = 0
        # # 创建 Phoenix 对象-注意表格名字
        p_client = PhoenixHbase(table_name="FUND")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="JSFUND_CCBDATA")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        #查询省市区的编码列表
        # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA")
        # province_list, city_list, area_list, dir_area_list = script.area_from_mysql()
        list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS")
        list_TYPE = self.dict_from_mysql("FUND_TYPE")

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 基金表创建语句
        # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,'
        #     '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,'
        #     '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,'
        #     '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,'
        #     '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,'
        #     '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,'
        #     '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,'
        #     '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,'
        #     '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,'
        #     '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,'
        #     '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE)
                        re_data = module_name.data_shuffle(
                            data, list_SUBS_STATUS, list_TYPE)
                        # re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # try:
                            #     area_data = self.shuffle_for_area(list_data)
                            # except Exception as e:
                            #     self.remove_id_list.remove(data_id)
                            #     self.copy_mongo_data_list.remove(copy_data)
                            #     self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e))
                            #     continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    count += 1
                                    print(count)
                                    # print(list_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=list_data)
                                    # 导出csv
                                    # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv")
                                    once_count += success_count
                                    self.success_count += success_count
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            success_count))
                                    if self.success_count % 50 == 0:
                                        update_count = m_client.update_to_mongodb(
                                            collection=collection,
                                            data_id=self.remove_id_list,
                                            data_dict={"d": 1})

                                        self.remove_count += update_count
                                        self.logger.info("MongoDB 更新成功")
                                except Exception as e:
                                    self.remove_id_list.remove(data_id)
                                    self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        # try:
                        # area_data = self.shuffle_for_area(re_data)
                        # except Exception as e:
                        #     self.remove_id_list.remove(data_id)
                        #     self.copy_mongo_data_list.remove(copy_data)
                        #     self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e))
                        #     continue
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                if self.success_count % 100 == 0:
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            self.success_count))
                                # 添加 {d:1}
                                if self.success_count % 50 == 0:
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=self.remove_id_list,
                                        data_dict={"d": 1})

                                    self.remove_count += update_count
                                    self.logger.info("MongoDB 更新成功")
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #4
0
class FinProductScript(object):
    def __init__(self):
        # 创建 MySQL 对象
        __mysql_config = {
            "host": MYSQL_HOST_25,
            "port": MYSQL_PORT_25,
            "database": MYSQL_DATABASE_25,
            "user": MYSQL_USER_25,
            "password": MYSQL_PASSWORD_25,
            "table": MYSQL_TABLE_25
        }

        __mysql_client = MysqlClient(**__mysql_config)
        __mysql_connection = __mysql_client.client_to_mysql()

        self.sales_status = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'SALES_STATUS\'",
            connection=__mysql_connection)
        self.produc_category = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PRODUC_CATEGORY\'",
            connection=__mysql_connection)
        self.revenue_type = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'REVENUE_TYPE\'",
            connection=__mysql_connection)
        self.operaton_pattern = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'OPERATION_PATTERN\'",
            connection=__mysql_connection)
        self.purchase_amount = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PURCHASE_AMOUNT\'",
            connection=__mysql_connection)
        self.duration_type = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'DURATION_TYPE\'",
            connection=__mysql_connection)
        __mysql_client.close_client(connection=__mysql_connection)

        self.logger = Logger().logger
        self.remove_id_list = list()
        self.copy_mongo_data_list = list()
        # "CZBFinancial", "PABFinancial", "PSBCFinancial",
        #
        self.entity_list = ["ABCFinancial", "BOCFinancial", "BOCOMFinancial", "CBHBFinancial", "CCBFinancial",
                            "CEBFinancial", "CGBFinancial", "CIBFinancial", "CMBCFinancial", "CMBFinancial",
                            "EBCLFinancial", "ECITICFinancial", "HXBFinancial", "ICBCFinancial",
                            "SPDBFinancial", "CHINANETFINANCIAL", "JSFIN_CCBDATA"]

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.verify_list = ["ID_", "ENTITY_CODE_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_",
                            "PERIOD_CODE_", "CONTENT_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_",
                            "TIME_LIMIT_", "YIELD_RATE_", "BREAKEVEN_", "START_FUNDS_", "INVEST_PERIOD_", "SALE_START_",
                            "SALE_END_", "RISK_LEVEL_", "REDEMING_MODE_", "PRIVATE_BANK_", "URL_", "DEALTIME_",
                            "DATETIME_", "ENTITY_NAME_", "STATUS_", "SALE_DISTRICT_"]

    def get_data_from_mongo(self, m_client, collection, entity_code, data_id):
        """
        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :param entity_code:
        :return: data from MongoDB
        """
        m_client.mongo_db = "spider_data"
        m_client.mongo_entity_code = entity_code

        try:
            mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list):
        """
        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :param entity_code:
        :param remove_id_list: id list to remove
        :return: delete count
        """
        m_client.mongo_entity_code = entity_code

        try:
            remove_count = m_client.remove_from_mongo(collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = m_client.remove_from_mongo(collection=collection, remove_id_list=remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["FINPRODUCT_FINASSIST"]

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 表创建语句
        # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, '
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,'
        #        '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,'
        #        '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,'
        #        '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,'
        #        '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #        '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,'
        #        '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,'
        #        '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,'
        #        '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,'
        #        '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]:
            # for entity in self.entity_list:
            status = False
            module_name = __import__(entity)
            self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            # find_id = "5c3f118f8d7fee068da6ef53"
            find_id = None
            try:
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        if entity == "CHINANETFINANCIAL":
                            re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status,
                                                               produc_category=self.produc_category,
                                                               revenue_type=self.revenue_type,
                                                               operaton_pattern=self.operaton_pattern,
                                                               purchase_amount=self.purchase_amount,
                                                               duration_type=self.duration_type)
                        elif entity == "JSFIN_CCBDATA":
                            re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data)
                        else:
                            re_data = module_name.data_shuffle(data)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e))
                        continue

                    print(data_id)

                    # phoenix_HBase 插入数据
                    if isinstance(re_data, dict):
                        try:
                            success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data)
                            once_count += success_count
                            self.success_count += success_count
                            # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(data)
                            self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                            continue
                    elif isinstance(re_data, list):
                        for r_data in re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(connection=connection,
                                                                                  data=r_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(data)
                                self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            break
            # # 删除数据
            # if status:
            #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                                entity_code=entity,
            #                                                remove_id_list=self.remove_id_list)
            #     self.remove_count += delete_count
            #     # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
            #
            # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

            # 关闭连接
        m_client.client_close()
        # p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Пример #5
0
class ScriptCCB(object):
    def __init__(self):
        self.logger = Logger().logger
        self.remove_id_list = list()
        self.copy_mongo_data_list = list()
        # 创建 MySQL 对象
        __mysql_config = {
            "host": MYSQL_HOST_25,
            "port": MYSQL_PORT_25,
            "database": MYSQL_DATABASE_25,
            "user": MYSQL_USER_25,
            "password": MYSQL_PASSWORD_25,
            "table": MYSQL_TABLE_25
        }

        __mysql_client = MysqlClient(**__mysql_config)
        __mysql_connection = __mysql_client.client_to_mysql()

        self.sales_status = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'SALES_STATUS\'",
            connection=__mysql_connection)
        self.produc_category = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PRODUC_CATEGORY\'",
            connection=__mysql_connection)
        self.revenue_type = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'REVENUE_TYPE\'",
            connection=__mysql_connection)
        self.operaton_pattern = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'OPERATION_PATTERN\'",
            connection=__mysql_connection)
        self.purchase_amount = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PURCHASE_AMOUNT\'",
            connection=__mysql_connection)
        self.duration_type = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'DURATION_TYPE\'",
            connection=__mysql_connection)
        __mysql_client.close_client(connection=__mysql_connection)

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.verify_list = [
            "ID_", "ENTITY_CODE_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_",
            "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "REMARK_",
            "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_", "TIME_LIMIT_",
            "YIELD_RATE_", "BREAKEVEN_", "START_FUNDS_", "INVEST_PERIOD_",
            "SALE_START_", "SALE_END_", "RISK_LEVEL_", "REDEMING_MODE_",
            "PRIVATE_BANK_", "URL_", "DEALTIME_", "DATETIME_", "ENTITY_NAME_",
            "STATUS_", "SALE_DISTRICT_", "CURRENCY_TYPE_", "INCREASE_UNIT_",
            "YIELD_START_DATE_", "YIELD_END_DATE_", "YIELD_TYPE_", "TARGET_",
            "PRODUCT_TYPE_", "YIELD_STATMENT_", "INVEST_RANGE_", "PRE_STOP_",
            "RASE_PLAN_", "PURCHASE_"
        ]

    @staticmethod
    def get_data_from_mongo(self, m_client, collection, data_id):
        """
        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :return: data from MongoDB
        """
        m_client.mongo_db = "spider_data"

        try:
            mongo_data_list = m_client.all_from_mongodb(collection,
                                                        data_id=data_id)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.all_from_mongodb(collection,
                                                        data_id=data_id)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    @staticmethod
    def data_shuffle(self, data):
        if "上海银行" in data["BANK_NAME"]:
            bank_code = "BankOfShanghai"
        elif "天津银行" in data["BANK_NAME"]:
            bank_code = "TJBANK"
        elif "北京银行" in data["BANK_NAME"]:
            bank_code = "BOB"
        elif "中国光大银行股份有限公司" in data["BANK_NAME"]:
            bank_code = "CEB"
        else:
            print(data)

        re_data = dict()
        hash_m = hashlib.md5()
        hash_m.update(data["NAME_"].encode("utf-8"))
        hash_id = hash_m.hexdigest()
        re_data["ID_"] = bank_code + "_" + hash_id + "_" + data["SALE_START_"]
        re_data["ENTITY_CODE_"] = "RONG360FINANCIAL"
        # re_data["AREA_CODE_"]
        re_data["BANK_CODE_"] = bank_code
        re_data["BANK_NAME_"] = data["BANK_NAME"].replace("股份有限公司", "")
        # re_data["UNIT_CODE_"]
        re_data["PERIOD_CODE_"] = data["SALE_START_"].replace("-", "")
        # re_data["CONTENT_"]
        re_data["STATUS_"] = ""
        # re_data["REMARK_"]
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        # re_data["UPDATE_TIME_"]

        re_data["NAME_"] = data["NAME_"]
        # 售卖时间范围
        re_data["TIME_LIMIT_"] = ""
        # 收益率
        re_data["LOW_YIELD_RATE_"] = data["YIELD_RATE_"].replace("%", "")
        re_data["HIGH_YIELD_RATE_"] = data["YIELD_RATE_"].replace("%", "")
        # 售卖区域
        re_data["SALE_DISTRICT_"] = data["SALE_AREA_"]
        # 起购金额
        data["START_FUNDS_"] = data["START_FUNDS_"].replace("亿", "00000000")
        data["START_FUNDS_"] = data["START_FUNDS_"].replace("万", "0000")
        data["START_FUNDS_"] = data["START_FUNDS_"].replace("千", "000")
        if data["START_FUNDS_"]:
            if int(data["START_FUNDS_"]) < 50000:
                match_funds = "5万以下"
            elif 50000 <= int(data["START_FUNDS_"]) < 100000:
                match_funds = "5万-10万"
            elif 100000 <= int(data["START_FUNDS_"]) < 200000:
                match_funds = "10万-20万"
            elif 20000 <= int(data["START_FUNDS_"]) < 500000:
                match_funds = "20万-50万"
            elif 500000 <= int(data["START_FUNDS_"]):
                match_funds = "50万以上"
        else:
            match_funds = "不限"
        for i in self.purchase_amount:
            if match_funds in i["ITEM_LABEL_"]:
                re_data["START_FUNDS_"] = data["START_FUNDS_"]
                re_data["START_FUNDS_CODE_"] = i["ITEM_VALUE_"]

        # 期限
        data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace("天", "")

        if data["INVEST_PERIOD_"]:
            if int(data["INVEST_PERIOD_"]) <= 30:
                match_str = "1个月内"
            elif 30 < int(data["INVEST_PERIOD_"]) <= 90:
                match_str = "1-3个月(含)"
            elif 90 < int(data["INVEST_PERIOD_"]) <= 180:
                match_str = "3-6个月(含)"
            elif 180 < int(data["INVEST_PERIOD_"]) <= 365:
                match_str = "6-12个月(含)"
            elif 365 < int(data["INVEST_PERIOD_"]):
                match_str = "12个月以上"

        else:
            match_str = "不限"

        for i in self.duration_type:
            if match_str in i["ITEM_LABEL_"]:
                re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
                re_data["INVEST_PERIOD_CODE_"] = i["ITEM_VALUE_"]

        # 开始售卖时间
        re_data["SALE_START_"] = data["SALE_START_"]
        # 结束售卖时间
        re_data["SALE_END_"] = data["SALE_END_"]
        # 风险等级
        # re_data["RISK_LEVEL_"] = data["RISK_LEVEL_"]
        re_data["URL_"] = data["URL_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        re_data["DATETIME_"] = data["DATETIME_"]
        re_data["ENTITY_NAME_"] = "融360理财产品"
        # NEW
        # 认购货币(类型)
        re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"]
        # 递增单位
        re_data["INCREASE_UNIT_"] = re.sub(r"元.*", "", data["INCREASE_UNIT_"])
        # 收益起记(日期)
        re_data["YIELD_START_DATE_"] = data["YIELD_START_DATE_"][:10]
        # 收益结束(日期)
        re_data["YIELD_END_DATE_"] = data["YIELD_START_DATE_"][-10:]
        # 收益获取方式
        for i in self.revenue_type:
            if i["ITEM_LABEL_"] == data["YIELD_TYPE_"]:
                re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"]
                re_data["YIELD_TYPE_CODE_"] = i["ITEM_VALUE_"]
                break
        # 对象(目标人群)
        re_data["TARGET_"] = data["TARGET_"]
        # 产品类型
        re_data["PRODUCT_TYPE_"] = data["PRODUCT_TYPE_"]

        # 收益率说明
        re_data["YIELD_STATMENT_"] = data["YIELD_STATMENT_"]

        # 投资范围
        re_data["INVEST_RANGE_"] = data["INVEST_RANGE_"]
        # 提前终止条件
        re_data["PRE_STOP_"] = data["PRE_STOP_"]
        # 募集规划条件
        re_data["RASE_PLAN_"] = data["RASE_PLAN_"]
        # 申购条件
        re_data["PURCHASE_"] = data["PURCHASE_"]
        # 无
        # re_data["CODE_"] = data["CODE_"]
        # 是否保本
        # re_data["BREAKEVEN_"] = data["BREAKEVEN_"]
        # # 可否赎回
        # re_data["REDEMING_MODE_"]
        # # 私人银行
        # re_data["PRIVATE_BANK_"]

        return re_data

    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()

        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="JSFIN_CCBDATA")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["FINPRODUCT_FINASSIST"]

        # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # # 表创建语句
        # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."IMPORTANCE_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."SALE_STATUS_CODE_" varchar,'
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar,'
        #        '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar, "C"."SALE_STATUS_" varchar,'
        #        '"C"."LOW_YIELD_RATE_" varchar, "C"."HIGH_YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar,'
        #        '"C"."START_FUNDS_" varchar, "C"."START_FUNDS_CODE_" varchar, "C"."INVEST_PERIOD_" varchar,'
        #        '"C"."INVEST_PERIOD_CODE_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,'
        #        '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,'
        #        '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #        '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,'
        #        '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,'
        #        '"C"."YIELD_TYPE_CODE_" varchar, "C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar,'
        #        '"C"."INVESTOR_TYPE_" varchar, "C". "INVESTOR_TYPE_CODE_" varchar, "C"."YIELD_STATMENT_" varchar,'
        #        '"C"."OPERA_MODEL_CODE_" varchar, "C"."OPERA_MODEL_" varchar,'
        #        '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,'
        #        '"C"."PURCHASE_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # colum_list = ["CURRENCY_TYPE_", "INCREASE_UNIT_", "YIELD_START_DATE_", "YIELD_END_DATE_", "YIELD_TYPE_",
        #               "TARGET_", "PRODUCT_TYPE_", "YIELD_STATMENT_", "INVEST_RANGE_", "PRE_STOP_", "RASE_PLAN_",
        #               "PURCHASE_"]
        # p_client.add_column_phoenix(connection=connection, column=colum_list)

        status = False
        self.logger.info("开始进行 ENTITY_CODE_: RONG360FINANCIAL")
        self.remove_id_list = []
        self.copy_mongo_data_list = []
        find_id = None
        try:
            mongo_data_list = self.get_data_from_mongo(self=self,
                                                       m_client=m_client,
                                                       collection=collection,
                                                       data_id=find_id)
        except pymongo.errors.ServerSelectionTimeoutError:
            sleep(1)
            mongo_data_list = self.get_data_from_mongo(self=self,
                                                       m_client=m_client,
                                                       collection=collection,
                                                       data_id=find_id)

        # 清洗数据并插入 HBase
        if mongo_data_list:
            once_count = 0
            self.find_count += mongo_data_list.count()
            for data in mongo_data_list:
                data_id = data["_id"]
                copy_data = {}
                self.remove_id_list.append(data_id)
                try:
                    del data["_id"]
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    re_data = self.data_shuffle(self=self, data=data)

                    if not re_data:
                        self.bad_count += 1
                        continue
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(data)
                    self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                        data_id, e))
                    continue

                print(data_id)

                # phoenix_HBase 插入数据
                if isinstance(re_data, dict):
                    try:
                        success_count = p_client.upsert_to_phoenix_by_one(
                            connection=connection, data=re_data)
                        once_count += success_count
                        self.success_count += success_count
                        # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(data)
                        self.logger.warning(
                            "HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                        continue
                elif isinstance(re_data, list):
                    for r_data in re_data:
                        try:
                            success_count = p_client.upsert_to_phoenix_by_one(
                                connection=connection, data=r_data)
                            once_count += success_count
                            self.success_count += success_count
                            # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(data)
                            self.logger.warning(
                                "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                    data_id, e))
                            continue
            if once_count > 0:
                status = True
                self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
        else:
            quit()
        # # 添加 {d:1}
        # if status:
        #     update_count = m_client.update_to_mongodb(collection=collection, data_id=self.remove_id_list,
        #                                               data_dict={"d": 1})
        #     self.remove_count += update_count
        #     self.logger.info("MongoDB 更新成功")
        # # 删除数据
        # if status:
        #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
        #                                                entity_code=entity,
        #                                                remove_id_list=self.remove_id_list)
        #     self.remove_count += delete_count
        #     # self.logger.info("MongoDB 删除成功")
        # else:
        #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
        #
        # 将数据插入 spider_data_old 中
        # if status:
        #     try:
        #         old_client.mongo_db = "spider_data_old"
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except pymongo.errors.ServerSelectionTimeoutError as e:
        #         sleep(1)
        #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except Exception as e:
        #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        # p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Пример #6
0
class AllToPhoenix(object):
    def __init__(self):
        self.file_list = list()
        self.get_code_list()
        self.logger = Logger().logger
        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.copy_mongo_data_list = list()
        self.remove_id_list = list()
        # 字段验证列表
        self.verify_list = [
            "ID_", "CONTENT_", "NOTICE_TIME_", "TITLE_", "PROJECT_NAME_",
            "BID_CONTENT_", "SIGN_START_TIME_", "SIGN_END_TIME_",
            "OPEN_BID_TIME_", "OPEN_BID_PLACE_", "BID_AGENCY_",
            "APPLY_CONDITION_", "SIGN_QUALIFICATION_", "PROJECT_ID_",
            "WIN_CANDIDATE_", "CANDIDATE_RANK_", "BID_", "URL_", "DEALTIME_",
            "CREATE_TIME_", "ENTITY_NAME_", "ENTITY_CODE_", "ENTITY_STATUS_",
            "SIGN_MATERIAL_", "BID_TYPE_", "DATETIME_", "BUDGET_PRICE_",
            "PASS_REASON_", "PRESALE_CONTENT_", "PRESALE_WAY_",
            "PRESALE_START_TIME_", "PRESALE_END_TIME_", "PRESALE_ADDR_",
            "PRESALE_PREPARE_", "IMAGE_"
        ]

    def get_code_list(self):
        """
        获取当前目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ )
        :return:
        """
        for root, dirs, files in os.walk(curPath):
            # print(root)  # 当前目录路径
            # print(dirs)  # 当前路径下所有子目录
            # print(files)  # 当前路径下所有非目录子文件
            self.file_list = files
            self.file_list.remove("__init_____.py")
            break

    def get_data_from_mongo(self, m_client, collection, entity_code):
        """

        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :param entity_code:
        :return: all from MongoDB where ENTITY_CODE_ = entity_code
        """
        m_client.mongo_db = "spider_data"
        other_query = {
            "$or": [{
                "TITLE_": {
                    "$exists": True
                }
            }, {
                "title": {
                    "$exists": True
                }
            }]
        }
        try:
            mongo_data_list = m_client.get_data_from_mongodb(
                collection=collection,
                entity_code=entity_code,
                exclude_code=None,
                limit_number=None,
                other_query=other_query)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            time.sleep(1)
            mongo_data_list = m_client.get_data_from_mongodb(
                collection=collection,
                entity_code=entity_code,
                exclude_code=None,
                limit_number=None,
                other_query=other_query)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def delete_data_from_mongo(self, m_client, collection, entity_code,
                               remove_id_list):
        """

        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :param entity_code:
        :param remove_id_list: delete data id's list
        :return: delete count
        """
        m_client.mongo_entity_code = entity_code

        try:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def shuffle_data(self, data):
        """
        通用清洗
        :param data:
        :return:
        """
        re_data = dict()
        if "TITLE_" in data:
            if not data["TITLE_"]:
                return
            hash_m = hashlib.md5()
            hash_m.update(str(data["TITLE_"]).encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)
        elif "title" in data:
            if not data["title"]:
                return
            hash_m = hashlib.md5()
            hash_m.update(data["title"].encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["entity_code"]) + "_" + str(hash_title)
        else:
            return
        re_data["ID_"] = row_key

        for key, value in data.items():
            # 字段验证
            if key in self.verify_list:
                re_data[key] = value
            elif key == "entityStatus" or key == "ENTITY_STATUS_":
                key = "ENTITY_STATUS_"
                value = "DRAFT"
                re_data[key] = value
        if "ENTITY_STATUS_" not in re_data:
            re_data["ENTITY_STATUS_"] = "DRAFT"

        for s_key in self.verify_list:
            if s_key == "CONTENT_" or s_key == "URL_" or s_key == "url":
                continue
            if data.get(s_key, ""):
                data[s_key] = data[s_key].replace("|", "")

        return re_data

    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="CommonBidding")
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="CommonBidding")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        old_client = MongoClient(mongo_collection="CommonBidding")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        old_client.mongo_db = "spider_data_old"
        db_old, collection_list_old = old_client.client_to_mongodb()
        collection_old = db_old["CommonBidding"]

        # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # 招投标表创建语句
        # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,'
        #        '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,'
        #        '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,'
        #        '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,'
        #        '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,'
        #        '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,'
        #        '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,'
        #        '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,'
        #        '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,'
        #        '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,'
        #        '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,'
        #        '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        # self.file_list = ["CommonBidding_86JCW"]
        for f in self.file_list:
            status = False
            entity_code = f.replace(".py", "")
            module_name = __import__(entity_code)
            entity_code_mongo = entity_code.replace("CommonBidding_", "")
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                try:
                    self.find_count += mongo_data_list.count()
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(1)
                    self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    self.remove_id_list.append(data_id)
                    del data["_id"]
                    # 深拷贝源数据,用于插入 spider_data 库中
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    # 数据清洗
                    try:
                        re_data = module_name.data_shuffle(data)
                        final_data = self.shuffle_data(re_data)
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue
                    # phoenix_HBase 插入数据
                    if final_data:
                        try:
                            p_client.upsert_to_phoenix_by_one(
                                connection=connection, data=final_data)
                            once_count += 1
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(copy_data)
                            self.logger.warning(
                                "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                    data_id, e))
                            continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))

                # 删除数据
                if status:
                    delete_count = self.delete_data_from_mongo(
                        m_client=m_client,
                        collection=collection,
                        entity_code=entity_code_mongo,
                        remove_id_list=self.remove_id_list)
                    self.remove_count += delete_count
                else:
                    self.logger.info("HBase 插入成功条数0条, 不执行删除")

                # 将数据插入 spider_data_old 中
                if status:
                    try:
                        old_client.mongo_db = "spider_data_old"
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except pymongo.errors.ServerSelectionTimeoutError as e:
                        time.sleep(1)
                        self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except Exception as e:
                        self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Пример #7
0
class AllToPhoenix(object):
    def __init__(self):
        # "CNINFONEWS" pdf too long
        self.code_list = [
            "CAIJINGNEWS", "CNINFONEWS", "CSFINACIAL", "CSFINACIALNEWS",
            "CSNEWS", "CSNOTICE", "FINAQQNEWS", "XLCJYHMKNEWS", "XLCJNEWS",
            "XLCJGSNEWS", "WYCJNEWS", "WYCJGSNEWS", "NEWS163DOM",
            "NEWS10JQKA2", "NEWS10JQKA", "HOUSEQQNEWS"
        ]

        self.logger = Logger().logger
        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        # 插入 spider_data_old 的数据列表
        # self.copy_mongo_data_list = list()
        # 删除 spider_data 的数据 _id 列表
        # self.remove_id_list = list()
        # self.branch_code_list = list()

        self.verify_list = [
            "ENTITY_CODE_", "ENTITY_NAME_", "URL_", "PERIOD_CODE_", "STATUS_",
            "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "BANK_NAME_",
            "BANK_CODE_", "CONTENT_", "DATA_SOURCE_", "KEYWORDS_",
            "ENTITY_NAME_", "ID_"
        ]

    # 从 MongoDB 获取数据
    def get_data_from_mongo(self, m_client, collection, entity_code, find_id):
        m_client.mongo_db = "spider_data"
        m_client.mongo_entity_code = entity_code
        try:
            mongo_data_list = m_client.search_from_mongodb(
                collection=collection, data_id=find_id)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.search_from_mongodb(
                collection=collection, data_id=find_id)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None

    # 从 MongoDB 删除数据
    def delete_data_from_mongo(self, m_client, collection, entity_code,
                               remove_id_list):
        m_client.mongo_entity_code = entity_code

        try:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except Exception as e:
            self.logger.info(e)
            return None

    def get_brief_from_ai(self, data):
        data["CONTENT_"] = data["CONTENT_"].replace("|", "")
        if data["PUBLISH_TIME_"]:
            data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"][:10]
        # ID
        hash_m = hashlib.md5()
        hash_m.update(data["URL_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        data["ID_"] = data["ENTITY_CODE_"] + "_" + str(hash_title)

        text = data["CONTENT_"]

        ex_line = ("python3" + " " + AI_PATH + " " + '\"{}\"'.format(text) +
                   " " + "1")
        # print(ex_line)
        r = os.popen(ex_line)

        # print(1, r.read())
        data["BRIEF_"] = r.read()

        return data

    # 主函数
    def run(self):
        # # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="NEWS_FINASSIST")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="NEWS_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)
        #
        # # 表创建语句
        # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, '
        #        '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, '
        #        '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)'
        #        'IMMUTABLE_ROWS = true')

        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            # self.remove_id_list = []
            # self.copy_mongo_data_list = []
            # self.branch_code_list = []
            if entity_code == "CAIJINGNEWS":
                find_id = "5c6bfa508d7fee512a4ca68f"
            else:
                find_id = ""
            # find_id = ""
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for i in range(1000000):
                    try:
                        data = mongo_data_list.__next__()
                    except pymongo.errors.ServerSelectionTimeoutError:
                        continue
                    except StopIteration:
                        break

                    # for data in mongo_data_list:
                    data_id = data["_id"]
                    if self.success_count % 100 == 0:
                        self.logger.info(
                            "running on data_id: {}".format(data_id))
                    # print(data_id)
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        data_list = [data]
                        re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            # self.remove_id_list.remove(data_id)
                            continue
                    except Exception as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    if entity_code != "CNINFONEWS":
                                        ai_data = self.get_brief_from_ai(
                                            data=list_data)
                                    else:
                                        ai_data = list_data
                                    # print(ai_data["CONTENT_"])
                                except Exception as e:
                                    self.logger.info("AI 调取失败, 错误信息", e)
                                    ai_data = re_data
                                try:
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=ai_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    if self.success_count % 10 == 0:
                                        self.logger.info(
                                            "HBase 插入成功, 成功条数{}条".format(
                                                once_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                                try:
                                    # 添加 {d:1}
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=data_id,
                                        data_dict={"d": 1})
                                    self.remove_count += update_count
                                    # self.logger.info("MongoDB 更新成功")
                                    if self.remove_count % 10 == 0:
                                        self.logger.info(
                                            "MongoDB 更新成功, 成功条数 {} 条".format(
                                                "10"))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue

                    elif isinstance(re_data, dict):
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                self.logger.info(
                                    "HBase 插入成功, 成功条数 {} 条".format(
                                        success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue

                if once_count > 0:
                    status = True
                    self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format(
                        entity_code, once_count))
                mongo_data_list.close()
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #8
0
class AllToPhoenix(object):
    def __init__(self):
        self.code_list = [
            "ABCORGANIZE", "BOCOMORGANIZE", "BOCORGANIZE", "CBHBORGANIZE",
            "CCBORGANIZE", "CEBORGANIZE", "CGBORGANIZE", "CIBORGANIZE",
            "CMBCORGANIZE", "CMBORGANIZE", "CZBORGANIZE", "EBCLORGANIZE",
            "ECITICORGANIZE", "HXBORGANIZE", "ICBCORGANIZE", "PABORGANIZE",
            "PSBCORGANIZE", "SPDBORGANIZE"
        ]
        self.logger = Logger().logger
        self.count = 0

    # 从 MongoDB 获取数据
    def get_data_from_mongo(self, m_client, collection, entity_code):
        m_client.mongo_entity_code = entity_code
        try:
            mongo_data_list = m_client.search_from_mongodb(collection)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.search_from_mongodb(collection)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            # print(e)
            return None

    # 主函数
    def run(self):
        # 创建 MongoDB 对象
        m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = self.code_list[14:]
        for entity_code in self.code_list:
            self.count = 0
            hash_list = list()
            status = False
            mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                       collection=collection,
                                                       entity_code=entity_code)
            if mongo_data_list:
                self.logger.warning("{} 查取成功".format(entity_code))
                self.logger.warning("当前共有{}条".format(mongo_data_list.count()))
                status = True
            else:
                self.logger.warning("{} 无数据".format(entity_code))

            if status:
                for data in mongo_data_list:
                    if "ADDR_" in data:
                        hash_m = hashlib.md5()
                        hash_m.update(data["ADDR_"].encode("utf-8"))
                        hash_title = hash_m.hexdigest()
                        if hash_title in hash_list:
                            self.count += 1
                        else:
                            hash_list.append(hash_title)
                    else:
                        if "CONTENT_" in data:
                            hash_m = hashlib.md5()
                            hash_m.update(data["CONTENT_"].encode("utf-8"))
                            hash_title = hash_m.hexdigest()
                            if hash_title in hash_list:
                                self.count += 1
                            else:
                                hash_list.append(hash_title)
            self.logger.warning("重复数据{}条".format(self.count))

        # 关闭连接
        m_client.client_close()
        self.logger.handlers.clear()
Пример #9
0
    def search_title_from_mongodb(self, collection, data_id=None):
        mon_logger = Logger().logger
        try:
            mon_logger.info("开始查取数据")
            if data_id:
                find_id = ObjectId(data_id)
                result_one = collection.find_one({
                    "$and": [{
                        "ENTITY_CODE_": self.mongo_entity_code
                    }, {
                        "$or": [{
                            "TITLE_": {
                                "$exists": True
                            }
                        }, {
                            "title": {
                                "$exists": True
                            }
                        }]
                    }, {
                        "_id": {
                            "$gte": find_id
                        }
                    }]
                })
            else:
                result_one = collection.find_one({
                    "$and": [{
                        "ENTITY_CODE_": self.mongo_entity_code
                    }, {
                        "$or": [{
                            "TITLE_": {
                                "$exists": True
                            }
                        }, {
                            "title": {
                                "$exists": True
                            }
                        }]
                    }]
                })

            if result_one is not None:
                result = collection.find(
                    {
                        "$and": [{
                            "ENTITY_CODE_": self.mongo_entity_code
                        }, {
                            "$or": [{
                                "TITLE_": {
                                    "$exists": True
                                }
                            }, {
                                "title": {
                                    "$exists": True
                                }
                            }]
                        }, {
                            "_id": {
                                "$gte": result_one["_id"]
                            }
                        }]
                    },
                    no_cursor_timeout=True)

                mon_logger.info("ENTITY: {} 数据查取成功共 {}条".format(
                    self.mongo_entity_code, result.count()))
                return result
            else:
                mon_logger.info(
                    "MongoDB 查取数据为空,请检查 ENTITY_CODE_ 是否正确:{}".format(
                        self.mongo_entity_code))
                return None

        except TypeError as e:
            mon_logger.warning(
                "MongoDB数据查取失败,错误信息为{}, 请检查 ENTITY_CODE_ 是否正确:{}".format(
                    e, self.mongo_entity_code))

        finally:
            self.client_close()
Пример #10
0
class WeiboBasicInfoScript(object):
    # 初始化参数
    def __init__(self, entity_type="WEIBOBASICINFO"):
        self.entity_type = entity_type
        self.logger = Logger().logger
        self.verify_list = [
            "ID_", "BANK_CODE_", "BANK_NAME_", "PERIOD_TIME_", "AREA_CODE_",
            "CREATE_TIME_", "WEIBO_CODE_", "MAIN_URL_", "NAME_", "FOCUS_",
            "FANS_", "COMPANY_URL_", "COMPANY_", "DETAILED_URL_", "VIRIFIED_",
            "BIREF_", "ENTITY_NAME_", "ENTITY_CODE_", "DEALTIME_",
            "PROVINCE_NAME_", "PROVINCE_CODE_", "STATUS_1"
        ]
        self.remove_id_list = list()
        self.copy_mongo_data_list = list()
        self.branch_code_list = list()
        self.find_count = 0
        self.bad_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0

    def match_weibo_code(self, match):
        mongo_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        db, collection_list = mongo_client.client_to_mongodb()
        collection = mongo_client.get_check_collection(db, collection_list)
        result = mongo_client.match_from_mongo(collection=collection,
                                               match=match,
                                               output="WEIBO_CODE_")
        return result

    def data_shuffle(self, data, province_list):
        re_data = dict()
        prov_n = ""
        prov_c = ""

        # # BANK_NAME_ 字典
        # name_dict = {"ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行",
        #              "BOCOM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行",
        #              "ECITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行",
        #              "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行",
        #              "SPDB": "浦发银行", "EBCL": "恒丰银行"}
        province_list, city_list, area_list, dir_area_list, bank_list = GenericScript.data_from_mysql(
        )

        bank_code = data["BANK_CODE_"][:-9]

        bank_name = name_dict[bank_code]

        time_array = time.localtime(int(data["DEALTIME_"]))
        period_time = time.strftime("%Y%m%d", time_array)

        for prov in province_list:
            if prov["NAME_"][:2] in data["LOCATION_"]:
                prov_n = prov["NAME_"]
                prov_c = prov["CODE_"]

        # "C"
        # TODO row_key 时间戳还是年-月-日
        re_data["ID_"] = data["BANK_CODE_"] + "_" + period_time
        re_data["BANK_CODE_"] = bank_code
        re_data["BANK_NAME_"] = bank_name
        re_data["PERIOD_CODE_"] = period_time
        re_data["AREA_CODE_"] = prov_c
        re_data["CREATE_TIME_"] = period_time

        re_data["WEIBO_CODE_"] = data["WEIBO_CODE_"]
        re_data["MAIN_URL_"] = data["MAIN_URL_"]
        re_data["NAME_"] = data["NAME_"]
        re_data["FOCUS_"] = data["FOCUS_"]
        re_data["FANS_"] = data["FANS_"]
        re_data["COMPANY_URL_"] = data["COMPANY_URL_"]
        if "COMPANY_" not in data:
            re_data["COMPANY_"] = data["VIRIFIED_"]
        else:
            re_data["COMPANY_"] = data["COMPANY_"]
        re_data["DETAILED_URL_"] = data["DETAILED_URL_"]
        re_data["VIRIFIED_"] = bank_name + "股份有限公司"
        re_data["BIREF_"] = data["BIREF_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["ENTITY_CODE_"] = data["BANK_CODE_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        re_data["PROVINCE_NAME_"] = prov_n
        re_data["PROVINCE_CODE_"] = prov_c

        re_data["STATUS_"] = ""

        return re_data

    # 从 MongoDB 删除数据
    def delete_data_from_mongo(self, m_client, collection, remove_id_list):
        m_client.mongo_entity_code = None

        try:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="WEIBOBASICINFO")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None, entity_type=None).area_from_mysql())

        # 删除表
        p_client.drop_table_phoenix(connection=connection)
        # quit()

        # 创建表
        sql = (
            'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,'
            '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,'
            '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,'
            '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,'
            '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,'
            '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
            '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,'
            '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        status = False
        self.logger.info("开始进行 WEIBOBASICINFO")

        try:
            mongo_data_list = m_client.all_from_mongodb(collection=collection)
        except pymongo.errors.ServerSelectionTimeoutError:
            time.sleep(1)
            mongo_data_list = m_client.all_from_mongodb(collection=collection)

        # 清洗数据并插入 HBase
        if mongo_data_list:
            self.find_count = mongo_data_list.count()
            for data in mongo_data_list:
                re_data = ""
                data_id = data["_id"]
                copy_data = {}
                self.remove_id_list.append(data_id)
                try:
                    del data["_id"]
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    re_data = self.data_shuffle(data=data,
                                                province_list=province_list)
                    if not re_data:
                        self.bad_count += 1
                        continue
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                        data_id, e))

                # phoenix_HBase 插入数据
                try:
                    success_count = p_client.upsert_to_phoenix_by_one(
                        connection=connection, data=re_data)
                    self.success_count += success_count
                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(
                        data_id, e))
                    continue
            if self.success_count > 0:
                status = True
                self.logger.info("HBase 插入成功, 成功条数 {}".format(
                    self.success_count))
        else:
            quit()

        # # 删除数据
        # if status:
        #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
        #                                                remove_id_list=self.remove_id_list)
        #     self.remove_count += delete_count
        # else:
        #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
        #
        # # 将数据插入 spider_data_old 中
        # if status:
        #     try:
        #         old_client.mongo_db = "spider_data_old"
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #     except pymongo.errors.ServerSelectionTimeoutError as e:
        #         time.sleep(1)
        #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except Exception as e:
        #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #11
0
class MapBarTransfer(object):
    def __init__(self,
                 table_name="CHA_BRANCH_MAPBAR",
                 collection_name="mapbar"):
        # phoenix connection
        self.p_client = PhoenixHbase(table_name=table_name)
        self.connection = self.p_client.connect_to_phoenix()
        # MongoDB connection
        self.m_client = MongoClient(mongo_collection=collection_name,
                                    entity_code="MAPBAR_DEATAIL_BJ")
        self.m_client.mongo_host = "172.22.69.35"
        self.m_client.mongo_port = 20000
        self.m_client.client = pymongo.MongoClient(host="172.22.69.35",
                                                   port=20000,
                                                   serverSelectionTimeoutMS=60,
                                                   connectTimeoutMS=60,
                                                   connect=False)
        self.db, self.collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=self.db, collection_list=self.collection_list)
        # Log
        self.logger = Logger().logger
        # count
        self.count = 0

    def main(self):
        # # 创建表
        # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,'
        #              '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,'
        #              '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,'
        #              '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar'
        #              ') IMMUTABLE_ROWS = true')
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        # 获取数据
        # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection)
        mongo_data_list = self.m_client.search_from_mongodb(
            collection=self.collection,
            field_name="DEALTIME_",
            field_value={"$gt": "1555136656.0579224"},
            data_id="5cb65fac9bb3df61a09c6625")

        count = 0
        while True:
            # 取一条处理
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(3)
                data = mongo_data_list.__next__()

            # 清洗
            try:
                data["PHONE_"] = data["PHONE_"].replace("无,", "")
                u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日",
                                         data["UPDATETIME_"])
                if u_time_list:
                    u_ = u_time_list[0].replace("年", "-")
                    u_ = u_.replace("月", "-")
                    u_l = u_.split("-")
                    if len(u_l[1]) == 1:
                        u_l[1] = "0" + u_l[1]
                    if len(u_l[2]) == 1:
                        u_l[2] = "0" + u_l[2]
                    data["UPDATETIME_"] = "-".join(u_l)
            except Exception as e:
                self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}")
                continue

            # 获取经纬度
            try:
                if data["ADDRESS_"]:
                    data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:])
                    location_result = get_lat_lng(address=data["ADDRESS_"])
                    if location_result["status"] == 0:
                        data["LNG_"] = str(
                            location_result["result"]["location"]["lng"])
                        data["LAT_"] = str(
                            location_result["result"]["location"]["lat"])
                    else:
                        self.logger.warning(f"_id: {data['_id']} 获取经纬度失败")
                else:
                    continue
            except Exception as e:
                self.logger.exception(
                    f"_id: {data['_id']} 获取经纬度失败, error: {e}")
                continue
            # upsert to HBase
            try:
                re_data = self.__check_lat(data=data)
                # 向 HBase 中插入一条
                self.p_client.upsert_to_phoenix_by_one(
                    connection=self.connection, data=re_data)
                count += 1
                if count % 100 == 0:
                    self.logger.info(
                        f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}")
            except Exception as e:
                self.logger.exception(
                    f"HBase 插入失败, _id: {data['_id']}, error: {e}")
                continue

        # 关闭 MongoDB cursor
        mongo_data_list.close()
        self.logger.info(
            f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条"
        )

    def check_lat(self):
        # # 删除表
        # self.p_client.drop_table_phoenix(connection=self.connection, table_name="CHA_BRANCH_MAPBAR")
        #
        # table_sql = (f'create table "CHA_BRANCH_MAPBAR" ("ID_" varchar primary key,'
        #              '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,'
        #              '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,'
        #              '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar,'
        #              '"C"."CHECK_LAT_" varchar, "C"."CHECK_LNG_" varchar'
        #              ') IMMUTABLE_ROWS = true')
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        self.p_client.table_name = "FANSILE"
        data_cursor = self.p_client.search_all_from_phoenix(
            connection=self.connection, dict_status=True)
        self.p_client.table_name = "CHA_BRANCH_MAPBAR"
        while True:
            try:
                data = data_cursor.__next__()

                # del data["('C', 'CHECK_LNG_')"]
                # if not data["LAT_"]:
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864:
                #     pass
                # else:
                #     self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}")
                #     data["CHECK_LAT_"] = data["LAT_"]
                #     data["CHECK_LNG_"] = data["LNG_"]
                #     data["LAT_"] = ""
                #     data["LNG_"] = ""
                #
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443:
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # else:
                #     self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}")
                #     data["CHECK_LAT_"] = data["LAT_"]
                #     data["CHECK_LNG_"] = data["LNG_"]
                #     data["LAT_"] = ""
                #     data["LNG_"] = ""
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                self.p_client.upsert_to_phoenix_by_one(
                    connection=self.connection, data=data)
                self.count += 1
                if self.count % 100 == 0:
                    self.logger.info(
                        f"HBase 插入成功, _id: {data['_id']}, 成功条数 {self.count} 条")

            except StopIteration:
                break

    def __check_lat(self, data):
        if "LAT_" not in data:
            return data
        # 上海
        # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864:
        # 北京
        if 39.4498800000 < float(data["LAT_"]) < 41.1684980000:
            pass
        else:
            self.logger.warning(
                f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}")
            data["CHECK_LAT_"] = data["LAT_"]
            data["CHECK_LNG_"] = data["LNG_"]
            data["LAT_"] = ""
            data["LNG_"] = ""
            return data
        # 上海
        # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443:
        # 北京
        if 115.4534230000 < float(data["LNG_"]) < 117.5461160000:
            return data
        else:
            self.logger.warning(
                f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}")
            data["CHECK_LAT_"] = data["LAT_"]
            data["CHECK_LNG_"] = data["LNG_"]
            data["LAT_"] = ""
            data["LNG_"] = ""
            return data