Пример #1
0
    def insert_to_mysql(self, connection, data):
        """
        插入新数据
        :param connection:
        :param data: type => tuple List or dict
        :return:
        """
        mysql_logger = Logger().logger

        if isinstance(data, dict):
            k_list = [key for key in data.keys()]
            v_list = tuple([value for value in data.values()])
            v_sql = str(v_list)
            if v_sql[-2] == ",":
                v_sql = v_sql[:-2] + ")"
            sql = f"INSERT INTO {self.mysql_table} ({','.join(k_list)}) VALUES{v_sql}"
        elif isinstance(data, (list, tuple)):
            k_list = [key for key in data[0].keys()]
            value_list = list()
            for each in data:
                v_list = str(tuple([value for value in each.values()]))
                if v_list[-2] == ",":
                    v_list = v_list[:-2] + ")"
                value_list.append(v_list)
            sql = f"INSERT INTO {self.mysql_table} ({','.join(k_list)}) VALUES"
            sql = sql + ",".join(value_list)
        else:
            raise Exception("not format type of data")
        try:
            mysql_logger.info(f"网络声量sql==>{sql}")
            count = self.cs_commit(connection=connection, sql=sql)
            mysql_logger.info(f"MySQL 插入成功 {count} 条")
        except Exception as e:
            mysql_logger.exception(f"网络声量  插入失败,ERROR: {e}")
Пример #2
0
    def all_to_mongodb(self, collection, insert_list):
        """
        插入数据
        :param collection:
        :param insert_list: list
        :return:
        """
        mon_logger = Logger().logger
        try:
            result = collection.insert_many(insert_list)
            mon_logger.info("MongoDB 数据插入成功, 成功条数 {} 条".format(
                len(result.inserted_ids)))
            return len(result.inserted_ids)

        except TypeError as e:
            mon_logger.info("MongoDB 数据插入失败,错误信息为 {}".format(e))
        except pymongo.errors.ServerSelectionTimeoutError as e:
            mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e))
            result = collection.insert_many(insert_list)
            mon_logger.info("数据插入成功, 成功条数 {} 条".format(len(
                result.inserted_ids)))
            return len(result.inserted_ids)
        except Exception as e:
            mon_logger.info("MongoDB 数据插入失败,错误信息为 {}".format(e))
            return 0
        finally:
            self.client_close()
Пример #3
0
    def match_from_mongo(self, collection, match, output):
        """
        查询所有数据, 返回游标对象(聚合)
        :param collection:
        :param match: match condition like dict {"ENTITY_CODE_": "XXXXXXXXX"}
        :param output: output field like list or dict
        :return:
        """
        mon_logger = Logger().logger
        if isinstance(output, str):
            output = [output]
        try:
            mon_logger.info("MongoDB 开始查取数据")
            output_condition = dict()
            for o in output:
                output_condition[o] = 1
            result = collection.aggregate([{
                "$match": match
            }, {
                "$project": output_condition
            }])
            mon_logger.info("MongoDB 数据查取成功")
            return result

        except TypeError as e:
            mon_logger.error(
                "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确:{}".format(e, match))
            # raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e))

        finally:
            self.client_close()
Пример #4
0
    def match_from_mongo(self, collection, match, output):
        mon_logger = Logger().logger
        try:
            mon_logger.info("开始查取数据")
            result = collection.aggregate([{
                "$match": match
            }, {
                "$project": {
                    "budgetPrice": 1,
                    "_id": 0,
                    output: 1
                }
            }])
            for i in result:
                if i is not None:
                    mon_logger.info("数据查取成功")
                    return i[output]
                else:
                    mon_logger.error("WEIBO_CODE_ 查取数据为空")
                    # raise Exception("WEIBO_CODE_ 查取失败")

        except TypeError as e:
            mon_logger.error(
                "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确:{}".format(e, match))
            raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e))

        finally:
            self.client_close()
Пример #5
0
    def search_from_mysql(self,
                          connection,
                          output=None,
                          where_condition=None,
                          limit_num=None,
                          offset_num=None):
        """
        查询
        :param connection:
        :param output: 输出字段
        :param where_condition: where 条件
        :param limit_num: 输出数量
        :param offset_num: 跳过数量
        :return:
        """
        mysql_logger = Logger().logger
        if output:
            if isinstance(output, str):
                sql = f"SELECT {output} FROM {self.mysql_table}"
            elif isinstance(output, (tuple, list)):
                sql = f"SELECT {','.join(output)} FROM {self.mysql_table}"
            else:
                raise Exception("not format type of \"output\"")
        else:
            sql = f"SELECT * FROM {self.mysql_table}"

        if where_condition:
            if "where" in where_condition or "WHERE" in where_condition:
                sql = sql + " " + where_condition
            else:
                sql = sql + f" WHERE {where_condition}"

        sql = sql + f" LIMIT {limit_num}" if limit_num else sql

        sql = sql + f" OFFSET {offset_num}" if offset_num else sql

        try:
            cs = connection.cursor(pymysql.cursors.DictCursor)
            count = cs.execute(sql)
            result = cs.fetchall()
            if count:
                mysql_logger.info(f"Mysql 查取成功 {count} 条")
                return result
            else:
                mysql_logger.info("数据库查取数为0")
        except TypeError:
            mysql_logger.error("MySQL查取失败,请检查")
        finally:
            cs.close()
Пример #6
0
    def get_mongo_column_dict(self, collection, column1, column2):
        mon_logger = Logger().logger
        try:
            mon_logger.info("开始查取数据")
            result = collection.aggregate([{
                "$project": {
                    "_id": 0,
                    column1: 1,
                    column2: 1
                }
            }])
            return result
        except TypeError as e:
            mon_logger.error(
                "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确".format(e))
            raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e))

        finally:
            self.m_client.client.close()
Пример #7
0
    def delete_from_mysql(self, connection, where_condition):
        """
        删除
        :param connection:
        :param where_condition: where 条件
        :return:
        """
        mysql_logger = Logger().logger

        if "where" in where_condition or "WHERE" in where_condition:
            sql = f"DELETE FROM {self.mysql_table} {where_condition}"
        else:
            sql = f"DELETE FROM {self.mysql_table} WHERE {where_condition}"

        try:
            count = self.cs_commit(connection=connection, sql=sql)
            mysql_logger.info(f"MySQL 删除成功 {count} 条")
        except Exception as e:
            mysql_logger.exception(f"MySQL 删除失败,ERROR: {e}")
Пример #8
0
 def client_to_mysql(self):
     '''
     :return: 返回mysql连接
     '''
     mysql_logger = Logger().logger
     try:
         mysql_logger.info("正在连接MySQL({}@{}:{})".format(
             self.mysql_user, self.mysql_host, self.mysql_port))
         connection = pymysql.connect(**self.mysql_config)
         mysql_logger.info("Mysql连接成功({}@{}:{})".format(
             self.mysql_user, self.mysql_host, self.mysql_port))
         return connection
     except pymysql.err.OperationalError as e:
         for retry_count in range(2, 7):
             try:
                 mysql_logger.warning(
                     "MySQL连接失败,正在重试第{}次连接".format(retry_count))
                 connection = pymysql.connect(**self.mysql_config)
                 mysql_logger.info("Mysql连接成功")
                 return connection
             except Exception as e:
                 mysql_logger.warning("第{}次连接MySQL失败".format(retry_count))
                 # print(retry_count)
                 if retry_count == 6:
                     mysql_logger.error("MySQL连接失败,错误信息为{}".format(e))
Пример #9
0
 def client_to_mongodb(self):
     mon_logger = Logger().logger
     mon_logger.info("开始连接MongoDB({}:{}),database={}".format(
         self.mongo_host, self.mongo_port, self.mongo_database))
     try:
         collection_list = self.db.collection_names()
         mon_logger.info("MongoDB({}:{})连接成功".format(
             self.mongo_host, self.mongo_port))
         return collection_list
     except pymongo.errors.ServerSelectionTimeoutError as e:
         mon_logger.warning("MongoDB({}:{})连接失败".format(
             self.mongo_host, self.mongo_port))
         for i in range(2, 6):
             try:
                 collection_list = self.db.collection_names()
                 mon_logger.info("MongoDB({}:{})连接成功".format(
                     self.mongo_host, self.mongo_port))
                 return collection_list
             except Exception:
                 mon_logger.warning("MongoDB({}:{})第{}次连接失败".format(
                     self.mongo_host, self.mongo_port, i))
             if i == 5:
                 mon_logger.error(
                     "MongoDB连接失败,错误信息为: {}, 请检查各项参数是否正确host={}, port={},database={}"
                     .format(e, self.mongo_host, self.mongo_port,
                             self.mongo_database))
                 self.client_close()
Пример #10
0
    def search_by_status(self, collection, data_id=None):
        mon_logger = Logger().logger
        try:
            mon_logger.info("开始查取数据")
            if data_id:
                find_id = ObjectId(data_id)
                result_one = collection.find_one({
                    "$and": [{
                        "ENTITY_CODE_": self.mongo_entity_code
                    }, {
                        "_id": {
                            "$gte": find_id
                        }
                    }, {
                        "d": {
                            "$exists": False
                        }
                    }]
                })
            else:
                result_one = collection.find_one({
                    "$and": [{
                        "ENTITY_CODE_": self.mongo_entity_code
                    }, {
                        "d": {
                            "$exists": False
                        }
                    }]
                })
            if result_one is not None:
                result = collection.find(
                    {
                        "$and": [{
                            "ENTITY_CODE_": self.mongo_entity_code
                        }, {
                            "_id": {
                                "$gte": result_one["_id"]
                            }
                        }, {
                            "d": {
                                "$exists": False
                            }
                        }]
                    },
                    no_cursor_timeout=True)

                mon_logger.info("ENTITY: {} 数据查取成功共 {}条".format(
                    result.count()))
                return result
            else:
                mon_logger.info("ENTITY: {} 数据查取为空".format(
                    self.mongo_entity_code))
                return None
        except TypeError as e:
            mon_logger.error(
                "MongoDB数据查取失败,错误信息为{}, 请检查 ENTITY_CODE_ 是否正确:{}".format(
                    e, self.mongo_entity_code))
        finally:
            self.client_close()
Пример #11
0
    def update_to_mysql(self, connection, data, where_condition):
        """
        更新数据
        :param connection:
        :param data:
        :param where_condition: where 条件
        :return:
        """
        mysql_logger = Logger().logger

        set_list = list()
        for key, value in data.items():
            set_list.append(f"{key} = \'{value}\'")

        if "where" in where_condition or "WHERE" in where_condition:
            sql = f"UPDATE {self.mysql_table} SET {','.join(set_list)} {where_condition}"
        else:
            sql = f"UPDATE {self.mysql_table} SET {','.join(set_list)} WHERE {where_condition}"

        try:
            count = self.cs_commit(connection=connection, sql=sql)
            mysql_logger.info(f"MySQL 更新成功 {count} 条")
        except Exception as e:
            mysql_logger.exception(f"MySQL 更新失败,ERROR: {e}")
Пример #12
0
class FundScript(object):
    def __init__(self):
        self.code_list = [
            "STCNFUND", "ABCFUND", "CCBFUND", "CITICFUND", "ICBCFUND"
        ]
        self.logger = Logger().logger
        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.copy_mongo_data_list = list()
        self.remove_id_list = list()
        self.branch_code_list = list()

        # 基金
        self.verify_list = [
            "ENTITY_CODE_", "ENTITY_NAME_", "URL_", "PERIOD_CODE_", "STATUS_",
            "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_",
            "FUND_NEW_VALUE_", "TOTAL_NEW_VALUE_", "FUND_OLD_VALUE_",
            "TOTAL_OLD_VALUE_", "DAILY_RATE_", "YEAR_REWARD_", "SUBS_STATUS_",
            "ATONEM_STATUS_", "TYPE_", "ID_", "NEWEST_VALUE_", "TOTAL_VALUE_",
            "POPULARITY_", "RATING_", "OLD_VALUE_", "UNIT_VALUE_", "SCALE_",
            "ESTABLISH_DATE_", "RISK_LEVEL_", "BASE_INFO_", "YIELD_",
            "INVEST_", "MONTH_RATE_", "QUARTER_RATE_", "HALF_YEAR_RATE_",
            "HISTORY_RATE_", "FUND_STATUS_", "COMPANY_", "SUBS_STATUS_CODE_",
            "TYPE_CODE_"
        ]

    # 从 MongoDB 获取数据
    def get_data_from_mongo(self, m_client, collection, entity_code):
        m_client.mongo_db = "spider_data"
        m_client.mongo_entity_code = entity_code

        try:
            mongo_data_list = m_client.search_from_mongodb(collection)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.search_from_mongodb(collection)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    # 从 MongoDB 删除数据
    def delete_data_from_mongo(self, m_client, collection, entity_code,
                               remove_id_list):
        m_client.mongo_entity_code = entity_code

        try:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

        # 网点 CODE_
        hash_m = hashlib.md5()
        hash_m.update(re_data["ADDR_"].encode("utf-8"))
        hash_addr_ = hash_m.hexdigest()
        re_data["CODE_"] = re_data["BANK_CODE_"] + "_" + re_data[
            "AREA_CODE_"] + "_" + hash_addr_
        # for i in range(1, 10000):
        #         branch_code = "ABC" + "_" + re_data["AREA_CODE_"] + "_" + "00000"
        #         branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i)
        #         if branch_code in branch_code_list:
        #             continue
        #         else:
        #             branch_code_list.append(branch_code)
        #             break
        return re_data

    def gaode_get_lat_lng(self, address):
        url = URL_FOR_LAT_LNG + "?" + "key=" + AK + "&address=" + address
        # url = url + "?location={}&output=json&pois=1&ak={}".format(address, ak)
        response = requests.get(url)
        temp = json.loads(response.content)
        response.close()
        return temp['geocodes'][0]['location']

    def dict_from_mysql(self, dict_code):
        # 创建 MySQL 对象
        mysql_config = {
            "host": "172.22.67.25",
            "port": 3306,
            "database": "chabei",
            "user": "******",
            "password": "******",
            "table": "sys_dict_item"
        }

        mysql_client = MysqlClient(**mysql_config)
        mysql_connection = mysql_client.client_to_mysql()
        result = mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'{}\'"
            .format(dict_code),
            connection=mysql_connection)

        mysql_client.close_client(connection=mysql_connection)
        return result

    # 主函数
    def run(self):
        count = 0
        # # 创建 Phoenix 对象-注意表格名字
        p_client = PhoenixHbase(table_name="FUND")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="JSFUND_CCBDATA")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        #查询省市区的编码列表
        # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA")
        # province_list, city_list, area_list, dir_area_list = script.area_from_mysql()
        list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS")
        list_TYPE = self.dict_from_mysql("FUND_TYPE")

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 基金表创建语句
        # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,'
        #     '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,'
        #     '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,'
        #     '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,'
        #     '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,'
        #     '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,'
        #     '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,'
        #     '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,'
        #     '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,'
        #     '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,'
        #     '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE)
                        re_data = module_name.data_shuffle(
                            data, list_SUBS_STATUS, list_TYPE)
                        # re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # try:
                            #     area_data = self.shuffle_for_area(list_data)
                            # except Exception as e:
                            #     self.remove_id_list.remove(data_id)
                            #     self.copy_mongo_data_list.remove(copy_data)
                            #     self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e))
                            #     continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    count += 1
                                    print(count)
                                    # print(list_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=list_data)
                                    # 导出csv
                                    # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv")
                                    once_count += success_count
                                    self.success_count += success_count
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            success_count))
                                    if self.success_count % 50 == 0:
                                        update_count = m_client.update_to_mongodb(
                                            collection=collection,
                                            data_id=self.remove_id_list,
                                            data_dict={"d": 1})

                                        self.remove_count += update_count
                                        self.logger.info("MongoDB 更新成功")
                                except Exception as e:
                                    self.remove_id_list.remove(data_id)
                                    self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        # try:
                        # area_data = self.shuffle_for_area(re_data)
                        # except Exception as e:
                        #     self.remove_id_list.remove(data_id)
                        #     self.copy_mongo_data_list.remove(copy_data)
                        #     self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e))
                        #     continue
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                if self.success_count % 100 == 0:
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            self.success_count))
                                # 添加 {d:1}
                                if self.success_count % 50 == 0:
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=self.remove_id_list,
                                        data_dict={"d": 1})

                                    self.remove_count += update_count
                                    self.logger.info("MongoDB 更新成功")
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #13
0
class WechatScript(object):
    def __init__(self, entity_type="WECHAT"):
        """
        初始化参数
        :param entity_type: WECHAT
        """
        self.entity_type = entity_type
        self.logger = Logger().logger

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name=self.entity_type)
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="WECHAT")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # # 创建 MongoDB spider_data_old 数据库对象
        # self.old_client = MongoClient(mongo_collection="WECHAT")
        # # 本地测试
        # self.old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                              connectTimeoutMS=60, connect=False)
        # self.old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = self.old_client.client_to_mongodb()
        # self.collection_old = db_old["WECHAT"]

        # 创建 MySQL 对象
        self.mysql_client = GenericScript(entity_code=None, entity_type=None)

        self.remove_id_list = list()
        self.copy_mongo_data_list = list()
        self.verify_list = [
            "ID_", "ENTITY_CODE_", "URL_", "AREA_CODE_", "BANK_CODE_",
            "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_",
            "CONTENT_TYPE_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_",
            "TITLE_", "ENTITY_NAME_", "DEALTIME_", "DATETIME_", "STATUS_",
            "WECHAT_NAME_", "WECHAT_ID_"
        ]

        # BANK_NAME_ 字典
        self.name_dict = {
            "ICBC": "中国工商银行",
            "ABC": "中国农业银行",
            "BOC": "中国银行",
            "CCB": "中国建设银行",
            "BOCOM": "交通银行",
            "PSBC": "中国邮政储蓄银行",
            "CZB": "浙商银行",
            "CBHB": "渤海银行",
            "ECITIC": "中信银行",
            "CEB": "中国光大银行",
            "HXB": "华夏银行",
            "CMBC": "中国民生银行",
            "CMB": "招商银行",
            "CIB": "兴业银行",
            "CGB": "广发银行",
            "PAB": "平安银行",
            "SPDB": "浦发银行",
            "EBCL": "恒丰银行"
        }

        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Host":
            "weixin.sogou.com",
            "Referer":
            "http://weixin.sogou.com/"
        }

        self.url = "http://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&s_from=input&_sug_=y&_sug_type_="

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def check_name(self, wechat_id):
        check_dict = dict()
        with open("wechat_id_name.txt", "r", encoding="utf-8") as rf:
            read_data = rf.read()
            if read_data:
                read_data = read_data.replace("\'", "\"")
                read_data = read_data.replace(": None", ": \"None\"")
                # print(read_data)
                check_dict = json.loads(read_data)
            else:
                wechat_name = self.req_for_name(wechat_id)
                check_dict[wechat_id] = wechat_name
                with open("wechat_id_name.txt", "w", encoding="utf-8") as wf:
                    wf.write(str(check_dict))
                return check_dict[wechat_id]
        if wechat_id in check_dict:
            return check_dict[wechat_id]
        else:
            wechat_name = self.req_for_name(wechat_id)
            check_dict[wechat_id] = wechat_name
            with open("wechat_id_name.txt", "w", encoding="utf-8") as wf:
                wf.write(str(check_dict))
            return check_dict[wechat_id]

    def req_for_name(self, wechat_id):
        url = self.url.format(wechat_id)
        # response = WanDou().http_client(url=url, param=self.headers)
        resp1 = requests.get(
            url=
            r"http://h.wandouip.com/get/ip-list?pack=853&num=1&xy=1&type=2&lb=\r\n&mr=1&"
        )
        resp2 = resp1.json()["data"][0]
        # print(resp2)
        # resp1.close()
        time.sleep(2)
        try:
            response = requests.get(
                url=url,
                headers=self.headers,
                proxies={"http": "{}:{}".format(resp2["ip"], resp2["port"])})
        except Exception as e:
            print(1, e)
            self.logger.info("error ip: {}".format(resp2))
            time.sleep(5)
            return self.req_for_name(wechat_id)

        html = HTML(response.content.decode())
        # response.close()
        name = html.xpath('//p[@class="tit"]/a/text()')
        if name:
            # print(name)
            self.error_count = 0
            return name[0]
        else:
            self.error_count += 1
            if self.error_count == 5:
                self.logger.info("wetchat id error: \"{}\"".format(wechat_id))
                return "None"
            else:
                time.sleep(2)
                self.req_for_name(wechat_id)

        # if response is None:
        #     self.logger.info("ip_prox error")
        #     return self.req_for_name(wechat_id)

        # if isinstance(response, str):
        #     html = HTML(response)
        #     name = html.xpath('//p[@class="tit"]/a/text()')
        #     if name:
        #         print(name)
        #         return name[0]
        #     else:
        #         self.logger.info("ip_prox error2")
        #         return self.req_for_name(wechat_id)
        # else:
        #     self.logger.info("ip_prox error change")
        #     return self.req_for_name(wechat_id)

    def data_shuffle(self, data, province_list, city_list, area_list):
        """
        数据清洗
        :param data:
        :param province_list:
        :param city_list:
        :param area_list:
        :return: re_data or None
        """
        # BANK_CODE_正则匹配规则
        pattern = re.compile(
            r'ICBC|ABC|BOCOM|CCB|BOC|PSBC|CZB|CBHB|ECITIC|CEB|HXB|CMBC|CMB|CIB|CGB|PAB|SPDB|EBCL'
        )

        re_data = dict()

        if data["TITLE_"]:
            # HBase row_key
            hash_m = hashlib.md5()
            hash_m.update(data["TITLE_"].encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

            # "C" 通用列族字段
            re_data["ID_"] = row_key
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
            # re_data["URL_"] = ""

            prov_c = None
            prov_n = None
            city_c = None
            city_n = None
            area_c = None
            area_n = None
            bank_n = None

            bank_c = pattern.match(data["ENTITY_CODE_"])
            if bank_c:
                re_data["BANK_CODE_"] = bank_c.group()
            else:
                return None

            # 正则去除银行名称,方便匹配地区编码
            bank_n = re.sub(
                r"{}银?行?|微信|[总分支]行".format(
                    self.name_dict[re_data["BANK_CODE_"]][:-2]), "",
                data["ENTITY_NAME_"])

            re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]]
            re_data["PERIOD_CODE_"] = data["PERIOD_CODE_"].replace("-", "")
            re_data["NOTICE_TIME_"] = data["PERIOD_CODE_"]
            re_data["STATUS_"] = "1"
            re_data["CONTENT_"] = data["CONTENT_"]
            re_data["REMARK_"] = ""

            for area in area_list:
                if area["NAME_"] in bank_n:
                    area_c = area["CODE_"]
                    area_n = area["NAME_"]
            if area_c:
                pass
            else:
                for prov in province_list:
                    if prov["NAME_"] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n, "")
                        break
                    elif prov["NAME_"][:-1] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n[:-1], "")
                        break
                    elif prov["NAME_"][:4] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n[:4], "")
                        break
                    elif prov["NAME_"][:3] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n[:3], "")
                        break
                    elif prov["NAME_"][:2] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n[:2], "")
                        break

                for city in city_list:
                    if len(city["NAME_"]) == 1:
                        continue
                    if prov_c:
                        if city["CODE_"][:2] == prov_c[:2]:
                            if city["NAME_"] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n, "")
                                break
                            elif city["NAME_"][:-1] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n[:-1], "")
                                break
                            elif city["NAME_"][:4] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n[:4], "")
                                break
                            elif city["NAME_"][:3] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n[:3], "")
                                break
                            elif city["NAME_"][:2] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n[:2], "")
                                break
                    else:
                        if city["NAME_"] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n, "")
                            break
                        elif city["NAME_"][:-1] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n[:-1], "")
                            break
                        elif city["NAME_"][:4] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n[:4], "")
                            break
                        elif city["NAME_"][:3] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n[:3], "")
                            break
                        elif city["NAME_"][:2] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n[:2], "")
                            break

                for area in area_list:
                    if city_c:
                        if area["CODE_"][:2] == city_c[:2]:
                            if area["NAME_"] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:-1] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:4] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:3] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:2] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                    elif prov_c:
                        if area["CODE_"][:2] == prov_c[:2]:
                            if area["NAME_"] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:-1] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:4] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:3] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:2] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                    else:
                        if area["NAME_"][:-1] in bank_n:
                            area_c = area["CODE_"]
                            area_n = area["NAME_"]
                            break
                        elif area["NAME_"][:4] in bank_n:
                            area_c = area["CODE_"]
                            area_n = area["NAME_"]
                            break
                        elif area["NAME_"][:3] in bank_n:
                            area_c = area["CODE_"]
                            area_n = area["NAME_"]
                            break
                        elif area["NAME_"][:2] in bank_n:
                            area_c = area["CODE_"]
                            area_n = area["NAME_"]
                            break

                # 特殊情况 星子县现为庐山市 喻家山位于武汉洪山区
                if "星子县" in data["ENTITY_NAME_"]:
                    area_c = "360483"
                    area_n = "庐山市"
                elif "喻家山" in data["ENTITY_NAME_"]:
                    area_c = "420111"
                    area_n = "洪山区"
                elif "江南西" in data["ENTITY_NAME_"]:
                    area_c = "440105"
                    area_n = "海珠区"
                elif "两路口" in data["ENTITY_NAME_"]:
                    area_c = "500103"
                    area_n = "渝中区"
                elif "大兴安岭" in data["ENTITY_NAME_"]:
                    area_c = "232700"
                    area_n = "大兴安岭地区"
                elif "张家港" in data["ENTITY_NAME_"]:
                    area_c = "320582"
                    area_n = "张家港市"
                elif "兴业银行新阳支行" in data["ENTITY_NAME_"]:
                    area_c = "230102"
                    area_n = "道里区"

                if area_c:
                    pass
                elif (not area_c) and city_c:
                    area_c = city_c
                    area_n = city_n
                elif (not area_c) and (not city_c) and prov_c:
                    area_c = prov_c
                    area_n = prov_n
                # 总行地区处理
                elif (not area_c) and (not city_c) and (not prov_c):
                    if re_data["BANK_CODE_"] == "ICBC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "ABC":
                        area_c = "110101"
                        area_n = "东城区"
                    elif re_data["BANK_CODE_"] == "BOCOM":
                        area_c = "310115"
                        area_n = "浦东新区"
                    elif re_data["BANK_CODE_"] == "CCB":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "BOC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "PSBC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "CZB":
                        area_c = "330103"
                        area_n = "下城区"
                    elif re_data["BANK_CODE_"] == "CBHB":
                        area_c = "120103"
                        area_n = "河西区"
                    elif re_data["BANK_CODE_"] == "ECITIC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "CEB":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "HXB":
                        area_c = "110101"
                        area_n = "东城区"
                    elif re_data["BANK_CODE_"] == "CMBC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "CMB":
                        area_c = "440304"
                        area_n = "福田区"
                    elif re_data["BANK_CODE_"] == "CIB":
                        area_c = "350102"
                        area_n = "鼓楼区"
                    elif re_data["BANK_CODE_"] == "CGB":
                        area_c = "440104"
                        area_n = "越秀区"
                    elif re_data["BANK_CODE_"] == "PAB":
                        area_c = "440303"
                        area_n = "罗湖区"
                    elif re_data["BANK_CODE_"] == "SPDB":
                        area_c = "310101"
                        area_n = "黄浦区"
                    elif re_data["BANK_CODE_"] == "EBCL":
                        area_c = "370602"
                        area_n = "芝罘区"
            re_data["AREA_CODE_"] = area_c

            if area_c:
                re_data["UNIT_CODE_"] = re_data[
                    "BANK_CODE_"] + "_" + area_c[:4] + "00"
            if ("b" in data["BANK_NAME_"]) or ("B" in data["BANK_NAME_"]):
                return None

            if "DATETIME_" not in data:
                time_array = time.localtime(int(float(data["DEALTIME_"])))
                value_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
                re_data["CREATE_TIME_"] = value_time
            else:
                re_data["CREATE_TIME_"] = data["DATETIME_"]

            # data["UPDATE_TIME_"] = ""

            re_data["TITLE_"] = data["TITLE_"]
            re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"]
            re_data["WECHAT_ID_"] = data["WECHAT_"].replace(" ", "")
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
            re_data["DEALTIME_"] = str(data["DEALTIME_"])

            # print(area_c, area_n, data["ENTITY_NAME_"])
            return re_data
        else:
            return None

    def delete_data_from_mongo(self):
        """
        从 MongoDB 删除数据
        :return: delete_count
        """
        try:
            remove_count = self.m_client.remove_from_mongo(
                collection=self.collection, remove_id_list=self.remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = self.m_client.remove_from_mongo(
                collection=self.collection, remove_id_list=self.remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def upsert_and_delete(self, mongo_data_list, province_list, city_list,
                          area_list):
        """
        插入和删除
        :param mongo_data_list:
        :param province_list:
        :param city_list:
        :param area_list:
        :return:
        """
        for i in range(1000000):
            status = False
            self.data_id = ""
            success_count = 0
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            # print(data["_id"])
            # self.remove_id_list.append(self.data_id)
            # del data["_id"]
            # copy_data = deepcopy(data)
            # self.copy_mongo_data_list.append(copy_data)

            # 清洗数据
            try:
                re_data = self.data_shuffle(data=data,
                                            province_list=province_list,
                                            city_list=city_list,
                                            area_list=area_list)
            except Exception as e:
                # self.remove_id_list.remove(self.data_id)
                # self.copy_mongo_data_list.remove(copy_data)
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # 获取公众号名称
                # try:
                # print(re_data["WECHAT_ID_"])
                re_data["WECHAT_NAME_"] = self.check_name(
                    re_data["WECHAT_ID_"])

                # re_data["WECHAT_NAME_"] = self.req_for_name(re_data["WECHAT_ID_"])
                # print(re_data["WECHAT_ID_"])
                # print(re_data["WECHAT_NAME_"])
                # except Exception as e:
                # 向 HBase 插入数据
                try:
                    count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                    success_count += count
                except jaydebeapi.DatabaseError as e:
                    # self.logger.info("error: {}".format(e))
                    # self.remove_id_list.remove(self.data_id)
                    # self.copy_mongo_data_list.remove(copy_data)
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                    # # Phoenix 连接关闭
                    # p_client.close_client_phoenix(connection=connection)
                    # time.sleep(10)
                    # # 连接 Phoenix
                    # connection = p_client.connect_to_phoenix()
                    # # 向 HBase 插入数据
                    # count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data)
                    # success_count += count

                # try:
                #     # 添加 {d:1}
                #     update_count = self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id,
                #                                                    data_dict={"d": 1})
                #     self.remove_count += update_count
                #     # self.logger.info("MongoDB 更新成功")
                #     if self.remove_count % 10 == 0:
                #         self.logger.info("MongoDB 更新成功, 成功条数 {} 条".format("10"))
                # except Exception as e:
                #     # self.remove_id_list.remove(data_id)
                #     # self.copy_mongo_data_list.remove(copy_data)
                #     self.logger.warning("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e))
                #     continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                # self.remove_id_list.remove(self.data_id)
                # self.copy_mongo_data_list.remove(copy_data)
                continue

            # # 删除数据
            # if status:
            #     delete_count = self.delete_data_from_mongo()
            #     self.remove_count += delete_count
            #     self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
            #
            # # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         self.old_client.mongo_db = "spider_data_old"
            #         insert_count = self.old_client.all_to_mongodb(collection=self.collection_old,
            #                                                       insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         time.sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = self.old_client.all_to_mongodb(collection=self.collection_old,
            #                                                       insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

    def main(self):
        """

        :return:
        """
        # # 删除表
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()

        # # 建表语句
        # table_sql = ('create table "WECHAT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar,'
        #              '"C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar,'
        #              '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, '
        #              '"T"."CONTENT_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."TITLE_" varchar,'
        #              '"C"."WECHAT_ID_" varchar, "C"."WECHAT_NAME_" varchar, "C"."ENTITY_NAME_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."PRAISES_" varchar,'
        #              '"C"."READ_NUM_" varchar, "C"."REPLIES_" varchar, "C"."RELAYS_" varchar,'
        #              '"C"."NOTICE_TIME_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        # f_id = "5c1267258d7fee59f7d089f8"  # gte 10M
        # f_id = "5c1271a28d7fee66df0fdd83"  # gte 10M
        # f_id = "5c127e7b9bb3df7412b53b04"  # gte 10M
        # f_id = "5c1330d28d7fee4d9c87d6e1"  # gte 10M
        # f_id = "5c1330ed9bb3df2de33bb746"  # gte 10M
        # f_id = "5c13490a8d7fee79f1d9e87f"  # gte 10M
        # f_id = "5c1350ee8d7fee2d29b601ef"  # gte 10M
        # f_id = "5c1351c79bb3df0e23ee68c1"  # gte 10M
        # f_id = "5c13547d9bb3df06d41997d5"  # gte 10M
        # f_id = "5c1354849bb3df202508ee3e"  # gte 10M
        # f_id = "5c1354bd8d7fee44b881b11a"  # gte 10M
        # f_id = "5c1354e89bb3df1b2a6ef59c"  # gte 10M
        # f_id = "5c1355139bb3df197beb11c0"  # gte 10M
        # f_id = "5c1355328d7fee2f0997a3ac"  # gte 10M
        # f_id = "5c13558e8d7fee50ea04bd0a"  # gte 10M
        # f_id = "5c135a5f8d7fee5bf7db91b8"  # gte 10M
        # f_id = "5c135b0c8d7fee697fa5bd80"  # gte 10M
        # f_id = "5c135bd59bb3df4d7aa66cad"  # gte 10M
        # f_id = "5c135bdb9bb3df454c0157a3"  # gte 10M
        # f_id = "5c135bfc8d7fee73c8f84567"  # gte 10M
        # f_id = "5c135c119bb3df48aeb8fe63"  # gte 10M
        # f_id = "5c135dfe9bb3df4d7aa66cc2"  # gte 10M
        # f_id = "5c13602d8d7fee7f7a48c485"  # gte 10M
        # f_id = "5c1361858d7fee223825f805"  # gte 10M
        # f_id = "5c1361d68d7fee561806fc4d"  # gte 10M
        # f_id = "5c1362068d7fee223825f808"  # gte 10M
        # f_id = "5c1362159bb3df26bba60a05"  # gte 10M
        # f_id = "5c1366248d7fee6741adb5be"  # gte 10M
        # f_id = "5c1366418d7fee673f6c95cb"  # gte 10M
        # f_id = "5c1367099bb3df5a0e013c4d"  # gte 10M
        # f_id = "5c13686d8d7fee76ac78735b"  # gte 10M
        # f_id = "5c1368788d7fee6fcb24daa3"  # gte 10M
        # f_id = "5c1369438d7fee63412b04ff"  # gte 10M
        # f_id = "5c13697b9bb3df60429b5d31"  # gte 10M
        # f_id = "5c1389468d7fee6a94c413c3"  # gte 10M
        # f_id = "5c1389c29bb3df75adc8861a"  # gte 10M
        # f_id = "5c138b039bb3df75adc88620"  # gte 10M
        # f_id = "5c138e3d9bb3df074c4ec0b3"  # gte 10M
        # f_id = "5c138e4d8d7fee06a4f8fd59"  # gte 10M
        # f_id = "5c1391318d7fee168749a96e"  # gte 10M
        # f_id = "5c25a4f19bb3df51eba386b8"  # gte 10M
        # f_id = "5c2601ef9bb3df7d42fe2084"  # gte 10M
        # f_id = "5c2608099bb3df24f5db4527"  # gte 10M
        # f_id = "5c2608be9bb3df2d58d08e32"  # gte 10M
        # f_id = "5c260d2b9bb3df3c084d2a83"  # gte 10M
        # f_id = "5c2615868d7fee2771bb3914"  # gte 10M
        # f_id = "5c261d528d7fee3c1383db85"  # gte 10M
        # f_id = "5c26340e8d7fee66d784fe8a"  # gte 10M
        # f_id = "5c263b818d7fee630f0d3ac4"  # gte 10M
        # f_id = "5c263ee28d7fee04ddc62e31"  # gte 10M
        # f_id = "5c263f269bb3df0d29d1e1e5"  # gte 10M
        # f_id = "5c2766718d7fee2aa36fa166"  # gte 10M
        # f_id = "5c2b79ef8d7fee3025e02575"  # gte 10M
        # f_id = "5c2b854a9bb3df27dc669d5a"  # gte 10M
        # f_id = "5c2e00078d7fee1b60443cf3"  # gte 10M
        # f_id = "5c2f69028d7fee62d31a72db"  # gte 10M
        # f_id = "5c36a7948d7fee18d9333327"  # gte 10M
        # f_id = "5c36b9ff9bb3df332dfebe39"  # gte 10M
        # f_id = "5c3754579bb3df02b680150b"  # gte 10M
        # f_id = "5c375c969bb3df6afd18e22d"  # gte 10M
        # f_id = "5c38a1e59bb3df6b2ff2f269"  # gte 10M
        # f_id = "5c394e058d7fee6a2582d1d3"  # gte 10M
        # f_id = "5c3c983e9bb3df21ddf94a92"  # gte 10M
        # f_id = "5c3ca38a9bb3df60bca07833"  # gte 10M

        f_id = "5c3c983e9bb3df21ddf94a92"
        # f_id = ""
        self.data_id = f_id

        province_list, city_list, area_list, dir_area_list = self.mysql_client.area_from_mysql(
        )
        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection, data_id=self.data_id)
        self.find_count += mongo_data_list.count()

        try:
            self.upsert_and_delete(mongo_data_list=mongo_data_list,
                                   province_list=province_list,
                                   city_list=city_list,
                                   area_list=area_list)
        except jaydebeapi.DatabaseError:
            self.logger.info("error id is: {}".format(self.data_id))
            mongo_data_list = self.m_client.all_from_mongodb(
                collection=self.collection, data_id=self.data_id)
            self.upsert_and_delete(mongo_data_list=mongo_data_list,
                                   province_list=province_list,
                                   city_list=city_list,
                                   area_list=area_list)

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #14
0
class FinProductScript(object):
    def __init__(self):
        # 创建 MySQL 对象
        __mysql_config = {
            "host": MYSQL_HOST_25,
            "port": MYSQL_PORT_25,
            "database": MYSQL_DATABASE_25,
            "user": MYSQL_USER_25,
            "password": MYSQL_PASSWORD_25,
            "table": MYSQL_TABLE_25
        }

        __mysql_client = MysqlClient(**__mysql_config)
        __mysql_connection = __mysql_client.client_to_mysql()

        self.sales_status = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'SALES_STATUS\'",
            connection=__mysql_connection)
        self.produc_category = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PRODUC_CATEGORY\'",
            connection=__mysql_connection)
        self.revenue_type = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'REVENUE_TYPE\'",
            connection=__mysql_connection)
        self.operaton_pattern = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'OPERATION_PATTERN\'",
            connection=__mysql_connection)
        self.purchase_amount = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PURCHASE_AMOUNT\'",
            connection=__mysql_connection)
        self.duration_type = __mysql_client.search_area_code(
            sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'DURATION_TYPE\'",
            connection=__mysql_connection)
        __mysql_client.close_client(connection=__mysql_connection)

        self.logger = Logger().logger
        self.remove_id_list = list()
        self.copy_mongo_data_list = list()
        # "CZBFinancial", "PABFinancial", "PSBCFinancial",
        #
        self.entity_list = ["ABCFinancial", "BOCFinancial", "BOCOMFinancial", "CBHBFinancial", "CCBFinancial",
                            "CEBFinancial", "CGBFinancial", "CIBFinancial", "CMBCFinancial", "CMBFinancial",
                            "EBCLFinancial", "ECITICFinancial", "HXBFinancial", "ICBCFinancial",
                            "SPDBFinancial", "CHINANETFINANCIAL", "JSFIN_CCBDATA"]

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.verify_list = ["ID_", "ENTITY_CODE_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_",
                            "PERIOD_CODE_", "CONTENT_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_",
                            "TIME_LIMIT_", "YIELD_RATE_", "BREAKEVEN_", "START_FUNDS_", "INVEST_PERIOD_", "SALE_START_",
                            "SALE_END_", "RISK_LEVEL_", "REDEMING_MODE_", "PRIVATE_BANK_", "URL_", "DEALTIME_",
                            "DATETIME_", "ENTITY_NAME_", "STATUS_", "SALE_DISTRICT_"]

    def get_data_from_mongo(self, m_client, collection, entity_code, data_id):
        """
        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :param entity_code:
        :return: data from MongoDB
        """
        m_client.mongo_db = "spider_data"
        m_client.mongo_entity_code = entity_code

        try:
            mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list):
        """
        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :param entity_code:
        :param remove_id_list: id list to remove
        :return: delete count
        """
        m_client.mongo_entity_code = entity_code

        try:
            remove_count = m_client.remove_from_mongo(collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = m_client.remove_from_mongo(collection=collection, remove_id_list=remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["FINPRODUCT_FINASSIST"]

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 表创建语句
        # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, '
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,'
        #        '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,'
        #        '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,'
        #        '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,'
        #        '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #        '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,'
        #        '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,'
        #        '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,'
        #        '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,'
        #        '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]:
            # for entity in self.entity_list:
            status = False
            module_name = __import__(entity)
            self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            # find_id = "5c3f118f8d7fee068da6ef53"
            find_id = None
            try:
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        if entity == "CHINANETFINANCIAL":
                            re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status,
                                                               produc_category=self.produc_category,
                                                               revenue_type=self.revenue_type,
                                                               operaton_pattern=self.operaton_pattern,
                                                               purchase_amount=self.purchase_amount,
                                                               duration_type=self.duration_type)
                        elif entity == "JSFIN_CCBDATA":
                            re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data)
                        else:
                            re_data = module_name.data_shuffle(data)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e))
                        continue

                    print(data_id)

                    # phoenix_HBase 插入数据
                    if isinstance(re_data, dict):
                        try:
                            success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data)
                            once_count += success_count
                            self.success_count += success_count
                            # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(data)
                            self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                            continue
                    elif isinstance(re_data, list):
                        for r_data in re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(connection=connection,
                                                                                  data=r_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(data)
                                self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            break
            # # 删除数据
            # if status:
            #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                                entity_code=entity,
            #                                                remove_id_list=self.remove_id_list)
            #     self.remove_count += delete_count
            #     # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
            #
            # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

            # 关闭连接
        m_client.client_close()
        # p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Пример #15
0
class ScriptCCB(object):
    def __init__(self):
        self.logger = Logger().logger
        self.remove_id_list = list()
        self.copy_mongo_data_list = list()
        # 创建 MySQL 对象
        __mysql_config = {
            "host": MYSQL_HOST_25,
            "port": MYSQL_PORT_25,
            "database": MYSQL_DATABASE_25,
            "user": MYSQL_USER_25,
            "password": MYSQL_PASSWORD_25,
            "table": MYSQL_TABLE_25
        }

        __mysql_client = MysqlClient(**__mysql_config)
        __mysql_connection = __mysql_client.client_to_mysql()

        self.sales_status = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'SALES_STATUS\'",
            connection=__mysql_connection)
        self.produc_category = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PRODUC_CATEGORY\'",
            connection=__mysql_connection)
        self.revenue_type = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'REVENUE_TYPE\'",
            connection=__mysql_connection)
        self.operaton_pattern = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'OPERATION_PATTERN\'",
            connection=__mysql_connection)
        self.purchase_amount = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PURCHASE_AMOUNT\'",
            connection=__mysql_connection)
        self.duration_type = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'DURATION_TYPE\'",
            connection=__mysql_connection)
        __mysql_client.close_client(connection=__mysql_connection)

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.verify_list = [
            "ID_", "ENTITY_CODE_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_",
            "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "REMARK_",
            "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_", "TIME_LIMIT_",
            "YIELD_RATE_", "BREAKEVEN_", "START_FUNDS_", "INVEST_PERIOD_",
            "SALE_START_", "SALE_END_", "RISK_LEVEL_", "REDEMING_MODE_",
            "PRIVATE_BANK_", "URL_", "DEALTIME_", "DATETIME_", "ENTITY_NAME_",
            "STATUS_", "SALE_DISTRICT_", "CURRENCY_TYPE_", "INCREASE_UNIT_",
            "YIELD_START_DATE_", "YIELD_END_DATE_", "YIELD_TYPE_", "TARGET_",
            "PRODUCT_TYPE_", "YIELD_STATMENT_", "INVEST_RANGE_", "PRE_STOP_",
            "RASE_PLAN_", "PURCHASE_"
        ]

    @staticmethod
    def get_data_from_mongo(self, m_client, collection, data_id):
        """
        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :return: data from MongoDB
        """
        m_client.mongo_db = "spider_data"

        try:
            mongo_data_list = m_client.all_from_mongodb(collection,
                                                        data_id=data_id)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.all_from_mongodb(collection,
                                                        data_id=data_id)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    @staticmethod
    def data_shuffle(self, data):
        if "上海银行" in data["BANK_NAME"]:
            bank_code = "BankOfShanghai"
        elif "天津银行" in data["BANK_NAME"]:
            bank_code = "TJBANK"
        elif "北京银行" in data["BANK_NAME"]:
            bank_code = "BOB"
        elif "中国光大银行股份有限公司" in data["BANK_NAME"]:
            bank_code = "CEB"
        else:
            print(data)

        re_data = dict()
        hash_m = hashlib.md5()
        hash_m.update(data["NAME_"].encode("utf-8"))
        hash_id = hash_m.hexdigest()
        re_data["ID_"] = bank_code + "_" + hash_id + "_" + data["SALE_START_"]
        re_data["ENTITY_CODE_"] = "RONG360FINANCIAL"
        # re_data["AREA_CODE_"]
        re_data["BANK_CODE_"] = bank_code
        re_data["BANK_NAME_"] = data["BANK_NAME"].replace("股份有限公司", "")
        # re_data["UNIT_CODE_"]
        re_data["PERIOD_CODE_"] = data["SALE_START_"].replace("-", "")
        # re_data["CONTENT_"]
        re_data["STATUS_"] = ""
        # re_data["REMARK_"]
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        # re_data["UPDATE_TIME_"]

        re_data["NAME_"] = data["NAME_"]
        # 售卖时间范围
        re_data["TIME_LIMIT_"] = ""
        # 收益率
        re_data["LOW_YIELD_RATE_"] = data["YIELD_RATE_"].replace("%", "")
        re_data["HIGH_YIELD_RATE_"] = data["YIELD_RATE_"].replace("%", "")
        # 售卖区域
        re_data["SALE_DISTRICT_"] = data["SALE_AREA_"]
        # 起购金额
        data["START_FUNDS_"] = data["START_FUNDS_"].replace("亿", "00000000")
        data["START_FUNDS_"] = data["START_FUNDS_"].replace("万", "0000")
        data["START_FUNDS_"] = data["START_FUNDS_"].replace("千", "000")
        if data["START_FUNDS_"]:
            if int(data["START_FUNDS_"]) < 50000:
                match_funds = "5万以下"
            elif 50000 <= int(data["START_FUNDS_"]) < 100000:
                match_funds = "5万-10万"
            elif 100000 <= int(data["START_FUNDS_"]) < 200000:
                match_funds = "10万-20万"
            elif 20000 <= int(data["START_FUNDS_"]) < 500000:
                match_funds = "20万-50万"
            elif 500000 <= int(data["START_FUNDS_"]):
                match_funds = "50万以上"
        else:
            match_funds = "不限"
        for i in self.purchase_amount:
            if match_funds in i["ITEM_LABEL_"]:
                re_data["START_FUNDS_"] = data["START_FUNDS_"]
                re_data["START_FUNDS_CODE_"] = i["ITEM_VALUE_"]

        # 期限
        data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace("天", "")

        if data["INVEST_PERIOD_"]:
            if int(data["INVEST_PERIOD_"]) <= 30:
                match_str = "1个月内"
            elif 30 < int(data["INVEST_PERIOD_"]) <= 90:
                match_str = "1-3个月(含)"
            elif 90 < int(data["INVEST_PERIOD_"]) <= 180:
                match_str = "3-6个月(含)"
            elif 180 < int(data["INVEST_PERIOD_"]) <= 365:
                match_str = "6-12个月(含)"
            elif 365 < int(data["INVEST_PERIOD_"]):
                match_str = "12个月以上"

        else:
            match_str = "不限"

        for i in self.duration_type:
            if match_str in i["ITEM_LABEL_"]:
                re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
                re_data["INVEST_PERIOD_CODE_"] = i["ITEM_VALUE_"]

        # 开始售卖时间
        re_data["SALE_START_"] = data["SALE_START_"]
        # 结束售卖时间
        re_data["SALE_END_"] = data["SALE_END_"]
        # 风险等级
        # re_data["RISK_LEVEL_"] = data["RISK_LEVEL_"]
        re_data["URL_"] = data["URL_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        re_data["DATETIME_"] = data["DATETIME_"]
        re_data["ENTITY_NAME_"] = "融360理财产品"
        # NEW
        # 认购货币(类型)
        re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"]
        # 递增单位
        re_data["INCREASE_UNIT_"] = re.sub(r"元.*", "", data["INCREASE_UNIT_"])
        # 收益起记(日期)
        re_data["YIELD_START_DATE_"] = data["YIELD_START_DATE_"][:10]
        # 收益结束(日期)
        re_data["YIELD_END_DATE_"] = data["YIELD_START_DATE_"][-10:]
        # 收益获取方式
        for i in self.revenue_type:
            if i["ITEM_LABEL_"] == data["YIELD_TYPE_"]:
                re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"]
                re_data["YIELD_TYPE_CODE_"] = i["ITEM_VALUE_"]
                break
        # 对象(目标人群)
        re_data["TARGET_"] = data["TARGET_"]
        # 产品类型
        re_data["PRODUCT_TYPE_"] = data["PRODUCT_TYPE_"]

        # 收益率说明
        re_data["YIELD_STATMENT_"] = data["YIELD_STATMENT_"]

        # 投资范围
        re_data["INVEST_RANGE_"] = data["INVEST_RANGE_"]
        # 提前终止条件
        re_data["PRE_STOP_"] = data["PRE_STOP_"]
        # 募集规划条件
        re_data["RASE_PLAN_"] = data["RASE_PLAN_"]
        # 申购条件
        re_data["PURCHASE_"] = data["PURCHASE_"]
        # 无
        # re_data["CODE_"] = data["CODE_"]
        # 是否保本
        # re_data["BREAKEVEN_"] = data["BREAKEVEN_"]
        # # 可否赎回
        # re_data["REDEMING_MODE_"]
        # # 私人银行
        # re_data["PRIVATE_BANK_"]

        return re_data

    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()

        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="JSFIN_CCBDATA")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["FINPRODUCT_FINASSIST"]

        # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # # 表创建语句
        # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."IMPORTANCE_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."SALE_STATUS_CODE_" varchar,'
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar,'
        #        '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar, "C"."SALE_STATUS_" varchar,'
        #        '"C"."LOW_YIELD_RATE_" varchar, "C"."HIGH_YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar,'
        #        '"C"."START_FUNDS_" varchar, "C"."START_FUNDS_CODE_" varchar, "C"."INVEST_PERIOD_" varchar,'
        #        '"C"."INVEST_PERIOD_CODE_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,'
        #        '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,'
        #        '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #        '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,'
        #        '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,'
        #        '"C"."YIELD_TYPE_CODE_" varchar, "C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar,'
        #        '"C"."INVESTOR_TYPE_" varchar, "C". "INVESTOR_TYPE_CODE_" varchar, "C"."YIELD_STATMENT_" varchar,'
        #        '"C"."OPERA_MODEL_CODE_" varchar, "C"."OPERA_MODEL_" varchar,'
        #        '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,'
        #        '"C"."PURCHASE_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # colum_list = ["CURRENCY_TYPE_", "INCREASE_UNIT_", "YIELD_START_DATE_", "YIELD_END_DATE_", "YIELD_TYPE_",
        #               "TARGET_", "PRODUCT_TYPE_", "YIELD_STATMENT_", "INVEST_RANGE_", "PRE_STOP_", "RASE_PLAN_",
        #               "PURCHASE_"]
        # p_client.add_column_phoenix(connection=connection, column=colum_list)

        status = False
        self.logger.info("开始进行 ENTITY_CODE_: RONG360FINANCIAL")
        self.remove_id_list = []
        self.copy_mongo_data_list = []
        find_id = None
        try:
            mongo_data_list = self.get_data_from_mongo(self=self,
                                                       m_client=m_client,
                                                       collection=collection,
                                                       data_id=find_id)
        except pymongo.errors.ServerSelectionTimeoutError:
            sleep(1)
            mongo_data_list = self.get_data_from_mongo(self=self,
                                                       m_client=m_client,
                                                       collection=collection,
                                                       data_id=find_id)

        # 清洗数据并插入 HBase
        if mongo_data_list:
            once_count = 0
            self.find_count += mongo_data_list.count()
            for data in mongo_data_list:
                data_id = data["_id"]
                copy_data = {}
                self.remove_id_list.append(data_id)
                try:
                    del data["_id"]
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    re_data = self.data_shuffle(self=self, data=data)

                    if not re_data:
                        self.bad_count += 1
                        continue
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(data)
                    self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                        data_id, e))
                    continue

                print(data_id)

                # phoenix_HBase 插入数据
                if isinstance(re_data, dict):
                    try:
                        success_count = p_client.upsert_to_phoenix_by_one(
                            connection=connection, data=re_data)
                        once_count += success_count
                        self.success_count += success_count
                        # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(data)
                        self.logger.warning(
                            "HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                        continue
                elif isinstance(re_data, list):
                    for r_data in re_data:
                        try:
                            success_count = p_client.upsert_to_phoenix_by_one(
                                connection=connection, data=r_data)
                            once_count += success_count
                            self.success_count += success_count
                            # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(data)
                            self.logger.warning(
                                "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                    data_id, e))
                            continue
            if once_count > 0:
                status = True
                self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
        else:
            quit()
        # # 添加 {d:1}
        # if status:
        #     update_count = m_client.update_to_mongodb(collection=collection, data_id=self.remove_id_list,
        #                                               data_dict={"d": 1})
        #     self.remove_count += update_count
        #     self.logger.info("MongoDB 更新成功")
        # # 删除数据
        # if status:
        #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
        #                                                entity_code=entity,
        #                                                remove_id_list=self.remove_id_list)
        #     self.remove_count += delete_count
        #     # self.logger.info("MongoDB 删除成功")
        # else:
        #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
        #
        # 将数据插入 spider_data_old 中
        # if status:
        #     try:
        #         old_client.mongo_db = "spider_data_old"
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except pymongo.errors.ServerSelectionTimeoutError as e:
        #         sleep(1)
        #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except Exception as e:
        #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        # p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Пример #16
0
class WeiboBasicInfoScript(object):
    # 初始化参数
    def __init__(self, entity_type="WEIBOBASICINFO"):
        self.entity_type = entity_type
        self.logger = Logger().logger
        self.verify_list = [
            "ID_", "BANK_CODE_", "BANK_NAME_", "PERIOD_TIME_", "AREA_CODE_",
            "CREATE_TIME_", "WEIBO_CODE_", "MAIN_URL_", "NAME_", "FOCUS_",
            "FANS_", "COMPANY_URL_", "COMPANY_", "DETAILED_URL_", "VIRIFIED_",
            "BIREF_", "ENTITY_NAME_", "ENTITY_CODE_", "DEALTIME_",
            "PROVINCE_NAME_", "PROVINCE_CODE_", "STATUS_1"
        ]
        self.remove_id_list = list()
        self.copy_mongo_data_list = list()
        self.branch_code_list = list()
        self.find_count = 0
        self.bad_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0

    def match_weibo_code(self, match):
        mongo_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        db, collection_list = mongo_client.client_to_mongodb()
        collection = mongo_client.get_check_collection(db, collection_list)
        result = mongo_client.match_from_mongo(collection=collection,
                                               match=match,
                                               output="WEIBO_CODE_")
        return result

    def data_shuffle(self, data, province_list):
        re_data = dict()
        prov_n = ""
        prov_c = ""

        # # BANK_NAME_ 字典
        # name_dict = {"ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行",
        #              "BOCOM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行",
        #              "ECITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行",
        #              "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行",
        #              "SPDB": "浦发银行", "EBCL": "恒丰银行"}
        province_list, city_list, area_list, dir_area_list, bank_list = GenericScript.data_from_mysql(
        )

        bank_code = data["BANK_CODE_"][:-9]

        bank_name = name_dict[bank_code]

        time_array = time.localtime(int(data["DEALTIME_"]))
        period_time = time.strftime("%Y%m%d", time_array)

        for prov in province_list:
            if prov["NAME_"][:2] in data["LOCATION_"]:
                prov_n = prov["NAME_"]
                prov_c = prov["CODE_"]

        # "C"
        # TODO row_key 时间戳还是年-月-日
        re_data["ID_"] = data["BANK_CODE_"] + "_" + period_time
        re_data["BANK_CODE_"] = bank_code
        re_data["BANK_NAME_"] = bank_name
        re_data["PERIOD_CODE_"] = period_time
        re_data["AREA_CODE_"] = prov_c
        re_data["CREATE_TIME_"] = period_time

        re_data["WEIBO_CODE_"] = data["WEIBO_CODE_"]
        re_data["MAIN_URL_"] = data["MAIN_URL_"]
        re_data["NAME_"] = data["NAME_"]
        re_data["FOCUS_"] = data["FOCUS_"]
        re_data["FANS_"] = data["FANS_"]
        re_data["COMPANY_URL_"] = data["COMPANY_URL_"]
        if "COMPANY_" not in data:
            re_data["COMPANY_"] = data["VIRIFIED_"]
        else:
            re_data["COMPANY_"] = data["COMPANY_"]
        re_data["DETAILED_URL_"] = data["DETAILED_URL_"]
        re_data["VIRIFIED_"] = bank_name + "股份有限公司"
        re_data["BIREF_"] = data["BIREF_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["ENTITY_CODE_"] = data["BANK_CODE_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        re_data["PROVINCE_NAME_"] = prov_n
        re_data["PROVINCE_CODE_"] = prov_c

        re_data["STATUS_"] = ""

        return re_data

    # 从 MongoDB 删除数据
    def delete_data_from_mongo(self, m_client, collection, remove_id_list):
        m_client.mongo_entity_code = None

        try:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="WEIBOBASICINFO")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None, entity_type=None).area_from_mysql())

        # 删除表
        p_client.drop_table_phoenix(connection=connection)
        # quit()

        # 创建表
        sql = (
            'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,'
            '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,'
            '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,'
            '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,'
            '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,'
            '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
            '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,'
            '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        status = False
        self.logger.info("开始进行 WEIBOBASICINFO")

        try:
            mongo_data_list = m_client.all_from_mongodb(collection=collection)
        except pymongo.errors.ServerSelectionTimeoutError:
            time.sleep(1)
            mongo_data_list = m_client.all_from_mongodb(collection=collection)

        # 清洗数据并插入 HBase
        if mongo_data_list:
            self.find_count = mongo_data_list.count()
            for data in mongo_data_list:
                re_data = ""
                data_id = data["_id"]
                copy_data = {}
                self.remove_id_list.append(data_id)
                try:
                    del data["_id"]
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    re_data = self.data_shuffle(data=data,
                                                province_list=province_list)
                    if not re_data:
                        self.bad_count += 1
                        continue
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                        data_id, e))

                # phoenix_HBase 插入数据
                try:
                    success_count = p_client.upsert_to_phoenix_by_one(
                        connection=connection, data=re_data)
                    self.success_count += success_count
                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(
                        data_id, e))
                    continue
            if self.success_count > 0:
                status = True
                self.logger.info("HBase 插入成功, 成功条数 {}".format(
                    self.success_count))
        else:
            quit()

        # # 删除数据
        # if status:
        #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
        #                                                remove_id_list=self.remove_id_list)
        #     self.remove_count += delete_count
        # else:
        #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
        #
        # # 将数据插入 spider_data_old 中
        # if status:
        #     try:
        #         old_client.mongo_db = "spider_data_old"
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #     except pymongo.errors.ServerSelectionTimeoutError as e:
        #         time.sleep(1)
        #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except Exception as e:
        #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #17
0
class AllToPhoenix(object):
    def __init__(self):
        self.file_list = list()
        self.get_code_list()
        self.logger = Logger().logger
        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.copy_mongo_data_list = list()
        self.remove_id_list = list()
        # 字段验证列表
        self.verify_list = [
            "ID_", "CONTENT_", "NOTICE_TIME_", "TITLE_", "PROJECT_NAME_",
            "BID_CONTENT_", "SIGN_START_TIME_", "SIGN_END_TIME_",
            "OPEN_BID_TIME_", "OPEN_BID_PLACE_", "BID_AGENCY_",
            "APPLY_CONDITION_", "SIGN_QUALIFICATION_", "PROJECT_ID_",
            "WIN_CANDIDATE_", "CANDIDATE_RANK_", "BID_", "URL_", "DEALTIME_",
            "CREATE_TIME_", "ENTITY_NAME_", "ENTITY_CODE_", "ENTITY_STATUS_",
            "SIGN_MATERIAL_", "BID_TYPE_", "DATETIME_", "BUDGET_PRICE_",
            "PASS_REASON_", "PRESALE_CONTENT_", "PRESALE_WAY_",
            "PRESALE_START_TIME_", "PRESALE_END_TIME_", "PRESALE_ADDR_",
            "PRESALE_PREPARE_", "IMAGE_"
        ]

    def get_code_list(self):
        """
        获取当前目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ )
        :return:
        """
        for root, dirs, files in os.walk(curPath):
            # print(root)  # 当前目录路径
            # print(dirs)  # 当前路径下所有子目录
            # print(files)  # 当前路径下所有非目录子文件
            self.file_list = files
            self.file_list.remove("__init_____.py")
            break

    def get_data_from_mongo(self, m_client, collection, entity_code):
        """

        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :param entity_code:
        :return: all from MongoDB where ENTITY_CODE_ = entity_code
        """
        m_client.mongo_db = "spider_data"
        other_query = {
            "$or": [{
                "TITLE_": {
                    "$exists": True
                }
            }, {
                "title": {
                    "$exists": True
                }
            }]
        }
        try:
            mongo_data_list = m_client.get_data_from_mongodb(
                collection=collection,
                entity_code=entity_code,
                exclude_code=None,
                limit_number=None,
                other_query=other_query)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            time.sleep(1)
            mongo_data_list = m_client.get_data_from_mongodb(
                collection=collection,
                entity_code=entity_code,
                exclude_code=None,
                limit_number=None,
                other_query=other_query)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def delete_data_from_mongo(self, m_client, collection, entity_code,
                               remove_id_list):
        """

        :param m_client: MongoDB client
        :param collection: MongoDB collection
        :param entity_code:
        :param remove_id_list: delete data id's list
        :return: delete count
        """
        m_client.mongo_entity_code = entity_code

        try:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def shuffle_data(self, data):
        """
        通用清洗
        :param data:
        :return:
        """
        re_data = dict()
        if "TITLE_" in data:
            if not data["TITLE_"]:
                return
            hash_m = hashlib.md5()
            hash_m.update(str(data["TITLE_"]).encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)
        elif "title" in data:
            if not data["title"]:
                return
            hash_m = hashlib.md5()
            hash_m.update(data["title"].encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["entity_code"]) + "_" + str(hash_title)
        else:
            return
        re_data["ID_"] = row_key

        for key, value in data.items():
            # 字段验证
            if key in self.verify_list:
                re_data[key] = value
            elif key == "entityStatus" or key == "ENTITY_STATUS_":
                key = "ENTITY_STATUS_"
                value = "DRAFT"
                re_data[key] = value
        if "ENTITY_STATUS_" not in re_data:
            re_data["ENTITY_STATUS_"] = "DRAFT"

        for s_key in self.verify_list:
            if s_key == "CONTENT_" or s_key == "URL_" or s_key == "url":
                continue
            if data.get(s_key, ""):
                data[s_key] = data[s_key].replace("|", "")

        return re_data

    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="CommonBidding")
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="CommonBidding")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        old_client = MongoClient(mongo_collection="CommonBidding")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        old_client.mongo_db = "spider_data_old"
        db_old, collection_list_old = old_client.client_to_mongodb()
        collection_old = db_old["CommonBidding"]

        # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # 招投标表创建语句
        # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,'
        #        '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,'
        #        '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,'
        #        '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,'
        #        '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,'
        #        '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,'
        #        '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,'
        #        '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,'
        #        '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,'
        #        '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,'
        #        '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,'
        #        '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        # self.file_list = ["CommonBidding_86JCW"]
        for f in self.file_list:
            status = False
            entity_code = f.replace(".py", "")
            module_name = __import__(entity_code)
            entity_code_mongo = entity_code.replace("CommonBidding_", "")
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                try:
                    self.find_count += mongo_data_list.count()
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(1)
                    self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    self.remove_id_list.append(data_id)
                    del data["_id"]
                    # 深拷贝源数据,用于插入 spider_data 库中
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    # 数据清洗
                    try:
                        re_data = module_name.data_shuffle(data)
                        final_data = self.shuffle_data(re_data)
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue
                    # phoenix_HBase 插入数据
                    if final_data:
                        try:
                            p_client.upsert_to_phoenix_by_one(
                                connection=connection, data=final_data)
                            once_count += 1
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(copy_data)
                            self.logger.warning(
                                "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                    data_id, e))
                            continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))

                # 删除数据
                if status:
                    delete_count = self.delete_data_from_mongo(
                        m_client=m_client,
                        collection=collection,
                        entity_code=entity_code_mongo,
                        remove_id_list=self.remove_id_list)
                    self.remove_count += delete_count
                else:
                    self.logger.info("HBase 插入成功条数0条, 不执行删除")

                # 将数据插入 spider_data_old 中
                if status:
                    try:
                        old_client.mongo_db = "spider_data_old"
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except pymongo.errors.ServerSelectionTimeoutError as e:
                        time.sleep(1)
                        self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except Exception as e:
                        self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Пример #18
0
    def get_data_from_mongodb(self,
                              collection,
                              entity_code=None,
                              exclude_code=None,
                              limit_number=None,
                              data_id=None,
                              find_query=None):
        """
        从 MongoDB 获取数据
        :param collection:
        :param entity_code: 需要查取的 实体编码
        :param exclude_code: 需要排除的 实体编码
        :param limit_number: 查取的数据条数
        :param data_id: 查取 data_id 后的数据
        :return:
        """
        mon_logger = Logger().logger
        query_list = list()
        if isinstance(find_query, dict):
            query_list.append(find_query)
        elif isinstance(find_query, list):
            query_list.extend(find_query)
        if isinstance(entity_code, str):
            query_list.append({"ENTITY_CODE_": entity_code})
        elif isinstance(entity_code, (list, tuple)):
            query_list.append({"ENTITY_CODE_": {"$in": list(entity_code)}})

        if isinstance(exclude_code, str):
            query_list.append({"ENTITY_CODE_": {"$ne": exclude_code}})
        elif isinstance(exclude_code, (list, tuple)):
            query_list.append({"ENTITY_CODE_": {"$nin": list(exclude_code)}})

        if data_id:
            find_id = ObjectId(data_id)
            query_list.append({"_id": {"$gte": find_id}})

        if query_list:
            query = {"$and": query_list}
        else:
            query = {}
        try:
            mon_logger.info("MongoDB 开始查取数据")
            result_one = collection.find_one(query)
            if result_one:
                if limit_number:
                    result = collection.find(query,
                                             no_cursor_timeout=True).limit(
                                                 int(limit_number))
                    if entity_code:
                        mon_logger.info(
                            f"ENTITY: {entity_code} 数据查取成功共 {limit_number} 条")
                    else:
                        mon_logger.info("数据查取成功共 {}条".format(limit_number))
                else:
                    result = collection.find(query, no_cursor_timeout=True)
                    if entity_code:
                        mon_logger.info(
                            f"ENTITY: {entity_code} 数据查取成功共 {result.count()}条")
                    else:
                        mon_logger.info("数据查取成功共 {}条".format(result.count()))

                return result
            else:
                if entity_code:
                    mon_logger.info("ENTITY: {} 数据查取为空".format(
                        self.mongo_entity_code))
                else:
                    mon_logger.info("数据查取为空".format(self.mongo_entity_code))
                return None
        except TypeError as e:
            mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format(
                e, self.mongo_entity_code))
        except pymongo.errors.ServerSelectionTimeoutError as e:
            mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e))
            result_one = collection.find_one(query)
            if result_one:
                if limit_number:
                    result = collection.find(query,
                                             no_cursor_timeout=True).limit(
                                                 int(limit_number))
                    if entity_code:
                        mon_logger.info(
                            f"ENTITY: {entity_code} 数据查取成功共 {limit_number} 条")
                    else:
                        mon_logger.info("数据查取成功共 {}条".format(limit_number))
                else:
                    result = collection.find(query, no_cursor_timeout=True)
                    if entity_code:
                        mon_logger.info(
                            f"ENTITY: {entity_code} 数据查取成功共 {result.count()}条")
                    else:
                        mon_logger.info("数据查取成功共 {}条".format(result.count()))

                return result
            else:
                if entity_code:
                    mon_logger.info("ENTITY: {} 数据查取为空".format(
                        self.mongo_entity_code))
                else:
                    mon_logger.info("数据查取为空".format(self.mongo_entity_code))
                return None
Пример #19
0
class AllToPhoenix(object):
    def __init__(self):
        self.code_list = [
            "ABCORGANIZE", "BOCOMORGANIZE", "BOCORGANIZE", "CBHBORGANIZE",
            "CCBORGANIZE", "CEBORGANIZE", "CGBORGANIZE", "CIBORGANIZE",
            "CMBCORGANIZE", "CMBORGANIZE", "CZBORGANIZE", "EBCLORGANIZE",
            "ECITICORGANIZE", "HXBORGANIZE", "ICBCORGANIZE", "PABORGANIZE",
            "PSBCORGANIZE", "SPDBORGANIZE"
        ]
        self.logger = Logger().logger
        self.count = 0

    # 从 MongoDB 获取数据
    def get_data_from_mongo(self, m_client, collection, entity_code):
        m_client.mongo_entity_code = entity_code
        try:
            mongo_data_list = m_client.search_from_mongodb(collection)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.search_from_mongodb(collection)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            # print(e)
            return None

    # 主函数
    def run(self):
        # 创建 MongoDB 对象
        m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = self.code_list[14:]
        for entity_code in self.code_list:
            self.count = 0
            hash_list = list()
            status = False
            mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                       collection=collection,
                                                       entity_code=entity_code)
            if mongo_data_list:
                self.logger.warning("{} 查取成功".format(entity_code))
                self.logger.warning("当前共有{}条".format(mongo_data_list.count()))
                status = True
            else:
                self.logger.warning("{} 无数据".format(entity_code))

            if status:
                for data in mongo_data_list:
                    if "ADDR_" in data:
                        hash_m = hashlib.md5()
                        hash_m.update(data["ADDR_"].encode("utf-8"))
                        hash_title = hash_m.hexdigest()
                        if hash_title in hash_list:
                            self.count += 1
                        else:
                            hash_list.append(hash_title)
                    else:
                        if "CONTENT_" in data:
                            hash_m = hashlib.md5()
                            hash_m.update(data["CONTENT_"].encode("utf-8"))
                            hash_title = hash_m.hexdigest()
                            if hash_title in hash_list:
                                self.count += 1
                            else:
                                hash_list.append(hash_title)
            self.logger.warning("重复数据{}条".format(self.count))

        # 关闭连接
        m_client.client_close()
        self.logger.handlers.clear()
Пример #20
0
class JsInsuranceCcbData(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="JSINSURANCE_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 MySQL 对象
        __mysql_config = {
            "host": MYSQL_HOST_25,
            "port": MYSQL_PORT_25,
            "database": MYSQL_DATABASE_25,
            "user": MYSQL_USER_25,
            "password": MYSQL_PASSWORD_25,
            "table": MYSQL_TABLE_25
        }

        __mysql_client = MysqlClient(**__mysql_config)
        __mysql_connection = __mysql_client.client_to_mysql()

        self.type = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'TYPE\'",
            connection=__mysql_connection)

        __mysql_client.close_client(connection=__mysql_connection)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="INSURANCE")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""
        self.a = list()

    def data_shuffle(self, data):
        if data["ENTITY_CODE_"] == "PAINSURANCE":
            return None
        elif data["ENTITY_CODE_"] == "BJBINSURANCE":
            data["CONTET_"] = data["CONTET_"].replace("|主险2:", "主险2:")
            first_shuffle = data["CONTET_"].split("|")
            data_list = list()
            company_dict = dict()
            index_list = list()
            for first in first_shuffle:
                if first[-2:] == "公司":
                    company_index = first_shuffle.index(first)
                    company_dict[company_index] = first
                    index_list.append(company_index)
                else:
                    continue

            for key in index_list:
                # print(index_list)
                j = key + 1
                for i in range(100):
                    if index_list.index(key) == len(index_list) - 1:
                        if j == len(first_shuffle) - 1:
                            break
                    else:
                        if j == index_list[index_list.index(key) + 1]:
                            break

                    data_dict = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(first_shuffle[j].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    data_dict["ID_"] = row_key
                    data_dict["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    data_dict["ENTITY_NAME_"] = data["ENTITY_NAME_"].replace(
                        "模板", "产品")
                    data_dict["BANK_CODE_"] = "BJB"
                    data_dict["BANK_NAME_"] = "北京银行"
                    data_dict["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    data_dict["URL_"] = data["URL_"]
                    data_dict["PRODUCT_NAME_"] = first_shuffle[j]
                    j += 1
                    # data_dict["TYPE_"] = first_shuffle[j]
                    data_dict["TYPE_"] = ""
                    data_dict["TYPE_CODE_"] = ""
                    for i in self.type:
                        if i["ITEM_LABEL_"][:-1] in first_shuffle[j]:
                            data_dict["TYPE_"] = data_dict["TYPE_"] + i[
                                "ITEM_LABEL_"] + "|"
                            data_dict["TYPE_CODE_"] = data_dict[
                                "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                    data_dict["TYPE_"] = data_dict["TYPE_"][:-1]
                    data_dict["TYPE_CODE_"] = data_dict["TYPE_CODE_"][:-1]
                    j += 1
                    # data_dict["RISK_LEVEL_"] = first_shuffle[j]
                    j += 1
                    data_dict["PAY_METHOD_"] = first_shuffle[j]
                    j += 1
                    # data_dict["INSURANCE_DATE_"] = first_shuffle[j]
                    j += 1
                    # data_dict["TOUZIZHE_TYPE_"] = first_shuffle[j]
                    j += 1
                    data_dict["COM_NAME_"] = company_dict[key]
                    # data_dict["CONSIGNMENT_"] = "代销"
                    # if "CONTENT_" in data:
                    #     data_dict["CONTENT_"] = data["CONTENT_"]
                    data_dict["DEALTIME_"] = data["DEALTIME_"]
                    data_dict["CREATE_TIME_"] = data["DATETIME_"]
                    data_dict["STATUS_"] = "1"
                    # print(data_dict)
                    data_list.append(data_dict)

            return data_list

        elif data["ENTITY_CODE_"] == "CIBINSURANCE":
            data_list = list()
            insurance_name = re.findall(r".*?计划", data["PRODUCT_NAME_"])
            for name in insurance_name:
                re_data = dict()
                # HBase row_key
                hash_m = hashlib.md5()
                hash_m.update(name.encode("utf-8"))
                hash_title = hash_m.hexdigest()
                row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                # "C"
                re_data["ID_"] = row_key
                re_data["PRODUCT_NAME_"] = name
                re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                re_data["BANK_CODE_"] = "CIB"
                re_data["BANK_NAME_"] = "兴业银行"
                re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                    "-", "")
                re_data["URL_"] = data["URL_"]
                re_data["DEALTIME_"] = data["DEALTIME_"]
                re_data["CREATE_TIME_"] = data["DATETIME_"]
                re_data["STATUS_"] = "1"

                re_data["TYPE_"] = ""
                re_data["TYPE_CODE_"] = ""
                for i in self.type:
                    if i["ITEM_LABEL_"][:-1] in name:
                        re_data["TYPE_"] = re_data["TYPE_"] + i[
                            "ITEM_LABEL_"] + "|"
                        re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"] + i[
                            "ITEM_VALUE_"] + "|"
                re_data["TYPE_"] = re_data["TYPE_"][:-1]
                re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                data_list.append(re_data)

            return data_list

        else:
            if "INSURANCE_NAME_" not in data and ("PRODUCT_NAME_" not in data):
                return None
            else:
                if "INSURANCE_NAME_" in data:
                    # # 承保年龄
                    # if ("INSURANCE_AGE_" not in data) or (not data["INSURANCE_AGE_"]):
                    #     age = re.findall(r"(\d*)周岁", data["INSURANCE_NAME_"])
                    #     if age:
                    #         data["INSURANCE_AGE_"] = age[0]

                    # 保障期限
                    # if ("INSURANCE_DATE_" not in data) or (not data["INSURANCE_DATE_"]):
                    #     limit = re.findall(r"保(终身)|保(\d*年)|(\d*年)期", data["INSURANCE_NAME_"])
                    #     if limit:
                    #         for l in limit[0]:
                    #             if l:
                    #                 data["INSURANCE_DATE_"] = l
                    #                 break

                    re_data = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(data["INSURANCE_NAME_"].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    re_data["ID_"] = row_key
                    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                    re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace(
                        "INSURANCE", "")
                    re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace(
                        "保险产品", "")
                    if "INSURANCE_NAME_" in data:
                        re_data["PRODUCT_NAME_"] = data["INSURANCE_NAME_"]
                    if ("INSURANCE_AGE_" in data) or ("AGE_" in data):
                        re_data["AGE_"] = data["INSURANCE_AGE_"]
                    if "TYPE_" in data:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        if data["TYPE_"] == "财险":
                            re_data["TYPE_"] = "财产险"
                            re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE"
                        else:
                            for i in self.type:
                                if i["ITEM_LABEL_"][:-1] in data["TYPE_"]:
                                    re_data["TYPE_"] = re_data["TYPE_"] + i[
                                        "ITEM_LABEL_"] + "|"
                                    re_data["TYPE_CODE_"] = re_data[
                                        "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                            re_data["TYPE_"] = re_data["TYPE_"][:-1]
                            re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    else:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        for i in self.type:
                            if i["ITEM_LABEL_"][:-1] in data["ENTITY_NAME_"]:
                                re_data["TYPE_"] = re_data["TYPE_"] + i[
                                    "ITEM_LABEL_"] + "|"
                                re_data["TYPE_CODE_"] = re_data[
                                    "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                        re_data["TYPE_"] = re_data["TYPE_"][:-1]
                        re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    # if "INSURANCE_DATE_" in data:
                    #     re_data["INSURANCE_DATE_"] = data["INSURANCE_DATE_"]
                    if "INSURANCE_DETAIL_" in data:
                        re_data["PRODUCT_DETAIL_"] = data["INSURANCE_DETAIL_"]
                    if "COMPANY_NAME_" in data:
                        re_data["COM_NAME_"] = data["COMPANY_NAME_"]
                    if "LIMIT_NUMBER_" in data:
                        re_data["BUY_LIMIT_"] = data["LIMIT_NUMBER_"]
                    # re_data["AREA_CODE_"]
                    # re_data["UNIT_CODE_"]
                    re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    if "CONTENT_" in data:
                        re_data["CONTENT_"] = data["CONTENT_"]
                    # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
                    re_data["STATUS_"] = "1"
                    # re_data["REMARK_"] = ""
                    re_data["CREATE_TIME_"] = data["DATETIME_"]
                    # re_data["UPDATE_TIME_"]
                    # re_data["TITLE_"] = data["TITLE_"]
                    re_data["URL_"] = data["URL_"]
                    re_data["DEALTIME_"] = data["DEALTIME_"]
                    # re_data["DATETIME_"] = data["DATETIME_"]

                    return re_data

                elif "PRODUCT_NAME_" in data:
                    re_data = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(data["PRODUCT_NAME_"].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    re_data["ID_"] = row_key
                    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                    # re_data["BANK_CODE_"] = data["ENTITY_CODE_"]
                    # re_data["BANK_NAME_"] = data["ENTITY_NAME_"]
                    if "PRODUCT_NAME_" in data:
                        re_data["PRODUCT_NAME_"] = data["PRODUCT_NAME_"]
                    if "FEATURE_NAME_" in data:
                        re_data["FEATURE_NAME_"] = data["FEATURE_NAME_"]
                    if "TYPE_" in data:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        if data["TYPE_"] == "财险":
                            re_data["TYPE_"] = "财产险"
                            re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE"
                        elif data["TYPE_"] == "100种疾病保障":
                            re_data["TYPE_"] = "健康险"
                            re_data["TYPE_CODE_"] = "HEALTH_INSURANCE"
                        else:
                            for i in self.type:
                                if i["ITEM_LABEL_"][:-1] in data["TYPE_"]:
                                    re_data["TYPE_"] = re_data["TYPE_"] + i[
                                        "ITEM_LABEL_"] + "|"
                                    re_data["TYPE_CODE_"] = re_data[
                                        "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                            re_data["TYPE_"] = re_data["TYPE_"][:-1]
                            re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    if "POLICY_DUTY_" in data:
                        re_data["POLICY_DUTY_"] = data["POLICY_DUTY_"]
                    if "PRODUCT_CASE_" in data:
                        re_data["PRODUCT_CASE_"] = data["PRODUCT_CASE_"]
                    if "BUY_LIMIT_" in data:
                        re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"]
                    if "ENSURE_PRICE_" in data:
                        re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"]
                    # re_data["AREA_CODE_"]
                    # re_data["UNIT_CODE_"]
                    re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    if "PRODUCT_PRICE_" in data:
                        re_data["PRODUCT_PRICE_"] = data["PRODUCT_PRICE_"]
                    if "PRODUCT_ID_" in data:
                        re_data["PRODUCT_ID_"] = data["PRODUCT_ID_"]
                    if "PRODUCT_CLAUSE_" in data:
                        re_data["PRODUCT_CLAUSE_"] = data["PRODUCT_CLAUSE_"]
                    if "GENDER_" in data:
                        re_data["GENDER_"] = data["GENDER_"]
                    if "AGE_" in data:
                        re_data["AGE_"] = data["AGE_"]
                    if "COM_NAME_" in data:
                        re_data["COM_NAME_"] = data["COM_NAME_"]
                    if "PAY_METHOD_" in data:
                        re_data["PAY_METHOD_"] = data["PAY_METHOD_"]
                    if "PROBLEM_" in data:
                        re_data["PROBLEM_"] = data["PROBLEM_"]
                    if "CLAIM_" in data:
                        re_data["CLAIM_"] = data["CLAIM_"]
                    if "COMMENT_" in data:
                        re_data["COMMENT_"] = data["COMMENT_"]
                    if "ENSURE_CONTENT_" in data:
                        re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"]
                    if "INSURE_INFO_" in data:
                        re_data["INSURE_INFO_"] = data["INSURE_INFO_"]
                    if "RATE_INFO_" in data:
                        re_data["RATE_INFO_"] = data["RATE_INFO_"]
                    if "SALE_SERVICE_" in data:
                        re_data["SALE_SERVICE_"] = data["SALE_SERVICE_"]

                    # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
                    re_data["STATUS_"] = "1"
                    # re_data["REMARK_"] = ""
                    re_data["CREATE_TIME_"] = data["DATETIME_"]
                    # re_data["UPDATE_TIME_"]
                    # re_data["TITLE_"] = data["TITLE_"]
                    re_data["URL_"] = data["URL_"]
                    re_data["DEALTIME_"] = data["DEALTIME_"]
                    # re_data["DATETIME_"] = data["DATETIME_"]

                    return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "INSURANCE" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,'
        #              ' "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, '
        #              '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,'
        #              '"C"."TYPE_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C".PRODUCT_CLAUSE_ varchar,'
        #              '"C"."SOURCE_" varchar, "C"."PRODUCT_NAME_" varchar, "C"."FEATURE_NAME_" varchar,'
        #              '"C"."POLICY_DUTY_" varchar, "C"."PRODUCT_CASE_" varchar, "C"."BUY_LIMIT_" varchar,'
        #              '"C"."ENSURE_PRICE_" varchar, "C"."PRODUCT_PRICE_" varchar, "C"."PRODUCT_ID_" varchar,'
        #              '"C"."GENDER_" varchar, "C"."AGE_" varchar, "C"."COM_NAME_" varchar, "C"."TYPE_CODE_" varchar,'
        #              '"C"."PAY_METHOD_" varchar, "C"."PRODUCT_DETAIL_" varchar, "C"."PROBLEM_" varchar,'
        #              '"C"."CLAIM_" varchar, "C"."COMMENT_" varchar, "C"."STATUS_" varchar,'
        #              '"C"."ENSURE_CONTENT_" varchar, "C"."INSURE_INFO_" varchar, "C"."RATE_INFO_" varchar,'
        #              '"C"."SALE_SERVICE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            # print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                if isinstance(re_data, dict):
                    # upsert data to HBase
                    try:
                        success_count = self.p_client.upsert_to_phoenix_by_one(
                            connection=self.connection, data=re_data)
                    except jaydebeapi.DatabaseError as e:
                        self.logger.info("错误 id: {}, 错误信息 {}".format(
                            self.data_id, e))
                        continue

                elif isinstance(re_data, list):
                    for r_d in re_data:
                        # upsert data to HBase
                        try:
                            success_count = self.p_client.upsert_to_phoenix_by_one(
                                connection=self.connection, data=r_d)
                        except jaydebeapi.DatabaseError as e:
                            self.logger.info("错误 id: {}, 错误信息 {}".format(
                                self.data_id, e))
                            continue

            #     # add {d:1}
            #     try:
            #         self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id,
            #                                         data_dict={"d": 1})
            #         self.remove_count += 1
            #         if self.remove_count % 10 == 0:
            #             self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count))
            #     except Exception as e:
            #         self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e))
            #         continue
                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #21
0
    def all_from_mongodb(self, collection, data_id=None, d=False):
        mon_logger = Logger().logger
        if data_id:
            find_id = ObjectId(data_id)
            try:
                mon_logger.info("开始查取数据")
                # result = collection.find_one({"_id": {"$gte": find_id}})
                result = collection.find_one({
                    "$and": [{
                        "_id": {
                            "$gte": find_id
                        }
                    }, {
                        "ORDER_ID": {
                            "$exists": False
                        }
                    }]
                })
                if result is not None:
                    result = collection.find(
                        {
                            "$and": [{
                                "_id": {
                                    "$gte": find_id
                                }
                            }, {
                                "ORDER_ID": {
                                    "$exists": False
                                }
                            }]
                        },
                        no_cursor_timeout=True)
                    mon_logger.info("数据查取成功, 共 {} 条".format(result.count()))
                    return result
                else:
                    mon_logger.info("MongoDB 查取数据为空")
                    return None
            except TypeError as e:
                mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format(
                    e, self.mongo_entity_code))
            except pymongo.errors.ServerSelectionTimeoutError as e:
                mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e))
                result = collection.find_one({
                    "$and": [{
                        "_id": {
                            "$gte": find_id
                        }
                    }, {
                        "ORDER_ID": {
                            "$exists": False
                        }
                    }]
                })
                if result:
                    result = collection.find(
                        {
                            "$and": [{
                                "_id": {
                                    "$gte": find_id
                                }
                            }, {
                                "ORDER_ID": {
                                    "$exists": False
                                }
                            }]
                        },
                        no_cursor_timeout=True)
                    mon_logger.info("数据查取成功, 共 {} 条".format(result.count()))
                    return result
                else:
                    mon_logger.info("MongoDB 查取数据为空")
                    return None
        else:
            if d:
                try:
                    mon_logger.info("开始查取数据")
                    result = collection.find_one()
                    if result is not None:
                        result = collection.find(no_cursor_timeout=True)
                        mon_logger.info("数据查取成功, 共 {} 条".format(
                            result.count()))
                        return result
                    else:
                        mon_logger.info("MongoDB 查取数据为空")
                        return None
                except TypeError as e:
                    mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format(
                        e, self.mongo_entity_code))
                except pymongo.errors.ServerSelectionTimeoutError as e:
                    mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e))
                    result = collection.find_one()
                    if result:
                        result = collection.find(no_cursor_timeout=True)
                        mon_logger.info("数据查取成功, 共 {} 条".format(
                            result.count()))
                        return result
                    else:
                        mon_logger.info("MongoDB 查取数据为空")
                        return None
            else:
                try:
                    mon_logger.info("开始查取数据")
                    result = collection.find_one({"d": {"$exists": False}})
                    if result is not None:
                        result = collection.find({"d": {
                            "$exists": False
                        }},
                                                 no_cursor_timeout=True)
                        mon_logger.info("数据查取成功, 共 {} 条".format(
                            result.count()))
                        return result
                    else:
                        mon_logger.info("MongoDB 查取数据为空")
                        return None

                except TypeError as e:
                    mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format(
                        e, self.mongo_entity_code))
                except pymongo.errors.ServerSelectionTimeoutError as e:
                    mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e))
                    result = collection.find_one()
                    if result:
                        result = collection.find(no_cursor_timeout=True)
                        mon_logger.info("数据查取成功, 共 {} 条".format(
                            result.count()))
                        return result
                    else:
                        mon_logger.info("MongoDB 查取数据为空")
                        return None
Пример #22
0
class DataShuffle(object):
    def __init__(self, param):
        self.logger = Logger().logger
        self.invoke_type = "BRANCH"
        self.base_dir = os.path.dirname(os.getcwd())
        self.param_dict = eval(param)
        self.param = "\"" + param + "\""
        if self.param_dict:
            try:
                self.entity_type = self.param_dict["entityType"]
            except Exception:
                raise Exception

    def invoking_the_script(self):
        """
        调用脚本
        :return:
        """
        if self.invoke_type == "BRANCH":
            # 微博和微信
            if self.entity_type == "WEIBOINFO" or self.entity_type == "WEIBOBASICINFO" or self.entity_type == "WECHAT":
                file_path = "/".join([
                    "./branch_scripts2", self.entity_type,
                    f"{self.entity_type}.py"
                ])
            # 金融产品
            elif self.entity_type[:4] == "JRCP":
                file_path = "/".join([
                    "./branch_scripts2", "FIN_PRODUCT", self.entity_type,
                    "__init_____.py"
                ])
            # 网点
            elif self.entity_type[:2] == "WD":
                file_path = "/".join([
                    "./branch_scripts2", "SURROUNDING_FACILITIES",
                    self.entity_type, "__init_____.py"
                ])
            # 招投标
            elif self.entity_type == "COMMONBIDDING" or self.entity_type == "commonbidding" or self.entity_type == "CommonBidding":
                file_path = "/".join(
                    ["./scripts", "CommonBidding", "__init_____.py"])
            # 招投标
            elif self.entity_type == "ORGANIZE":
                file_path = "/".join(
                    ["./scripts", "CommonBidding", "__init_____.py"])
            else:
                file_path = "/".join(
                    ["./crm.scripts", self.entity_type, "__init_____.py"])
        else:
            if self.entity_type == "WEIBOINFO" or self.entity_type == "WEIBOBASICINFO" or self.entity_type == "WECHAT":
                file_path = "/".join([
                    "./branch_scripts2", self.entity_type,
                    f"{self.entity_type}.py"
                ])
            else:
                file_path = "/".join(
                    ["./branch_scripts2", self.entity_type, "__init_____.py"])

        self.logger.info("调用的脚本文件为{}".format(file_path))

        os.system(" ".join(["python ", file_path, self.param]))

    def main(self):
        self.invoking_the_script()
Пример #23
0
 def remove_from_mysql(self, connection, list_remove):
     mysql_logger = Logger().logger
     sql = "DELETE FROM {} WHERE ID_ IN {}".format(self.mysql_table,
                                                   list_remove)
     count = self.cs_commit(connection=connection, sql=sql)
     mysql_logger.info("删除成功 {}条".format(count))
Пример #24
0
    def get_data_and_update(self,
                            collection,
                            entity_code,
                            exclude_code,
                            update_dict,
                            data_id=None,
                            other_query=None,
                            sort_query=None):
        """
        查询一条数据并更新
        :param collection:
        :param entity_code: 需要查取的 实体编码
        :param exclude_code: 需要排除的 实体编码
        :param update_dict: 需要更新的字段与值
        :param data_id: 查取 data_id 后的数据
        :param other_query: 其他过滤条件
        :return:
        """
        mon_logger = Logger().logger
        query_list = list()
        if isinstance(entity_code, str):
            query_list.append({"ENTITY_CODE_": entity_code})
        elif isinstance(entity_code, (list, tuple)):
            query_list.append({"ENTITY_CODE_": {"$in": list(entity_code)}})

        if isinstance(exclude_code, str):
            query_list.append({"ENTITY_CODE_": {"$ne": exclude_code}})
        elif isinstance(exclude_code, (list, tuple)):
            query_list.append({"ENTITY_CODE_": {"$nin": list(exclude_code)}})

        if isinstance(other_query, dict):
            query_list.append(other_query)
        elif isinstance(other_query, list):
            query_list.extend(other_query)

        if data_id:
            find_id = ObjectId(data_id)
            query_list.append({"_id": {"$gte": find_id}})

        if query_list:
            query = {"$and": query_list}
        else:
            query = {}
        try:
            mon_logger.info(f"query={query}")
            result_one = collection.find_one_and_update(query,
                                                        update_dict,
                                                        sort=sort_query)
            if result_one:
                mon_logger.info(f"MongoDB--数据查取并更新成功")
                return result_one
            else:
                mon_logger.info("MongoDB 数据查取为空".format(
                    self.mongo_entity_code))
                return None
        except TypeError as e:
            mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format(
                e, self.mongo_entity_code))
        except pymongo.errors.ServerSelectionTimeoutError as e:
            mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e))
            result_one = collection.find_one_and_update(query, update_dict)
            if result_one:
                mon_logger.info("MongoDB--数据查取并更新成功")
                return result_one
            else:
                mon_logger.info("MongoDB 数据查取为空".format(
                    self.mongo_entity_code))
                return None
Пример #25
0
class WechatScript(object):
    def __init__(self, entity_type="FOR_TEST_WECHAT"):
        """
        初始化参数
        :param entity_type: WECHAT
        """
        self.entity_type = entity_type
        self.logger = Logger().logger

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name=self.entity_type)
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.remove_id_list = list()
        self.copy_mongo_data_list = list()

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""
        self.row_key_count = 0

    # def write_to_local(self, data):
    #     with open("wechat_test.txt", "a", encoding="utf-8") as f:
    #         f.write(str(data))

    def read_from_local(self):
        with open("wechat_test.txt", "r", encoding="utf-8") as f:
            data = f.read()
            return data

    def main(self, data):
        """

        :return:
        """
        # # 删除表
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()

        # # 建表语句
        # table_sql = ('create table "FOR_TEST_WECHAT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar,'
        #              '"C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar,'
        #              '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, '
        #              '"T"."CONTENT_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."TITLE_" varchar,'
        #              '"C"."WECHAT_ID_" varchar, "C"."WECHAT_NAME_" varchar, "C"."ENTITY_NAME_" varchar,'
        #              '"C"."DEALTIME_" varchar,"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        i = random.randint(0, 100000000)
        re_data = eval(data)
        re_data["ID_"] = re_data["ID_"] + "_" + str(i)

        if self.success_count % 10 == 0:
            self.logger.info("HBase 插入成功 {} 条".format(self.success_count))
        # 向 HBase 插入数据
        try:
            count = self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=re_data)
            self.success_count += count
            return self.success_count
        except jaydebeapi.DatabaseError as e:
            self.logger.info("错误 id: {}, 错误信息 {}".format(self.data_id, e))
            return None

    def final(self):
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #26
0
class AllToPhoenix(object):
    def __init__(self):
        # "CNINFONEWS" pdf too long
        self.code_list = [
            "CAIJINGNEWS", "CNINFONEWS", "CSFINACIAL", "CSFINACIALNEWS",
            "CSNEWS", "CSNOTICE", "FINAQQNEWS", "XLCJYHMKNEWS", "XLCJNEWS",
            "XLCJGSNEWS", "WYCJNEWS", "WYCJGSNEWS", "NEWS163DOM",
            "NEWS10JQKA2", "NEWS10JQKA", "HOUSEQQNEWS"
        ]

        self.logger = Logger().logger
        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        # 插入 spider_data_old 的数据列表
        # self.copy_mongo_data_list = list()
        # 删除 spider_data 的数据 _id 列表
        # self.remove_id_list = list()
        # self.branch_code_list = list()

        self.verify_list = [
            "ENTITY_CODE_", "ENTITY_NAME_", "URL_", "PERIOD_CODE_", "STATUS_",
            "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "BANK_NAME_",
            "BANK_CODE_", "CONTENT_", "DATA_SOURCE_", "KEYWORDS_",
            "ENTITY_NAME_", "ID_"
        ]

    # 从 MongoDB 获取数据
    def get_data_from_mongo(self, m_client, collection, entity_code, find_id):
        m_client.mongo_db = "spider_data"
        m_client.mongo_entity_code = entity_code
        try:
            mongo_data_list = m_client.search_from_mongodb(
                collection=collection, data_id=find_id)
            return mongo_data_list
        except pymongo.errors.ServerSelectionTimeoutError:
            self.logger.info("连接失败,正在重新连接")
            sleep(1)
            mongo_data_list = m_client.search_from_mongodb(
                collection=collection, data_id=find_id)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None

    # 从 MongoDB 删除数据
    def delete_data_from_mongo(self, m_client, collection, entity_code,
                               remove_id_list):
        m_client.mongo_entity_code = entity_code

        try:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            remove_count = m_client.remove_from_mongo(
                collection=collection, remove_id_list=remove_id_list)
            return remove_count
        except Exception as e:
            self.logger.info(e)
            return None

    def get_brief_from_ai(self, data):
        data["CONTENT_"] = data["CONTENT_"].replace("|", "")
        if data["PUBLISH_TIME_"]:
            data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"][:10]
        # ID
        hash_m = hashlib.md5()
        hash_m.update(data["URL_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        data["ID_"] = data["ENTITY_CODE_"] + "_" + str(hash_title)

        text = data["CONTENT_"]

        ex_line = ("python3" + " " + AI_PATH + " " + '\"{}\"'.format(text) +
                   " " + "1")
        # print(ex_line)
        r = os.popen(ex_line)

        # print(1, r.read())
        data["BRIEF_"] = r.read()

        return data

    # 主函数
    def run(self):
        # # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="NEWS_FINASSIST")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="NEWS_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)
        #
        # # 表创建语句
        # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, '
        #        '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, '
        #        '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)'
        #        'IMMUTABLE_ROWS = true')

        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            # self.remove_id_list = []
            # self.copy_mongo_data_list = []
            # self.branch_code_list = []
            if entity_code == "CAIJINGNEWS":
                find_id = "5c6bfa508d7fee512a4ca68f"
            else:
                find_id = ""
            # find_id = ""
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for i in range(1000000):
                    try:
                        data = mongo_data_list.__next__()
                    except pymongo.errors.ServerSelectionTimeoutError:
                        continue
                    except StopIteration:
                        break

                    # for data in mongo_data_list:
                    data_id = data["_id"]
                    if self.success_count % 100 == 0:
                        self.logger.info(
                            "running on data_id: {}".format(data_id))
                    # print(data_id)
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        data_list = [data]
                        re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            # self.remove_id_list.remove(data_id)
                            continue
                    except Exception as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    if entity_code != "CNINFONEWS":
                                        ai_data = self.get_brief_from_ai(
                                            data=list_data)
                                    else:
                                        ai_data = list_data
                                    # print(ai_data["CONTENT_"])
                                except Exception as e:
                                    self.logger.info("AI 调取失败, 错误信息", e)
                                    ai_data = re_data
                                try:
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=ai_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    if self.success_count % 10 == 0:
                                        self.logger.info(
                                            "HBase 插入成功, 成功条数{}条".format(
                                                once_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                                try:
                                    # 添加 {d:1}
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=data_id,
                                        data_dict={"d": 1})
                                    self.remove_count += update_count
                                    # self.logger.info("MongoDB 更新成功")
                                    if self.remove_count % 10 == 0:
                                        self.logger.info(
                                            "MongoDB 更新成功, 成功条数 {} 条".format(
                                                "10"))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue

                    elif isinstance(re_data, dict):
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                self.logger.info(
                                    "HBase 插入成功, 成功条数 {} 条".format(
                                        success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue

                if once_count > 0:
                    status = True
                    self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format(
                        entity_code, once_count))
                mongo_data_list.close()
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #27
0
class Trend(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="TREND")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="CHA_BRANCH_MARKET_ACT")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["TITLE_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

        # 分行
        copy_result = dict()
        copy_result["ID_"] = row_key
        copy_result["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        copy_result["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        copy_result["URL_"] = data["URL_"]
        # copy_result["PROVINCE_CODE_"] = result[""]
        # copy_result["PROVINCE_NAME_"] = result[""]
        # copy_result["CITY_CODE_"] = result[""]
        # copy_result["CITY_NAME_"] = result[""]
        # copy_result["AREA_CODE_"] = result[""]
        # copy_result["AREA_NAME_"] = result[""]
        # copy_result["LAT_"] = result[""]
        # copy_result["LNG_"] = result[""]
        copy_result["APP_VERSION_"] = "BRANCH"
        copy_result["BANK_CODE_"] = data["ENTITY_CODE_"].replace(
            "PRIVATEINFO", "")
        copy_result["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "")
        # copy_result["UNIT_CODE_"] = result["UNIT_CODE_"]
        # copy_result["UNIT_NAME_"] = result[""]
        copy_result["PERIOD_CODE_"] = data["NOTICE_TIME_"].replace("-", "")
        # copy_result["REMARK_"] = result[""]
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        copy_result["CREATE_TIME_"] = create_time
        copy_result["SPIDER_TIME_"] = data["DATETIME_"]
        # copy_result["MODIFIED_TIME_"] = result[""]
        copy_result["CREATE_BY_ID_"] = "P0131857"
        copy_result["CREATE_BY_NAME_"] = "钟楷文"
        # copy_result["MODIFIED_BY_ID_"] = result[""]
        # copy_result["MODIFIED_BY_NAME_"] = result[""]
        copy_result["M_STATUS_"] = "0"
        copy_result["DELETE_STATUS_"] = "0"
        copy_result["DATA_STATUS_"] = "uncheck"
        # copy_result["TAGS_"] = result[""]
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        copy_result["SOURCE_"] = source[0]
        copy_result["SOURCE_NAME_"] = data["ENTITY_NAME_"]
        # copy_result["SOURCE_TYPE_"] = result[""]
        # copy_result["HOT_"] = result[""]
        # copy_result["IMPORTANCE_"] = result[""]
        copy_result["ACT_NAME_"] = data["TITLE_"]
        # copy_result["IMAGES_"] = data[""]
        # copy_result["TARGET_"] = data[""]
        # copy_result["BRIEF_"] = data[""]
        copy_result["DETAILS_"] = data["CONTENT_"]
        # copy_result["RULE_"] = data[""]
        # copy_result["START_TIME_"] = data[""]
        # copy_result["END_TIME_"] = data[""]
        # copy_result["ACT_TYPE1_"] = data[""]
        # copy_result["ACT_TYPE2_"] = data[""]
        # copy_result["ACT_TYPE3_"] = data[""]
        copy_result["PUBLISH_TIME_"] = data["NOTICE_TIME_"]
        # copy_result["READS_"] = data[""]
        # copy_result["LIKES_"] = data[""]
        # copy_result["COMMENTS_"] = data[""]
        # copy_result["JOINS_"] = data[""]
        # copy_result["RELAYS_"] = data[""]
        # copy_result["SOURCE_ID_"] = data[""]
        # copy_result["HTML_"] = data[""]
        # copy_result["SOURCE_OWN_NAME_"] = data[""]
        # copy_result["SOURCE_OWN_ID_"] = data[""]
        return copy_result

        # "C"
        # re_data["ID_"] = row_key
        # re_data["TYPE_"] = random.choice(
        #     ["税务法律", "子女教育", "健康医养", "财富管理", "生活娱乐", "旅游出行", "艺术/艺术品", "节日庆贺", "其他"])
        # re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        # re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        # re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace("PRIVATEINFO", "")
        # re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "")
        # # re_data["AREA_CODE_"]
        # # re_data["UNIT_CODE_"]
        # period_code = data["NOTICE_TIME_"].replace("-", "")
        # re_data["PERIOD_CODE_"] = period_code
        # re_data["CONTENT_"] = data["CONTENT_"]
        # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
        # re_data["STATUS_"] = "1"
        # # re_data["REMARK_"] = ""
        # re_data["CREATE_TIME_"] = data["DATETIME_"]
        # # re_data["UPDATE_TIME_"]
        # re_data["TITLE_"] = data["TITLE_"]
        # re_data["URL_"] = data["URL_"]
        # re_data["DEALTIME_"] = data["DEALTIME_"]
        # # re_data["DATETIME_"] = data["DATETIME_"]
        # # re_data["SOURCE_TYPE_"]
        #
        # return re_data

    def run(self):
        # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # quit()

        # add colum
        # self.p_client.add_column_phoenix(connection=self.connection, column="SOURCE_TYPE_")
        # quit()

        # create table sql
        # table_sql = ('create table "MARKETING_ACT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."TITLE_" varchar,"C"."NOTICE_TIME_" varchar,'
        #              '"T"."CONTENT_" varchar,"C"."OBJ_" varchar, "C"."ATENDANCE_" varchar, "C"."PERIOD_CODE_" varchar,'
        #              '"C"."IMAGES_" varchar, "C"."RESULTS_" varchar,"C"."PLACE_" varchar, "C"."TYPE_" varchar,'
        #              '"C"."READ_NUM_" varchar, "C"."CONTENT_NUM_" varchar, "C"."COMMENT_CONTENT_" varchar, '
        #              '"C"."FORWARD_NUM_" varchar, "C"."COLLECTION_NUM_" varchar, "C"."PRAISE_NUM_" varchar,'
        #              '"C"."BANK_NAME_" varchar, "C"."STATUS_" varchar, "C"."REMARK_" varchar, "C"."SOURCE_ID_" varchar,'
        #              '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,"C"."SOURCE_" varchar,'
        #              '"C"."URL_" varchar, "C"."BANK_CODE_" varchar, "C"."DEALTIME_" varchar, '
        #              '"C"."SOURCE_TYPE_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true')

        # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        # for i in range(mongo_data_list.count() + 100):
        for i in range(100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #28
0
class Meipian(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="meipian_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="MEIPIAN_CCBDATA")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        if data["TITLE_"]:
            # HBase row_key
            hash_m = hashlib.md5()
            hash_m.update(data["TITLE_"].encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

            # "C" 通用列族字段
            re_data["ID_"] = row_key
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
            # re_data["BANK_NAME_"]

            time_arrary = arrow.get(data["CREATE_TIME"])
            period_code = time_arrary.format("YYYYMMDD")
            publish_time = time_arrary.format("YYYY-MM-DD HH:mm:ss")
            re_data["PERIOD_CODE_"] = str(period_code)
            re_data["PUBLISH_TIME_"] = str(publish_time)
            re_data["STATUS_"] = "UNPROCESSED"
            re_data["CONTENT_"] = data["CONTENT_"]
            re_data["REMARK_"] = ""
            # re_data["AREA_CODE_"]
            # re_data["UNIT_CODE_"]
            re_data["CREATE_TIME_"] = data["DATETIME_"]

            re_data["URL_"] = data["URL_"]
            re_data["TITLE_"] = data["TITLE_"]
            re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"]
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
            re_data["DEALTIME_"] = str(data["DEALTIME_"])
            re_data["VISIT_COUNT_"] = data["VISIT_COUNT"]
            re_data["PRAISE_COUNT_"] = data["PRAISE_COUNT"]
            re_data["COMMENT_COUNT_"] = data["COMMENT_COUNT"]
            re_data["SOURCE_"] = data["SOURCE_"]

            return re_data
        else:
            return None

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "MEIPIAN_CCBDATA" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."URL_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #              ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, '
        #              '"C"."TITLE_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."ENTITY_NAME_" varchar,'
        #              '"C"."VISIT_COUNT_" varchar, "C"."PRAISE_COUNT_" varchar, "C"."COMMENT_COUNT_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."SOURCE_" varchar, "C"."PUBLISH_TIME_" varchar,'
        #              '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')

        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)
        # f_id = "5c6fa1328d7fee306de9463d"  # quit()
        # f_id = "5c6fe1ba8d7fee1d44775989"  # quit()
        # f_id = "5c6fdb448d7fee394da6a5fb"  # quit() Exception while executing batch.
        # f_id = "5c6fe1ba8d7fee1d44775989"
        f_id = "5c6fe11b9bb3df6b0ec6168b"  # gt 10M
        mongo_data_list = self.m_client.all_from_mongodb(self.collection,
                                                         data_id=f_id)
        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #29
0
class BaiduSearch(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="BAIDU_SEARCH")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="BAIDU_SEARCH")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

        # BANK_NAME_ 字典  交通银行 BOCOM 改为 COMM 中信银行 ECITIC 改为 CITIC  增加 平安银行 北京银行 上海银行
        self.name_dict = {
            "ICBC": "中国工商银行",
            "ABC": "中国农业银行",
            "BOC": "中国银行",
            "CCB": "中国建设银行",
            "COMM": "交通银行",
            "PSBC": "中国邮政储蓄银行",
            "CZB": "浙商银行",
            "CBHB": "渤海银行",
            "CITIC": "中信银行",
            "CEB": "中国光大银行",
            "HXB": "华夏银行",
            "CMBC": "中国民生银行",
            "CMB": "招商银行",
            "CIB": "兴业银行",
            "CGB": "广发银行",
            "PAB": "平安银行",
            "SPDB": "浦发银行",
            "EBCL": "恒丰银行",
            "PINGAN": "平安银行",
            "LTD": "中国光大银行",
            "BEIJING": "北京银行",
            "BOSC": "上海银行"
        }

        # TYPE_ 列表
        self.type_list = [
            "Market", "Activity", "GoodStart", "MidSeason", "PrivateBank",
            "Recommendation"
        ]

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["TITLE_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

        # "C"
        re_data["ID_"] = row_key
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        # re_data["AREA_CODE_"]
        for bank_c in self.name_dict:
            if bank_c in data["ENTITY_CODE_"]:
                re_data["BANK_CODE_"] = bank_c
                break
        if "BANK_CODE_" in re_data:
            re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]]
        else:
            print(data["ENTITY_CODE_"])

        # re_data["UNIT_CODE_"]
        # re_data["PERIOD_CODE_"] = ""
        re_data["CONTENT_"] = data["CONTENT_"]
        re_data["STATUS_"] = "UNPROCESSED"
        # re_data["REMARK_"] = ""
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        # re_data["UPDATE_TIME_"]

        for type in self.type_list:
            if type in data["ENTITY_CODE_"]:
                re_data["TYPE_"] = type
                break
        re_data["TITLE_"] = data["TITLE_"]
        re_data["URL_"] = data["URL_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        # re_data["DATETIME_"] = data["DATETIME_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]

        return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "BAIDU_SEARCH" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,"C"."REMARK_" varchar,'
        #              ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, '
        #              '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."ENTITY_NAME_" varchar,"C"."TYPE_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Пример #30
0
class Entrust(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="JSENTRUST_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="ENTRUST")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["NAME_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)
        re_data["ID_"] = row_key
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        re_data["STATUS_"] = "1"
        re_data["DEALTIME_"] = data["DEALTIME_"]
        re_data["URL_"] = data["URL_"]

        if data["ENTITY_CODE_"] == "CHINATRC":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]

            pub_date = eval(data["PUB_DATE_"])
            date = str(pub_date["time"])[:-3]
            t = arrow.get(int(date))
            publish_date = str(t)[:10]
            period_code = publish_date.replace("-", "")
            re_data["PERIOD_CODE_"] = period_code
            # re_data["REMARK_"]

            # re_data["UPDATE_TIME_"]

            re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            re_data["FUNCTION_"] = data["FUNCTION_"]

            pro_date = eval(data["PRO_START_"])
            pro_date = str(pro_date["time"])[:-3]
            p_t = arrow.get(int(pro_date))
            product_date = str(p_t)[:10]
            re_data["PRO_START_"] = product_date
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = publish_date
            # re_data["SCALE_"] = data[""]
            # re_data["MONTH_"]
            # re_data["YIELD_RATE_"]
            # re_data["START_FUNDS_"]
            # re_data["PURPOSE_"]
            # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"]
            #
            # re_data["DISTRIBU_MODE_"]
            # re_data["INVEST_AREA_"]
            # re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # re_data["INVEST_DIRECTION_"]
            # re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"]
            # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            # re_data["ISSUER_AREA_"]
            # re_data["RESERVE_INFO_"]
            # re_data["TRUSTEESHIP_BANK_"]
            # re_data["OTHER_INFO_"]
            # re_data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "TRUSTHEXUN":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # re_data["REMARK_"]
            # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # re_data["PRO_START_"]
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace(
                "至月", "")
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            re_data["SCALE_"] = data["SCALE_"]
            # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # re_data["PURPOSE_"]
            # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"]
            #
            # re_data["DISTRIBU_MODE_"]
            # re_data["INVEST_AREA_"]
            # re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # re_data["INVEST_DIRECTION_"]
            re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            re_data["CURRENCY_"] = data["CURRENCY_"]
            re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"]
            # re_data["TRUSTEESHIP_BANK_"]
            re_data["OTHER_INFO_"] = data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "YANGLEE":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # # re_data["REMARK_"]
            # # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # # re_data["PRO_START_"]
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            # re_data["SCALE_"] = data["SCALE_"]
            # # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # # re_data["PURPOSE_"]
            # # re_data["ESTAB_ANNOUNCEMENT_"]
            re_data["ENTRUST_STATUS_"] = data["STATUS_"]
            #
            re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"]
            # # re_data["INVEST_AREA_"]
            re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # # re_data["INVEST_DIRECTION_"]
            # re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"] = data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            # re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"]
            re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"]
            re_data["OTHER_INFO_"] = data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "TRUSTONE":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # # re_data["REMARK_"]
            # # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # # re_data["PRO_START_"]
            # re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            # re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            re_data["SCALE_"] = data["SCALE_"]
            # # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # # re_data["PURPOSE_"]
            # # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"] = data["STATUS_"]
            # #
            re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"]
            re_data["INVEST_AREA_"] = data["INVEST_AREA_"]
            re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            re_data["INVEST_DIRECTION_"] = data["INVEST_DIRECTION_"]
            re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"] = data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            # re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            re_data["RESERVE_INFO_"] = re.sub(r"</?\w*>", "",
                                              data["RESERVE_INFO_"])
            # re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"]
            # re_data["OTHER_INFO_"] = data["OTHER_INFO_"]

        return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "ENTRUST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."CREATE_TIME_" varchar, "C"."STATUS_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."FUNCTION_" varchar,'
        #              '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar,'
        #              '"C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."UPDATE_TIME_" varchar,'
        #              '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."ISSUER_" varchar, "C"."PRO_START_" varchar,'
        #              '"C"."INVEST_PERIOD_" varchar,"C"."RUN_MODE_" varchar, "C"."INDUSTRY_" varchar,'
        #              '"C"."PUB_DATE_" varchar, "C"."SCALE_" varchar, "C"."MONTH_" varchar, "C"."YIELD_RATE_" varchar,'
        #              '"C"."START_FUNDS_" varchar, "C"."PURPOSE_" varchar, "C"."ESTAB_ANNOUNCEMENT_" varchar,'
        #              '"C"."ENTRUST_STATUS_" varchar, "C"."DISTRIBU_MODE_" varchar, "C"."INVEST_AREA_" varchar,'
        #              '"C"."TERM_TYPE_" varchar, "C"."INVEST_DIRECTION_" varchar, "C"."INVEST_MODE_" varchar,'
        #              '"C"."CURRENCY_" varchar, "C"."MANAGE_TYPE_" varchar, "C"."SALE_TARGET_" varchar,'
        #              '"C"."PROFIT_TYPE_" varchar, "C"."ISSUER_AREA_" varchar, "C"."RESERVE_INFO_" varchar,'
        #              '"C"."TRUSTEESHIP_BANK_" varchar, "C"."OTHER_INFO_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection, data_id="5c67307d9bb3df76b4229f79")

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            # try:
            re_data = self.data_shuffle(data=data)
            # except Exception as e:
            #     self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
            #     continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # # add {d:1}
                # try:
                #     self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id,
                #                                     data_dict={"d": 1})
                #     self.remove_count += 1
                #     if self.remove_count % 10 == 0:
                #         self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count))
                # except Exception as e:
                #     self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e))
                #     continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()