예제 #1
0
class BaiduSearch(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="BAIDU_SEARCH")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="BAIDU_SEARCH")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

        # BANK_NAME_ 字典  交通银行 BOCOM 改为 COMM 中信银行 ECITIC 改为 CITIC  增加 平安银行 北京银行 上海银行
        self.name_dict = {
            "ICBC": "中国工商银行",
            "ABC": "中国农业银行",
            "BOC": "中国银行",
            "CCB": "中国建设银行",
            "COMM": "交通银行",
            "PSBC": "中国邮政储蓄银行",
            "CZB": "浙商银行",
            "CBHB": "渤海银行",
            "CITIC": "中信银行",
            "CEB": "中国光大银行",
            "HXB": "华夏银行",
            "CMBC": "中国民生银行",
            "CMB": "招商银行",
            "CIB": "兴业银行",
            "CGB": "广发银行",
            "PAB": "平安银行",
            "SPDB": "浦发银行",
            "EBCL": "恒丰银行",
            "PINGAN": "平安银行",
            "LTD": "中国光大银行",
            "BEIJING": "北京银行",
            "BOSC": "上海银行"
        }

        # TYPE_ 列表
        self.type_list = [
            "Market", "Activity", "GoodStart", "MidSeason", "PrivateBank",
            "Recommendation"
        ]

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["TITLE_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

        # "C"
        re_data["ID_"] = row_key
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        # re_data["AREA_CODE_"]
        for bank_c in self.name_dict:
            if bank_c in data["ENTITY_CODE_"]:
                re_data["BANK_CODE_"] = bank_c
                break
        if "BANK_CODE_" in re_data:
            re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]]
        else:
            print(data["ENTITY_CODE_"])

        # re_data["UNIT_CODE_"]
        # re_data["PERIOD_CODE_"] = ""
        re_data["CONTENT_"] = data["CONTENT_"]
        re_data["STATUS_"] = "UNPROCESSED"
        # re_data["REMARK_"] = ""
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        # re_data["UPDATE_TIME_"]

        for type in self.type_list:
            if type in data["ENTITY_CODE_"]:
                re_data["TYPE_"] = type
                break
        re_data["TITLE_"] = data["TITLE_"]
        re_data["URL_"] = data["URL_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        # re_data["DATETIME_"] = data["DATETIME_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]

        return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "BAIDU_SEARCH" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,"C"."REMARK_" varchar,'
        #              ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, '
        #              '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."ENTITY_NAME_" varchar,"C"."TYPE_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #2
0
파일: trend.py 프로젝트: ILKKAI/dataETL
class Trend(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="TREND")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="CHA_BRANCH_MARKET_ACT")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["TITLE_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

        # 分行
        copy_result = dict()
        copy_result["ID_"] = row_key
        copy_result["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        copy_result["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        copy_result["URL_"] = data["URL_"]
        # copy_result["PROVINCE_CODE_"] = result[""]
        # copy_result["PROVINCE_NAME_"] = result[""]
        # copy_result["CITY_CODE_"] = result[""]
        # copy_result["CITY_NAME_"] = result[""]
        # copy_result["AREA_CODE_"] = result[""]
        # copy_result["AREA_NAME_"] = result[""]
        # copy_result["LAT_"] = result[""]
        # copy_result["LNG_"] = result[""]
        copy_result["APP_VERSION_"] = "BRANCH"
        copy_result["BANK_CODE_"] = data["ENTITY_CODE_"].replace(
            "PRIVATEINFO", "")
        copy_result["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "")
        # copy_result["UNIT_CODE_"] = result["UNIT_CODE_"]
        # copy_result["UNIT_NAME_"] = result[""]
        copy_result["PERIOD_CODE_"] = data["NOTICE_TIME_"].replace("-", "")
        # copy_result["REMARK_"] = result[""]
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        copy_result["CREATE_TIME_"] = create_time
        copy_result["SPIDER_TIME_"] = data["DATETIME_"]
        # copy_result["MODIFIED_TIME_"] = result[""]
        copy_result["CREATE_BY_ID_"] = "P0131857"
        copy_result["CREATE_BY_NAME_"] = "钟楷文"
        # copy_result["MODIFIED_BY_ID_"] = result[""]
        # copy_result["MODIFIED_BY_NAME_"] = result[""]
        copy_result["M_STATUS_"] = "0"
        copy_result["DELETE_STATUS_"] = "0"
        copy_result["DATA_STATUS_"] = "uncheck"
        # copy_result["TAGS_"] = result[""]
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        copy_result["SOURCE_"] = source[0]
        copy_result["SOURCE_NAME_"] = data["ENTITY_NAME_"]
        # copy_result["SOURCE_TYPE_"] = result[""]
        # copy_result["HOT_"] = result[""]
        # copy_result["IMPORTANCE_"] = result[""]
        copy_result["ACT_NAME_"] = data["TITLE_"]
        # copy_result["IMAGES_"] = data[""]
        # copy_result["TARGET_"] = data[""]
        # copy_result["BRIEF_"] = data[""]
        copy_result["DETAILS_"] = data["CONTENT_"]
        # copy_result["RULE_"] = data[""]
        # copy_result["START_TIME_"] = data[""]
        # copy_result["END_TIME_"] = data[""]
        # copy_result["ACT_TYPE1_"] = data[""]
        # copy_result["ACT_TYPE2_"] = data[""]
        # copy_result["ACT_TYPE3_"] = data[""]
        copy_result["PUBLISH_TIME_"] = data["NOTICE_TIME_"]
        # copy_result["READS_"] = data[""]
        # copy_result["LIKES_"] = data[""]
        # copy_result["COMMENTS_"] = data[""]
        # copy_result["JOINS_"] = data[""]
        # copy_result["RELAYS_"] = data[""]
        # copy_result["SOURCE_ID_"] = data[""]
        # copy_result["HTML_"] = data[""]
        # copy_result["SOURCE_OWN_NAME_"] = data[""]
        # copy_result["SOURCE_OWN_ID_"] = data[""]
        return copy_result

        # "C"
        # re_data["ID_"] = row_key
        # re_data["TYPE_"] = random.choice(
        #     ["税务法律", "子女教育", "健康医养", "财富管理", "生活娱乐", "旅游出行", "艺术/艺术品", "节日庆贺", "其他"])
        # re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        # re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        # re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace("PRIVATEINFO", "")
        # re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "")
        # # re_data["AREA_CODE_"]
        # # re_data["UNIT_CODE_"]
        # period_code = data["NOTICE_TIME_"].replace("-", "")
        # re_data["PERIOD_CODE_"] = period_code
        # re_data["CONTENT_"] = data["CONTENT_"]
        # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
        # re_data["STATUS_"] = "1"
        # # re_data["REMARK_"] = ""
        # re_data["CREATE_TIME_"] = data["DATETIME_"]
        # # re_data["UPDATE_TIME_"]
        # re_data["TITLE_"] = data["TITLE_"]
        # re_data["URL_"] = data["URL_"]
        # re_data["DEALTIME_"] = data["DEALTIME_"]
        # # re_data["DATETIME_"] = data["DATETIME_"]
        # # re_data["SOURCE_TYPE_"]
        #
        # return re_data

    def run(self):
        # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # quit()

        # add colum
        # self.p_client.add_column_phoenix(connection=self.connection, column="SOURCE_TYPE_")
        # quit()

        # create table sql
        # table_sql = ('create table "MARKETING_ACT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."TITLE_" varchar,"C"."NOTICE_TIME_" varchar,'
        #              '"T"."CONTENT_" varchar,"C"."OBJ_" varchar, "C"."ATENDANCE_" varchar, "C"."PERIOD_CODE_" varchar,'
        #              '"C"."IMAGES_" varchar, "C"."RESULTS_" varchar,"C"."PLACE_" varchar, "C"."TYPE_" varchar,'
        #              '"C"."READ_NUM_" varchar, "C"."CONTENT_NUM_" varchar, "C"."COMMENT_CONTENT_" varchar, '
        #              '"C"."FORWARD_NUM_" varchar, "C"."COLLECTION_NUM_" varchar, "C"."PRAISE_NUM_" varchar,'
        #              '"C"."BANK_NAME_" varchar, "C"."STATUS_" varchar, "C"."REMARK_" varchar, "C"."SOURCE_ID_" varchar,'
        #              '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,"C"."SOURCE_" varchar,'
        #              '"C"."URL_" varchar, "C"."BANK_CODE_" varchar, "C"."DEALTIME_" varchar, '
        #              '"C"."SOURCE_TYPE_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true')

        # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        # for i in range(mongo_data_list.count() + 100):
        for i in range(100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #3
0
    def run(self):
        count = 0
        # # 创建 Phoenix 对象-注意表格名字
        p_client = PhoenixHbase(table_name="FUND")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="JSFUND_CCBDATA")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        #查询省市区的编码列表
        # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA")
        # province_list, city_list, area_list, dir_area_list = script.area_from_mysql()
        list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS")
        list_TYPE = self.dict_from_mysql("FUND_TYPE")

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 基金表创建语句
        # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,'
        #     '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,'
        #     '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,'
        #     '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,'
        #     '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,'
        #     '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,'
        #     '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,'
        #     '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,'
        #     '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,'
        #     '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,'
        #     '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE)
                        re_data = module_name.data_shuffle(
                            data, list_SUBS_STATUS, list_TYPE)
                        # re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # try:
                            #     area_data = self.shuffle_for_area(list_data)
                            # except Exception as e:
                            #     self.remove_id_list.remove(data_id)
                            #     self.copy_mongo_data_list.remove(copy_data)
                            #     self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e))
                            #     continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    count += 1
                                    print(count)
                                    # print(list_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=list_data)
                                    # 导出csv
                                    # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv")
                                    once_count += success_count
                                    self.success_count += success_count
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            success_count))
                                    if self.success_count % 50 == 0:
                                        update_count = m_client.update_to_mongodb(
                                            collection=collection,
                                            data_id=self.remove_id_list,
                                            data_dict={"d": 1})

                                        self.remove_count += update_count
                                        self.logger.info("MongoDB 更新成功")
                                except Exception as e:
                                    self.remove_id_list.remove(data_id)
                                    self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        # try:
                        # area_data = self.shuffle_for_area(re_data)
                        # except Exception as e:
                        #     self.remove_id_list.remove(data_id)
                        #     self.copy_mongo_data_list.remove(copy_data)
                        #     self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e))
                        #     continue
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                if self.success_count % 100 == 0:
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            self.success_count))
                                # 添加 {d:1}
                                if self.success_count % 50 == 0:
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=self.remove_id_list,
                                        data_dict={"d": 1})

                                    self.remove_count += update_count
                                    self.logger.info("MongoDB 更新成功")
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #4
0
파일: __init__.py 프로젝트: ILKKAI/dataETL
    def run(self):
        # # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="NEWS_FINASSIST")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="NEWS_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)
        #
        # # 表创建语句
        # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, '
        #        '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, '
        #        '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)'
        #        'IMMUTABLE_ROWS = true')

        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            # self.remove_id_list = []
            # self.copy_mongo_data_list = []
            # self.branch_code_list = []
            if entity_code == "CAIJINGNEWS":
                find_id = "5c6bfa508d7fee512a4ca68f"
            else:
                find_id = ""
            # find_id = ""
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for i in range(1000000):
                    try:
                        data = mongo_data_list.__next__()
                    except pymongo.errors.ServerSelectionTimeoutError:
                        continue
                    except StopIteration:
                        break

                    # for data in mongo_data_list:
                    data_id = data["_id"]
                    if self.success_count % 100 == 0:
                        self.logger.info(
                            "running on data_id: {}".format(data_id))
                    # print(data_id)
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        data_list = [data]
                        re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            # self.remove_id_list.remove(data_id)
                            continue
                    except Exception as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    if entity_code != "CNINFONEWS":
                                        ai_data = self.get_brief_from_ai(
                                            data=list_data)
                                    else:
                                        ai_data = list_data
                                    # print(ai_data["CONTENT_"])
                                except Exception as e:
                                    self.logger.info("AI 调取失败, 错误信息", e)
                                    ai_data = re_data
                                try:
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=ai_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    if self.success_count % 10 == 0:
                                        self.logger.info(
                                            "HBase 插入成功, 成功条数{}条".format(
                                                once_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                                try:
                                    # 添加 {d:1}
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=data_id,
                                        data_dict={"d": 1})
                                    self.remove_count += update_count
                                    # self.logger.info("MongoDB 更新成功")
                                    if self.remove_count % 10 == 0:
                                        self.logger.info(
                                            "MongoDB 更新成功, 成功条数 {} 条".format(
                                                "10"))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue

                    elif isinstance(re_data, dict):
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                self.logger.info(
                                    "HBase 插入成功, 成功条数 {} 条".format(
                                        success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue

                if once_count > 0:
                    status = True
                    self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format(
                        entity_code, once_count))
                mongo_data_list.close()
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #5
0
class Meipian(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="meipian_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="MEIPIAN_CCBDATA")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        if data["TITLE_"]:
            # HBase row_key
            hash_m = hashlib.md5()
            hash_m.update(data["TITLE_"].encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

            # "C" 通用列族字段
            re_data["ID_"] = row_key
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
            # re_data["BANK_NAME_"]

            time_arrary = arrow.get(data["CREATE_TIME"])
            period_code = time_arrary.format("YYYYMMDD")
            publish_time = time_arrary.format("YYYY-MM-DD HH:mm:ss")
            re_data["PERIOD_CODE_"] = str(period_code)
            re_data["PUBLISH_TIME_"] = str(publish_time)
            re_data["STATUS_"] = "UNPROCESSED"
            re_data["CONTENT_"] = data["CONTENT_"]
            re_data["REMARK_"] = ""
            # re_data["AREA_CODE_"]
            # re_data["UNIT_CODE_"]
            re_data["CREATE_TIME_"] = data["DATETIME_"]

            re_data["URL_"] = data["URL_"]
            re_data["TITLE_"] = data["TITLE_"]
            re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"]
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
            re_data["DEALTIME_"] = str(data["DEALTIME_"])
            re_data["VISIT_COUNT_"] = data["VISIT_COUNT"]
            re_data["PRAISE_COUNT_"] = data["PRAISE_COUNT"]
            re_data["COMMENT_COUNT_"] = data["COMMENT_COUNT"]
            re_data["SOURCE_"] = data["SOURCE_"]

            return re_data
        else:
            return None

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "MEIPIAN_CCBDATA" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."URL_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #              ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, '
        #              '"C"."TITLE_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."ENTITY_NAME_" varchar,'
        #              '"C"."VISIT_COUNT_" varchar, "C"."PRAISE_COUNT_" varchar, "C"."COMMENT_COUNT_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."SOURCE_" varchar, "C"."PUBLISH_TIME_" varchar,'
        #              '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')

        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)
        # f_id = "5c6fa1328d7fee306de9463d"  # quit()
        # f_id = "5c6fe1ba8d7fee1d44775989"  # quit()
        # f_id = "5c6fdb448d7fee394da6a5fb"  # quit() Exception while executing batch.
        # f_id = "5c6fe1ba8d7fee1d44775989"
        f_id = "5c6fe11b9bb3df6b0ec6168b"  # gt 10M
        mongo_data_list = self.m_client.all_from_mongodb(self.collection,
                                                         data_id=f_id)
        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #6
0
class HexunOpinion(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="HEXUNOPINION")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="SENTIMENT")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        if ":" not in data["NOTICE_TIME_"]:
            return None
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["TITLE_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

        # "C"
        re_data["ID_"] = row_key
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        # re_data["AREA_CODE_"]
        # re_data["UNIT_CODE_"]
        period_code = data["NOTICE_TIME_"][:10].replace("-", "")
        re_data["PERIOD_CODE_"] = period_code
        re_data["CONTENT_"] = re.sub(r"本报告版权归和讯财经传播研究所所有,未经书面授权允许,不得复制转载。\|.*",
                                     "", data["CONTENT_"])
        re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"][:10]
        re_data["STATUS_"] = "1"
        # re_data["REMARK_"] = ""
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        # re_data["UPDATE_TIME_"]
        re_data["TITLE_"] = data["TITLE_"]
        re_data["URL_"] = data["URL_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        # re_data["DATETIME_"] = data["DATETIME_"]

        return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()

        # # create table sql
        # table_sql = ('create table "SENTIMENT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."REMARK_" varchar, "C"."PERIOD_CODE_" varchar,'
        #              '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, '
        #              '"C"."SOURCE_" varchar, "C"."BRIEF_" varchar, "C"."IMAGE_" varchar, "C"."IMPORTANCE_" varchar,'
        #              '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."NOTICE_TIME_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."HOME_PAGE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
                if i < 4:
                    re_data["HOME_PAGE_"] = "Y"
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()