예제 #1
0
 def match_weibo_code(self, match):
     mongo_client = MongoClient(mongo_collection="WEIBOBASICINFO")
     db, collection_list = mongo_client.client_to_mongodb()
     collection = mongo_client.get_check_collection(db, collection_list)
     result = mongo_client.match_from_mongo(collection=collection,
                                            match=match,
                                            output="WEIBO_CODE_")
     return result
예제 #2
0
파일: generator.py 프로젝트: ILKKAI/dataETL
class Generate_task(threading.Thread):
    def __init__(self, store, queue, values=()):
        '''
        消费者
        FIFO即 First in First Out,先进先出。Queue提供了一个基本的FIFO容器,使用方法很简单,maxsize是个整数,指明了队列中能存放的数据个数的上限。一旦达到上限,插入会导致阻塞,直到队列中的数据被消费掉。如果maxsize小于或者等于0,队列大小没有限制
        可自定义任务函数,
        :param store:
        :param queue: 队列对象
        :param values: 用于切片的值 int
        '''
        threading.Thread.__init__(self)
        self.queue = queue
        self.store = store
        self.data = []
        self.values = values
        self.result = ''

        collection_name = ''
        # MongoDB connection
        self.m_client = MongoClient(mongo_collection=collection_name)
        self.db, self.collection_list = self.m_client.client_to_mongodb()
        # MongoDB old connection
        # spider_data_old 的表连接是遍历查询出来的,,所以需要手动建立
        self.old_client = MongoClient(mongo_collection=collection_name)
        self.old_client.mongo_db = "spider_data_old"

    def rask(self, *args, **kwargs):
        pass

    def get_result(self, result):
        '''
        线程你不好追踪返回值, 所有座位属性返回
        :param result:
        :return:
        '''
        self.result = result

    def run(self):
        try:
            self.queue.get()  # 获取任务
            # 写入函数
            print('This is store %s' % self.store)
            result = self.rask()
            self.get_result(result)
        except Exception as e:
            print(e)
        finally:
            self.queue.task_done()  # 通知任务结束
예제 #3
0
    def run(self):
        # 创建 MongoDB 对象
        m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = self.code_list[14:]
        for entity_code in self.code_list:
            self.count = 0
            hash_list = list()
            status = False
            mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                       collection=collection,
                                                       entity_code=entity_code)
            if mongo_data_list:
                self.logger.warning("{} 查取成功".format(entity_code))
                self.logger.warning("当前共有{}条".format(mongo_data_list.count()))
                status = True
            else:
                self.logger.warning("{} 无数据".format(entity_code))

            if status:
                for data in mongo_data_list:
                    if "ADDR_" in data:
                        hash_m = hashlib.md5()
                        hash_m.update(data["ADDR_"].encode("utf-8"))
                        hash_title = hash_m.hexdigest()
                        if hash_title in hash_list:
                            self.count += 1
                        else:
                            hash_list.append(hash_title)
                    else:
                        if "CONTENT_" in data:
                            hash_m = hashlib.md5()
                            hash_m.update(data["CONTENT_"].encode("utf-8"))
                            hash_title = hash_m.hexdigest()
                            if hash_title in hash_list:
                                self.count += 1
                            else:
                                hash_list.append(hash_title)
            self.logger.warning("重复数据{}条".format(self.count))

        # 关闭连接
        m_client.client_close()
        self.logger.handlers.clear()
예제 #4
0
def run():
    shuffle_list = list()
    count = 0

    # 创建 MongoDB 查询数据库对象
    m_client = MongoClient(mongo_collection="CommonBidding")
    db, collection_list = m_client.client_to_mongodb()
    collection = m_client.get_check_collection(db=db,
                                               collection_list=collection_list)
    m_client.mongo_db = "spider_data"
    m_client.mongo_entity_code = "500000CQGGZY"

    try:
        mongo_data_list = m_client.search_title_from_mongodb(collection)
    except pymongo.errors.ServerSelectionTimeoutError:
        print("正在重新连接")
        time.sleep(1)
        mongo_data_list = m_client.search_title_from_mongodb(collection)

    for data in mongo_data_list:
        data_list = data_shuffle(data)
예제 #5
0
    def run(self):
        count = 0
        # # 创建 Phoenix 对象-注意表格名字
        p_client = PhoenixHbase(table_name="FUND")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="JSFUND_CCBDATA")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        #查询省市区的编码列表
        # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA")
        # province_list, city_list, area_list, dir_area_list = script.area_from_mysql()
        list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS")
        list_TYPE = self.dict_from_mysql("FUND_TYPE")

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 基金表创建语句
        # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,'
        #     '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,'
        #     '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,'
        #     '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,'
        #     '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,'
        #     '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,'
        #     '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,'
        #     '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,'
        #     '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,'
        #     '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,'
        #     '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE)
                        re_data = module_name.data_shuffle(
                            data, list_SUBS_STATUS, list_TYPE)
                        # re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # try:
                            #     area_data = self.shuffle_for_area(list_data)
                            # except Exception as e:
                            #     self.remove_id_list.remove(data_id)
                            #     self.copy_mongo_data_list.remove(copy_data)
                            #     self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e))
                            #     continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    count += 1
                                    print(count)
                                    # print(list_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=list_data)
                                    # 导出csv
                                    # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv")
                                    once_count += success_count
                                    self.success_count += success_count
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            success_count))
                                    if self.success_count % 50 == 0:
                                        update_count = m_client.update_to_mongodb(
                                            collection=collection,
                                            data_id=self.remove_id_list,
                                            data_dict={"d": 1})

                                        self.remove_count += update_count
                                        self.logger.info("MongoDB 更新成功")
                                except Exception as e:
                                    self.remove_id_list.remove(data_id)
                                    self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        # try:
                        # area_data = self.shuffle_for_area(re_data)
                        # except Exception as e:
                        #     self.remove_id_list.remove(data_id)
                        #     self.copy_mongo_data_list.remove(copy_data)
                        #     self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e))
                        #     continue
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                if self.success_count % 100 == 0:
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            self.success_count))
                                # 添加 {d:1}
                                if self.success_count % 50 == 0:
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=self.remove_id_list,
                                        data_dict={"d": 1})

                                    self.remove_count += update_count
                                    self.logger.info("MongoDB 更新成功")
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #6
0
파일: WECHAT.py 프로젝트: ILKKAI/dataETL
class WechatScript(object):
    def __init__(self, entity_type="WECHAT"):
        """
        初始化参数
        :param entity_type: WECHAT
        """
        self.entity_type = entity_type
        self.logger = Logger().logger

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name=self.entity_type)
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="WECHAT")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # # 创建 MongoDB spider_data_old 数据库对象
        # self.old_client = MongoClient(mongo_collection="WECHAT")
        # # 本地测试
        # self.old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                              connectTimeoutMS=60, connect=False)
        # self.old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = self.old_client.client_to_mongodb()
        # self.collection_old = db_old["WECHAT"]

        # 创建 MySQL 对象
        self.mysql_client = GenericScript(entity_code=None, entity_type=None)

        self.remove_id_list = list()
        self.copy_mongo_data_list = list()
        self.verify_list = [
            "ID_", "ENTITY_CODE_", "URL_", "AREA_CODE_", "BANK_CODE_",
            "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_",
            "CONTENT_TYPE_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_",
            "TITLE_", "ENTITY_NAME_", "DEALTIME_", "DATETIME_", "STATUS_",
            "WECHAT_NAME_", "WECHAT_ID_"
        ]

        # BANK_NAME_ 字典
        self.name_dict = {
            "ICBC": "中国工商银行",
            "ABC": "中国农业银行",
            "BOC": "中国银行",
            "CCB": "中国建设银行",
            "BOCOM": "交通银行",
            "PSBC": "中国邮政储蓄银行",
            "CZB": "浙商银行",
            "CBHB": "渤海银行",
            "ECITIC": "中信银行",
            "CEB": "中国光大银行",
            "HXB": "华夏银行",
            "CMBC": "中国民生银行",
            "CMB": "招商银行",
            "CIB": "兴业银行",
            "CGB": "广发银行",
            "PAB": "平安银行",
            "SPDB": "浦发银行",
            "EBCL": "恒丰银行"
        }

        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Host":
            "weixin.sogou.com",
            "Referer":
            "http://weixin.sogou.com/"
        }

        self.url = "http://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&s_from=input&_sug_=y&_sug_type_="

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def check_name(self, wechat_id):
        check_dict = dict()
        with open("wechat_id_name.txt", "r", encoding="utf-8") as rf:
            read_data = rf.read()
            if read_data:
                read_data = read_data.replace("\'", "\"")
                read_data = read_data.replace(": None", ": \"None\"")
                # print(read_data)
                check_dict = json.loads(read_data)
            else:
                wechat_name = self.req_for_name(wechat_id)
                check_dict[wechat_id] = wechat_name
                with open("wechat_id_name.txt", "w", encoding="utf-8") as wf:
                    wf.write(str(check_dict))
                return check_dict[wechat_id]
        if wechat_id in check_dict:
            return check_dict[wechat_id]
        else:
            wechat_name = self.req_for_name(wechat_id)
            check_dict[wechat_id] = wechat_name
            with open("wechat_id_name.txt", "w", encoding="utf-8") as wf:
                wf.write(str(check_dict))
            return check_dict[wechat_id]

    def req_for_name(self, wechat_id):
        url = self.url.format(wechat_id)
        # response = WanDou().http_client(url=url, param=self.headers)
        resp1 = requests.get(
            url=
            r"http://h.wandouip.com/get/ip-list?pack=853&num=1&xy=1&type=2&lb=\r\n&mr=1&"
        )
        resp2 = resp1.json()["data"][0]
        # print(resp2)
        # resp1.close()
        time.sleep(2)
        try:
            response = requests.get(
                url=url,
                headers=self.headers,
                proxies={"http": "{}:{}".format(resp2["ip"], resp2["port"])})
        except Exception as e:
            print(1, e)
            self.logger.info("error ip: {}".format(resp2))
            time.sleep(5)
            return self.req_for_name(wechat_id)

        html = HTML(response.content.decode())
        # response.close()
        name = html.xpath('//p[@class="tit"]/a/text()')
        if name:
            # print(name)
            self.error_count = 0
            return name[0]
        else:
            self.error_count += 1
            if self.error_count == 5:
                self.logger.info("wetchat id error: \"{}\"".format(wechat_id))
                return "None"
            else:
                time.sleep(2)
                self.req_for_name(wechat_id)

        # if response is None:
        #     self.logger.info("ip_prox error")
        #     return self.req_for_name(wechat_id)

        # if isinstance(response, str):
        #     html = HTML(response)
        #     name = html.xpath('//p[@class="tit"]/a/text()')
        #     if name:
        #         print(name)
        #         return name[0]
        #     else:
        #         self.logger.info("ip_prox error2")
        #         return self.req_for_name(wechat_id)
        # else:
        #     self.logger.info("ip_prox error change")
        #     return self.req_for_name(wechat_id)

    def data_shuffle(self, data, province_list, city_list, area_list):
        """
        数据清洗
        :param data:
        :param province_list:
        :param city_list:
        :param area_list:
        :return: re_data or None
        """
        # BANK_CODE_正则匹配规则
        pattern = re.compile(
            r'ICBC|ABC|BOCOM|CCB|BOC|PSBC|CZB|CBHB|ECITIC|CEB|HXB|CMBC|CMB|CIB|CGB|PAB|SPDB|EBCL'
        )

        re_data = dict()

        if data["TITLE_"]:
            # HBase row_key
            hash_m = hashlib.md5()
            hash_m.update(data["TITLE_"].encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

            # "C" 通用列族字段
            re_data["ID_"] = row_key
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
            # re_data["URL_"] = ""

            prov_c = None
            prov_n = None
            city_c = None
            city_n = None
            area_c = None
            area_n = None
            bank_n = None

            bank_c = pattern.match(data["ENTITY_CODE_"])
            if bank_c:
                re_data["BANK_CODE_"] = bank_c.group()
            else:
                return None

            # 正则去除银行名称,方便匹配地区编码
            bank_n = re.sub(
                r"{}银?行?|微信|[总分支]行".format(
                    self.name_dict[re_data["BANK_CODE_"]][:-2]), "",
                data["ENTITY_NAME_"])

            re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]]
            re_data["PERIOD_CODE_"] = data["PERIOD_CODE_"].replace("-", "")
            re_data["NOTICE_TIME_"] = data["PERIOD_CODE_"]
            re_data["STATUS_"] = "1"
            re_data["CONTENT_"] = data["CONTENT_"]
            re_data["REMARK_"] = ""

            for area in area_list:
                if area["NAME_"] in bank_n:
                    area_c = area["CODE_"]
                    area_n = area["NAME_"]
            if area_c:
                pass
            else:
                for prov in province_list:
                    if prov["NAME_"] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n, "")
                        break
                    elif prov["NAME_"][:-1] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n[:-1], "")
                        break
                    elif prov["NAME_"][:4] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n[:4], "")
                        break
                    elif prov["NAME_"][:3] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n[:3], "")
                        break
                    elif prov["NAME_"][:2] in bank_n:
                        prov_c = prov["CODE_"]
                        prov_n = prov["NAME_"]
                        bank_n = bank_n.replace(prov_n[:2], "")
                        break

                for city in city_list:
                    if len(city["NAME_"]) == 1:
                        continue
                    if prov_c:
                        if city["CODE_"][:2] == prov_c[:2]:
                            if city["NAME_"] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n, "")
                                break
                            elif city["NAME_"][:-1] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n[:-1], "")
                                break
                            elif city["NAME_"][:4] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n[:4], "")
                                break
                            elif city["NAME_"][:3] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n[:3], "")
                                break
                            elif city["NAME_"][:2] in bank_n:
                                city_c = city["CODE_"]
                                city_n = city["NAME_"]
                                bank_n = bank_n.replace(city_n[:2], "")
                                break
                    else:
                        if city["NAME_"] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n, "")
                            break
                        elif city["NAME_"][:-1] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n[:-1], "")
                            break
                        elif city["NAME_"][:4] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n[:4], "")
                            break
                        elif city["NAME_"][:3] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n[:3], "")
                            break
                        elif city["NAME_"][:2] in bank_n:
                            city_c = city["CODE_"]
                            city_n = city["NAME_"]
                            bank_n = bank_n.replace(city_n[:2], "")
                            break

                for area in area_list:
                    if city_c:
                        if area["CODE_"][:2] == city_c[:2]:
                            if area["NAME_"] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:-1] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:4] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:3] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:2] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                    elif prov_c:
                        if area["CODE_"][:2] == prov_c[:2]:
                            if area["NAME_"] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:-1] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:4] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:3] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                            elif area["NAME_"][:2] in bank_n:
                                area_c = area["CODE_"]
                                area_n = area["NAME_"]
                                break
                    else:
                        if area["NAME_"][:-1] in bank_n:
                            area_c = area["CODE_"]
                            area_n = area["NAME_"]
                            break
                        elif area["NAME_"][:4] in bank_n:
                            area_c = area["CODE_"]
                            area_n = area["NAME_"]
                            break
                        elif area["NAME_"][:3] in bank_n:
                            area_c = area["CODE_"]
                            area_n = area["NAME_"]
                            break
                        elif area["NAME_"][:2] in bank_n:
                            area_c = area["CODE_"]
                            area_n = area["NAME_"]
                            break

                # 特殊情况 星子县现为庐山市 喻家山位于武汉洪山区
                if "星子县" in data["ENTITY_NAME_"]:
                    area_c = "360483"
                    area_n = "庐山市"
                elif "喻家山" in data["ENTITY_NAME_"]:
                    area_c = "420111"
                    area_n = "洪山区"
                elif "江南西" in data["ENTITY_NAME_"]:
                    area_c = "440105"
                    area_n = "海珠区"
                elif "两路口" in data["ENTITY_NAME_"]:
                    area_c = "500103"
                    area_n = "渝中区"
                elif "大兴安岭" in data["ENTITY_NAME_"]:
                    area_c = "232700"
                    area_n = "大兴安岭地区"
                elif "张家港" in data["ENTITY_NAME_"]:
                    area_c = "320582"
                    area_n = "张家港市"
                elif "兴业银行新阳支行" in data["ENTITY_NAME_"]:
                    area_c = "230102"
                    area_n = "道里区"

                if area_c:
                    pass
                elif (not area_c) and city_c:
                    area_c = city_c
                    area_n = city_n
                elif (not area_c) and (not city_c) and prov_c:
                    area_c = prov_c
                    area_n = prov_n
                # 总行地区处理
                elif (not area_c) and (not city_c) and (not prov_c):
                    if re_data["BANK_CODE_"] == "ICBC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "ABC":
                        area_c = "110101"
                        area_n = "东城区"
                    elif re_data["BANK_CODE_"] == "BOCOM":
                        area_c = "310115"
                        area_n = "浦东新区"
                    elif re_data["BANK_CODE_"] == "CCB":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "BOC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "PSBC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "CZB":
                        area_c = "330103"
                        area_n = "下城区"
                    elif re_data["BANK_CODE_"] == "CBHB":
                        area_c = "120103"
                        area_n = "河西区"
                    elif re_data["BANK_CODE_"] == "ECITIC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "CEB":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "HXB":
                        area_c = "110101"
                        area_n = "东城区"
                    elif re_data["BANK_CODE_"] == "CMBC":
                        area_c = "110102"
                        area_n = "西城区"
                    elif re_data["BANK_CODE_"] == "CMB":
                        area_c = "440304"
                        area_n = "福田区"
                    elif re_data["BANK_CODE_"] == "CIB":
                        area_c = "350102"
                        area_n = "鼓楼区"
                    elif re_data["BANK_CODE_"] == "CGB":
                        area_c = "440104"
                        area_n = "越秀区"
                    elif re_data["BANK_CODE_"] == "PAB":
                        area_c = "440303"
                        area_n = "罗湖区"
                    elif re_data["BANK_CODE_"] == "SPDB":
                        area_c = "310101"
                        area_n = "黄浦区"
                    elif re_data["BANK_CODE_"] == "EBCL":
                        area_c = "370602"
                        area_n = "芝罘区"
            re_data["AREA_CODE_"] = area_c

            if area_c:
                re_data["UNIT_CODE_"] = re_data[
                    "BANK_CODE_"] + "_" + area_c[:4] + "00"
            if ("b" in data["BANK_NAME_"]) or ("B" in data["BANK_NAME_"]):
                return None

            if "DATETIME_" not in data:
                time_array = time.localtime(int(float(data["DEALTIME_"])))
                value_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
                re_data["CREATE_TIME_"] = value_time
            else:
                re_data["CREATE_TIME_"] = data["DATETIME_"]

            # data["UPDATE_TIME_"] = ""

            re_data["TITLE_"] = data["TITLE_"]
            re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"]
            re_data["WECHAT_ID_"] = data["WECHAT_"].replace(" ", "")
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
            re_data["DEALTIME_"] = str(data["DEALTIME_"])

            # print(area_c, area_n, data["ENTITY_NAME_"])
            return re_data
        else:
            return None

    def delete_data_from_mongo(self):
        """
        从 MongoDB 删除数据
        :return: delete_count
        """
        try:
            remove_count = self.m_client.remove_from_mongo(
                collection=self.collection, remove_id_list=self.remove_id_list)
            return remove_count
        except pymongo.errors.ServerSelectionTimeoutError:
            mongo_data_list = self.m_client.remove_from_mongo(
                collection=self.collection, remove_id_list=self.remove_id_list)
            return mongo_data_list
        except Exception as e:
            self.logger.info(e)
            return None
        except KeyError as e:
            self.logger.info(e)
            return None

    def upsert_and_delete(self, mongo_data_list, province_list, city_list,
                          area_list):
        """
        插入和删除
        :param mongo_data_list:
        :param province_list:
        :param city_list:
        :param area_list:
        :return:
        """
        for i in range(1000000):
            status = False
            self.data_id = ""
            success_count = 0
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            # print(data["_id"])
            # self.remove_id_list.append(self.data_id)
            # del data["_id"]
            # copy_data = deepcopy(data)
            # self.copy_mongo_data_list.append(copy_data)

            # 清洗数据
            try:
                re_data = self.data_shuffle(data=data,
                                            province_list=province_list,
                                            city_list=city_list,
                                            area_list=area_list)
            except Exception as e:
                # self.remove_id_list.remove(self.data_id)
                # self.copy_mongo_data_list.remove(copy_data)
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # 获取公众号名称
                # try:
                # print(re_data["WECHAT_ID_"])
                re_data["WECHAT_NAME_"] = self.check_name(
                    re_data["WECHAT_ID_"])

                # re_data["WECHAT_NAME_"] = self.req_for_name(re_data["WECHAT_ID_"])
                # print(re_data["WECHAT_ID_"])
                # print(re_data["WECHAT_NAME_"])
                # except Exception as e:
                # 向 HBase 插入数据
                try:
                    count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                    success_count += count
                except jaydebeapi.DatabaseError as e:
                    # self.logger.info("error: {}".format(e))
                    # self.remove_id_list.remove(self.data_id)
                    # self.copy_mongo_data_list.remove(copy_data)
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                    # # Phoenix 连接关闭
                    # p_client.close_client_phoenix(connection=connection)
                    # time.sleep(10)
                    # # 连接 Phoenix
                    # connection = p_client.connect_to_phoenix()
                    # # 向 HBase 插入数据
                    # count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data)
                    # success_count += count

                # try:
                #     # 添加 {d:1}
                #     update_count = self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id,
                #                                                    data_dict={"d": 1})
                #     self.remove_count += update_count
                #     # self.logger.info("MongoDB 更新成功")
                #     if self.remove_count % 10 == 0:
                #         self.logger.info("MongoDB 更新成功, 成功条数 {} 条".format("10"))
                # except Exception as e:
                #     # self.remove_id_list.remove(data_id)
                #     # self.copy_mongo_data_list.remove(copy_data)
                #     self.logger.warning("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e))
                #     continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                # self.remove_id_list.remove(self.data_id)
                # self.copy_mongo_data_list.remove(copy_data)
                continue

            # # 删除数据
            # if status:
            #     delete_count = self.delete_data_from_mongo()
            #     self.remove_count += delete_count
            #     self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
            #
            # # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         self.old_client.mongo_db = "spider_data_old"
            #         insert_count = self.old_client.all_to_mongodb(collection=self.collection_old,
            #                                                       insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         time.sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = self.old_client.all_to_mongodb(collection=self.collection_old,
            #                                                       insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

    def main(self):
        """

        :return:
        """
        # # 删除表
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()

        # # 建表语句
        # table_sql = ('create table "WECHAT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar,'
        #              '"C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar,'
        #              '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, '
        #              '"T"."CONTENT_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."TITLE_" varchar,'
        #              '"C"."WECHAT_ID_" varchar, "C"."WECHAT_NAME_" varchar, "C"."ENTITY_NAME_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."PRAISES_" varchar,'
        #              '"C"."READ_NUM_" varchar, "C"."REPLIES_" varchar, "C"."RELAYS_" varchar,'
        #              '"C"."NOTICE_TIME_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        # f_id = "5c1267258d7fee59f7d089f8"  # gte 10M
        # f_id = "5c1271a28d7fee66df0fdd83"  # gte 10M
        # f_id = "5c127e7b9bb3df7412b53b04"  # gte 10M
        # f_id = "5c1330d28d7fee4d9c87d6e1"  # gte 10M
        # f_id = "5c1330ed9bb3df2de33bb746"  # gte 10M
        # f_id = "5c13490a8d7fee79f1d9e87f"  # gte 10M
        # f_id = "5c1350ee8d7fee2d29b601ef"  # gte 10M
        # f_id = "5c1351c79bb3df0e23ee68c1"  # gte 10M
        # f_id = "5c13547d9bb3df06d41997d5"  # gte 10M
        # f_id = "5c1354849bb3df202508ee3e"  # gte 10M
        # f_id = "5c1354bd8d7fee44b881b11a"  # gte 10M
        # f_id = "5c1354e89bb3df1b2a6ef59c"  # gte 10M
        # f_id = "5c1355139bb3df197beb11c0"  # gte 10M
        # f_id = "5c1355328d7fee2f0997a3ac"  # gte 10M
        # f_id = "5c13558e8d7fee50ea04bd0a"  # gte 10M
        # f_id = "5c135a5f8d7fee5bf7db91b8"  # gte 10M
        # f_id = "5c135b0c8d7fee697fa5bd80"  # gte 10M
        # f_id = "5c135bd59bb3df4d7aa66cad"  # gte 10M
        # f_id = "5c135bdb9bb3df454c0157a3"  # gte 10M
        # f_id = "5c135bfc8d7fee73c8f84567"  # gte 10M
        # f_id = "5c135c119bb3df48aeb8fe63"  # gte 10M
        # f_id = "5c135dfe9bb3df4d7aa66cc2"  # gte 10M
        # f_id = "5c13602d8d7fee7f7a48c485"  # gte 10M
        # f_id = "5c1361858d7fee223825f805"  # gte 10M
        # f_id = "5c1361d68d7fee561806fc4d"  # gte 10M
        # f_id = "5c1362068d7fee223825f808"  # gte 10M
        # f_id = "5c1362159bb3df26bba60a05"  # gte 10M
        # f_id = "5c1366248d7fee6741adb5be"  # gte 10M
        # f_id = "5c1366418d7fee673f6c95cb"  # gte 10M
        # f_id = "5c1367099bb3df5a0e013c4d"  # gte 10M
        # f_id = "5c13686d8d7fee76ac78735b"  # gte 10M
        # f_id = "5c1368788d7fee6fcb24daa3"  # gte 10M
        # f_id = "5c1369438d7fee63412b04ff"  # gte 10M
        # f_id = "5c13697b9bb3df60429b5d31"  # gte 10M
        # f_id = "5c1389468d7fee6a94c413c3"  # gte 10M
        # f_id = "5c1389c29bb3df75adc8861a"  # gte 10M
        # f_id = "5c138b039bb3df75adc88620"  # gte 10M
        # f_id = "5c138e3d9bb3df074c4ec0b3"  # gte 10M
        # f_id = "5c138e4d8d7fee06a4f8fd59"  # gte 10M
        # f_id = "5c1391318d7fee168749a96e"  # gte 10M
        # f_id = "5c25a4f19bb3df51eba386b8"  # gte 10M
        # f_id = "5c2601ef9bb3df7d42fe2084"  # gte 10M
        # f_id = "5c2608099bb3df24f5db4527"  # gte 10M
        # f_id = "5c2608be9bb3df2d58d08e32"  # gte 10M
        # f_id = "5c260d2b9bb3df3c084d2a83"  # gte 10M
        # f_id = "5c2615868d7fee2771bb3914"  # gte 10M
        # f_id = "5c261d528d7fee3c1383db85"  # gte 10M
        # f_id = "5c26340e8d7fee66d784fe8a"  # gte 10M
        # f_id = "5c263b818d7fee630f0d3ac4"  # gte 10M
        # f_id = "5c263ee28d7fee04ddc62e31"  # gte 10M
        # f_id = "5c263f269bb3df0d29d1e1e5"  # gte 10M
        # f_id = "5c2766718d7fee2aa36fa166"  # gte 10M
        # f_id = "5c2b79ef8d7fee3025e02575"  # gte 10M
        # f_id = "5c2b854a9bb3df27dc669d5a"  # gte 10M
        # f_id = "5c2e00078d7fee1b60443cf3"  # gte 10M
        # f_id = "5c2f69028d7fee62d31a72db"  # gte 10M
        # f_id = "5c36a7948d7fee18d9333327"  # gte 10M
        # f_id = "5c36b9ff9bb3df332dfebe39"  # gte 10M
        # f_id = "5c3754579bb3df02b680150b"  # gte 10M
        # f_id = "5c375c969bb3df6afd18e22d"  # gte 10M
        # f_id = "5c38a1e59bb3df6b2ff2f269"  # gte 10M
        # f_id = "5c394e058d7fee6a2582d1d3"  # gte 10M
        # f_id = "5c3c983e9bb3df21ddf94a92"  # gte 10M
        # f_id = "5c3ca38a9bb3df60bca07833"  # gte 10M

        f_id = "5c3c983e9bb3df21ddf94a92"
        # f_id = ""
        self.data_id = f_id

        province_list, city_list, area_list, dir_area_list = self.mysql_client.area_from_mysql(
        )
        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection, data_id=self.data_id)
        self.find_count += mongo_data_list.count()

        try:
            self.upsert_and_delete(mongo_data_list=mongo_data_list,
                                   province_list=province_list,
                                   city_list=city_list,
                                   area_list=area_list)
        except jaydebeapi.DatabaseError:
            self.logger.info("error id is: {}".format(self.data_id))
            mongo_data_list = self.m_client.all_from_mongodb(
                collection=self.collection, data_id=self.data_id)
            self.upsert_and_delete(mongo_data_list=mongo_data_list,
                                   province_list=province_list,
                                   city_list=city_list,
                                   area_list=area_list)

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #7
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["FINPRODUCT_FINASSIST"]

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 表创建语句
        # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, '
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,'
        #        '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,'
        #        '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,'
        #        '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,'
        #        '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #        '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,'
        #        '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,'
        #        '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,'
        #        '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,'
        #        '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]:
            # for entity in self.entity_list:
            status = False
            module_name = __import__(entity)
            self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            # find_id = "5c3f118f8d7fee068da6ef53"
            find_id = None
            try:
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        if entity == "CHINANETFINANCIAL":
                            re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status,
                                                               produc_category=self.produc_category,
                                                               revenue_type=self.revenue_type,
                                                               operaton_pattern=self.operaton_pattern,
                                                               purchase_amount=self.purchase_amount,
                                                               duration_type=self.duration_type)
                        elif entity == "JSFIN_CCBDATA":
                            re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data)
                        else:
                            re_data = module_name.data_shuffle(data)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e))
                        continue

                    print(data_id)

                    # phoenix_HBase 插入数据
                    if isinstance(re_data, dict):
                        try:
                            success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data)
                            once_count += success_count
                            self.success_count += success_count
                            # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(data)
                            self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                            continue
                    elif isinstance(re_data, list):
                        for r_data in re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(connection=connection,
                                                                                  data=r_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(data)
                                self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            break
            # # 删除数据
            # if status:
            #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                                entity_code=entity,
            #                                                remove_id_list=self.remove_id_list)
            #     self.remove_count += delete_count
            #     # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
            #
            # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

            # 关闭连接
        m_client.client_close()
        # p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
예제 #8
0
class WeiboBasicInfoUpdate(object):
    def __init__(self,
                 table_name="CHA_BRANCH_WEIBO_BASIC",
                 collection_name="WEIBOBASICINFO"):
        # phoenix connection
        self.p_client = PhoenixHbase(table_name=table_name)
        self.connection = self.p_client.connect_to_phoenix()
        # Mongo connection
        self.m_client = MongoClient(entity_code="CMBCMICROBLOG",
                                    mongo_collection=collection_name)
        self.mongo_host = "172.22.69.35"
        self.mongo_port = 20000
        self.m_client.client = pymongo.MongoClient(host="172.22.69.35",
                                                   port=20000,
                                                   serverSelectionTimeoutMS=60,
                                                   connectTimeoutMS=60,
                                                   connect=False)
        self.db, self.collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=self.db, collection_list=self.collection_list)
        # Log
        self.logger = Logger().logger

    def get_mongo_column_dict(self, collection, column1, column2):
        mon_logger = Logger().logger
        try:
            mon_logger.info("开始查取数据")
            result = collection.aggregate([{
                "$project": {
                    "_id": 0,
                    column1: 1,
                    column2: 1
                }
            }])
            return result
        except TypeError as e:
            mon_logger.error(
                "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确".format(e))
            raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e))

        finally:
            self.m_client.client.close()

    def main(self):
        mongo_data_list = self.get_mongo_column_dict(
            collection=self.collection, column1="WEIBO_CODE_", column2="FANS_")

        # update to hbase

        result_generator = self.p_client.search_all_from_phoenix(
            connection=self.connection, dict_status=True)
        while True:
            try:
                result = result_generator.__next__()
                for mongo_data in mongo_data_list:
                    if mongo_data["WEIBO_CODE_"] == result["WEIBO_CODE_"]:
                        result["FANS_"] = mongo_data["FANS_"]
                        break
            except StopIteration:
                break
            self.p_client.upsert_to_phoenix_by_one(connection=self.connection,
                                                   data=result)
        self.connection.close()
예제 #9
0
def judge(item):

    if not any([num in item for num in ['长城', '长城科技', '长城信息']]):
        return False
    else:
        return True


if __name__ == '__main__':
    # run()

    pass
    import pandas as pd

    main_mongo = MongoClient(entity_code="", mongo_collection="CommonBidding")
    db, collection_list = main_mongo.client_to_mongodb()
    collection = main_mongo.get_check_collection(
        db=db, collection_list=collection_list)
    # mon_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    mon_list = [
        '08',
        '09',
        '10',
        '11',
        '12',
        # '06',
    ]
    for _ in range(13):
        try:
            data_list = collection.find(
                {
예제 #10
0
파일: __init__.py 프로젝트: ILKKAI/dataETL
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="CommonBidding")
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="CommonBidding")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        old_client = MongoClient(mongo_collection="CommonBidding")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        old_client.mongo_db = "spider_data_old"
        db_old, collection_list_old = old_client.client_to_mongodb()
        collection_old = db_old["CommonBidding"]

        # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # 招投标表创建语句
        # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,'
        #        '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,'
        #        '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,'
        #        '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,'
        #        '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,'
        #        '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,'
        #        '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,'
        #        '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,'
        #        '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,'
        #        '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,'
        #        '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,'
        #        '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        # self.file_list = ["CommonBidding_86JCW"]
        for f in self.file_list:
            status = False
            entity_code = f.replace(".py", "")
            module_name = __import__(entity_code)
            entity_code_mongo = entity_code.replace("CommonBidding_", "")
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                try:
                    self.find_count += mongo_data_list.count()
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(1)
                    self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    self.remove_id_list.append(data_id)
                    del data["_id"]
                    # 深拷贝源数据,用于插入 spider_data 库中
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    # 数据清洗
                    try:
                        re_data = module_name.data_shuffle(data)
                        final_data = self.shuffle_data(re_data)
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue
                    # phoenix_HBase 插入数据
                    if final_data:
                        try:
                            p_client.upsert_to_phoenix_by_one(
                                connection=connection, data=final_data)
                            once_count += 1
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(copy_data)
                            self.logger.warning(
                                "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                    data_id, e))
                            continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))

                # 删除数据
                if status:
                    delete_count = self.delete_data_from_mongo(
                        m_client=m_client,
                        collection=collection,
                        entity_code=entity_code_mongo,
                        remove_id_list=self.remove_id_list)
                    self.remove_count += delete_count
                else:
                    self.logger.info("HBase 插入成功条数0条, 不执行删除")

                # 将数据插入 spider_data_old 中
                if status:
                    try:
                        old_client.mongo_db = "spider_data_old"
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except pymongo.errors.ServerSelectionTimeoutError as e:
                        time.sleep(1)
                        self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except Exception as e:
                        self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
예제 #11
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="ORGANIZE_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None,
            entity_type="ORGANIZE_FINASSIST").area_from_mysql())

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 创建表
        # # 网点表创建语句
        # sql = ('create table "ORGANIZE_FINASSIST" ("ID_" varchar primary key, "C"."BANK_NAME_" varchar,'
        #        '"C"."BANK_CODE_" varchar, "C"."NAME_" varchar,'
        #        '"C"."CODE_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."UNIT_CODE_" varchar, "C"."ADDR_" varchar,'
        #        '"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar, "C"."CITY_" varchar,'
        #        '"C"."CITY_CODE_" varchar, "C"."DISTRICT_NAME_" varchar, "C". "DISTRICT_CODE_" varchar,'
        #        '"C"."LAT_" varchar, "C"."LNG_" varchar, "C"."CREATE_TIME_" varchar, "C"."DEALTIME_" varchar,'
        #        '"C"."URL_" varchar, "C"."TEL_" varchar, "C"."BUSINESS_HOURS_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."IMPORTANCE_" varchar) IMMUTABLE_ROWS = true')
        #
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            # find_id = ""
            if entity_code == "ECITICORGANIZE":
                find_id = "5c3f48479bb3df1d97d762e1"
            else:
                find_id = None
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        re_data = module_name.data_shuffle(
                            data, province_list, city_list, area_list)
                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.exception("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    print(data_id)

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            area_data = ""
                            try:
                                # self.logger.info("_id {}".format(data_id))
                                area_data = self.shuffle_for_area(list_data)
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.exception(
                                    "_id:{} 获取经纬度失败, {}".format(data_id, e))
                                continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if area_data:
                                try:
                                    # print(area_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=area_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.exception(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        area_data = ""
                        try:
                            area_data = self.shuffle_for_area(re_data)
                        except urllib3.exceptions.NewConnectionError as e:
                            # self.remove_id_list.remove(data_id)
                            # self.copy_mongo_data_list.remove(copy_data)
                            self.logger.exception("_id: {}获取经纬度失败, {}".format(
                                data_id, e))
                        except Exception as e:
                            # self.remove_id_list.remove(data_id)
                            # self.copy_mongo_data_list.remove(copy_data)
                            self.logger.exception("_id: {}获取经纬度失败, {}".format(
                                data_id, e))
                            continue
                        # phoenix_HBase 插入数据
                        if area_data:
                            try:
                                # print(area_data)
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=area_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.exception(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                    if self.success_count % 100 == 0:
                        self.logger.info("HBase 插入成功, 成功条数 {} 条".format(
                            self.success_count))
                    # 添加 {d:1}
                    # if self.success_count % 50 == 0:
                    #     update_count = m_client.update_to_mongodb(collection=collection,
                    #                                               data_id=self.remove_id_list,
                    #                                               data_dict={"d": 1})
                    #     self.remove_id_list = []
                    #     self.remove_count += update_count
                    #     self.logger.info("MongoDB 更新成功")

                mongo_data_list.close()

                # 添加 {d:1}
                # if self.remove_id_list:
                #     update_count = m_client.update_to_mongodb(collection=collection,
                #                                               data_id=self.remove_id_list,
                #                                               data_dict={"d": 1})
                #     self.remove_id_list = []
                #     self.remove_count += update_count
                #     self.logger.info("MongoDB 更新成功")
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            # 删除数据
            # if status:
            # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                            entity_code=entity_code,
            #                                            remove_id_list=self.remove_id_list)
            # self.remove_count += delete_count
            # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")

            # # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #12
0
class HexunOpinion(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="HEXUNOPINION")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="SENTIMENT")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        if ":" not in data["NOTICE_TIME_"]:
            return None
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["TITLE_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

        # "C"
        re_data["ID_"] = row_key
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        # re_data["AREA_CODE_"]
        # re_data["UNIT_CODE_"]
        period_code = data["NOTICE_TIME_"][:10].replace("-", "")
        re_data["PERIOD_CODE_"] = period_code
        re_data["CONTENT_"] = re.sub(r"本报告版权归和讯财经传播研究所所有,未经书面授权允许,不得复制转载。\|.*",
                                     "", data["CONTENT_"])
        re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"][:10]
        re_data["STATUS_"] = "1"
        # re_data["REMARK_"] = ""
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        # re_data["UPDATE_TIME_"]
        re_data["TITLE_"] = data["TITLE_"]
        re_data["URL_"] = data["URL_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        # re_data["DATETIME_"] = data["DATETIME_"]

        return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()

        # # create table sql
        # table_sql = ('create table "SENTIMENT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."REMARK_" varchar, "C"."PERIOD_CODE_" varchar,'
        #              '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, '
        #              '"C"."SOURCE_" varchar, "C"."BRIEF_" varchar, "C"."IMAGE_" varchar, "C"."IMPORTANCE_" varchar,'
        #              '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."NOTICE_TIME_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."HOME_PAGE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
                if i < 4:
                    re_data["HOME_PAGE_"] = "Y"
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #13
0
class Meipian(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="meipian_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="MEIPIAN_CCBDATA")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        if data["TITLE_"]:
            # HBase row_key
            hash_m = hashlib.md5()
            hash_m.update(data["TITLE_"].encode("utf-8"))
            hash_title = hash_m.hexdigest()
            row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

            # "C" 通用列族字段
            re_data["ID_"] = row_key
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
            # re_data["BANK_NAME_"]

            time_arrary = arrow.get(data["CREATE_TIME"])
            period_code = time_arrary.format("YYYYMMDD")
            publish_time = time_arrary.format("YYYY-MM-DD HH:mm:ss")
            re_data["PERIOD_CODE_"] = str(period_code)
            re_data["PUBLISH_TIME_"] = str(publish_time)
            re_data["STATUS_"] = "UNPROCESSED"
            re_data["CONTENT_"] = data["CONTENT_"]
            re_data["REMARK_"] = ""
            # re_data["AREA_CODE_"]
            # re_data["UNIT_CODE_"]
            re_data["CREATE_TIME_"] = data["DATETIME_"]

            re_data["URL_"] = data["URL_"]
            re_data["TITLE_"] = data["TITLE_"]
            re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"]
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
            re_data["DEALTIME_"] = str(data["DEALTIME_"])
            re_data["VISIT_COUNT_"] = data["VISIT_COUNT"]
            re_data["PRAISE_COUNT_"] = data["PRAISE_COUNT"]
            re_data["COMMENT_COUNT_"] = data["COMMENT_COUNT"]
            re_data["SOURCE_"] = data["SOURCE_"]

            return re_data
        else:
            return None

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "MEIPIAN_CCBDATA" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."URL_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #              ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, '
        #              '"C"."TITLE_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."ENTITY_NAME_" varchar,'
        #              '"C"."VISIT_COUNT_" varchar, "C"."PRAISE_COUNT_" varchar, "C"."COMMENT_COUNT_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."SOURCE_" varchar, "C"."PUBLISH_TIME_" varchar,'
        #              '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')

        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)
        # f_id = "5c6fa1328d7fee306de9463d"  # quit()
        # f_id = "5c6fe1ba8d7fee1d44775989"  # quit()
        # f_id = "5c6fdb448d7fee394da6a5fb"  # quit() Exception while executing batch.
        # f_id = "5c6fe1ba8d7fee1d44775989"
        f_id = "5c6fe11b9bb3df6b0ec6168b"  # gt 10M
        mongo_data_list = self.m_client.all_from_mongodb(self.collection,
                                                         data_id=f_id)
        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #14
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="WEIBOBASICINFO")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None, entity_type=None).area_from_mysql())

        # 删除表
        p_client.drop_table_phoenix(connection=connection)
        # quit()

        # 创建表
        sql = (
            'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,'
            '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,'
            '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,'
            '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,'
            '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,'
            '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
            '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,'
            '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        status = False
        self.logger.info("开始进行 WEIBOBASICINFO")

        try:
            mongo_data_list = m_client.all_from_mongodb(collection=collection)
        except pymongo.errors.ServerSelectionTimeoutError:
            time.sleep(1)
            mongo_data_list = m_client.all_from_mongodb(collection=collection)

        # 清洗数据并插入 HBase
        if mongo_data_list:
            self.find_count = mongo_data_list.count()
            for data in mongo_data_list:
                re_data = ""
                data_id = data["_id"]
                copy_data = {}
                self.remove_id_list.append(data_id)
                try:
                    del data["_id"]
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    re_data = self.data_shuffle(data=data,
                                                province_list=province_list)
                    if not re_data:
                        self.bad_count += 1
                        continue
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                        data_id, e))

                # phoenix_HBase 插入数据
                try:
                    success_count = p_client.upsert_to_phoenix_by_one(
                        connection=connection, data=re_data)
                    self.success_count += success_count
                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(
                        data_id, e))
                    continue
            if self.success_count > 0:
                status = True
                self.logger.info("HBase 插入成功, 成功条数 {}".format(
                    self.success_count))
        else:
            quit()

        # # 删除数据
        # if status:
        #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
        #                                                remove_id_list=self.remove_id_list)
        #     self.remove_count += delete_count
        # else:
        #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
        #
        # # 将数据插入 spider_data_old 中
        # if status:
        #     try:
        #         old_client.mongo_db = "spider_data_old"
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #     except pymongo.errors.ServerSelectionTimeoutError as e:
        #         time.sleep(1)
        #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except Exception as e:
        #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #15
0
class JsInsuranceCcbData(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="JSINSURANCE_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 MySQL 对象
        __mysql_config = {
            "host": MYSQL_HOST_25,
            "port": MYSQL_PORT_25,
            "database": MYSQL_DATABASE_25,
            "user": MYSQL_USER_25,
            "password": MYSQL_PASSWORD_25,
            "table": MYSQL_TABLE_25
        }

        __mysql_client = MysqlClient(**__mysql_config)
        __mysql_connection = __mysql_client.client_to_mysql()

        self.type = __mysql_client.search_area_code(
            sql=
            "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'TYPE\'",
            connection=__mysql_connection)

        __mysql_client.close_client(connection=__mysql_connection)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="INSURANCE")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""
        self.a = list()

    def data_shuffle(self, data):
        if data["ENTITY_CODE_"] == "PAINSURANCE":
            return None
        elif data["ENTITY_CODE_"] == "BJBINSURANCE":
            data["CONTET_"] = data["CONTET_"].replace("|主险2:", "主险2:")
            first_shuffle = data["CONTET_"].split("|")
            data_list = list()
            company_dict = dict()
            index_list = list()
            for first in first_shuffle:
                if first[-2:] == "公司":
                    company_index = first_shuffle.index(first)
                    company_dict[company_index] = first
                    index_list.append(company_index)
                else:
                    continue

            for key in index_list:
                # print(index_list)
                j = key + 1
                for i in range(100):
                    if index_list.index(key) == len(index_list) - 1:
                        if j == len(first_shuffle) - 1:
                            break
                    else:
                        if j == index_list[index_list.index(key) + 1]:
                            break

                    data_dict = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(first_shuffle[j].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    data_dict["ID_"] = row_key
                    data_dict["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    data_dict["ENTITY_NAME_"] = data["ENTITY_NAME_"].replace(
                        "模板", "产品")
                    data_dict["BANK_CODE_"] = "BJB"
                    data_dict["BANK_NAME_"] = "北京银行"
                    data_dict["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    data_dict["URL_"] = data["URL_"]
                    data_dict["PRODUCT_NAME_"] = first_shuffle[j]
                    j += 1
                    # data_dict["TYPE_"] = first_shuffle[j]
                    data_dict["TYPE_"] = ""
                    data_dict["TYPE_CODE_"] = ""
                    for i in self.type:
                        if i["ITEM_LABEL_"][:-1] in first_shuffle[j]:
                            data_dict["TYPE_"] = data_dict["TYPE_"] + i[
                                "ITEM_LABEL_"] + "|"
                            data_dict["TYPE_CODE_"] = data_dict[
                                "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                    data_dict["TYPE_"] = data_dict["TYPE_"][:-1]
                    data_dict["TYPE_CODE_"] = data_dict["TYPE_CODE_"][:-1]
                    j += 1
                    # data_dict["RISK_LEVEL_"] = first_shuffle[j]
                    j += 1
                    data_dict["PAY_METHOD_"] = first_shuffle[j]
                    j += 1
                    # data_dict["INSURANCE_DATE_"] = first_shuffle[j]
                    j += 1
                    # data_dict["TOUZIZHE_TYPE_"] = first_shuffle[j]
                    j += 1
                    data_dict["COM_NAME_"] = company_dict[key]
                    # data_dict["CONSIGNMENT_"] = "代销"
                    # if "CONTENT_" in data:
                    #     data_dict["CONTENT_"] = data["CONTENT_"]
                    data_dict["DEALTIME_"] = data["DEALTIME_"]
                    data_dict["CREATE_TIME_"] = data["DATETIME_"]
                    data_dict["STATUS_"] = "1"
                    # print(data_dict)
                    data_list.append(data_dict)

            return data_list

        elif data["ENTITY_CODE_"] == "CIBINSURANCE":
            data_list = list()
            insurance_name = re.findall(r".*?计划", data["PRODUCT_NAME_"])
            for name in insurance_name:
                re_data = dict()
                # HBase row_key
                hash_m = hashlib.md5()
                hash_m.update(name.encode("utf-8"))
                hash_title = hash_m.hexdigest()
                row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                # "C"
                re_data["ID_"] = row_key
                re_data["PRODUCT_NAME_"] = name
                re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                re_data["BANK_CODE_"] = "CIB"
                re_data["BANK_NAME_"] = "兴业银行"
                re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                    "-", "")
                re_data["URL_"] = data["URL_"]
                re_data["DEALTIME_"] = data["DEALTIME_"]
                re_data["CREATE_TIME_"] = data["DATETIME_"]
                re_data["STATUS_"] = "1"

                re_data["TYPE_"] = ""
                re_data["TYPE_CODE_"] = ""
                for i in self.type:
                    if i["ITEM_LABEL_"][:-1] in name:
                        re_data["TYPE_"] = re_data["TYPE_"] + i[
                            "ITEM_LABEL_"] + "|"
                        re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"] + i[
                            "ITEM_VALUE_"] + "|"
                re_data["TYPE_"] = re_data["TYPE_"][:-1]
                re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                data_list.append(re_data)

            return data_list

        else:
            if "INSURANCE_NAME_" not in data and ("PRODUCT_NAME_" not in data):
                return None
            else:
                if "INSURANCE_NAME_" in data:
                    # # 承保年龄
                    # if ("INSURANCE_AGE_" not in data) or (not data["INSURANCE_AGE_"]):
                    #     age = re.findall(r"(\d*)周岁", data["INSURANCE_NAME_"])
                    #     if age:
                    #         data["INSURANCE_AGE_"] = age[0]

                    # 保障期限
                    # if ("INSURANCE_DATE_" not in data) or (not data["INSURANCE_DATE_"]):
                    #     limit = re.findall(r"保(终身)|保(\d*年)|(\d*年)期", data["INSURANCE_NAME_"])
                    #     if limit:
                    #         for l in limit[0]:
                    #             if l:
                    #                 data["INSURANCE_DATE_"] = l
                    #                 break

                    re_data = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(data["INSURANCE_NAME_"].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    re_data["ID_"] = row_key
                    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                    re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace(
                        "INSURANCE", "")
                    re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace(
                        "保险产品", "")
                    if "INSURANCE_NAME_" in data:
                        re_data["PRODUCT_NAME_"] = data["INSURANCE_NAME_"]
                    if ("INSURANCE_AGE_" in data) or ("AGE_" in data):
                        re_data["AGE_"] = data["INSURANCE_AGE_"]
                    if "TYPE_" in data:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        if data["TYPE_"] == "财险":
                            re_data["TYPE_"] = "财产险"
                            re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE"
                        else:
                            for i in self.type:
                                if i["ITEM_LABEL_"][:-1] in data["TYPE_"]:
                                    re_data["TYPE_"] = re_data["TYPE_"] + i[
                                        "ITEM_LABEL_"] + "|"
                                    re_data["TYPE_CODE_"] = re_data[
                                        "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                            re_data["TYPE_"] = re_data["TYPE_"][:-1]
                            re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    else:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        for i in self.type:
                            if i["ITEM_LABEL_"][:-1] in data["ENTITY_NAME_"]:
                                re_data["TYPE_"] = re_data["TYPE_"] + i[
                                    "ITEM_LABEL_"] + "|"
                                re_data["TYPE_CODE_"] = re_data[
                                    "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                        re_data["TYPE_"] = re_data["TYPE_"][:-1]
                        re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    # if "INSURANCE_DATE_" in data:
                    #     re_data["INSURANCE_DATE_"] = data["INSURANCE_DATE_"]
                    if "INSURANCE_DETAIL_" in data:
                        re_data["PRODUCT_DETAIL_"] = data["INSURANCE_DETAIL_"]
                    if "COMPANY_NAME_" in data:
                        re_data["COM_NAME_"] = data["COMPANY_NAME_"]
                    if "LIMIT_NUMBER_" in data:
                        re_data["BUY_LIMIT_"] = data["LIMIT_NUMBER_"]
                    # re_data["AREA_CODE_"]
                    # re_data["UNIT_CODE_"]
                    re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    if "CONTENT_" in data:
                        re_data["CONTENT_"] = data["CONTENT_"]
                    # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
                    re_data["STATUS_"] = "1"
                    # re_data["REMARK_"] = ""
                    re_data["CREATE_TIME_"] = data["DATETIME_"]
                    # re_data["UPDATE_TIME_"]
                    # re_data["TITLE_"] = data["TITLE_"]
                    re_data["URL_"] = data["URL_"]
                    re_data["DEALTIME_"] = data["DEALTIME_"]
                    # re_data["DATETIME_"] = data["DATETIME_"]

                    return re_data

                elif "PRODUCT_NAME_" in data:
                    re_data = dict()
                    # HBase row_key
                    hash_m = hashlib.md5()
                    hash_m.update(data["PRODUCT_NAME_"].encode("utf-8"))
                    hash_title = hash_m.hexdigest()
                    row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

                    # "C"
                    re_data["ID_"] = row_key
                    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
                    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
                    # re_data["BANK_CODE_"] = data["ENTITY_CODE_"]
                    # re_data["BANK_NAME_"] = data["ENTITY_NAME_"]
                    if "PRODUCT_NAME_" in data:
                        re_data["PRODUCT_NAME_"] = data["PRODUCT_NAME_"]
                    if "FEATURE_NAME_" in data:
                        re_data["FEATURE_NAME_"] = data["FEATURE_NAME_"]
                    if "TYPE_" in data:
                        re_data["TYPE_"] = ""
                        re_data["TYPE_CODE_"] = ""
                        if data["TYPE_"] == "财险":
                            re_data["TYPE_"] = "财产险"
                            re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE"
                        elif data["TYPE_"] == "100种疾病保障":
                            re_data["TYPE_"] = "健康险"
                            re_data["TYPE_CODE_"] = "HEALTH_INSURANCE"
                        else:
                            for i in self.type:
                                if i["ITEM_LABEL_"][:-1] in data["TYPE_"]:
                                    re_data["TYPE_"] = re_data["TYPE_"] + i[
                                        "ITEM_LABEL_"] + "|"
                                    re_data["TYPE_CODE_"] = re_data[
                                        "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|"
                            re_data["TYPE_"] = re_data["TYPE_"][:-1]
                            re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1]
                    if "POLICY_DUTY_" in data:
                        re_data["POLICY_DUTY_"] = data["POLICY_DUTY_"]
                    if "PRODUCT_CASE_" in data:
                        re_data["PRODUCT_CASE_"] = data["PRODUCT_CASE_"]
                    if "BUY_LIMIT_" in data:
                        re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"]
                    if "ENSURE_PRICE_" in data:
                        re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"]
                    # re_data["AREA_CODE_"]
                    # re_data["UNIT_CODE_"]
                    re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                        "-", "")
                    if "PRODUCT_PRICE_" in data:
                        re_data["PRODUCT_PRICE_"] = data["PRODUCT_PRICE_"]
                    if "PRODUCT_ID_" in data:
                        re_data["PRODUCT_ID_"] = data["PRODUCT_ID_"]
                    if "PRODUCT_CLAUSE_" in data:
                        re_data["PRODUCT_CLAUSE_"] = data["PRODUCT_CLAUSE_"]
                    if "GENDER_" in data:
                        re_data["GENDER_"] = data["GENDER_"]
                    if "AGE_" in data:
                        re_data["AGE_"] = data["AGE_"]
                    if "COM_NAME_" in data:
                        re_data["COM_NAME_"] = data["COM_NAME_"]
                    if "PAY_METHOD_" in data:
                        re_data["PAY_METHOD_"] = data["PAY_METHOD_"]
                    if "PROBLEM_" in data:
                        re_data["PROBLEM_"] = data["PROBLEM_"]
                    if "CLAIM_" in data:
                        re_data["CLAIM_"] = data["CLAIM_"]
                    if "COMMENT_" in data:
                        re_data["COMMENT_"] = data["COMMENT_"]
                    if "ENSURE_CONTENT_" in data:
                        re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"]
                    if "INSURE_INFO_" in data:
                        re_data["INSURE_INFO_"] = data["INSURE_INFO_"]
                    if "RATE_INFO_" in data:
                        re_data["RATE_INFO_"] = data["RATE_INFO_"]
                    if "SALE_SERVICE_" in data:
                        re_data["SALE_SERVICE_"] = data["SALE_SERVICE_"]

                    # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
                    re_data["STATUS_"] = "1"
                    # re_data["REMARK_"] = ""
                    re_data["CREATE_TIME_"] = data["DATETIME_"]
                    # re_data["UPDATE_TIME_"]
                    # re_data["TITLE_"] = data["TITLE_"]
                    re_data["URL_"] = data["URL_"]
                    re_data["DEALTIME_"] = data["DEALTIME_"]
                    # re_data["DATETIME_"] = data["DATETIME_"]

                    return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "INSURANCE" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,'
        #              ' "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, '
        #              '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,'
        #              '"C"."TYPE_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C".PRODUCT_CLAUSE_ varchar,'
        #              '"C"."SOURCE_" varchar, "C"."PRODUCT_NAME_" varchar, "C"."FEATURE_NAME_" varchar,'
        #              '"C"."POLICY_DUTY_" varchar, "C"."PRODUCT_CASE_" varchar, "C"."BUY_LIMIT_" varchar,'
        #              '"C"."ENSURE_PRICE_" varchar, "C"."PRODUCT_PRICE_" varchar, "C"."PRODUCT_ID_" varchar,'
        #              '"C"."GENDER_" varchar, "C"."AGE_" varchar, "C"."COM_NAME_" varchar, "C"."TYPE_CODE_" varchar,'
        #              '"C"."PAY_METHOD_" varchar, "C"."PRODUCT_DETAIL_" varchar, "C"."PROBLEM_" varchar,'
        #              '"C"."CLAIM_" varchar, "C"."COMMENT_" varchar, "C"."STATUS_" varchar,'
        #              '"C"."ENSURE_CONTENT_" varchar, "C"."INSURE_INFO_" varchar, "C"."RATE_INFO_" varchar,'
        #              '"C"."SALE_SERVICE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            # print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                if isinstance(re_data, dict):
                    # upsert data to HBase
                    try:
                        success_count = self.p_client.upsert_to_phoenix_by_one(
                            connection=self.connection, data=re_data)
                    except jaydebeapi.DatabaseError as e:
                        self.logger.info("错误 id: {}, 错误信息 {}".format(
                            self.data_id, e))
                        continue

                elif isinstance(re_data, list):
                    for r_d in re_data:
                        # upsert data to HBase
                        try:
                            success_count = self.p_client.upsert_to_phoenix_by_one(
                                connection=self.connection, data=r_d)
                        except jaydebeapi.DatabaseError as e:
                            self.logger.info("错误 id: {}, 错误信息 {}".format(
                                self.data_id, e))
                            continue

            #     # add {d:1}
            #     try:
            #         self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id,
            #                                         data_dict={"d": 1})
            #         self.remove_count += 1
            #         if self.remove_count % 10 == 0:
            #             self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count))
            #     except Exception as e:
            #         self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e))
            #         continue
                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #16
0
class Entrust(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="JSENTRUST_CCBDATA")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="ENTRUST")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["NAME_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)
        re_data["ID_"] = row_key
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        re_data["STATUS_"] = "1"
        re_data["DEALTIME_"] = data["DEALTIME_"]
        re_data["URL_"] = data["URL_"]

        if data["ENTITY_CODE_"] == "CHINATRC":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]

            pub_date = eval(data["PUB_DATE_"])
            date = str(pub_date["time"])[:-3]
            t = arrow.get(int(date))
            publish_date = str(t)[:10]
            period_code = publish_date.replace("-", "")
            re_data["PERIOD_CODE_"] = period_code
            # re_data["REMARK_"]

            # re_data["UPDATE_TIME_"]

            re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            re_data["FUNCTION_"] = data["FUNCTION_"]

            pro_date = eval(data["PRO_START_"])
            pro_date = str(pro_date["time"])[:-3]
            p_t = arrow.get(int(pro_date))
            product_date = str(p_t)[:10]
            re_data["PRO_START_"] = product_date
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = publish_date
            # re_data["SCALE_"] = data[""]
            # re_data["MONTH_"]
            # re_data["YIELD_RATE_"]
            # re_data["START_FUNDS_"]
            # re_data["PURPOSE_"]
            # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"]
            #
            # re_data["DISTRIBU_MODE_"]
            # re_data["INVEST_AREA_"]
            # re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # re_data["INVEST_DIRECTION_"]
            # re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"]
            # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            # re_data["ISSUER_AREA_"]
            # re_data["RESERVE_INFO_"]
            # re_data["TRUSTEESHIP_BANK_"]
            # re_data["OTHER_INFO_"]
            # re_data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "TRUSTHEXUN":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # re_data["REMARK_"]
            # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # re_data["PRO_START_"]
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace(
                "至月", "")
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            re_data["SCALE_"] = data["SCALE_"]
            # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # re_data["PURPOSE_"]
            # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"]
            #
            # re_data["DISTRIBU_MODE_"]
            # re_data["INVEST_AREA_"]
            # re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # re_data["INVEST_DIRECTION_"]
            re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            re_data["CURRENCY_"] = data["CURRENCY_"]
            re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"]
            # re_data["TRUSTEESHIP_BANK_"]
            re_data["OTHER_INFO_"] = data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "YANGLEE":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # # re_data["REMARK_"]
            # # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # # re_data["PRO_START_"]
            re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            # re_data["SCALE_"] = data["SCALE_"]
            # # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # # re_data["PURPOSE_"]
            # # re_data["ESTAB_ANNOUNCEMENT_"]
            re_data["ENTRUST_STATUS_"] = data["STATUS_"]
            #
            re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"]
            # # re_data["INVEST_AREA_"]
            re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            # # re_data["INVEST_DIRECTION_"]
            # re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"] = data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            # re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"]
            re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"]
            re_data["OTHER_INFO_"] = data["OTHER_INFO_"]
        elif data["ENTITY_CODE_"] == "TRUSTONE":
            # "C"
            # re_data["AREA_CODE_"]
            # re_data["BANK_CODE_"]
            # re_data["BANK_NAME_"]
            # re_data["UNIT_CODE_"]
            re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "")
            # # re_data["REMARK_"]
            # # re_data["UPDATE_TIME_"]
            # re_data["CODE_"] = data["CODE_"]
            re_data["NAME_"] = data["NAME_"]
            re_data["ISSUER_"] = data["ISSUER_"]
            # re_data["FUNCTION_"] = data["FUNCTION_"]
            # # re_data["PRO_START_"]
            # re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"]
            # re_data["RUN_MODE_"] = data["RUN_MODE_"]
            # re_data["INDUSTRY_"] = data["INDUSTRY_"]
            re_data["PUB_DATE_"] = data["PUB_DATE_"]
            re_data["SCALE_"] = data["SCALE_"]
            # # re_data["MONTH_"]
            re_data["YIELD_RATE_"] = data["YIELD_RATE_"]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            # # re_data["PURPOSE_"]
            # # re_data["ESTAB_ANNOUNCEMENT_"]
            # re_data["ENTRUST_STATUS_"] = data["STATUS_"]
            # #
            re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"]
            re_data["INVEST_AREA_"] = data["INVEST_AREA_"]
            re_data["TERM_TYPE_"] = data["TERM_TYPE_"]
            re_data["INVEST_DIRECTION_"] = data["INVEST_DIRECTION_"]
            re_data["INVEST_MODE_"] = data["INVEST_MODE_"]
            # re_data["CURRENCY_"] = data["CURRENCY_"]
            # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"]
            # re_data["SALE_TARGET_"] = data["SALE_TARGET_"]
            re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"]
            # re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"]
            re_data["RESERVE_INFO_"] = re.sub(r"</?\w*>", "",
                                              data["RESERVE_INFO_"])
            # re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"]
            # re_data["OTHER_INFO_"] = data["OTHER_INFO_"]

        return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "ENTRUST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."CREATE_TIME_" varchar, "C"."STATUS_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."FUNCTION_" varchar,'
        #              '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar,'
        #              '"C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."UPDATE_TIME_" varchar,'
        #              '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."ISSUER_" varchar, "C"."PRO_START_" varchar,'
        #              '"C"."INVEST_PERIOD_" varchar,"C"."RUN_MODE_" varchar, "C"."INDUSTRY_" varchar,'
        #              '"C"."PUB_DATE_" varchar, "C"."SCALE_" varchar, "C"."MONTH_" varchar, "C"."YIELD_RATE_" varchar,'
        #              '"C"."START_FUNDS_" varchar, "C"."PURPOSE_" varchar, "C"."ESTAB_ANNOUNCEMENT_" varchar,'
        #              '"C"."ENTRUST_STATUS_" varchar, "C"."DISTRIBU_MODE_" varchar, "C"."INVEST_AREA_" varchar,'
        #              '"C"."TERM_TYPE_" varchar, "C"."INVEST_DIRECTION_" varchar, "C"."INVEST_MODE_" varchar,'
        #              '"C"."CURRENCY_" varchar, "C"."MANAGE_TYPE_" varchar, "C"."SALE_TARGET_" varchar,'
        #              '"C"."PROFIT_TYPE_" varchar, "C"."ISSUER_AREA_" varchar, "C"."RESERVE_INFO_" varchar,'
        #              '"C"."TRUSTEESHIP_BANK_" varchar, "C"."OTHER_INFO_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection, data_id="5c67307d9bb3df76b4229f79")

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            # try:
            re_data = self.data_shuffle(data=data)
            # except Exception as e:
            #     self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
            #     continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # # add {d:1}
                # try:
                #     self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id,
                #                                     data_dict={"d": 1})
                #     self.remove_count += 1
                #     if self.remove_count % 10 == 0:
                #         self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count))
                # except Exception as e:
                #     self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e))
                #     continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #17
0
파일: trend.py 프로젝트: ILKKAI/dataETL
class Trend(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="TREND")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="CHA_BRANCH_MARKET_ACT")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["TITLE_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

        # 分行
        copy_result = dict()
        copy_result["ID_"] = row_key
        copy_result["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        copy_result["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        copy_result["URL_"] = data["URL_"]
        # copy_result["PROVINCE_CODE_"] = result[""]
        # copy_result["PROVINCE_NAME_"] = result[""]
        # copy_result["CITY_CODE_"] = result[""]
        # copy_result["CITY_NAME_"] = result[""]
        # copy_result["AREA_CODE_"] = result[""]
        # copy_result["AREA_NAME_"] = result[""]
        # copy_result["LAT_"] = result[""]
        # copy_result["LNG_"] = result[""]
        copy_result["APP_VERSION_"] = "BRANCH"
        copy_result["BANK_CODE_"] = data["ENTITY_CODE_"].replace(
            "PRIVATEINFO", "")
        copy_result["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "")
        # copy_result["UNIT_CODE_"] = result["UNIT_CODE_"]
        # copy_result["UNIT_NAME_"] = result[""]
        copy_result["PERIOD_CODE_"] = data["NOTICE_TIME_"].replace("-", "")
        # copy_result["REMARK_"] = result[""]
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        copy_result["CREATE_TIME_"] = create_time
        copy_result["SPIDER_TIME_"] = data["DATETIME_"]
        # copy_result["MODIFIED_TIME_"] = result[""]
        copy_result["CREATE_BY_ID_"] = "P0131857"
        copy_result["CREATE_BY_NAME_"] = "钟楷文"
        # copy_result["MODIFIED_BY_ID_"] = result[""]
        # copy_result["MODIFIED_BY_NAME_"] = result[""]
        copy_result["M_STATUS_"] = "0"
        copy_result["DELETE_STATUS_"] = "0"
        copy_result["DATA_STATUS_"] = "uncheck"
        # copy_result["TAGS_"] = result[""]
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        copy_result["SOURCE_"] = source[0]
        copy_result["SOURCE_NAME_"] = data["ENTITY_NAME_"]
        # copy_result["SOURCE_TYPE_"] = result[""]
        # copy_result["HOT_"] = result[""]
        # copy_result["IMPORTANCE_"] = result[""]
        copy_result["ACT_NAME_"] = data["TITLE_"]
        # copy_result["IMAGES_"] = data[""]
        # copy_result["TARGET_"] = data[""]
        # copy_result["BRIEF_"] = data[""]
        copy_result["DETAILS_"] = data["CONTENT_"]
        # copy_result["RULE_"] = data[""]
        # copy_result["START_TIME_"] = data[""]
        # copy_result["END_TIME_"] = data[""]
        # copy_result["ACT_TYPE1_"] = data[""]
        # copy_result["ACT_TYPE2_"] = data[""]
        # copy_result["ACT_TYPE3_"] = data[""]
        copy_result["PUBLISH_TIME_"] = data["NOTICE_TIME_"]
        # copy_result["READS_"] = data[""]
        # copy_result["LIKES_"] = data[""]
        # copy_result["COMMENTS_"] = data[""]
        # copy_result["JOINS_"] = data[""]
        # copy_result["RELAYS_"] = data[""]
        # copy_result["SOURCE_ID_"] = data[""]
        # copy_result["HTML_"] = data[""]
        # copy_result["SOURCE_OWN_NAME_"] = data[""]
        # copy_result["SOURCE_OWN_ID_"] = data[""]
        return copy_result

        # "C"
        # re_data["ID_"] = row_key
        # re_data["TYPE_"] = random.choice(
        #     ["税务法律", "子女教育", "健康医养", "财富管理", "生活娱乐", "旅游出行", "艺术/艺术品", "节日庆贺", "其他"])
        # re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        # re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        # re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace("PRIVATEINFO", "")
        # re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "")
        # # re_data["AREA_CODE_"]
        # # re_data["UNIT_CODE_"]
        # period_code = data["NOTICE_TIME_"].replace("-", "")
        # re_data["PERIOD_CODE_"] = period_code
        # re_data["CONTENT_"] = data["CONTENT_"]
        # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"]
        # re_data["STATUS_"] = "1"
        # # re_data["REMARK_"] = ""
        # re_data["CREATE_TIME_"] = data["DATETIME_"]
        # # re_data["UPDATE_TIME_"]
        # re_data["TITLE_"] = data["TITLE_"]
        # re_data["URL_"] = data["URL_"]
        # re_data["DEALTIME_"] = data["DEALTIME_"]
        # # re_data["DATETIME_"] = data["DATETIME_"]
        # # re_data["SOURCE_TYPE_"]
        #
        # return re_data

    def run(self):
        # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # quit()

        # add colum
        # self.p_client.add_column_phoenix(connection=self.connection, column="SOURCE_TYPE_")
        # quit()

        # create table sql
        # table_sql = ('create table "MARKETING_ACT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."TITLE_" varchar,"C"."NOTICE_TIME_" varchar,'
        #              '"T"."CONTENT_" varchar,"C"."OBJ_" varchar, "C"."ATENDANCE_" varchar, "C"."PERIOD_CODE_" varchar,'
        #              '"C"."IMAGES_" varchar, "C"."RESULTS_" varchar,"C"."PLACE_" varchar, "C"."TYPE_" varchar,'
        #              '"C"."READ_NUM_" varchar, "C"."CONTENT_NUM_" varchar, "C"."COMMENT_CONTENT_" varchar, '
        #              '"C"."FORWARD_NUM_" varchar, "C"."COLLECTION_NUM_" varchar, "C"."PRAISE_NUM_" varchar,'
        #              '"C"."BANK_NAME_" varchar, "C"."STATUS_" varchar, "C"."REMARK_" varchar, "C"."SOURCE_ID_" varchar,'
        #              '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,"C"."SOURCE_" varchar,'
        #              '"C"."URL_" varchar, "C"."BANK_CODE_" varchar, "C"."DEALTIME_" varchar, '
        #              '"C"."SOURCE_TYPE_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true')

        # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        # for i in range(mongo_data_list.count() + 100):
        for i in range(100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #18
0
파일: __init__.py 프로젝트: ILKKAI/dataETL
    def run(self):
        # # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="NEWS_FINASSIST")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="NEWS_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)
        #
        # # 表创建语句
        # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, '
        #        '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, '
        #        '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)'
        #        'IMMUTABLE_ROWS = true')

        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            # self.remove_id_list = []
            # self.copy_mongo_data_list = []
            # self.branch_code_list = []
            if entity_code == "CAIJINGNEWS":
                find_id = "5c6bfa508d7fee512a4ca68f"
            else:
                find_id = ""
            # find_id = ""
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for i in range(1000000):
                    try:
                        data = mongo_data_list.__next__()
                    except pymongo.errors.ServerSelectionTimeoutError:
                        continue
                    except StopIteration:
                        break

                    # for data in mongo_data_list:
                    data_id = data["_id"]
                    if self.success_count % 100 == 0:
                        self.logger.info(
                            "running on data_id: {}".format(data_id))
                    # print(data_id)
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        data_list = [data]
                        re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            # self.remove_id_list.remove(data_id)
                            continue
                    except Exception as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    if entity_code != "CNINFONEWS":
                                        ai_data = self.get_brief_from_ai(
                                            data=list_data)
                                    else:
                                        ai_data = list_data
                                    # print(ai_data["CONTENT_"])
                                except Exception as e:
                                    self.logger.info("AI 调取失败, 错误信息", e)
                                    ai_data = re_data
                                try:
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=ai_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    if self.success_count % 10 == 0:
                                        self.logger.info(
                                            "HBase 插入成功, 成功条数{}条".format(
                                                once_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                                try:
                                    # 添加 {d:1}
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=data_id,
                                        data_dict={"d": 1})
                                    self.remove_count += update_count
                                    # self.logger.info("MongoDB 更新成功")
                                    if self.remove_count % 10 == 0:
                                        self.logger.info(
                                            "MongoDB 更新成功, 成功条数 {} 条".format(
                                                "10"))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue

                    elif isinstance(re_data, dict):
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                self.logger.info(
                                    "HBase 插入成功, 成功条数 {} 条".format(
                                        success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue

                if once_count > 0:
                    status = True
                    self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format(
                        entity_code, once_count))
                mongo_data_list.close()
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #19
0
class BaiduSearch(object):
    def __init__(self):
        # 创建 MongoDB 对象
        self.m_client = MongoClient(mongo_collection="BAIDU_SEARCH")
        db, collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 创建 Phoenix 对象
        self.p_client = PhoenixHbase(table_name="BAIDU_SEARCH")
        # 连接 Phoenix
        self.connection = self.p_client.connect_to_phoenix()

        self.logger = Logger().logger

        self.find_count = 0
        self.success_count = 0
        self.remove_count = 0
        self.old_count = 0
        self.bad_count = 0
        self.error_count = 0
        self.data_id = ""

        # BANK_NAME_ 字典  交通银行 BOCOM 改为 COMM 中信银行 ECITIC 改为 CITIC  增加 平安银行 北京银行 上海银行
        self.name_dict = {
            "ICBC": "中国工商银行",
            "ABC": "中国农业银行",
            "BOC": "中国银行",
            "CCB": "中国建设银行",
            "COMM": "交通银行",
            "PSBC": "中国邮政储蓄银行",
            "CZB": "浙商银行",
            "CBHB": "渤海银行",
            "CITIC": "中信银行",
            "CEB": "中国光大银行",
            "HXB": "华夏银行",
            "CMBC": "中国民生银行",
            "CMB": "招商银行",
            "CIB": "兴业银行",
            "CGB": "广发银行",
            "PAB": "平安银行",
            "SPDB": "浦发银行",
            "EBCL": "恒丰银行",
            "PINGAN": "平安银行",
            "LTD": "中国光大银行",
            "BEIJING": "北京银行",
            "BOSC": "上海银行"
        }

        # TYPE_ 列表
        self.type_list = [
            "Market", "Activity", "GoodStart", "MidSeason", "PrivateBank",
            "Recommendation"
        ]

    def data_shuffle(self, data):
        re_data = dict()
        # HBase row_key
        hash_m = hashlib.md5()
        hash_m.update(data["TITLE_"].encode("utf-8"))
        hash_title = hash_m.hexdigest()
        row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title)

        # "C"
        re_data["ID_"] = row_key
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        # re_data["AREA_CODE_"]
        for bank_c in self.name_dict:
            if bank_c in data["ENTITY_CODE_"]:
                re_data["BANK_CODE_"] = bank_c
                break
        if "BANK_CODE_" in re_data:
            re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]]
        else:
            print(data["ENTITY_CODE_"])

        # re_data["UNIT_CODE_"]
        # re_data["PERIOD_CODE_"] = ""
        re_data["CONTENT_"] = data["CONTENT_"]
        re_data["STATUS_"] = "UNPROCESSED"
        # re_data["REMARK_"] = ""
        re_data["CREATE_TIME_"] = data["DATETIME_"]
        # re_data["UPDATE_TIME_"]

        for type in self.type_list:
            if type in data["ENTITY_CODE_"]:
                re_data["TYPE_"] = type
                break
        re_data["TITLE_"] = data["TITLE_"]
        re_data["URL_"] = data["URL_"]
        re_data["DEALTIME_"] = data["DEALTIME_"]
        # re_data["DATETIME_"] = data["DATETIME_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]

        return re_data

    def run(self):
        # # delete table
        # self.p_client.drop_table_phoenix(connection=self.connection)
        # # quit()
        #
        # # create table sql
        # table_sql = ('create table "BAIDU_SEARCH" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #              '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,"C"."REMARK_" varchar,'
        #              ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, '
        #              '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."ENTITY_NAME_" varchar,"C"."TYPE_" varchar,'
        #              '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        #
        # # create table
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        mongo_data_list = self.m_client.all_from_mongodb(
            collection=self.collection)

        for i in range(mongo_data_list.count() + 100):
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError as e:
                self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e))
                time.sleep(3)
                data = mongo_data_list.__next__()

            self.data_id = data["_id"]
            if self.success_count % 100 == 0:
                self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id))
            print(data["_id"])
            # todo remove and upsert data from mongo

            # shuffle data
            try:
                re_data = self.data_shuffle(data=data)
            except Exception as e:
                self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id))
                continue

            if re_data:
                # upsert data to HBase
                try:
                    success_count = self.p_client.upsert_to_phoenix_by_one(
                        connection=self.connection, data=re_data)
                except jaydebeapi.DatabaseError as e:
                    self.logger.info("错误 id: {}, 错误信息 {}".format(
                        self.data_id, e))
                    continue
                # add {d:1}
                try:
                    self.m_client.update_to_mongodb(collection=self.collection,
                                                    data_id=self.data_id,
                                                    data_dict={"d": 1})
                    self.remove_count += 1
                    if self.remove_count % 10 == 0:
                        self.logger.info("MongoDB 更新成功, 成功条数 {}".format(
                            self.remove_count))
                except Exception as e:
                    self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                        self.data_id, e))
                    continue

                if success_count > 0:
                    status = True
                    self.success_count += success_count

                if self.success_count % 10 == 0:
                    self.logger.info("HBase 插入成功 {} 条".format(
                        self.success_count))

            else:
                self.bad_count += 1
                continue

        mongo_data_list.close()

        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
예제 #20
0
class MapBarTransfer(object):
    def __init__(self,
                 table_name="CHA_BRANCH_MAPBAR",
                 collection_name="mapbar"):
        # phoenix connection
        self.p_client = PhoenixHbase(table_name=table_name)
        self.connection = self.p_client.connect_to_phoenix()
        # MongoDB connection
        self.m_client = MongoClient(mongo_collection=collection_name,
                                    entity_code="MAPBAR_DEATAIL_BJ")
        self.m_client.mongo_host = "172.22.69.35"
        self.m_client.mongo_port = 20000
        self.m_client.client = pymongo.MongoClient(host="172.22.69.35",
                                                   port=20000,
                                                   serverSelectionTimeoutMS=60,
                                                   connectTimeoutMS=60,
                                                   connect=False)
        self.db, self.collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=self.db, collection_list=self.collection_list)
        # Log
        self.logger = Logger().logger
        # count
        self.count = 0

    def main(self):
        # # 创建表
        # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,'
        #              '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,'
        #              '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,'
        #              '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar'
        #              ') IMMUTABLE_ROWS = true')
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        # 获取数据
        # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection)
        mongo_data_list = self.m_client.search_from_mongodb(
            collection=self.collection,
            field_name="DEALTIME_",
            field_value={"$gt": "1555136656.0579224"},
            data_id="5cb65fac9bb3df61a09c6625")

        count = 0
        while True:
            # 取一条处理
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(3)
                data = mongo_data_list.__next__()

            # 清洗
            try:
                data["PHONE_"] = data["PHONE_"].replace("无,", "")
                u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日",
                                         data["UPDATETIME_"])
                if u_time_list:
                    u_ = u_time_list[0].replace("年", "-")
                    u_ = u_.replace("月", "-")
                    u_l = u_.split("-")
                    if len(u_l[1]) == 1:
                        u_l[1] = "0" + u_l[1]
                    if len(u_l[2]) == 1:
                        u_l[2] = "0" + u_l[2]
                    data["UPDATETIME_"] = "-".join(u_l)
            except Exception as e:
                self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}")
                continue

            # 获取经纬度
            try:
                if data["ADDRESS_"]:
                    data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:])
                    location_result = get_lat_lng(address=data["ADDRESS_"])
                    if location_result["status"] == 0:
                        data["LNG_"] = str(
                            location_result["result"]["location"]["lng"])
                        data["LAT_"] = str(
                            location_result["result"]["location"]["lat"])
                    else:
                        self.logger.warning(f"_id: {data['_id']} 获取经纬度失败")
                else:
                    continue
            except Exception as e:
                self.logger.exception(
                    f"_id: {data['_id']} 获取经纬度失败, error: {e}")
                continue
            # upsert to HBase
            try:
                re_data = self.__check_lat(data=data)
                # 向 HBase 中插入一条
                self.p_client.upsert_to_phoenix_by_one(
                    connection=self.connection, data=re_data)
                count += 1
                if count % 100 == 0:
                    self.logger.info(
                        f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}")
            except Exception as e:
                self.logger.exception(
                    f"HBase 插入失败, _id: {data['_id']}, error: {e}")
                continue

        # 关闭 MongoDB cursor
        mongo_data_list.close()
        self.logger.info(
            f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条"
        )

    def check_lat(self):
        # # 删除表
        # self.p_client.drop_table_phoenix(connection=self.connection, table_name="CHA_BRANCH_MAPBAR")
        #
        # table_sql = (f'create table "CHA_BRANCH_MAPBAR" ("ID_" varchar primary key,'
        #              '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,'
        #              '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,'
        #              '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar,'
        #              '"C"."CHECK_LAT_" varchar, "C"."CHECK_LNG_" varchar'
        #              ') IMMUTABLE_ROWS = true')
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        self.p_client.table_name = "FANSILE"
        data_cursor = self.p_client.search_all_from_phoenix(
            connection=self.connection, dict_status=True)
        self.p_client.table_name = "CHA_BRANCH_MAPBAR"
        while True:
            try:
                data = data_cursor.__next__()

                # del data["('C', 'CHECK_LNG_')"]
                # if not data["LAT_"]:
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864:
                #     pass
                # else:
                #     self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}")
                #     data["CHECK_LAT_"] = data["LAT_"]
                #     data["CHECK_LNG_"] = data["LNG_"]
                #     data["LAT_"] = ""
                #     data["LNG_"] = ""
                #
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443:
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # else:
                #     self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}")
                #     data["CHECK_LAT_"] = data["LAT_"]
                #     data["CHECK_LNG_"] = data["LNG_"]
                #     data["LAT_"] = ""
                #     data["LNG_"] = ""
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                self.p_client.upsert_to_phoenix_by_one(
                    connection=self.connection, data=data)
                self.count += 1
                if self.count % 100 == 0:
                    self.logger.info(
                        f"HBase 插入成功, _id: {data['_id']}, 成功条数 {self.count} 条")

            except StopIteration:
                break

    def __check_lat(self, data):
        if "LAT_" not in data:
            return data
        # 上海
        # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864:
        # 北京
        if 39.4498800000 < float(data["LAT_"]) < 41.1684980000:
            pass
        else:
            self.logger.warning(
                f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}")
            data["CHECK_LAT_"] = data["LAT_"]
            data["CHECK_LNG_"] = data["LNG_"]
            data["LAT_"] = ""
            data["LNG_"] = ""
            return data
        # 上海
        # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443:
        # 北京
        if 115.4534230000 < float(data["LNG_"]) < 117.5461160000:
            return data
        else:
            self.logger.warning(
                f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}")
            data["CHECK_LAT_"] = data["LAT_"]
            data["CHECK_LNG_"] = data["LNG_"]
            data["LAT_"] = ""
            data["LNG_"] = ""
            return data