Exemplo n.º 1
0
        if "IMAGES_" in data:
            pattern = re.compile(r"https:(http://.*)")
            if re.match(pattern, data["IMAGES_"]):
                a = re.match(pattern, data["IMAGES_"])
                image_url = a.group(1)
            else:
                image_url = data["IMAGES_"]
            response = req_for_something(url=image_url)
            if response:
                t = base64.b64encode(response.content)
                re_data["IMAGE_"] = t.decode("utf-8")

        re_data = super(BranchXyk, self).generic_shuffle(data=data,
                                                         re_data=re_data,
                                                         field=None)
        # print(re_data)
        re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"]
        return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]


if __name__ == '__main__':
    param = sys.argv[1]
    # param = "{'limitNumber':'1000'}"
    verify_field = {"URL_": "URL_"}
    script = BranchXyk(table_name=TABLE_NAME("CHA_BRANCH_CREDITCARDARD"),
                       collection_name="JRCP_XYK",
                       param=param,
                       verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 2
0
                re_data["SUBWAY_NAME_"] = SUBWAY_NAME_ + "-" + re_data[
                    "STATION_NAME_"]
                re_data = super(Branchjtdt,
                                self).generic_shuffle(data=data,
                                                      re_data=re_data,
                                                      field=None)
                return [{
                    "TABLE_NAME_": self.p_client.table_name,
                    "DATA_": re_data
                }]


if __name__ == '__main__':
    param = sys.argv[1]
    # param = "{'limitNumber':'1000'}"
    verify_field = {'SUBWAY_NAME_': 'SUBWAY_NAME_'}
    script = Branchjtdt(table_name=TABLE_NAME("CHA_BRANCH_SUBWAY"),
                        collection_name="WD_JT_DT",
                        param=param,
                        verify_field=verify_field)
    script.main()
    script.close_client()
    # filelist = ['WD_JT_DT_BDDT_BJ',
    # 'WD_JT_DT_BDDT_CD',
    # 'WD_JT_DT_BDDT_NB',
    # 'WD_JT_DT_BDDT_NN',
    # 'WD_JT_DT_BDDT_SH',
    # 'WD_JT_DT_BDDT_XM']
    # for i in filelist:
    #     with open("{}.py".format(i), "w") as f:
    #         f.write("from database._mongodb import MongoClient\n\n\ndef data_shuffle(data):\n\n    return data\n\n\nif __name__ == '__main__':\n    main_mongo = MongoClient(entity_code=\"{}\", mongo_collection=\"WD_JT_DT\")".format(i))
Exemplo n.º 3
0
                phone_number = re.sub(pattern5, r"\1  \2", data["TEL_"])
            elif re.match(pattern6, data["TEL_"]):
                phone_number = re.sub(pattern6, r"\1  \2", data["TEL_"])
            elif re.match(pattern7, data["TEL_"]):
                phone_number = re.sub(pattern7, r"\1  \2", data["TEL_"])
            elif re.match(pattern8, data["TEL_"]):
                phone_number = re.sub(pattern8, r"\1  \2  \3", data["TEL_"])
            elif re.match(pattern9, data["TEL_"]):
                phone_number = re.sub(pattern9, r"\1  \2", data["TEL_"])
            else:
                phone_number = data["TEL_"]
            re_data["TEL_"] = phone_number
        # 地址
        if "ADDR_" in data:
            re_data["ADDR_"] = data["ADDR_"]
        re_data = super(Branchssxx, self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)
        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]


if __name__ == '__main__':
    param = sys.argv[1]
    # param = "{'limit_Number': '1000'}"
    verify_field = {'URL_': 'URL_'}
    script = Branchssxx(table_name=TABLE_NAME("CHA_BRANCH_SCHOOL"),
                        collection_name="WD_SS_XX",
                        param=param,
                        verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 4
0
if __name__ == '__main__':
    try:
        # param = sys.argv[1]
        param = "{'entityType':'NEWS','limitNumber':10000,'entityCode':['ZX_GWDT_HEBYH_NHXW']}"
    except Exception:
        param = {}

    param_dict = eval(param)
    if "entityCode" in param_dict:
        if isinstance(param_dict["entityCode"], str):
            c = param_dict["entityCode"].split("_")
            if c[1] == "CJXW":
                coll = "_".join([c[0], c[1], c[2]])
            else:
                coll = "_".join([c[0], c[1]])
            script = BranchNews(table_name=TABLE_NAME("CHA_BRANCH_NEWS"), collection_name=coll, param=param)
            script.main()
            script.close_client()

        elif isinstance(param_dict["entityCode"], list):
            param_each = param_dict
            for each in param_dict["entityCode"]:
                c = each.split("_")
                if c[1] == "CJXW":
                    coll = "_".join([c[0], c[1], c[2]])
                else:
                    coll = "_".join([c[0], c[1]])
                param_each = param_dict
                param_each["entityCode"] = each
                script = BranchNews(table_name=TABLE_NAME("CHA_BRANCH_NEWS"), collection_name=coll, param=str(param_each))
                script.main()
Exemplo n.º 5
0
    def generic_shuffle(self, data):
        re_data = dict()
        serial_number = req_for_serial_number(code="MAPBAR")
        re_data["ID_"] = serial_number
        re_data["NAME_"] = data["NAME_"]
        re_data["ADDRESS_"] = data["ADDRESS_"].replace("|", "")
        re_data["ADDRESS_"] = re_data["ADDRESS_"].replace("地址:", "")
        # re_data["PROVINCE_CODE_"] = "3100"
        # re_data["PROVINCE_NAME_"] = "上海市"
        # re_data["CITY_CODE_"] = "310100"
        # re_data["CITY_NAME_"] = "上海市"
        re_data["HOT_"] = 0
        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        re_data["SOURCE_NAME_"] = "图吧"
        re_data["SOURCE_TYPE_"] = "图吧"
        # 获取经纬度
        try:
            if re_data["ADDRESS_"]:
                location_result = get_lat_lng(address=re_data["ADDRESS_"])
                if location_result["status"] == 0:
                    re_data["LNG_"] = str(
                        location_result["result"]["location"]["lng"])
                    re_data["LAT_"] = str(
                        location_result["result"]["location"]["lat"])
                else:
                    re_data["LNG_"] = ""
                    re_data["LAT_"] = ""
                    self.logger.warning(f"_id: {data['_id']} 获取经纬度失败")
            else:
                re_data["LNG_"] = ""
                re_data["LAT_"] = ""
        except Exception as e:
            self.logger.exception(f"_id: {data['_id']} 获取经纬度失败, error: {e}")
        if re_data["LAT_"]:
            try:
                area_result = get_area(",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])]))
            except Exception as e:
                self.logger.exception(f"_id: {data['_id']} 获取地址失败, error: {e}")
            else:
                try:
                    re_data["AREA_NAME_"] = area_result["result"][
                        "addressComponent"]["district"]
                except KeyError:
                    re_data["AREA_NAME_"] = ""
                try:
                    re_data["AREA_CODE_"] = area_result["result"][
                        "addressComponent"]["adcode"]
                except KeyError:
                    re_data["AREA_CODE_"] = ""
                else:
                    re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                    re_data[
                        "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                    for city in self.city_list:
                        if city["CODE_"] == re_data["CITY_CODE_"]:
                            re_data["CITY_NAME_"] = city["NAME_"]
                            break
                    for prov in self.province_list:
                        if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                            re_data["PROVINCE_NAME_"] = prov["NAME_"]
                            break

        if not re_data.get("CITY_NAME_", ""):
            for city in self.city_list:
                if city["NAME_"][:2] in data["TYPE_"]:
                    re_data["CITY_CODE_"] = city["CODE_"]
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            if re_data.get("CITY_NAME_", ""):
                for prov in self.province_list:
                    if prov["CODE_"][:2] == re_data["CITY_CODE_"][:2]:
                        re_data["PROVINCE_CODE_"] = prov["CODE_"]
                        re_data["PROVINCE_NAME_"] = prov["NAME_"]
                        break

        # CHA_BRANCH_MAIN_ROUTE 主干道
        if "道路" in data["TYPE_"]:
            road_data = dict()
            road_data.update(re_data)
            road_data["ID_"] = req_for_serial_number(code="WD_GD")
            road_data["ADDR_"] = road_data["ADDRESS_"]
            del road_data["ADDRESS_"]
            road_shuffle_data = super(MapbarScript,
                                      self).generic_shuffle(data=data,
                                                            re_data=road_data,
                                                            field=None)

        # CHA_BRANCH_FACILITY 图吧
        # serial_number = req_for_serial_number(code="MAPBAR")
        # re_data["ID_"] = serial_number
        re_data["TYPE1_"] = data["BTYPE_"]
        try:
            re_data["TYPE1_CODE_"] = self.type1_dict[re_data["TYPE1_"]]
        except KeyError:
            raise Exception("暂不需要清洗的数据")
        # 小分类清洗(合并部分分类)
        if data["TYPE_"][2:] in ["户外运动俱乐部", "赛马场及马术俱乐部", "室内运动健身俱乐部"]:
            re_data["TYPE2_"] = "俱乐部"
            re_data["TYPE2_CODE_"] = "JLB"
        elif data["TYPE_"][2:] in ["连锁店", "便利店"]:
            re_data["TYPE2_"] = "便利店"
            re_data["TYPE2_CODE_"] = "BLD"
        elif data["TYPE_"][2:] in ["电子商城", "电器商城"]:
            re_data["TYPE2_"] = "家电数码"
            re_data["TYPE2_CODE_"] = "JDSM"
        elif data["TYPE_"][2:] in ["诊所/卫生所", "门诊/急诊部"]:
            re_data["TYPE2_"] = "门诊/卫生所"
            re_data["TYPE2_CODE_"] = "MZWSS"
        else:
            re_data["TYPE2_"] = data["TYPE_"][2:]
            re_data["TYPE2_CODE_"] = self.type2_dict.get(re_data["TYPE2_"])
        re_data["SOURCE_TYPE1_"] = data["BTYPE_"]
        re_data["SOURCE_TYPE1_CODE_"] = self.type1_dict.get(
            re_data["SOURCE_TYPE1_"])
        re_data["SOURCE_TYPE2_"] = data["TYPE_"][2:]
        re_data["SOURCE_TYPE2_CODE_"] = self.source_type2_dict.get(
            re_data["SOURCE_TYPE2_"])
        re_data["PHONE_"] = data["PHONE_"].replace("无,", "")
        re_data["BUS_"] = data["BUS_"]
        re_data["BUSSTOP_"] = data["BUSSTOP_"]

        shuffle_data = super(MapbarScript,
                             self).generic_shuffle(data=data,
                                                   re_data=re_data,
                                                   field=None)

        return_list = list()
        return_list.append({
            "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FACILITY"),
            "DATA_": shuffle_data
        })
        if "road_shuffle_data" in dir():
            return_list.append({
                "TABLE_NAME_":
                TABLE_NAME("CHA_BRANCH_MAIN_ROUTE"),
                "DATA_":
                road_shuffle_data
            })
        return return_list
Exemplo n.º 6
0
            basic_dict["DELETE_STATUS_"] = "N"
            # basic_dict["TAGS_"] = data[""]
            # 数据来源 URL
            source = re.findall(r"(https?://.*?)/", data["URL_"])
            re_data["SOURCE_"] = source[0]
            # 数据来源 网站名称
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
            basic_dict["SOURCE_TYPE_"] = "链家"
            # basic_dict["PRICE_TYPE_"] = data[""]
            basic_dict["ADDR_"] = data["ADDR_"]

            return [{"TABLE_NAME_": self.data_table_name, "DATA_": data_dict},
                    {"TABLE_NAME_": self.base_table_name, "DATA_": basic_dict}]


if __name__ == '__main__':
    try:
        # param = sys.argv[1]
        # 'WD_JZ_FJ_LJXQFJ_FS', 'WD_JZ_FJ_LIXQZL_FS'
        param = "{'entityType':'WD_JZ_FJ_FS','limitNumber':10000,'entityCode':['WD_JZ_FJ_LJXQFJ_FS']}"
    except Exception:
        param = "{}"

    script = BranchNews(table_name=TABLE_NAME("CHA_BRANCH_HOUSE_DATA"), collection_name="WD_JZ_FJ_FS", param=param)
    script.main()
    script.close_client()




Exemplo n.º 7
0
                                comment_data["SENSITIVE_"] = "Y"
                                comment_data["SENSITIVE_WORD_"] = censor["words"]
                        else:
                            comment_data["SENSITIVE_"] = "N"
                    except Exception as e:
                        self.logger.info(f"调用模型req_for_comment失败,错误为{e}")
                        comment_data["SENSITIVE_"] = "N"

                comment_data["VERSION_"] = "0"
                comment_data["CREATE_BY_ID_"] = "P0131857"
                comment_data["CREATE_BY_NAME_"] = "钟楷文"
                re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_COMMENT"), "DATA_": comment_data})
                comment_count += 1
            # 打相关评论日志方便调试
            self.logger.info(f'清洗的URL为{info_data["URL_"]}')
            self.logger.info(f'清洗的评论数为{info_data["COMMENTS_"]}')
            self.logger.info(f'插入到comment表的数量为{comment_count}')
        # print(re_data)
        return re_data


if __name__ == '__main__':
    # try:
    #     param = sys.argv[1]
    # except Exception:
    #     param = '{}'
    param = "{'limitNumber':'1000'}"
    script = WeiboScript(table_name=TABLE_NAME("CHA_BRANCH_WEIBO_INFO"), collection_name="WEIBOINFO", param=param)
    script.main()
    script.close_client()
Exemplo n.º 8
0
        # 站点描述
        re_data["DESCRIBE_"] = data["DESCRIBE_"]
        # 周边站点
        re_data["AROUND_STATIONS_"] = self.handle_special_text(
            data["AROUND_STATIONS_"]).replace("|", ",")
        # 途径路线
        re_data["AROUND_ROUTE_"] = self.handle_special_text(
            data["AROUND_ROUTE_"]).replace("|", ",")
        if re_data["AROUND_ROUTE_"]:
            re_data["AROUND_ROUTE_"] = re_data["AROUND_ROUTE_"].replace(
                "公交线路", "")
        # 站点名称
        re_data["NAME_"] = data["NAME_"]
        re_data = super(Branchjtgj, self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)

        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]


if __name__ == '__main__':
    param = sys.argv[1]
    verify_field = {'URL_': 'URL_'}
    # param = "{'limitNumber':'10000'}"
    script = Branchjtgj(table_name=TABLE_NAME("CHA_BRANCH_BUS_STATION"),
                        collection_name="WD_JT_GJ",
                        param=param,
                        verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 9
0
        if "NAME_" in data:
            re_data["NAME_"] = data["NAME_"]
        # 地址
        if "ADDR_" in data:
            re_data["ADDR_"] = data["ADDR_"]
        # 床位
        if "BEDS_" in data:
            re_data["BEDS_"] = data["BEDS_"]
        # 医院性质
        if "TYPE_" in data:
            re_data["TYPE_"] = data["TYPE_"]
        # 网站
        if "WEBSITE_" in data:
            re_data["WEBSITE_"] = data["WEBSITE_"]
        # 门诊量
        if "VOLNUM_" in data:
            re_data["VOLNUM_"] = data["VOLNUM_"]
        # print(re_data)
        re_data = super(Branchssyy, self).generic_shuffle(data=data, re_data=re_data, field=None)
        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]


if __name__ == '__main__':
    param = sys.argv[1]
    # param = "{'limitNumber':'1000'}"
    verify_field = {'URL_': 'URL_'}
    script = Branchssyy(table_name=TABLE_NAME("CHA_BRANCH_HOSPITAL"), collection_name="WD_SS_YY", param=param,
                        verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 10
0
        return data

    def generic_shuffle(self, data, field="PRO_NAME_"):
        """
        通用清洗规则写这里, 如不需要通用清洗规则则不继承重写
        :param data:
        :param field:
        :return:
        """
        if isinstance(data, dict):
            re_data = self.__shuffle(data)
            return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
        elif isinstance(data, list):
            re_list = list()
            for each in data:
                re_data = self.__shuffle(each)
                re_list.append({"TABLE_NAME_": self.script_name, "DATA_": re_data})
            return re_list
        else:
            return


if __name__ == '__main__':
    # param = sys.argv[1]

    param = "{'entityType':'CRMMARKETACT','limitNumber':1,'entityCode':['CRMJPFX_YXHD_PFYH']}"
    script = BranchOrganize(table_name=TABLE_NAME("CRM_MARKET_ACT"), collection_name="CRMJPFX_YXHD", param=param)
    script.main()
    script.close_client()

Exemplo n.º 11
0
    def __shuffle(self, data):
        re_data = dict()
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["URL_"] = data["URL_"]

        if "中国理财网" in data["ENTITY_NAME_"]:
            serial_number = req_for_serial_number(code="JRCP_LCCP_INFO")
            re_data["ID_"] = serial_number
            re_data["PRO_NAME_"] = data["PRO_NAME_"]
            re_data["PRO_ORG_"] = data["PRO_ORG_"]
            re_data["REGIST_CODE_"] = data["REGIST_CODE_"]
            re_data["PRO_STATUS_"] = data["PRO_STATUS_"]
            re_data["OPT_MODE_"] = data["OPT_MODE_"]

            re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"]
            # re_data["YIELD_TYPE_CODE_"] = data[""]
            re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"]
            # re_data["CURRENCY_TYPE_CODE_"] = data[""]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            try:
                if float(data["START_FUNDS_"]) <= 10000:
                    re_data["START_FUNDS_CODE_"] = "S0_1"
                elif 10000 < float(data["START_FUNDS_"]) <= 50000:
                    re_data["START_FUNDS_CODE_"] = "S1_5"
                elif 50000 < float(data["START_FUNDS_"]) < 100000:
                    re_data["START_FUNDS_CODE_"] = "S5_10"
                elif 100000 < float(data["START_FUNDS_"]):
                    re_data["START_FUNDS_CODE_"] = "S10_"
            except Exception:
                re_data["START_FUNDS_"] = 0

            org = {
                '01': '国有银行',
                '02': '股份制银行',
                '03': '城商行',
                '04': '外资银行',
                '05': '农村合作金融机构',
                '06': '其他',
                '07': '其他',
                '08': '其他',
                '09': '其他',
                '00': '其他',
                '10': '理财子公司'
            }

            re_data["SOURCE_RISK_LEVEL_"] = data["SOURCE_RISK_LEVEL_"]
            re_data['ORG_TYPE_'] = org.get(data.get('ORG_TYPE_'))
            re_data["RAISE_START_"] = data["RAISE_START_"]
            re_data["RAISE_END_"] = data["RAISE_END_"]
            re_data["PRO_START_"] = data["PRO_START_"]
            re_data["PRO_END_"] = data["PRO_END_"]
            re_data["YIELD_LOW_"] = data["YIELD_LOW_"]
            re_data["YIELD_HIGH_"] = data["YIELD_HIGH_"]
            re_data["REAL_DAYS_"] = data["REAL_DAYS_"]
            re_data["INVEST_TYPE_"] = data["INVEST_TYPE_"]
            re_data["DATE_TYPE_"] = data["DATE_TYPE_"]
            re_data["YIELD_"] = data["YIELD_"]
            re_data["RAISE_TYPE_"] = data["RAISE_TYPE_"]
            re_data["INVEST_PROPERTIES_"] = data["INVEST_PROPERTIES_"]
            re_data["BUS_START_"] = data["BUS_START_"]
            re_data["BUS_END_"] = data["BUS_END_"]
            re_data["START_VALUE_"] = data["START_VALUE_"]
            re_data["PRO_VALUE_"] = data["PRO_VALUE_"]
            re_data["TOTAL_VALUE_"] = data["TOTAL_VALUE_"]
            re_data["RECENT_YIELD_"] = data["RECENT_YIELD_"]

            re_data["PRO_TYPE_"] = data["PRO_TYPE_"]
            re_data["SALE_AREA_"] = data["SALE_AREA_"]
            if "PROVINCE_NAME_" in data:
                re_data["PROVINCE_NAME_"] = data["PROVINCE_NAME_"]
            if "PROVINCE_NAME_" in data:
                re_data["PROVINCE_CODE_"] = data["PROVINCE_CODE_"]
            if "CITY_NAME_" in data:
                re_data["CITY_NAME_"] = data["CITY_NAME_"]
            if "CITY_CODE_" in data:
                re_data["CITY_CODE_"] = data["CITY_CODE_"]

            # re_data["REDEEM_"] = data[""]
            # re_data["INCREASE_"] = data[""]
            # re_data["INVEST_RANGE_"] = data[""]
            bank_list = list()
            bank_code_list = list()
            for each in self.bank_list:
                if each["NAME_"] in data.get("ENTITY_NAME_", ""):
                    bank_list.append(each["NAME_"])
                    bank_code_list.append(each["CODE_"])
            if bank_list:
                re_data["BANK_NAME_"] = "|".join(bank_list)
            if bank_code_list:
                re_data["BANK_CODE_"] = "|".join(bank_code_list)

            # del re_data["CREATE_TIME_"]
            # del re_data["SPIDER_TIME_"]
            # del re_data["M_STATUS_"]
            # del re_data["DELETE_STATUS_"]
            # del re_data["DATA_STATUS_"]
            # del re_data["PUBLISH_STATUS_"]

            re_data = super(BranchFinProduct,
                            self).generic_shuffle(data=data,
                                                  re_data=re_data,
                                                  field=None)

            if not data["YIELD_LOW_"]:
                re_data['YIELD_LOW_'] = '--'

            if not data["YIELD_HIGH_"]:
                re_data['YIELD_HIGH_'] = '--'

            if not data["START_FUNDS_"]:
                re_data['START_FUNDS_'] = '--'
            return {"TABLE_NAME_": TABLE_NAME("CRMLCCP"), "DATA_": re_data}
        else:
            source = re.findall(r"(https?://.*?)/", data["URL_"])
            re_data["SOURCE_"] = source[0]
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]
            serial_number = req_for_serial_number(code="JRCP_LCCP")
            re_data["ID_"] = serial_number
            re_data["SOURCE_TYPE_"] = ""
            # if "PRO_NAME_" not in data:
            #     return
            re_data["PRO_NAME_"] = data["PRO_NAME_"]
            f_index = data["ENTITY_NAME_"].find("-")
            re_data["PRO_ORG_"] = data["ENTITY_NAME_"][:f_index]
            if "PRO_CODE_" in data:
                re_data["PRO_CODE_"] = data["PRO_CODE_"]
            # 登记编码
            if "REGIST_CODE_" in data:
                re_data["REGIST_CODE_"] = data["REGIST_CODE_"]
            else:
                if "PDF_" in data:
                    try:
                        text = parse(data["PDF_"])
                        registration_code = re.findall(r"C\d{13}", text)
                        if registration_code:
                            re_data["REGIST_CODE_"] = registration_code[0]
                    except Exception as e:
                        self.logger.exception(
                            f"2.1--err: PDF."
                            f" 原始数据 collection = {self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f" error: {e}.")
            # 预售(PRE)、在售(ON)、停售(STOP)
            # 全部为 在售
            re_data["PRO_STATUS_"] = "ON"
            if "OPT_MODE_" in data:
                re_data["OPT_MODE_"] = data["OPT_MODE_"]

            if "YIELD_TYPE_" in data:
                re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"]
                # re_data["YIELD_TYPE_CODE_"] = data[""]
            if "CURRENCY_TYPE_" in data:
                re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"]
            # re_data["CURRENCY_TYPE_CODE_"] = data[""]
            # 起购金额
            if "START_FUNDS_" in data:
                start_funds = data["START_FUNDS_"].replace(" ", "")
                start_funds = start_funds.replace("亿", "00000000")
                start_funds = start_funds.replace("千万", "0000000")
                start_funds = start_funds.replace("百万", "000000")
                start_funds = start_funds.replace("十万", "00000")
                start_funds = start_funds.replace("万", "0000")
                start_funds = start_funds.replace("千", "000")
                start_funds = start_funds.replace("百", "00")
                start_funds = start_funds.replace("元", "")

                re_data["START_FUNDS_"] = start_funds

                try:
                    if float(re_data["START_FUNDS_"]) <= 10000:
                        re_data["START_FUNDS_CODE_"] = "S0_1"
                    elif 10000 < float(re_data["START_FUNDS_"]) <= 50000:
                        re_data["START_FUNDS_CODE_"] = "S1_5"
                    elif 50000 < float(re_data["START_FUNDS_"]) <= 100000:
                        re_data["START_FUNDS_CODE_"] = "S5_10"
                    elif 100000 < float(re_data["START_FUNDS_"]):
                        re_data["START_FUNDS_CODE_"] = "S10_"
                except Exception as e:
                    re_data["START_FUNDS_"] = 0

            if "RISK_LEVEL_CODE_" in data:
                re_data["RISK_LEVEL_"] = self.risk_dict[
                    data["RISK_LEVEL_CODE_"]]
                re_data["RISK_LEVEL_CODE_"] = data["RISK_LEVEL_CODE_"]

            if "RISK_LEVEL_" in data:
                re_data["SOURCE_RISK_LEVEL_"] = data["RISK_LEVEL_"]
            elif "SOURCE_RISK_LEVEL_" in data:
                re_data["SOURCE_RISK_LEVEL_"] = data["SOURCE_RISK_LEVEL_"]
            # # 募集起始日期
            if "RAISE_START_" in data:
                re_data["RAISE_START_"] = data["RAISE_START_"]
            # # 募集结束日期
            if "RAISE_END_" in data:
                re_data["RAISE_END_"] = data["RAISE_END_"]
            # # 产品起始日期
            if "PRO_START_" in data:
                re_data["PRO_START_"] = data["PRO_START_"]
            # # 产品结束日期
            if "PRO_END_" in data:
                re_data["PRO_END_"] = data["PRO_END_"]
            # 预期最低收益率
            if "YIELD_LOW_" in data:
                re_data["YIELD_LOW_"] = data["YIELD_LOW_"].replace("%", "")
            # 预期最高收益率
            if "YIELD_HIGH_" in data:
                re_data["YIELD_HIGH_"] = data["YIELD_HIGH_"].replace("%", "")
            # 实际天数
            if "REAL_DAYS_" in data:
                data["REAL_DAYS_"] = data["REAL_DAYS_"].replace(" ", "")
                if "年" in data["REAL_DAYS_"]:
                    re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("年", "")
                    try:
                        re_data["REAL_DAYS_"] = int(
                            re_data["REAL_DAYS_"]) * 365
                    except Exception:
                        re_data["REAL_DAYS_"] = 0
                elif "月" in data:
                    re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("月", "")
                    try:
                        re_data["REAL_DAYS_"] = int(re_data["REAL_DAYS_"]) * 30
                    except Exception:
                        re_data["REAL_DAYS_"] = 0
                else:
                    re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("天", "")
            else:
                if "PRO_START_" in data and "PRO_END_" in data:
                    t_start = arrow.get(data["PRO_START_"], "YYY-MM-DD")
                    t_end = arrow.get(data["PRO_END_"], "YYYY-MM-DD")
                    real_days = t_end - t_start
                    data["REAL_DAYS_"] = real_days.days

            if "INVEST_TYPE_" in data:
                re_data["INVEST_TYPE_"] = data["INVEST_TYPE_"]

            # # 投资者类型
            if "PRO_TYPE_" in data:
                re_data["PRO_TYPE_"] = data["PRO_TYPE_"]
            if "SALE_AREA_" in data:
                re_data["SALE_AREA_"] = data["SALE_AREA_"]
            # # 可否赎回
            if "REDEEM_" in data:
                if "不" in data["REDEEM_"]:
                    re_data["REDEEM_"] = "N"
                else:
                    re_data['REDEEM_'] = "Y"
            if "INCREASE_" in data:
                increase = data["INCREASE_"].replace(" ", "")
                increase = increase.replace("亿", "00000000")
                increase = increase.replace("千万", "0000000")
                increase = increase.replace("百万", "000000")
                increase = increase.replace("十万", "00000")
                increase = increase.replace("万", "0000")
                increase = increase.replace("千", "000")
                increase = increase.replace("百", "00")
                increase = increase.replace("元", "")
                re_data["INCREASE_"] = increase
                # re_data["INVEST_RANGE_"] = data["INVEST_RANGE_"]
            re_data["RECOMMEND_"] = "N"
            re_data["GOOD_SALE_"] = "N"
            re_data["NEW_SALE_"] = "N"
            re_data["SALE_SOURCE_"] = "NET"

            bank_list = list()
            bank_code_list = list()
            for each in self.bank_list:
                if each["NAME_"] in data.get("ENTITY_NAME_", ""):
                    bank_list.append(each["NAME_"])
                    bank_code_list.append(each["CODE_"])
            if bank_list:
                re_data["BANK_NAME_"] = "|".join(bank_list)
            if bank_code_list:
                re_data["BANK_CODE_"] = "|".join(bank_code_list)
            if not data["YIELD_LOW_"]:
                re_data['YIELD_LOW_'] = '--'

            if not data["YIELD_HIGH_"]:
                re_data['YIELD_HIGH_'] = '--'

            if not data["START_FUNDS_"]:
                re_data['START_FUNDS_'] = '--'

            re_data = super(BranchFinProduct,
                            self).generic_shuffle(data=data,
                                                  re_data=re_data,
                                                  field=None)
            re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"]
            return {
                "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FINANCIAL_PRODUCT"),
                "DATA_": re_data
            }
Exemplo n.º 12
0
                # else:
                #     pro_code = None
                # if (pro_name, pro_code) in self.verify_list:
                #     self.logger.info(f"重复值: {(pro_name, pro_code)}")
                #     continue
                # else:
                re_data = self.__shuffle(each)
                re_list.append(re_data)
        return re_list


if __name__ == '__main__':
    # param = sys.argv[1]
    param = "{'entityType':'CRMJPFX_LCCP','limitNumber':10000,'entityCode':['CRMJPFX_LCCP_ZGLCW']}"
    if "ZGLCW" in param:
        table_name = "CRMLCCP"  # hbase 表
        verify_field = {
            "PRO_NAME_": "PRO_NAME_",
            "REGIST_CODE_": "REGIST_CODE_"
        }
        # verify_field = {}
    else:
        table_name = "CHA_BRANCH_FINANCIAL_PRODUCT"
        verify_field = {"PRO_NAME_": "PRO_NAME_", "PRO_CODE_": "PRO_CODE_"}
    script = BranchFinProduct(table_name=TABLE_NAME(table_name),
                              collection_name="CRMJPFX_LCCP",
                              param=param,
                              verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 13
0
    #  GDSZ_SZS_FGW_GHJH           2
    #  GDSZ_GDS_FGW_FZGH           2
    #  GDSZ_GDS_CJJ_GG             2
    #  GDSZ_GZS_TZCJJ_GKXX         2
    #  GDSZ_GDS_TZJG_XMBLJGGS      2
    #  GDSZ_SWS_FGW_GHJH           2
    #  GDSZ_SGS_FGW_FZGGGZ         2
    #  GDSZ_ZHS_FGW_FZGH           2
    #  GDSZ_SZS_SWJ_TZGG           2
    #  GDSZ_YFS_FGW_GHJH           2  3
    #  GDSZ_FSS_FGW_JHGH           2
    #  GDSZ_ZHS_SWJ_TZGG           2
    #  GDSZ_HZS_FGW_FZGH_BMXGWJ    2
    #  GDSZ_HYS_FGW_XMXX           2
    #  GDSZ_QYS_FGW_ZDLYZL         2
    #  GDSZ_CZS_FGW_ZDXM           2
    #  GDSZ_JYS_FGW_ZDXM           2
    #  GDSZ_FSS_SWJ_TPXW           2
    #  GDSZ_HZS_SWJ_SWZX           2
    #  GDSZ_STS_SWJ_GZDT           2
    #  GZSZ_MZS_SWJ_TZGG           2    3
    #  GDSZ_ZQS_SWJ_GZDT           2    3
    #  GDSZ_MMS_FGW_FZGH_TZGG      2
    #  GDSZ_SGS_SWJ_SWDT           2
    #  GDSZ_MMS_SWJ_SWXW           2    3

    param = "{'entityType':'GOV_ZX_GDS','limitNumber':2000,'entityCode':['GDSZ_SWS_FGW_GHJH']}"
    script = BranchOrganize(table_name=TABLE_NAME("GOV_ZX_GDS"), collection_name="GOV_ZX_GDS", param=param)
    script.main()
    script.close_client()
Exemplo n.º 14
0
        data["SOURCE_NAME_"] = data["ENTITY_NAME_"]

        return data

    def generic_shuffle(self, data, field="BANK_NAME_"):
        """
        通用清洗规则写这里, 如不需要通用清洗规则则不继承重写
        :param data:
        :param field:
        :return:
        """
        if isinstance(data, dict):
            re_data = self.__shuffle(data)
            return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
        elif isinstance(data, list):
            re_list = list()
            for each in data:
                re_data = self.__shuffle(each)
                re_list.append({"TABLE_NAME_": self.script_name, "DATA_": re_data})
            return re_list
        else:
            return


if __name__ == '__main__':
    # param = sys.argv[1]
    param = "{'entityType':'CRMJPFX_XYK','limitNumber':1000,'entityCode':['CRMJPFX_XYK_KBB']}"
    script = BranchXYK(table_name=TABLE_NAME("CRMXYK"), collection_name="CRMJPFX_XYK", param=param)
    script.main()
    script.close_client()
Exemplo n.º 15
0
    def generic_shuffle(self, data):
        re_data = list()
        # CHA_BRANCH_WEIBO_INFO
        info_data = dict()
        serial_number = req_for_serial_number(code="WEIBO_INFO")
        info_data["ID_"] = serial_number
        print(serial_number)

        info_data["ENTITY_CODE_"] = data["BANK_CODE_"]

        info_data["URL_"] = data["CONTENT_URL_"]

        info_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "")
        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["CONTENT_URL_"])
        info_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        info_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]

        info_data["SOURCE_TYPE_"] = "WEIBO"

        info_data["LIKES_"] = data["PRAISES_"]
        if not info_data["LIKES_"]:
            info_data["LIKES_"] = 0
        info_data["COMMENTS_"] = data["REPLIES_"]
        if not info_data["COMMENTS_"]:
            info_data["COMMENTS_"] = 0
        info_data["RELAYS_"] = data["RELAYS_"]
        if not info_data["RELAYS_"]:
            info_data["RELAYS_"] = 0
        info_data["IMPORTANCE_"] = "N"
        info_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
        info_data["CONTENT_"] = data["CONTENT_"]
        if data.get("CONTENT_IMAGES_") and len(data["CONTENT_IMAGES_"]) > 0:
            for each_image in data["CONTENT_IMAGES_"]:
                response = req_for_something(url=each_image)
                if response:
                    t = base64.b64encode(response.content)
                    info_data[f"IMAGE_{data['CONTENT_IMAGES_'].index(each_image)+1}"] = t.decode("utf-8")
                    response.close()

        # 补录
        # info_data["TYPE_"] = data[""]
        # info_data["TYPE_CODE_"] = data[""]
        info_data["PUBLISH_STATUS_"] = "N"
        if "OWN_" in data:
            if data["OWN_"] == "转载":
                info_data["OWN_"] = "N"
            else:
                info_data["OWN_"] = "Y"

        for each in self.weibo_list:
            if each["WEIBO_NAME_"] == data["ENTITY_NAME_"]:
                info_data["WEIBO_CODE_"] = each["WEIBO_CODE_"]
                info_data["WEIBO_NAME_"] = each["WEIBO_NAME_"]
                break
        # 模型
        # 摘要
        try:
            brief = req_for_ts(info_data["CONTENT_"])
            if brief:
                info_data["BRIEF_"] = brief["summary"]
        except Exception as e:
            self.logger.info(f"调用模型req_for_ts失败,原因为{e}")
            info_data["BRIEF_"] = ""
        # 是否敏感
        try:
            censor = req_for_censor(info_data["CONTENT_"])
            if censor:
                if censor["censor"] == "N":
                    info_data["SENSITIVE_"] = "N"
                else:
                    info_data["SENSITIVE_"] = "Y"
                    info_data["SENSITIVE_WORD_"] = censor["words"]
        except Exception as e:
            self.logger.info(f"调用模型censor失败,错误为{e}")
            info_data["SENSITIVE_"] = "N"

        info_data["VERSION_"] = "0"
        info_data = super(WeiboScript, self).generic_shuffle(data=data, re_data=info_data, field="ENTITY_NAME_")
        # 清洗浦发银行BANK_NAME_和BANK_CODE_
        if info_data["ENTITY_NAME_"] == "上海浦东发展银行微博":
            info_data["BANK_NAME_"] = "浦发银行"
            info_data["BANK_CODE_"] = "SPDB"
        if info_data["ENTITY_NAME_"] == "南海农商银行微博":
            info_data["BANK_NAME_"] = "广东南海农村商业银行股份有限公司"
            info_data["BANK_CODE_"] = "NRC"
        if info_data["ENTITY_NAME_"] == "顺德农商银行微博":
            info_data["BANK_NAME_"] = "广东顺德农村商业银行股份有限公司"
            info_data["BANK_CODE_"] = "sdebank"

        comment = data["INFO_COMMENTS_"]
        verifieds = 0
        for c in comment:
            if c.get("VERIFIED_", ""):
                verifieds += 1

        # 微博热度
        try:
            hot = req_for_weibo_hot(publish_time=info_data["PUBLISH_TIME_"], relays=info_data["RELAYS_"],
                                    replies=len(comment), praises=info_data["LIKES_"], verifieds=verifieds)
            if hot:
                info_data["HOT_"] = hot["level"]
        except Exception as e:
            self.logger.info(f"调用模型weibo_hot失败,错误为{e}")

        re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_INFO"), "DATA_": info_data})
        if len(comment) > 0:
            comment_count = 0
            for each in comment:
                # CHA_BRANCH_WEIBO_COMMENT
                # 每次需要初始化comment_data不然导致数据重复
                comment_data = dict()
                # HBase row_key
                serial_number = req_for_serial_number(code="WEIBO_COMMENT")
                comment_data["ID_"] = serial_number
                comment_data["INFO_ID_"] = info_data["ID_"]
                comment_data["COMMENT_"] = each["COMMENT_"]
                comment_data["REPLIER_TIME_"] = each["REPLIER_TIME_"]
                comment_data["REPLIER_HEAD_"] = each["REPLIER_HEAD_"]
                comment_data["REPLIER_PRAISES_"] = each["REPLIER_PRAISES_"]
                comment_data["REPLIER_"] = each["REPLIER_"]
                comment_data["REPLIER_REPLIES_"] = each["REPLIER_REPLIES_"]
        # 情感分析

                if each.get("COMMENT_") and len(each["COMMENT_"]) > 0:
                    try:
                        sentiment = req_for_comment(each["COMMENT_"])
                        if sentiment:
                            if sentiment["sentiment"] == "中性":
                                comment_data["EMOTION_"] = "NORMAL"
                            if sentiment["sentiment"] == "积极":
                                comment_data["EMOTION_"] = "POSITIVE"
                            if sentiment["sentiment"] == "敏感":
                                comment_data["EMOTION_"] = "NAGETIVE"
                        else:
                            comment_data["EMOTION_"] = "NORMAL"
                    except Exception as e:
                        self.logger.info(f"调用模型req_for_comment失败,错误为{e}")
                        comment_data["EMOTION_"] = "NORMAL"

        # 是否敏感
                    try:
                        censor = req_for_censor(each["COMMENT_"])
                        if censor:
                            if censor["censor"] == "N":
                                comment_data["SENSITIVE_"] = "N"
                            else:
                                comment_data["SENSITIVE_"] = "Y"
                                comment_data["SENSITIVE_WORD_"] = censor["words"]
                        else:
                            comment_data["SENSITIVE_"] = "N"
                    except Exception as e:
                        self.logger.info(f"调用模型req_for_comment失败,错误为{e}")
                        comment_data["SENSITIVE_"] = "N"

                comment_data["VERSION_"] = "0"
                comment_data["CREATE_BY_ID_"] = "P0131857"
                comment_data["CREATE_BY_NAME_"] = "钟楷文"
                re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_COMMENT"), "DATA_": comment_data})
                comment_count += 1
            # 打相关评论日志方便调试
            self.logger.info(f'清洗的URL为{info_data["URL_"]}')
            self.logger.info(f'清洗的评论数为{info_data["COMMENTS_"]}')
            self.logger.info(f'插入到comment表的数量为{comment_count}')
        # print(re_data)
        return re_data
Exemplo n.º 16
0
        """
        通用清洗规则写这里, 如不需要通用清洗规则则不继承重写
        :param data:
        :param field:
        :return:
        """
        if isinstance(data, dict):
            re_data = self.__shuffle(data)
            return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
        elif isinstance(data, list):
            re_list = list()
            for each in data:
                re_data = self.__shuffle(each)
                re_list.append({
                    "TABLE_NAME_": self.script_name,
                    "DATA_": re_data
                })
            return re_list
        else:
            return


if __name__ == '__main__':
    # param = sys.argv[1]
    param = "{'entityType':'CRMJPFX_XT','limitNumber':1000,'entityCode':['CRMJPFX_XT_YYXTW']}"
    script = BranchOrganize(table_name=TABLE_NAME("CRMXT"),
                            collection_name="CRMJPFX_XT",
                            param=param)
    script.main()
    script.close_client()
Exemplo n.º 17
0
                re_list.append({
                    "TABLE_NAME_": self.script_name,
                    "DATA_": re_data
                })
            return re_list
        else:
            return


if __name__ == '__main__':
    # param = sys.argv[1]
    code_list = [
        # 'CRMJPFX_WD_JSYH',
        # 'CRMJPFX_WD_HXYH',
        # 'CRMJPFX_WD_BJYH',
        'CRMJPFX_WD_JTYH',
        # 'CRMJPFX_WD_XYYH',
        'CRMJPFX_WD_KBB_ALL',
        'CRMJPFX_WD_ZXYH',
        'CRMJPFX_WD_PAYH',
        'CRMJPFX_WD_PFYH',
        # 'CRMJPFX_WD_SHYH',
    ]

    param = "{'entityType':'ORGANIZE','limitNumber':20000,'entityCode':['CRMJPFX_WD_PAYH']}"
    script = BranchOrganize(table_name=TABLE_NAME("CRM_ORGANIZE"),
                            collection_name="CRMJPFX_WD",
                            param=param)
    script.main()
    script.close_client()
Exemplo n.º 18
0
                "TABLE_NAME_":
                TABLE_NAME("CHA_BRANCH_MAIN_ROUTE"),
                "DATA_":
                road_shuffle_data
            })
        return return_list


if __name__ == '__main__':
    # try:
    #     param = sys.argv[1]
    # except Exception:
    #     # param = "{'entityCode': 'MAPBAR_DEATAIL', 'limitNumber':2}"
    #     param = "{}"

    param = "{'entityType':'MAP_BAR','limitNumber':1,'entityCode':['MAPBAR_DEATAIL_BJ']}"

    # todo remove these code if  MongoDB collection is unified
    if "beijing" in param or "MAPBAR_DEATAIL_BJ" in param:
        collection = "mapbar_beijing"
    elif "shanghai" in param or "'MAPBAR_DEATAIL'" in param:
        collection = "mapbar_shanghai"
    else:
        collection = "mapbar"

    script = MapbarScript(table_name=TABLE_NAME("CHA_BRANCH_FACILITY"),
                          collection_name=collection,
                          param=param,
                          verify_field={"URL_": "URL_"})
    script.main()
Exemplo n.º 19
0
if __name__ == '__main__':
    # param = sys.argv[1]
    # '''
    # 基金需要查询相关 PRO_CODE_
    # '''
    param = "{'entityType':'JRCP_JJ','limitNumber':100000,'entityCode':['JRCP_JJ_TTJJ_FJZ','JRCP_JJ_TTJJ_JZ']}"
    # param = "{'entityType':'JRCP_JJ','limitNumber':100000,'entityCode':['JRCP_JJ_TTJJ_JZ_ALL', 'JRCP_JJ_TTJJ_FJZ_ALL']}"

    # 天天基金 BASIC 表
    if "JRCP_JJ_TTJJ_FJZ_ALL" in param or "JRCP_JJ_TTJJ_JZ_ALL" in param:
        # table_name = "CHA_BRANCH_FUND_BASIC"
        table_name = "CRMFUND_BASIC"
        collection = "JRCP_JJ"
        verify_field = {"URL_": "URL_"}
    # 代销
    elif "GW_ALL" in param and "TTJJ" not in param:
        # table_name = "CHA_BRANCH_FUND_AGENCY"
        table_name = "CRMFUND_AGENCY"     # CRM  hbase 表
        collection = "JRCP_JJ"
        verify_field = {"URL_": "URL_"}  # 用于 mongo 去重查询
    # 历史净值
    else:
        # table_name = "CHA_BRANCH_FUND_DATA"
        table_name = "CRMFUND_DATA"         # CRM  hbase 表
        collection = "JRCP_JJ"
        # 验证字段,key是hbase的字段。value是洗出来数据的key。拼接后为:PRO_CODE_='000406' and TIME_='2019-05-30'
        verify_field = {'PRO_CODE_': 'PRO_CODE_', 'TIME_': 'TIME_'}
    script = BranchFund(table_name=TABLE_NAME(table_name), collection_name=collection, param=param, verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 20
0
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """

        if isinstance(data, dict):
            re_data = self.__shuffle(data)
            return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
        elif isinstance(data, list):
            re_list = list()
            for each in data:
                re_data = self.__shuffle(each)
                re_list.append({
                    "TABLE_NAME_": self.script_name,
                    "DATA_": re_data
                })
            return re_list
        else:
            return


if __name__ == '__main__':
    # param = sys.argv[1]
    param = "{'entityType':'WD_TY','limitNumber':1000,'entityCode':['WD_TY_HEBYH_GW_ALL']}"
    script = BranchOrganize(table_name=TABLE_NAME("CHA_BRANCH_ORGANIZE"),
                            collection_name="WD_TY",
                            param=param)
    script.main()
    script.close_client()
Exemplo n.º 21
0
    def generic_shuffle(self, data, field="CONTENT_"):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data = dict()

        if "TAGS_" in data:
            re_data["TAGS_"] = ""

        # re_data["HOT_"] = data[""]

        re_data["PRO_NAME_"] = data["PRO_NAME_"]
        re_data["PRO_CODE_"] = data["PRO_CODE_"]
        # 基本信息 插入基本信息表
        if data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ_ALL", "JRCP_JJ_TTJJ_JZ_ALL"]:
            data_dict = dict(TABLE_NAME_=TABLE_NAME("CRMFUND_BASIC"))
            # self.p_client.table_name = TABLE_NAME("CRMFUND_BASIC")
            source = re.findall(r"(https?://.*?)[/?]", data["URL_"])
            re_data["SOURCE_"] = source[0]
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]

            # todo
            # re_data["SOURCE_CODE_"] = ""
            re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][8:12]
            basic_field_list = ["COM_NAME_", "FUND_TYPE_", "RISK_LEVEL_", "RELEASE_DATE_", "BUILD_DATE_",
                                "BUILD_SCAL_", "ASSET_SCAL_", "SHARE_SCAL_", "MANAGER_", "TRUSTEE_", "HANDLER_",
                                "DIVIDEND_", "MANAGE_FEE_RATE_", "HOST_FEE_RATE_", "SALE_FEE_RATE_", "MAX_SUB_RATE_",
                                "MAX_APPLY_RATE_", "MAX_REDEEM_RATE_", "BENCHMARK_", "BID_", "CLOSE_", "DIM_"]
            for basic_field in basic_field_list:
                if basic_field == "FUND_TYPE_":
                    fund_type = data.get("FUND_TYPE_", "其他")
                    re_data["FUND_TYPE_"] = fund_type
                    try:
                        re_data["FUND_TYPE_CODE_"] = self.ft_dict[data["FUND_TYPE_"]]
                    except KeyError:
                        for ft in self.ft_dict.keys():
                            if ft[:2] in fund_type:
                                re_data["FUND_TYPE_CODE_"] = self.ft_dict[ft]
                        if "FUND_TYPE_CODE_" not in re_data:
                            # self.logger.info(f"FUND_TYPE_CODE_ {fund_type}")
                            re_data["FUND_TYPE_CODE_"] = "QT"
                elif basic_field == "RISK_LEVEL_":
                    risk_level_ = data.get("RISK_LEVEL_", "未知")
                    risk_level_ = re.split(r'[|]', risk_level_.strip())[-1] if risk_level_ else "未知"
                    re_data["RISK_LEVEL_"] = self.rl_name_dict[risk_level_]
                    re_data["RISK_LEVEL_CODE_"] = self.rl_dict.get(re_data["RISK_LEVEL_"], "")
                elif basic_field == "MAX_REDEEM_RATE_":
                    max_redeem_rate_ = data.get("MAX_REDEEM_RATE_", "")
                    re_data["MAX_REDEEM_RATE_"] = re.split(r'[|]', data.get("MAX_REDEEM_RATE_", ""))[-1].replace \
                        ("%", "") if max_redeem_rate_ else ""
                elif basic_field == "BENCHMARK_":
                    re_data[basic_field] = data.get(basic_field, "")
                elif basic_field == "BUILD_DATE_" or basic_field == "RELEASE_DATE_":
                    basic_date = re.findall(r"(\d{4}年\d{2}月\d{1,2})日", data[basic_field])
                    if basic_date:
                        re_data[basic_field] = re.sub(r"[\u4e00-\u9fa5]", "-", basic_date[0])
                elif basic_field == "HANDLER_":
                    re_data[basic_field] = data.get(basic_field, "").replace('|', '')
                else:
                    re_data[basic_field] = data.get(basic_field, "").replace("%", "")
            # 添加一个资产总额字段方便统计
            if re_data["ASSET_SCAL_"]:
                asset_total = re.findall(r"(.*?亿元)(截止至:\d+年\d+月\d+日)", re_data["ASSET_SCAL_"])
                if len(asset_total) > 0:
                    re_data["ASSET_TOTAL_"] = asset_total[0]
                else:
                    re_data["ASSET_TOTAL_"] = '0'
            # 基金基本信息默认都是CHECK
            re_data["DATA_STATUS_"] = "CHECK"
            re_data["DATA_VERSION_"] = "0"

            re_data = super(BranchFund, self).generic_shuffle(data=data, re_data=re_data, field="TRUSTEE_")
            data_dict["DATA_"] = re_data
            return [data_dict]
        # 代销基金 插入代销基金表
        elif "GW_ALL" in data["ENTITY_CODE_"]:
            agency_dict = dict(TABLE_NAME_=TABLE_NAME("CRMFUND_AGENCY"))
            # self.p_client.table_name = "CRMFUND_AGENCY"
            # self.p_client.table_name = TABLE_NAME("CRMFUND_AGENCY")

            serial_number = req_for_serial_number(code="JRCP_JJ_AGENT")
            re_data["ID_"] = serial_number
            source = re.findall(r"(https?://.*?)[/?]", data["URL_"])
            re_data["SOURCE_"] = source[0]
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]
            re_data["PUBLISH_TIME_"] = data["DATETIME_"]
            re_data["SOURCE_TYPE_"] = ""
            # HOT_ 代销基金目前不需要热度字段
            # re_data["HOT_"] = data[""]

            re_data["RECOMMEND_"] = "N"
            re_data["GOOD_SALE_"] = "N"
            re_data["NEW_SALE_"] = "N"
            re_data["PUBLISH_STATUS_"] = "Y"
            re_data["DATA_STATUS_"] = "CHECK"
            re_data["VERSION_"] = "0"
            re_data["DATA_VERSION_"] = "0"
            # 从基金和基金基本信息中获取
            pro_code_ = data.get("PRO_CODE_")
            pro_name = data.get("PRO_NAME_")
            cur = self.connection.cursor()
            # TODO 查取不到
            # 从基金基本信息表查询相关数据
            if pro_code_:
                try:
                    re_data["PRO_CODE_"] = pro_code_
                    detail_list = ["RISK_LEVEL_", "RISK_LEVEL_CODE_", "FUND_TYPE_", "FUND_TYPE_CODE_","BUILD_DATE_"
                                   "COM_NAME_", "RELEASE_DATE_", "CLOSE_"]
                    cur.execute(f"SELECT {','.join(detail_list)} "
                                f"FROM CRMFUND_BASIC WHERE PRO_CODE_='{str(data['PRO_CODE_'])}' "
                                f"ORDER BY CREATE_TIME_ DESC LIMIT 1")
                    for index, item in enumerate(cur.fetchone()):
                        re_data[detail_list[index]] = item
                except Exception as e:
                    re_data["PUBLISH_STATUS_"] = "N"
                    re_data["DATA_STATUS_"] = "UNCHECK"
            elif pro_name:
                try:
                    pro_name = pro_name if not data.get("PRO_LIKE_NAME_") else data.get("PRO_LIKE_NAME_")
                    detail_list = ["PRO_CODE_", "RISK_LEVEL_", "RISK_LEVEL_CODE_", "FUND_TYPE_", "FUND_TYPE_CODE_",
                                   "COM_NAME_", "RELEASE_DATE_", "CLOSE_"]
                    cur.execute(f"SELECT {','.join(detail_list)} "
                                f"FROM CRMFUND_BASIC WHERE PRO_NAME_ LIKE '{pro_name}%' "
                                f"ORDER BY CREATE_TIME_ DESC LIMIT 1")
                    for index, item in enumerate(cur.fetchone()):
                        re_data[detail_list[index]] = item
                except Exception as e:
                    re_data["PUBLISH_STATUS_"] = "N"
                    re_data["DATA_STATUS_"] = "UNCHECK"
            # 从基金历史净值表查询相关数据
            if re_data.get("PRO_CODE_"):
                try:
                    cur.execute(f"SELECT BUY_STATUS_, NEW_NAV_, NEW_SYR_ "
                                f"FROM CRMFUND_DATA "
                                f"WHERE PRO_CODE_= '{str(re_data['PRO_CODE_'])}' "
                                f"ORDER BY TIME_ DESC LIMIT 1")
                    re_data["BUY_STATUS_"], re_data["NEW_NAV_"], re_data["NEW_SYR_"] = cur.fetchone()
                    if re_data["BUY_STATUS_"] and re_data["BUY_STATUS_"] in self.new_bs_dict.keys():
                        re_data["BUY_STATUS_CODE_"] = self.new_bs_dict[re_data["BUY_STATUS_"]]
                except Exception as e:
                    re_data["PUBLISH_STATUS_"] = "N"
                    re_data["DATA_STATUS_"] = "UNCHECK"

                finally:
                    cur.close()
            if not re_data.get("RISK_LEVEL_"):
                if "RISK_LEVEL_" not in data:
                    risk_level_ = "未知"
                else:
                    risk_level_ = data["RISK_LEVEL_"]
                risk_level_ = re.split(r'[|]', risk_level_.strip())[-1] if risk_level_ else "未知"
                re_data["RISK_LEVEL_"] = self.rl_name_dict[risk_level_]
                re_data["RISK_LEVEL_CODE_"] = self.rl_dict.get(re_data["RISK_LEVEL_"], "")
            # re_data["NEW_SYR_"] = data[""]
            if not (re_data.get("FUND_TYPE_") or re_data.get("RELEASE_DATE_")):
                re_data["PUBLISH_STATUS_"] = "N"
                re_data["DATA_STATUS_"] = "UNCHECK"
            re_data = super(BranchFund, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_")
            agency_dict["DATA_"] = re_data
            return [agency_dict]
        # 历史净值 插入基金表
        elif data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ", "JRCP_JJ_TTJJ_JZ"]:
            serial_number = req_for_serial_number(code=data["ENTITY_CODE_"][:7])
            re_data["ID_"] = serial_number
            # re_data["FUND_BASIC_ID_"] = data[""]   关联   BASIC_ID
            re_data["SERVICE_CHARGE_"] = data["SERVICE_CHARGE_"]
            re_data["RATING_AGENCIES_"] = data["RATING_AGENCIES_"].replace('jjpj', '')
            nom_field_list = ["TIME_", "NEW_NAV_", "NEW_ANV_", "OLD_TIME_", "OLD_NAV_", "OLD_ANV_", "DAY_GROWTH_",
                              "DAY_GROWTH_RATE_", "ONE_MONTH_RATE_", "THREE_MONTH_RATE_", "SIX_MONTH_RATE_",
                              "ONE_YEAR_RATE_", "THREE_YEAR_RATE_", "BUILD_RATE_", "NEW_TOI_", "NEW_SYR_", "OLD_TOI_",
                              "OLD_SYR_", "FYR_", "TYR_", "MARKET_PRICE_", "DISCOUNT_RATE_", "VERSION_",
                              "BUY_STATUS_", "REDEEM_STATUS_"]
            for nom_field in nom_field_list:
                if nom_field == "VERSION_":
                    re_data[nom_field] = "0"
                elif nom_field == "BUY_STATUS_":
                    re_data["BUY_STATUS_"] = data.get("BUY_STATUS_", "")
                    re_data["BUY_STATUS_CODE_"] = self.new_bs_dict.get(re_data["BUY_STATUS_"], "")
                elif nom_field == "REDEEM_STATUS_":
                    re_data["REDEEM_STATUS_"] = data.get("REDEEM_STATUS_")
                    re_data["REDEEM_STATUS_CODE_"] = self.rs_dict.get(re_data["REDEEM_STATUS_"], "")
                else:
                    re_data[nom_field] = data.get(nom_field, "").replace("%", "")
                    re_data[nom_field] = re_data[nom_field].replace("--", "")
            re_data["CREATE_BY_ID_"] = CREATE_ID
            re_data["CREATE_BY_NAME_"] = CREATE_NAME
            if data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ",]:
                re_data["APY_FOURTEEN_"] = data.get("APY_FOURTEEN_")
                re_data["APY_TWENTY_EIGHT_"] = data.get("APY_TWENTY_EIGHT_")
                re_data["NEW_TOI_"] = data.get("NEW_TOI_")
                re_data["NEW_SYR_"] = data.get("NEW_SYR_")
                try:
                    re_data["APY_THIRTY_FIVE_"] = round(float(dict(self.db_spider_data.JRCP_JJ.find_one({'PRO_CODE_': data['PRO_CODE_'], 'TIME_': data['TIME_'], 'ENTITY_CODE_': 'JRCP_JJ_TTJJ_35NH'})).get('APY_THIRTY_FIVE_')) * 100) / 100.0
                except:
                    re_data["APY_THIRTY_FIVE_"] = ''
            elif data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_JZ",]:
                re_data["NEW_WORTH_"] = data.get("NEW_WORTH_")

            # 处理T-1日净值
            # self.p_client.table_name =
            cur = self.connection.cursor()
            cur.execute(f"SELECT NEW_NAV_,NEW_ANV_,NEW_TOI_,NEW_SYR_ FROM  CRMFUND_DATA where PRO_CODE_='{re_data['PRO_CODE_']}' and TIME_<'{re_data['TIME_']}' order by TIME_ desc limit 1")
            t_1data = cur.fetchone()
            if t_1data:
                self.logger.info(f"====T-1日数据===={t_1data}")
                # print(t_1data)
                re_data['OLD_NAV_'] = t_1data[0]
                re_data['OLD_ANV_'] = t_1data[1]
                re_data['OLD_TOI_'] = t_1data[2]
                re_data['OLD_SYR_'] = t_1data[3]

            # 更新代销基金数据
            self.p_client.table_name = TABLE_NAME('CRMFUND_AGENCY')
            agences = self.p_client.search_all_from_phoenix(connection=self.connection, dict_status=True,
                                                            where_condition=f"PRO_CODE_='{re_data['PRO_CODE_']}'")
            if agences:
                while True:
                    try:
                        agence_data = agences.__next__()
                        self.logger.info(f"====更新代销基金数据===={agence_data}")
                        agence_data['NEW_NAV_'] = re_data['NEW_NAV_']
                        agence_data['NEW_SYR_'] = re_data['NEW_SYR_']
                        agence_data['BUY_STATUS_'] = re_data['BUY_STATUS_']
                        agence_data['BUY_STATUS_CODE_'] = re_data['BUY_STATUS_CODE_']
                    except:
                        break
                    try:
                        self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=agence_data)
                    except jaydebeapi.DatabaseError:
                        continue
            self.p_client.table_name = TABLE_NAME('CRMFUND_DATA')
            return [{"TABLE_NAME_": TABLE_NAME("CRMFUND_DATA"), "DATA_": re_data}]
Exemplo n.º 22
0
            del data["CONTENT_"]
            data["HTML_"] = html

            re_data = super(WechatScript,
                            self).generic_shuffle(data=data,
                                                  re_data=re_data,
                                                  field="ENTITY_NAME_")
            if re_data.get('_id'):
                del re_data['_id']
            return [{
                "TABLE_NAME_": self.p_client.table_name,
                "DATA_": re_data
            }]
        else:
            return


if __name__ == '__main__':
    # param = sys.argv[1]
    # verify_field = {'TITLE_': 'TITLE_', 'WECHAT_ID_': 'WECHAT_ID_'}
    verify_field = {'TITLE_': 'TITLE_'}
    # param = "{}"
    param = "{'limitNumber':'2'}"
    script = WechatScript(table_name=TABLE_NAME("CHA_BRANCH_WECHAT"),
                          collection_name="WECHAT",
                          param=param,
                          verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 23
0
            else:
                if response:
                    try:
                        p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_WORD",
                                                       file_name=data["PDF_NAME_"].replace(".doc", ""), postfix="doc",
                                                       file=response.content)
                        self.logger.info(f"{p_response.content.decode('utf-8')}")
                        p_response.close()
                    except Exception as e:
                        self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}")
                    finally:
                        response.close()
                else:
                    self.logger.warning(f'id: {data["_id"]},获取PDF失败')

        if "HTML_" in data:
            del data["HTML_"]
        re_data = super(BranchInsurance, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_")
        re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"]
        return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]


if __name__ == '__main__':
    # verify_field = {'URL_': 'URL_', 'PRO_NAME_': 'PRO_NAME_'}
    verify_field = {'URL_': 'URL_'}
    param = "{'entityType':'JRCP_BX','limitNumber':1000,'entityCode':['JRCP_BX_HEBYH_GW_ALL']}"

    script = BranchInsurance(table_name=TABLE_NAME("CHA_BRANCH_INSURANCE"), collection_name="JRCP_BX", param=param, verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 24
0
        return data

    def generic_shuffle(self, data, field="PRO_NAME_"):
        """
        通用清洗规则写这里, 如不需要通用清洗规则则不继承重写
        :param data:
        :param field:
        :return:
        """
        if isinstance(data, dict):
            re_data = self.__shuffle(data)
            return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
        elif isinstance(data, list):
            re_list = list()
            for each in data:
                re_data = self.__shuffle(each)
                re_list.append({"TABLE_NAME_": self.script_name, "DATA_": re_data})
            return re_list
        else:
            return


if __name__ == '__main__':
    # 清洗数据会先检验 verify_field 字段, 然后依照 verify_field 字段查询 hbase 去重查询
    # 债券数据未做去重处理, 所有不能重复插入
    # param = sys.argv[1]
    param = "{'entityType':'CRMJPFX_ZQ','limitNumber':10000,'entityCode':['CRMJPFX_ZQ_HXZQ']}"
    script = BranchOrganize(table_name=TABLE_NAME("CRMZQ"), collection_name="CRMJPFX_ZQ", param=param)
    script.main()
    script.close_client()
Exemplo n.º 25
0
                            re_data["PROVINCE_NAME_"] = None
                            re_data["CITY_NAME_"] = data["CITY_"] + "市"
                            re_data["AREA_NAME_"] = None
                            re_data["AREA_CODE_"] = None
                            re_data["CITY_CODE_"] = None
                            re_data["PROVINCE_CODE_"] = None

                re_data["NAME_"] = shopping_name
                re_data = super(Branchsssq,
                                self).generic_shuffle(data=data,
                                                      re_data=re_data,
                                                      field=None)
                re_data_list.append({
                    "TABLE_NAME_": self.p_client.table_name,
                    "DATA_": re_data
                })
        # print(re_data_list)
        return re_data_list


if __name__ == '__main__':
    # param = sys.argv[1]
    param = "{'limitNumber':'20'}"
    verify_field = {'NAME_': 'NAME_'}
    script = Branchsssq(table_name=TABLE_NAME("CHA_BRANCH_BUSINESS"),
                        collection_name="WD_SS_SQ",
                        param=param,
                        verify_field=verify_field)
    script.main()
    script.close_client()
Exemplo n.º 26
0
            # basic_dict["TAGS_"] = data[""]
            # 数据来源 URL
            source = re.findall(r"(https?://.*?)/", data["URL_"])
            re_data["SOURCE_"] = source[0]
            # 数据来源 网站名称
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
            basic_dict["SOURCE_TYPE_"] = "链家"
            # basic_dict["PRICE_TYPE_"] = data[""]
            basic_dict["ADDR_"] = data["ADDR_"]

            return [{
                "TABLE_NAME_": self.data_table_name,
                "DATA_": data_dict
            }, {
                "TABLE_NAME_": self.base_table_name,
                "DATA_": basic_dict
            }]


if __name__ == '__main__':
    try:
        param = sys.argv[1]
    except Exception:
        param = "{}"

    script = BranchNews(table_name=TABLE_NAME("CHA_BRANCH_HOUSE_DATA"),
                        collection_name="WD_JZ_FJ_XM",
                        param=param)
    script.main()
    script.close_client()
Exemplo n.º 27
0
                    for each in self.bank_list:
                        if re_data.get('BANK_NAME_') in each['ALIAS_']:
                            bank_list.append(each["NAME_"])
                            bank_code_list.append(each["CODE_"])
                    if bank_list:
                        re_data["BANK_NAME_"] = "|".join(bank_list)
                    if bank_code_list:
                        re_data["BANK_CODE_"] = "|".join(bank_code_list)

        return [{"TABLE_NAME_": 'CRM_NEWS', "DATA_": re_data}]


if __name__ == '__main__':
    # param = sys.argv[1]
    '''
    CRMJPFX_ZXYQ_XLCJ_JRBGT
    CRMJPFX_ZXYQ_XLCJ_JRTS
    CRMJPFX_ZXYQ_XL_HMTS
    CRMJPFX_ZXYQ_XLCJ_GSDT
    '''
    for _ in range(100):
        try:
            # param = "{'entityType':'CRMNEWS','limitNumber':10000,'entityCode':['CRMJPFX_ZXYQ_XLCJ_GSDT','CRMJPFX_ZXYQ_XL_HMTS', 'CRMJPFX_ZXYQ_XLCJ_JRTS','CRMJPFX_ZXYQ_XLCJ_JRBGT', ]}"
            param = "{'entityType':'CRMNEWS','limitNumber':10000,'entityCode':['CRMJPFX_ZXYQ_XL_HMTS' ]}"

            script = BranchNews(table_name=TABLE_NAME("CRM_NEWS"), collection_name="CRMJPFX_ZXYQ", param=param)
            script.main()
            script.close_client()
        except:
            continue