if "IMAGES_" in data: pattern = re.compile(r"https:(http://.*)") if re.match(pattern, data["IMAGES_"]): a = re.match(pattern, data["IMAGES_"]) image_url = a.group(1) else: image_url = data["IMAGES_"] response = req_for_something(url=image_url) if response: t = base64.b64encode(response.content) re_data["IMAGE_"] = t.decode("utf-8") re_data = super(BranchXyk, self).generic_shuffle(data=data, re_data=re_data, field=None) # print(re_data) re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"] return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}] if __name__ == '__main__': param = sys.argv[1] # param = "{'limitNumber':'1000'}" verify_field = {"URL_": "URL_"} script = BranchXyk(table_name=TABLE_NAME("CHA_BRANCH_CREDITCARDARD"), collection_name="JRCP_XYK", param=param, verify_field=verify_field) script.main() script.close_client()
re_data["SUBWAY_NAME_"] = SUBWAY_NAME_ + "-" + re_data[ "STATION_NAME_"] re_data = super(Branchjtdt, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }] if __name__ == '__main__': param = sys.argv[1] # param = "{'limitNumber':'1000'}" verify_field = {'SUBWAY_NAME_': 'SUBWAY_NAME_'} script = Branchjtdt(table_name=TABLE_NAME("CHA_BRANCH_SUBWAY"), collection_name="WD_JT_DT", param=param, verify_field=verify_field) script.main() script.close_client() # filelist = ['WD_JT_DT_BDDT_BJ', # 'WD_JT_DT_BDDT_CD', # 'WD_JT_DT_BDDT_NB', # 'WD_JT_DT_BDDT_NN', # 'WD_JT_DT_BDDT_SH', # 'WD_JT_DT_BDDT_XM'] # for i in filelist: # with open("{}.py".format(i), "w") as f: # f.write("from database._mongodb import MongoClient\n\n\ndef data_shuffle(data):\n\n return data\n\n\nif __name__ == '__main__':\n main_mongo = MongoClient(entity_code=\"{}\", mongo_collection=\"WD_JT_DT\")".format(i))
phone_number = re.sub(pattern5, r"\1 \2", data["TEL_"]) elif re.match(pattern6, data["TEL_"]): phone_number = re.sub(pattern6, r"\1 \2", data["TEL_"]) elif re.match(pattern7, data["TEL_"]): phone_number = re.sub(pattern7, r"\1 \2", data["TEL_"]) elif re.match(pattern8, data["TEL_"]): phone_number = re.sub(pattern8, r"\1 \2 \3", data["TEL_"]) elif re.match(pattern9, data["TEL_"]): phone_number = re.sub(pattern9, r"\1 \2", data["TEL_"]) else: phone_number = data["TEL_"] re_data["TEL_"] = phone_number # 地址 if "ADDR_" in data: re_data["ADDR_"] = data["ADDR_"] re_data = super(Branchssxx, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}] if __name__ == '__main__': param = sys.argv[1] # param = "{'limit_Number': '1000'}" verify_field = {'URL_': 'URL_'} script = Branchssxx(table_name=TABLE_NAME("CHA_BRANCH_SCHOOL"), collection_name="WD_SS_XX", param=param, verify_field=verify_field) script.main() script.close_client()
if __name__ == '__main__': try: # param = sys.argv[1] param = "{'entityType':'NEWS','limitNumber':10000,'entityCode':['ZX_GWDT_HEBYH_NHXW']}" except Exception: param = {} param_dict = eval(param) if "entityCode" in param_dict: if isinstance(param_dict["entityCode"], str): c = param_dict["entityCode"].split("_") if c[1] == "CJXW": coll = "_".join([c[0], c[1], c[2]]) else: coll = "_".join([c[0], c[1]]) script = BranchNews(table_name=TABLE_NAME("CHA_BRANCH_NEWS"), collection_name=coll, param=param) script.main() script.close_client() elif isinstance(param_dict["entityCode"], list): param_each = param_dict for each in param_dict["entityCode"]: c = each.split("_") if c[1] == "CJXW": coll = "_".join([c[0], c[1], c[2]]) else: coll = "_".join([c[0], c[1]]) param_each = param_dict param_each["entityCode"] = each script = BranchNews(table_name=TABLE_NAME("CHA_BRANCH_NEWS"), collection_name=coll, param=str(param_each)) script.main()
def generic_shuffle(self, data): re_data = dict() serial_number = req_for_serial_number(code="MAPBAR") re_data["ID_"] = serial_number re_data["NAME_"] = data["NAME_"] re_data["ADDRESS_"] = data["ADDRESS_"].replace("|", "") re_data["ADDRESS_"] = re_data["ADDRESS_"].replace("地址:", "") # re_data["PROVINCE_CODE_"] = "3100" # re_data["PROVINCE_NAME_"] = "上海市" # re_data["CITY_CODE_"] = "310100" # re_data["CITY_NAME_"] = "上海市" re_data["HOT_"] = 0 # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = "图吧" re_data["SOURCE_TYPE_"] = "图吧" # 获取经纬度 try: if re_data["ADDRESS_"]: location_result = get_lat_lng(address=re_data["ADDRESS_"]) if location_result["status"] == 0: re_data["LNG_"] = str( location_result["result"]["location"]["lng"]) re_data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: re_data["LNG_"] = "" re_data["LAT_"] = "" self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: re_data["LNG_"] = "" re_data["LAT_"] = "" except Exception as e: self.logger.exception(f"_id: {data['_id']} 获取经纬度失败, error: {e}") if re_data["LAT_"]: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.exception(f"_id: {data['_id']} 获取地址失败, error: {e}") else: try: re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data[ "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in self.city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in self.province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break if not re_data.get("CITY_NAME_", ""): for city in self.city_list: if city["NAME_"][:2] in data["TYPE_"]: re_data["CITY_CODE_"] = city["CODE_"] re_data["CITY_NAME_"] = city["NAME_"] break if re_data.get("CITY_NAME_", ""): for prov in self.province_list: if prov["CODE_"][:2] == re_data["CITY_CODE_"][:2]: re_data["PROVINCE_CODE_"] = prov["CODE_"] re_data["PROVINCE_NAME_"] = prov["NAME_"] break # CHA_BRANCH_MAIN_ROUTE 主干道 if "道路" in data["TYPE_"]: road_data = dict() road_data.update(re_data) road_data["ID_"] = req_for_serial_number(code="WD_GD") road_data["ADDR_"] = road_data["ADDRESS_"] del road_data["ADDRESS_"] road_shuffle_data = super(MapbarScript, self).generic_shuffle(data=data, re_data=road_data, field=None) # CHA_BRANCH_FACILITY 图吧 # serial_number = req_for_serial_number(code="MAPBAR") # re_data["ID_"] = serial_number re_data["TYPE1_"] = data["BTYPE_"] try: re_data["TYPE1_CODE_"] = self.type1_dict[re_data["TYPE1_"]] except KeyError: raise Exception("暂不需要清洗的数据") # 小分类清洗(合并部分分类) if data["TYPE_"][2:] in ["户外运动俱乐部", "赛马场及马术俱乐部", "室内运动健身俱乐部"]: re_data["TYPE2_"] = "俱乐部" re_data["TYPE2_CODE_"] = "JLB" elif data["TYPE_"][2:] in ["连锁店", "便利店"]: re_data["TYPE2_"] = "便利店" re_data["TYPE2_CODE_"] = "BLD" elif data["TYPE_"][2:] in ["电子商城", "电器商城"]: re_data["TYPE2_"] = "家电数码" re_data["TYPE2_CODE_"] = "JDSM" elif data["TYPE_"][2:] in ["诊所/卫生所", "门诊/急诊部"]: re_data["TYPE2_"] = "门诊/卫生所" re_data["TYPE2_CODE_"] = "MZWSS" else: re_data["TYPE2_"] = data["TYPE_"][2:] re_data["TYPE2_CODE_"] = self.type2_dict.get(re_data["TYPE2_"]) re_data["SOURCE_TYPE1_"] = data["BTYPE_"] re_data["SOURCE_TYPE1_CODE_"] = self.type1_dict.get( re_data["SOURCE_TYPE1_"]) re_data["SOURCE_TYPE2_"] = data["TYPE_"][2:] re_data["SOURCE_TYPE2_CODE_"] = self.source_type2_dict.get( re_data["SOURCE_TYPE2_"]) re_data["PHONE_"] = data["PHONE_"].replace("无,", "") re_data["BUS_"] = data["BUS_"] re_data["BUSSTOP_"] = data["BUSSTOP_"] shuffle_data = super(MapbarScript, self).generic_shuffle(data=data, re_data=re_data, field=None) return_list = list() return_list.append({ "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FACILITY"), "DATA_": shuffle_data }) if "road_shuffle_data" in dir(): return_list.append({ "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_MAIN_ROUTE"), "DATA_": road_shuffle_data }) return return_list
basic_dict["DELETE_STATUS_"] = "N" # basic_dict["TAGS_"] = data[""] # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] basic_dict["SOURCE_TYPE_"] = "链家" # basic_dict["PRICE_TYPE_"] = data[""] basic_dict["ADDR_"] = data["ADDR_"] return [{"TABLE_NAME_": self.data_table_name, "DATA_": data_dict}, {"TABLE_NAME_": self.base_table_name, "DATA_": basic_dict}] if __name__ == '__main__': try: # param = sys.argv[1] # 'WD_JZ_FJ_LJXQFJ_FS', 'WD_JZ_FJ_LIXQZL_FS' param = "{'entityType':'WD_JZ_FJ_FS','limitNumber':10000,'entityCode':['WD_JZ_FJ_LJXQFJ_FS']}" except Exception: param = "{}" script = BranchNews(table_name=TABLE_NAME("CHA_BRANCH_HOUSE_DATA"), collection_name="WD_JZ_FJ_FS", param=param) script.main() script.close_client()
comment_data["SENSITIVE_"] = "Y" comment_data["SENSITIVE_WORD_"] = censor["words"] else: comment_data["SENSITIVE_"] = "N" except Exception as e: self.logger.info(f"调用模型req_for_comment失败,错误为{e}") comment_data["SENSITIVE_"] = "N" comment_data["VERSION_"] = "0" comment_data["CREATE_BY_ID_"] = "P0131857" comment_data["CREATE_BY_NAME_"] = "钟楷文" re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_COMMENT"), "DATA_": comment_data}) comment_count += 1 # 打相关评论日志方便调试 self.logger.info(f'清洗的URL为{info_data["URL_"]}') self.logger.info(f'清洗的评论数为{info_data["COMMENTS_"]}') self.logger.info(f'插入到comment表的数量为{comment_count}') # print(re_data) return re_data if __name__ == '__main__': # try: # param = sys.argv[1] # except Exception: # param = '{}' param = "{'limitNumber':'1000'}" script = WeiboScript(table_name=TABLE_NAME("CHA_BRANCH_WEIBO_INFO"), collection_name="WEIBOINFO", param=param) script.main() script.close_client()
# 站点描述 re_data["DESCRIBE_"] = data["DESCRIBE_"] # 周边站点 re_data["AROUND_STATIONS_"] = self.handle_special_text( data["AROUND_STATIONS_"]).replace("|", ",") # 途径路线 re_data["AROUND_ROUTE_"] = self.handle_special_text( data["AROUND_ROUTE_"]).replace("|", ",") if re_data["AROUND_ROUTE_"]: re_data["AROUND_ROUTE_"] = re_data["AROUND_ROUTE_"].replace( "公交线路", "") # 站点名称 re_data["NAME_"] = data["NAME_"] re_data = super(Branchjtgj, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}] if __name__ == '__main__': param = sys.argv[1] verify_field = {'URL_': 'URL_'} # param = "{'limitNumber':'10000'}" script = Branchjtgj(table_name=TABLE_NAME("CHA_BRANCH_BUS_STATION"), collection_name="WD_JT_GJ", param=param, verify_field=verify_field) script.main() script.close_client()
if "NAME_" in data: re_data["NAME_"] = data["NAME_"] # 地址 if "ADDR_" in data: re_data["ADDR_"] = data["ADDR_"] # 床位 if "BEDS_" in data: re_data["BEDS_"] = data["BEDS_"] # 医院性质 if "TYPE_" in data: re_data["TYPE_"] = data["TYPE_"] # 网站 if "WEBSITE_" in data: re_data["WEBSITE_"] = data["WEBSITE_"] # 门诊量 if "VOLNUM_" in data: re_data["VOLNUM_"] = data["VOLNUM_"] # print(re_data) re_data = super(Branchssyy, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}] if __name__ == '__main__': param = sys.argv[1] # param = "{'limitNumber':'1000'}" verify_field = {'URL_': 'URL_'} script = Branchssyy(table_name=TABLE_NAME("CHA_BRANCH_HOSPITAL"), collection_name="WD_SS_YY", param=param, verify_field=verify_field) script.main() script.close_client()
return data def generic_shuffle(self, data, field="PRO_NAME_"): """ 通用清洗规则写这里, 如不需要通用清洗规则则不继承重写 :param data: :param field: :return: """ if isinstance(data, dict): re_data = self.__shuffle(data) return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}] elif isinstance(data, list): re_list = list() for each in data: re_data = self.__shuffle(each) re_list.append({"TABLE_NAME_": self.script_name, "DATA_": re_data}) return re_list else: return if __name__ == '__main__': # param = sys.argv[1] param = "{'entityType':'CRMMARKETACT','limitNumber':1,'entityCode':['CRMJPFX_YXHD_PFYH']}" script = BranchOrganize(table_name=TABLE_NAME("CRM_MARKET_ACT"), collection_name="CRMJPFX_YXHD", param=param) script.main() script.close_client()
def __shuffle(self, data): re_data = dict() re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "中国理财网" in data["ENTITY_NAME_"]: serial_number = req_for_serial_number(code="JRCP_LCCP_INFO") re_data["ID_"] = serial_number re_data["PRO_NAME_"] = data["PRO_NAME_"] re_data["PRO_ORG_"] = data["PRO_ORG_"] re_data["REGIST_CODE_"] = data["REGIST_CODE_"] re_data["PRO_STATUS_"] = data["PRO_STATUS_"] re_data["OPT_MODE_"] = data["OPT_MODE_"] re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"] # re_data["YIELD_TYPE_CODE_"] = data[""] re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"] # re_data["CURRENCY_TYPE_CODE_"] = data[""] re_data["START_FUNDS_"] = data["START_FUNDS_"] try: if float(data["START_FUNDS_"]) <= 10000: re_data["START_FUNDS_CODE_"] = "S0_1" elif 10000 < float(data["START_FUNDS_"]) <= 50000: re_data["START_FUNDS_CODE_"] = "S1_5" elif 50000 < float(data["START_FUNDS_"]) < 100000: re_data["START_FUNDS_CODE_"] = "S5_10" elif 100000 < float(data["START_FUNDS_"]): re_data["START_FUNDS_CODE_"] = "S10_" except Exception: re_data["START_FUNDS_"] = 0 org = { '01': '国有银行', '02': '股份制银行', '03': '城商行', '04': '外资银行', '05': '农村合作金融机构', '06': '其他', '07': '其他', '08': '其他', '09': '其他', '00': '其他', '10': '理财子公司' } re_data["SOURCE_RISK_LEVEL_"] = data["SOURCE_RISK_LEVEL_"] re_data['ORG_TYPE_'] = org.get(data.get('ORG_TYPE_')) re_data["RAISE_START_"] = data["RAISE_START_"] re_data["RAISE_END_"] = data["RAISE_END_"] re_data["PRO_START_"] = data["PRO_START_"] re_data["PRO_END_"] = data["PRO_END_"] re_data["YIELD_LOW_"] = data["YIELD_LOW_"] re_data["YIELD_HIGH_"] = data["YIELD_HIGH_"] re_data["REAL_DAYS_"] = data["REAL_DAYS_"] re_data["INVEST_TYPE_"] = data["INVEST_TYPE_"] re_data["DATE_TYPE_"] = data["DATE_TYPE_"] re_data["YIELD_"] = data["YIELD_"] re_data["RAISE_TYPE_"] = data["RAISE_TYPE_"] re_data["INVEST_PROPERTIES_"] = data["INVEST_PROPERTIES_"] re_data["BUS_START_"] = data["BUS_START_"] re_data["BUS_END_"] = data["BUS_END_"] re_data["START_VALUE_"] = data["START_VALUE_"] re_data["PRO_VALUE_"] = data["PRO_VALUE_"] re_data["TOTAL_VALUE_"] = data["TOTAL_VALUE_"] re_data["RECENT_YIELD_"] = data["RECENT_YIELD_"] re_data["PRO_TYPE_"] = data["PRO_TYPE_"] re_data["SALE_AREA_"] = data["SALE_AREA_"] if "PROVINCE_NAME_" in data: re_data["PROVINCE_NAME_"] = data["PROVINCE_NAME_"] if "PROVINCE_NAME_" in data: re_data["PROVINCE_CODE_"] = data["PROVINCE_CODE_"] if "CITY_NAME_" in data: re_data["CITY_NAME_"] = data["CITY_NAME_"] if "CITY_CODE_" in data: re_data["CITY_CODE_"] = data["CITY_CODE_"] # re_data["REDEEM_"] = data[""] # re_data["INCREASE_"] = data[""] # re_data["INVEST_RANGE_"] = data[""] bank_list = list() bank_code_list = list() for each in self.bank_list: if each["NAME_"] in data.get("ENTITY_NAME_", ""): bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: re_data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: re_data["BANK_CODE_"] = "|".join(bank_code_list) # del re_data["CREATE_TIME_"] # del re_data["SPIDER_TIME_"] # del re_data["M_STATUS_"] # del re_data["DELETE_STATUS_"] # del re_data["DATA_STATUS_"] # del re_data["PUBLISH_STATUS_"] re_data = super(BranchFinProduct, self).generic_shuffle(data=data, re_data=re_data, field=None) if not data["YIELD_LOW_"]: re_data['YIELD_LOW_'] = '--' if not data["YIELD_HIGH_"]: re_data['YIELD_HIGH_'] = '--' if not data["START_FUNDS_"]: re_data['START_FUNDS_'] = '--' return {"TABLE_NAME_": TABLE_NAME("CRMLCCP"), "DATA_": re_data} else: source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] serial_number = req_for_serial_number(code="JRCP_LCCP") re_data["ID_"] = serial_number re_data["SOURCE_TYPE_"] = "" # if "PRO_NAME_" not in data: # return re_data["PRO_NAME_"] = data["PRO_NAME_"] f_index = data["ENTITY_NAME_"].find("-") re_data["PRO_ORG_"] = data["ENTITY_NAME_"][:f_index] if "PRO_CODE_" in data: re_data["PRO_CODE_"] = data["PRO_CODE_"] # 登记编码 if "REGIST_CODE_" in data: re_data["REGIST_CODE_"] = data["REGIST_CODE_"] else: if "PDF_" in data: try: text = parse(data["PDF_"]) registration_code = re.findall(r"C\d{13}", text) if registration_code: re_data["REGIST_CODE_"] = registration_code[0] except Exception as e: self.logger.exception( f"2.1--err: PDF." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") # 预售(PRE)、在售(ON)、停售(STOP) # 全部为 在售 re_data["PRO_STATUS_"] = "ON" if "OPT_MODE_" in data: re_data["OPT_MODE_"] = data["OPT_MODE_"] if "YIELD_TYPE_" in data: re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"] # re_data["YIELD_TYPE_CODE_"] = data[""] if "CURRENCY_TYPE_" in data: re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"] # re_data["CURRENCY_TYPE_CODE_"] = data[""] # 起购金额 if "START_FUNDS_" in data: start_funds = data["START_FUNDS_"].replace(" ", "") start_funds = start_funds.replace("亿", "00000000") start_funds = start_funds.replace("千万", "0000000") start_funds = start_funds.replace("百万", "000000") start_funds = start_funds.replace("十万", "00000") start_funds = start_funds.replace("万", "0000") start_funds = start_funds.replace("千", "000") start_funds = start_funds.replace("百", "00") start_funds = start_funds.replace("元", "") re_data["START_FUNDS_"] = start_funds try: if float(re_data["START_FUNDS_"]) <= 10000: re_data["START_FUNDS_CODE_"] = "S0_1" elif 10000 < float(re_data["START_FUNDS_"]) <= 50000: re_data["START_FUNDS_CODE_"] = "S1_5" elif 50000 < float(re_data["START_FUNDS_"]) <= 100000: re_data["START_FUNDS_CODE_"] = "S5_10" elif 100000 < float(re_data["START_FUNDS_"]): re_data["START_FUNDS_CODE_"] = "S10_" except Exception as e: re_data["START_FUNDS_"] = 0 if "RISK_LEVEL_CODE_" in data: re_data["RISK_LEVEL_"] = self.risk_dict[ data["RISK_LEVEL_CODE_"]] re_data["RISK_LEVEL_CODE_"] = data["RISK_LEVEL_CODE_"] if "RISK_LEVEL_" in data: re_data["SOURCE_RISK_LEVEL_"] = data["RISK_LEVEL_"] elif "SOURCE_RISK_LEVEL_" in data: re_data["SOURCE_RISK_LEVEL_"] = data["SOURCE_RISK_LEVEL_"] # # 募集起始日期 if "RAISE_START_" in data: re_data["RAISE_START_"] = data["RAISE_START_"] # # 募集结束日期 if "RAISE_END_" in data: re_data["RAISE_END_"] = data["RAISE_END_"] # # 产品起始日期 if "PRO_START_" in data: re_data["PRO_START_"] = data["PRO_START_"] # # 产品结束日期 if "PRO_END_" in data: re_data["PRO_END_"] = data["PRO_END_"] # 预期最低收益率 if "YIELD_LOW_" in data: re_data["YIELD_LOW_"] = data["YIELD_LOW_"].replace("%", "") # 预期最高收益率 if "YIELD_HIGH_" in data: re_data["YIELD_HIGH_"] = data["YIELD_HIGH_"].replace("%", "") # 实际天数 if "REAL_DAYS_" in data: data["REAL_DAYS_"] = data["REAL_DAYS_"].replace(" ", "") if "年" in data["REAL_DAYS_"]: re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("年", "") try: re_data["REAL_DAYS_"] = int( re_data["REAL_DAYS_"]) * 365 except Exception: re_data["REAL_DAYS_"] = 0 elif "月" in data: re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("月", "") try: re_data["REAL_DAYS_"] = int(re_data["REAL_DAYS_"]) * 30 except Exception: re_data["REAL_DAYS_"] = 0 else: re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("天", "") else: if "PRO_START_" in data and "PRO_END_" in data: t_start = arrow.get(data["PRO_START_"], "YYY-MM-DD") t_end = arrow.get(data["PRO_END_"], "YYYY-MM-DD") real_days = t_end - t_start data["REAL_DAYS_"] = real_days.days if "INVEST_TYPE_" in data: re_data["INVEST_TYPE_"] = data["INVEST_TYPE_"] # # 投资者类型 if "PRO_TYPE_" in data: re_data["PRO_TYPE_"] = data["PRO_TYPE_"] if "SALE_AREA_" in data: re_data["SALE_AREA_"] = data["SALE_AREA_"] # # 可否赎回 if "REDEEM_" in data: if "不" in data["REDEEM_"]: re_data["REDEEM_"] = "N" else: re_data['REDEEM_'] = "Y" if "INCREASE_" in data: increase = data["INCREASE_"].replace(" ", "") increase = increase.replace("亿", "00000000") increase = increase.replace("千万", "0000000") increase = increase.replace("百万", "000000") increase = increase.replace("十万", "00000") increase = increase.replace("万", "0000") increase = increase.replace("千", "000") increase = increase.replace("百", "00") increase = increase.replace("元", "") re_data["INCREASE_"] = increase # re_data["INVEST_RANGE_"] = data["INVEST_RANGE_"] re_data["RECOMMEND_"] = "N" re_data["GOOD_SALE_"] = "N" re_data["NEW_SALE_"] = "N" re_data["SALE_SOURCE_"] = "NET" bank_list = list() bank_code_list = list() for each in self.bank_list: if each["NAME_"] in data.get("ENTITY_NAME_", ""): bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: re_data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: re_data["BANK_CODE_"] = "|".join(bank_code_list) if not data["YIELD_LOW_"]: re_data['YIELD_LOW_'] = '--' if not data["YIELD_HIGH_"]: re_data['YIELD_HIGH_'] = '--' if not data["START_FUNDS_"]: re_data['START_FUNDS_'] = '--' re_data = super(BranchFinProduct, self).generic_shuffle(data=data, re_data=re_data, field=None) re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"] return { "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FINANCIAL_PRODUCT"), "DATA_": re_data }
# else: # pro_code = None # if (pro_name, pro_code) in self.verify_list: # self.logger.info(f"重复值: {(pro_name, pro_code)}") # continue # else: re_data = self.__shuffle(each) re_list.append(re_data) return re_list if __name__ == '__main__': # param = sys.argv[1] param = "{'entityType':'CRMJPFX_LCCP','limitNumber':10000,'entityCode':['CRMJPFX_LCCP_ZGLCW']}" if "ZGLCW" in param: table_name = "CRMLCCP" # hbase 表 verify_field = { "PRO_NAME_": "PRO_NAME_", "REGIST_CODE_": "REGIST_CODE_" } # verify_field = {} else: table_name = "CHA_BRANCH_FINANCIAL_PRODUCT" verify_field = {"PRO_NAME_": "PRO_NAME_", "PRO_CODE_": "PRO_CODE_"} script = BranchFinProduct(table_name=TABLE_NAME(table_name), collection_name="CRMJPFX_LCCP", param=param, verify_field=verify_field) script.main() script.close_client()
# GDSZ_SZS_FGW_GHJH 2 # GDSZ_GDS_FGW_FZGH 2 # GDSZ_GDS_CJJ_GG 2 # GDSZ_GZS_TZCJJ_GKXX 2 # GDSZ_GDS_TZJG_XMBLJGGS 2 # GDSZ_SWS_FGW_GHJH 2 # GDSZ_SGS_FGW_FZGGGZ 2 # GDSZ_ZHS_FGW_FZGH 2 # GDSZ_SZS_SWJ_TZGG 2 # GDSZ_YFS_FGW_GHJH 2 3 # GDSZ_FSS_FGW_JHGH 2 # GDSZ_ZHS_SWJ_TZGG 2 # GDSZ_HZS_FGW_FZGH_BMXGWJ 2 # GDSZ_HYS_FGW_XMXX 2 # GDSZ_QYS_FGW_ZDLYZL 2 # GDSZ_CZS_FGW_ZDXM 2 # GDSZ_JYS_FGW_ZDXM 2 # GDSZ_FSS_SWJ_TPXW 2 # GDSZ_HZS_SWJ_SWZX 2 # GDSZ_STS_SWJ_GZDT 2 # GZSZ_MZS_SWJ_TZGG 2 3 # GDSZ_ZQS_SWJ_GZDT 2 3 # GDSZ_MMS_FGW_FZGH_TZGG 2 # GDSZ_SGS_SWJ_SWDT 2 # GDSZ_MMS_SWJ_SWXW 2 3 param = "{'entityType':'GOV_ZX_GDS','limitNumber':2000,'entityCode':['GDSZ_SWS_FGW_GHJH']}" script = BranchOrganize(table_name=TABLE_NAME("GOV_ZX_GDS"), collection_name="GOV_ZX_GDS", param=param) script.main() script.close_client()
data["SOURCE_NAME_"] = data["ENTITY_NAME_"] return data def generic_shuffle(self, data, field="BANK_NAME_"): """ 通用清洗规则写这里, 如不需要通用清洗规则则不继承重写 :param data: :param field: :return: """ if isinstance(data, dict): re_data = self.__shuffle(data) return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}] elif isinstance(data, list): re_list = list() for each in data: re_data = self.__shuffle(each) re_list.append({"TABLE_NAME_": self.script_name, "DATA_": re_data}) return re_list else: return if __name__ == '__main__': # param = sys.argv[1] param = "{'entityType':'CRMJPFX_XYK','limitNumber':1000,'entityCode':['CRMJPFX_XYK_KBB']}" script = BranchXYK(table_name=TABLE_NAME("CRMXYK"), collection_name="CRMJPFX_XYK", param=param) script.main() script.close_client()
def generic_shuffle(self, data): re_data = list() # CHA_BRANCH_WEIBO_INFO info_data = dict() serial_number = req_for_serial_number(code="WEIBO_INFO") info_data["ID_"] = serial_number print(serial_number) info_data["ENTITY_CODE_"] = data["BANK_CODE_"] info_data["URL_"] = data["CONTENT_URL_"] info_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "") # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["CONTENT_URL_"]) info_data["SOURCE_"] = source[0] # 数据来源 网站名称 info_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] info_data["SOURCE_TYPE_"] = "WEIBO" info_data["LIKES_"] = data["PRAISES_"] if not info_data["LIKES_"]: info_data["LIKES_"] = 0 info_data["COMMENTS_"] = data["REPLIES_"] if not info_data["COMMENTS_"]: info_data["COMMENTS_"] = 0 info_data["RELAYS_"] = data["RELAYS_"] if not info_data["RELAYS_"]: info_data["RELAYS_"] = 0 info_data["IMPORTANCE_"] = "N" info_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] info_data["CONTENT_"] = data["CONTENT_"] if data.get("CONTENT_IMAGES_") and len(data["CONTENT_IMAGES_"]) > 0: for each_image in data["CONTENT_IMAGES_"]: response = req_for_something(url=each_image) if response: t = base64.b64encode(response.content) info_data[f"IMAGE_{data['CONTENT_IMAGES_'].index(each_image)+1}"] = t.decode("utf-8") response.close() # 补录 # info_data["TYPE_"] = data[""] # info_data["TYPE_CODE_"] = data[""] info_data["PUBLISH_STATUS_"] = "N" if "OWN_" in data: if data["OWN_"] == "转载": info_data["OWN_"] = "N" else: info_data["OWN_"] = "Y" for each in self.weibo_list: if each["WEIBO_NAME_"] == data["ENTITY_NAME_"]: info_data["WEIBO_CODE_"] = each["WEIBO_CODE_"] info_data["WEIBO_NAME_"] = each["WEIBO_NAME_"] break # 模型 # 摘要 try: brief = req_for_ts(info_data["CONTENT_"]) if brief: info_data["BRIEF_"] = brief["summary"] except Exception as e: self.logger.info(f"调用模型req_for_ts失败,原因为{e}") info_data["BRIEF_"] = "" # 是否敏感 try: censor = req_for_censor(info_data["CONTENT_"]) if censor: if censor["censor"] == "N": info_data["SENSITIVE_"] = "N" else: info_data["SENSITIVE_"] = "Y" info_data["SENSITIVE_WORD_"] = censor["words"] except Exception as e: self.logger.info(f"调用模型censor失败,错误为{e}") info_data["SENSITIVE_"] = "N" info_data["VERSION_"] = "0" info_data = super(WeiboScript, self).generic_shuffle(data=data, re_data=info_data, field="ENTITY_NAME_") # 清洗浦发银行BANK_NAME_和BANK_CODE_ if info_data["ENTITY_NAME_"] == "上海浦东发展银行微博": info_data["BANK_NAME_"] = "浦发银行" info_data["BANK_CODE_"] = "SPDB" if info_data["ENTITY_NAME_"] == "南海农商银行微博": info_data["BANK_NAME_"] = "广东南海农村商业银行股份有限公司" info_data["BANK_CODE_"] = "NRC" if info_data["ENTITY_NAME_"] == "顺德农商银行微博": info_data["BANK_NAME_"] = "广东顺德农村商业银行股份有限公司" info_data["BANK_CODE_"] = "sdebank" comment = data["INFO_COMMENTS_"] verifieds = 0 for c in comment: if c.get("VERIFIED_", ""): verifieds += 1 # 微博热度 try: hot = req_for_weibo_hot(publish_time=info_data["PUBLISH_TIME_"], relays=info_data["RELAYS_"], replies=len(comment), praises=info_data["LIKES_"], verifieds=verifieds) if hot: info_data["HOT_"] = hot["level"] except Exception as e: self.logger.info(f"调用模型weibo_hot失败,错误为{e}") re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_INFO"), "DATA_": info_data}) if len(comment) > 0: comment_count = 0 for each in comment: # CHA_BRANCH_WEIBO_COMMENT # 每次需要初始化comment_data不然导致数据重复 comment_data = dict() # HBase row_key serial_number = req_for_serial_number(code="WEIBO_COMMENT") comment_data["ID_"] = serial_number comment_data["INFO_ID_"] = info_data["ID_"] comment_data["COMMENT_"] = each["COMMENT_"] comment_data["REPLIER_TIME_"] = each["REPLIER_TIME_"] comment_data["REPLIER_HEAD_"] = each["REPLIER_HEAD_"] comment_data["REPLIER_PRAISES_"] = each["REPLIER_PRAISES_"] comment_data["REPLIER_"] = each["REPLIER_"] comment_data["REPLIER_REPLIES_"] = each["REPLIER_REPLIES_"] # 情感分析 if each.get("COMMENT_") and len(each["COMMENT_"]) > 0: try: sentiment = req_for_comment(each["COMMENT_"]) if sentiment: if sentiment["sentiment"] == "中性": comment_data["EMOTION_"] = "NORMAL" if sentiment["sentiment"] == "积极": comment_data["EMOTION_"] = "POSITIVE" if sentiment["sentiment"] == "敏感": comment_data["EMOTION_"] = "NAGETIVE" else: comment_data["EMOTION_"] = "NORMAL" except Exception as e: self.logger.info(f"调用模型req_for_comment失败,错误为{e}") comment_data["EMOTION_"] = "NORMAL" # 是否敏感 try: censor = req_for_censor(each["COMMENT_"]) if censor: if censor["censor"] == "N": comment_data["SENSITIVE_"] = "N" else: comment_data["SENSITIVE_"] = "Y" comment_data["SENSITIVE_WORD_"] = censor["words"] else: comment_data["SENSITIVE_"] = "N" except Exception as e: self.logger.info(f"调用模型req_for_comment失败,错误为{e}") comment_data["SENSITIVE_"] = "N" comment_data["VERSION_"] = "0" comment_data["CREATE_BY_ID_"] = "P0131857" comment_data["CREATE_BY_NAME_"] = "钟楷文" re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_COMMENT"), "DATA_": comment_data}) comment_count += 1 # 打相关评论日志方便调试 self.logger.info(f'清洗的URL为{info_data["URL_"]}') self.logger.info(f'清洗的评论数为{info_data["COMMENTS_"]}') self.logger.info(f'插入到comment表的数量为{comment_count}') # print(re_data) return re_data
""" 通用清洗规则写这里, 如不需要通用清洗规则则不继承重写 :param data: :param field: :return: """ if isinstance(data, dict): re_data = self.__shuffle(data) return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}] elif isinstance(data, list): re_list = list() for each in data: re_data = self.__shuffle(each) re_list.append({ "TABLE_NAME_": self.script_name, "DATA_": re_data }) return re_list else: return if __name__ == '__main__': # param = sys.argv[1] param = "{'entityType':'CRMJPFX_XT','limitNumber':1000,'entityCode':['CRMJPFX_XT_YYXTW']}" script = BranchOrganize(table_name=TABLE_NAME("CRMXT"), collection_name="CRMJPFX_XT", param=param) script.main() script.close_client()
re_list.append({ "TABLE_NAME_": self.script_name, "DATA_": re_data }) return re_list else: return if __name__ == '__main__': # param = sys.argv[1] code_list = [ # 'CRMJPFX_WD_JSYH', # 'CRMJPFX_WD_HXYH', # 'CRMJPFX_WD_BJYH', 'CRMJPFX_WD_JTYH', # 'CRMJPFX_WD_XYYH', 'CRMJPFX_WD_KBB_ALL', 'CRMJPFX_WD_ZXYH', 'CRMJPFX_WD_PAYH', 'CRMJPFX_WD_PFYH', # 'CRMJPFX_WD_SHYH', ] param = "{'entityType':'ORGANIZE','limitNumber':20000,'entityCode':['CRMJPFX_WD_PAYH']}" script = BranchOrganize(table_name=TABLE_NAME("CRM_ORGANIZE"), collection_name="CRMJPFX_WD", param=param) script.main() script.close_client()
"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_MAIN_ROUTE"), "DATA_": road_shuffle_data }) return return_list if __name__ == '__main__': # try: # param = sys.argv[1] # except Exception: # # param = "{'entityCode': 'MAPBAR_DEATAIL', 'limitNumber':2}" # param = "{}" param = "{'entityType':'MAP_BAR','limitNumber':1,'entityCode':['MAPBAR_DEATAIL_BJ']}" # todo remove these code if MongoDB collection is unified if "beijing" in param or "MAPBAR_DEATAIL_BJ" in param: collection = "mapbar_beijing" elif "shanghai" in param or "'MAPBAR_DEATAIL'" in param: collection = "mapbar_shanghai" else: collection = "mapbar" script = MapbarScript(table_name=TABLE_NAME("CHA_BRANCH_FACILITY"), collection_name=collection, param=param, verify_field={"URL_": "URL_"}) script.main()
if __name__ == '__main__': # param = sys.argv[1] # ''' # 基金需要查询相关 PRO_CODE_ # ''' param = "{'entityType':'JRCP_JJ','limitNumber':100000,'entityCode':['JRCP_JJ_TTJJ_FJZ','JRCP_JJ_TTJJ_JZ']}" # param = "{'entityType':'JRCP_JJ','limitNumber':100000,'entityCode':['JRCP_JJ_TTJJ_JZ_ALL', 'JRCP_JJ_TTJJ_FJZ_ALL']}" # 天天基金 BASIC 表 if "JRCP_JJ_TTJJ_FJZ_ALL" in param or "JRCP_JJ_TTJJ_JZ_ALL" in param: # table_name = "CHA_BRANCH_FUND_BASIC" table_name = "CRMFUND_BASIC" collection = "JRCP_JJ" verify_field = {"URL_": "URL_"} # 代销 elif "GW_ALL" in param and "TTJJ" not in param: # table_name = "CHA_BRANCH_FUND_AGENCY" table_name = "CRMFUND_AGENCY" # CRM hbase 表 collection = "JRCP_JJ" verify_field = {"URL_": "URL_"} # 用于 mongo 去重查询 # 历史净值 else: # table_name = "CHA_BRANCH_FUND_DATA" table_name = "CRMFUND_DATA" # CRM hbase 表 collection = "JRCP_JJ" # 验证字段,key是hbase的字段。value是洗出来数据的key。拼接后为:PRO_CODE_='000406' and TIME_='2019-05-30' verify_field = {'PRO_CODE_': 'PRO_CODE_', 'TIME_': 'TIME_'} script = BranchFund(table_name=TABLE_NAME(table_name), collection_name=collection, param=param, verify_field=verify_field) script.main() script.close_client()
清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ if isinstance(data, dict): re_data = self.__shuffle(data) return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}] elif isinstance(data, list): re_list = list() for each in data: re_data = self.__shuffle(each) re_list.append({ "TABLE_NAME_": self.script_name, "DATA_": re_data }) return re_list else: return if __name__ == '__main__': # param = sys.argv[1] param = "{'entityType':'WD_TY','limitNumber':1000,'entityCode':['WD_TY_HEBYH_GW_ALL']}" script = BranchOrganize(table_name=TABLE_NAME("CHA_BRANCH_ORGANIZE"), collection_name="WD_TY", param=param) script.main() script.close_client()
def generic_shuffle(self, data, field="CONTENT_"): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data = dict() if "TAGS_" in data: re_data["TAGS_"] = "" # re_data["HOT_"] = data[""] re_data["PRO_NAME_"] = data["PRO_NAME_"] re_data["PRO_CODE_"] = data["PRO_CODE_"] # 基本信息 插入基本信息表 if data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ_ALL", "JRCP_JJ_TTJJ_JZ_ALL"]: data_dict = dict(TABLE_NAME_=TABLE_NAME("CRMFUND_BASIC")) # self.p_client.table_name = TABLE_NAME("CRMFUND_BASIC") source = re.findall(r"(https?://.*?)[/?]", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] # todo # re_data["SOURCE_CODE_"] = "" re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][8:12] basic_field_list = ["COM_NAME_", "FUND_TYPE_", "RISK_LEVEL_", "RELEASE_DATE_", "BUILD_DATE_", "BUILD_SCAL_", "ASSET_SCAL_", "SHARE_SCAL_", "MANAGER_", "TRUSTEE_", "HANDLER_", "DIVIDEND_", "MANAGE_FEE_RATE_", "HOST_FEE_RATE_", "SALE_FEE_RATE_", "MAX_SUB_RATE_", "MAX_APPLY_RATE_", "MAX_REDEEM_RATE_", "BENCHMARK_", "BID_", "CLOSE_", "DIM_"] for basic_field in basic_field_list: if basic_field == "FUND_TYPE_": fund_type = data.get("FUND_TYPE_", "其他") re_data["FUND_TYPE_"] = fund_type try: re_data["FUND_TYPE_CODE_"] = self.ft_dict[data["FUND_TYPE_"]] except KeyError: for ft in self.ft_dict.keys(): if ft[:2] in fund_type: re_data["FUND_TYPE_CODE_"] = self.ft_dict[ft] if "FUND_TYPE_CODE_" not in re_data: # self.logger.info(f"FUND_TYPE_CODE_ {fund_type}") re_data["FUND_TYPE_CODE_"] = "QT" elif basic_field == "RISK_LEVEL_": risk_level_ = data.get("RISK_LEVEL_", "未知") risk_level_ = re.split(r'[|]', risk_level_.strip())[-1] if risk_level_ else "未知" re_data["RISK_LEVEL_"] = self.rl_name_dict[risk_level_] re_data["RISK_LEVEL_CODE_"] = self.rl_dict.get(re_data["RISK_LEVEL_"], "") elif basic_field == "MAX_REDEEM_RATE_": max_redeem_rate_ = data.get("MAX_REDEEM_RATE_", "") re_data["MAX_REDEEM_RATE_"] = re.split(r'[|]', data.get("MAX_REDEEM_RATE_", ""))[-1].replace \ ("%", "") if max_redeem_rate_ else "" elif basic_field == "BENCHMARK_": re_data[basic_field] = data.get(basic_field, "") elif basic_field == "BUILD_DATE_" or basic_field == "RELEASE_DATE_": basic_date = re.findall(r"(\d{4}年\d{2}月\d{1,2})日", data[basic_field]) if basic_date: re_data[basic_field] = re.sub(r"[\u4e00-\u9fa5]", "-", basic_date[0]) elif basic_field == "HANDLER_": re_data[basic_field] = data.get(basic_field, "").replace('|', '') else: re_data[basic_field] = data.get(basic_field, "").replace("%", "") # 添加一个资产总额字段方便统计 if re_data["ASSET_SCAL_"]: asset_total = re.findall(r"(.*?亿元)(截止至:\d+年\d+月\d+日)", re_data["ASSET_SCAL_"]) if len(asset_total) > 0: re_data["ASSET_TOTAL_"] = asset_total[0] else: re_data["ASSET_TOTAL_"] = '0' # 基金基本信息默认都是CHECK re_data["DATA_STATUS_"] = "CHECK" re_data["DATA_VERSION_"] = "0" re_data = super(BranchFund, self).generic_shuffle(data=data, re_data=re_data, field="TRUSTEE_") data_dict["DATA_"] = re_data return [data_dict] # 代销基金 插入代销基金表 elif "GW_ALL" in data["ENTITY_CODE_"]: agency_dict = dict(TABLE_NAME_=TABLE_NAME("CRMFUND_AGENCY")) # self.p_client.table_name = "CRMFUND_AGENCY" # self.p_client.table_name = TABLE_NAME("CRMFUND_AGENCY") serial_number = req_for_serial_number(code="JRCP_JJ_AGENT") re_data["ID_"] = serial_number source = re.findall(r"(https?://.*?)[/?]", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] re_data["PUBLISH_TIME_"] = data["DATETIME_"] re_data["SOURCE_TYPE_"] = "" # HOT_ 代销基金目前不需要热度字段 # re_data["HOT_"] = data[""] re_data["RECOMMEND_"] = "N" re_data["GOOD_SALE_"] = "N" re_data["NEW_SALE_"] = "N" re_data["PUBLISH_STATUS_"] = "Y" re_data["DATA_STATUS_"] = "CHECK" re_data["VERSION_"] = "0" re_data["DATA_VERSION_"] = "0" # 从基金和基金基本信息中获取 pro_code_ = data.get("PRO_CODE_") pro_name = data.get("PRO_NAME_") cur = self.connection.cursor() # TODO 查取不到 # 从基金基本信息表查询相关数据 if pro_code_: try: re_data["PRO_CODE_"] = pro_code_ detail_list = ["RISK_LEVEL_", "RISK_LEVEL_CODE_", "FUND_TYPE_", "FUND_TYPE_CODE_","BUILD_DATE_" "COM_NAME_", "RELEASE_DATE_", "CLOSE_"] cur.execute(f"SELECT {','.join(detail_list)} " f"FROM CRMFUND_BASIC WHERE PRO_CODE_='{str(data['PRO_CODE_'])}' " f"ORDER BY CREATE_TIME_ DESC LIMIT 1") for index, item in enumerate(cur.fetchone()): re_data[detail_list[index]] = item except Exception as e: re_data["PUBLISH_STATUS_"] = "N" re_data["DATA_STATUS_"] = "UNCHECK" elif pro_name: try: pro_name = pro_name if not data.get("PRO_LIKE_NAME_") else data.get("PRO_LIKE_NAME_") detail_list = ["PRO_CODE_", "RISK_LEVEL_", "RISK_LEVEL_CODE_", "FUND_TYPE_", "FUND_TYPE_CODE_", "COM_NAME_", "RELEASE_DATE_", "CLOSE_"] cur.execute(f"SELECT {','.join(detail_list)} " f"FROM CRMFUND_BASIC WHERE PRO_NAME_ LIKE '{pro_name}%' " f"ORDER BY CREATE_TIME_ DESC LIMIT 1") for index, item in enumerate(cur.fetchone()): re_data[detail_list[index]] = item except Exception as e: re_data["PUBLISH_STATUS_"] = "N" re_data["DATA_STATUS_"] = "UNCHECK" # 从基金历史净值表查询相关数据 if re_data.get("PRO_CODE_"): try: cur.execute(f"SELECT BUY_STATUS_, NEW_NAV_, NEW_SYR_ " f"FROM CRMFUND_DATA " f"WHERE PRO_CODE_= '{str(re_data['PRO_CODE_'])}' " f"ORDER BY TIME_ DESC LIMIT 1") re_data["BUY_STATUS_"], re_data["NEW_NAV_"], re_data["NEW_SYR_"] = cur.fetchone() if re_data["BUY_STATUS_"] and re_data["BUY_STATUS_"] in self.new_bs_dict.keys(): re_data["BUY_STATUS_CODE_"] = self.new_bs_dict[re_data["BUY_STATUS_"]] except Exception as e: re_data["PUBLISH_STATUS_"] = "N" re_data["DATA_STATUS_"] = "UNCHECK" finally: cur.close() if not re_data.get("RISK_LEVEL_"): if "RISK_LEVEL_" not in data: risk_level_ = "未知" else: risk_level_ = data["RISK_LEVEL_"] risk_level_ = re.split(r'[|]', risk_level_.strip())[-1] if risk_level_ else "未知" re_data["RISK_LEVEL_"] = self.rl_name_dict[risk_level_] re_data["RISK_LEVEL_CODE_"] = self.rl_dict.get(re_data["RISK_LEVEL_"], "") # re_data["NEW_SYR_"] = data[""] if not (re_data.get("FUND_TYPE_") or re_data.get("RELEASE_DATE_")): re_data["PUBLISH_STATUS_"] = "N" re_data["DATA_STATUS_"] = "UNCHECK" re_data = super(BranchFund, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_") agency_dict["DATA_"] = re_data return [agency_dict] # 历史净值 插入基金表 elif data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ", "JRCP_JJ_TTJJ_JZ"]: serial_number = req_for_serial_number(code=data["ENTITY_CODE_"][:7]) re_data["ID_"] = serial_number # re_data["FUND_BASIC_ID_"] = data[""] 关联 BASIC_ID re_data["SERVICE_CHARGE_"] = data["SERVICE_CHARGE_"] re_data["RATING_AGENCIES_"] = data["RATING_AGENCIES_"].replace('jjpj', '') nom_field_list = ["TIME_", "NEW_NAV_", "NEW_ANV_", "OLD_TIME_", "OLD_NAV_", "OLD_ANV_", "DAY_GROWTH_", "DAY_GROWTH_RATE_", "ONE_MONTH_RATE_", "THREE_MONTH_RATE_", "SIX_MONTH_RATE_", "ONE_YEAR_RATE_", "THREE_YEAR_RATE_", "BUILD_RATE_", "NEW_TOI_", "NEW_SYR_", "OLD_TOI_", "OLD_SYR_", "FYR_", "TYR_", "MARKET_PRICE_", "DISCOUNT_RATE_", "VERSION_", "BUY_STATUS_", "REDEEM_STATUS_"] for nom_field in nom_field_list: if nom_field == "VERSION_": re_data[nom_field] = "0" elif nom_field == "BUY_STATUS_": re_data["BUY_STATUS_"] = data.get("BUY_STATUS_", "") re_data["BUY_STATUS_CODE_"] = self.new_bs_dict.get(re_data["BUY_STATUS_"], "") elif nom_field == "REDEEM_STATUS_": re_data["REDEEM_STATUS_"] = data.get("REDEEM_STATUS_") re_data["REDEEM_STATUS_CODE_"] = self.rs_dict.get(re_data["REDEEM_STATUS_"], "") else: re_data[nom_field] = data.get(nom_field, "").replace("%", "") re_data[nom_field] = re_data[nom_field].replace("--", "") re_data["CREATE_BY_ID_"] = CREATE_ID re_data["CREATE_BY_NAME_"] = CREATE_NAME if data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ",]: re_data["APY_FOURTEEN_"] = data.get("APY_FOURTEEN_") re_data["APY_TWENTY_EIGHT_"] = data.get("APY_TWENTY_EIGHT_") re_data["NEW_TOI_"] = data.get("NEW_TOI_") re_data["NEW_SYR_"] = data.get("NEW_SYR_") try: re_data["APY_THIRTY_FIVE_"] = round(float(dict(self.db_spider_data.JRCP_JJ.find_one({'PRO_CODE_': data['PRO_CODE_'], 'TIME_': data['TIME_'], 'ENTITY_CODE_': 'JRCP_JJ_TTJJ_35NH'})).get('APY_THIRTY_FIVE_')) * 100) / 100.0 except: re_data["APY_THIRTY_FIVE_"] = '' elif data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_JZ",]: re_data["NEW_WORTH_"] = data.get("NEW_WORTH_") # 处理T-1日净值 # self.p_client.table_name = cur = self.connection.cursor() cur.execute(f"SELECT NEW_NAV_,NEW_ANV_,NEW_TOI_,NEW_SYR_ FROM CRMFUND_DATA where PRO_CODE_='{re_data['PRO_CODE_']}' and TIME_<'{re_data['TIME_']}' order by TIME_ desc limit 1") t_1data = cur.fetchone() if t_1data: self.logger.info(f"====T-1日数据===={t_1data}") # print(t_1data) re_data['OLD_NAV_'] = t_1data[0] re_data['OLD_ANV_'] = t_1data[1] re_data['OLD_TOI_'] = t_1data[2] re_data['OLD_SYR_'] = t_1data[3] # 更新代销基金数据 self.p_client.table_name = TABLE_NAME('CRMFUND_AGENCY') agences = self.p_client.search_all_from_phoenix(connection=self.connection, dict_status=True, where_condition=f"PRO_CODE_='{re_data['PRO_CODE_']}'") if agences: while True: try: agence_data = agences.__next__() self.logger.info(f"====更新代销基金数据===={agence_data}") agence_data['NEW_NAV_'] = re_data['NEW_NAV_'] agence_data['NEW_SYR_'] = re_data['NEW_SYR_'] agence_data['BUY_STATUS_'] = re_data['BUY_STATUS_'] agence_data['BUY_STATUS_CODE_'] = re_data['BUY_STATUS_CODE_'] except: break try: self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=agence_data) except jaydebeapi.DatabaseError: continue self.p_client.table_name = TABLE_NAME('CRMFUND_DATA') return [{"TABLE_NAME_": TABLE_NAME("CRMFUND_DATA"), "DATA_": re_data}]
del data["CONTENT_"] data["HTML_"] = html re_data = super(WechatScript, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_") if re_data.get('_id'): del re_data['_id'] return [{ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }] else: return if __name__ == '__main__': # param = sys.argv[1] # verify_field = {'TITLE_': 'TITLE_', 'WECHAT_ID_': 'WECHAT_ID_'} verify_field = {'TITLE_': 'TITLE_'} # param = "{}" param = "{'limitNumber':'2'}" script = WechatScript(table_name=TABLE_NAME("CHA_BRANCH_WECHAT"), collection_name="WECHAT", param=param, verify_field=verify_field) script.main() script.close_client()
else: if response: try: p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_WORD", file_name=data["PDF_NAME_"].replace(".doc", ""), postfix="doc", file=response.content) self.logger.info(f"{p_response.content.decode('utf-8')}") p_response.close() except Exception as e: self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}") finally: response.close() else: self.logger.warning(f'id: {data["_id"]},获取PDF失败') if "HTML_" in data: del data["HTML_"] re_data = super(BranchInsurance, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_") re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"] return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}] if __name__ == '__main__': # verify_field = {'URL_': 'URL_', 'PRO_NAME_': 'PRO_NAME_'} verify_field = {'URL_': 'URL_'} param = "{'entityType':'JRCP_BX','limitNumber':1000,'entityCode':['JRCP_BX_HEBYH_GW_ALL']}" script = BranchInsurance(table_name=TABLE_NAME("CHA_BRANCH_INSURANCE"), collection_name="JRCP_BX", param=param, verify_field=verify_field) script.main() script.close_client()
return data def generic_shuffle(self, data, field="PRO_NAME_"): """ 通用清洗规则写这里, 如不需要通用清洗规则则不继承重写 :param data: :param field: :return: """ if isinstance(data, dict): re_data = self.__shuffle(data) return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}] elif isinstance(data, list): re_list = list() for each in data: re_data = self.__shuffle(each) re_list.append({"TABLE_NAME_": self.script_name, "DATA_": re_data}) return re_list else: return if __name__ == '__main__': # 清洗数据会先检验 verify_field 字段, 然后依照 verify_field 字段查询 hbase 去重查询 # 债券数据未做去重处理, 所有不能重复插入 # param = sys.argv[1] param = "{'entityType':'CRMJPFX_ZQ','limitNumber':10000,'entityCode':['CRMJPFX_ZQ_HXZQ']}" script = BranchOrganize(table_name=TABLE_NAME("CRMZQ"), collection_name="CRMJPFX_ZQ", param=param) script.main() script.close_client()
re_data["PROVINCE_NAME_"] = None re_data["CITY_NAME_"] = data["CITY_"] + "市" re_data["AREA_NAME_"] = None re_data["AREA_CODE_"] = None re_data["CITY_CODE_"] = None re_data["PROVINCE_CODE_"] = None re_data["NAME_"] = shopping_name re_data = super(Branchsssq, self).generic_shuffle(data=data, re_data=re_data, field=None) re_data_list.append({ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }) # print(re_data_list) return re_data_list if __name__ == '__main__': # param = sys.argv[1] param = "{'limitNumber':'20'}" verify_field = {'NAME_': 'NAME_'} script = Branchsssq(table_name=TABLE_NAME("CHA_BRANCH_BUSINESS"), collection_name="WD_SS_SQ", param=param, verify_field=verify_field) script.main() script.close_client()
# basic_dict["TAGS_"] = data[""] # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] basic_dict["SOURCE_TYPE_"] = "链家" # basic_dict["PRICE_TYPE_"] = data[""] basic_dict["ADDR_"] = data["ADDR_"] return [{ "TABLE_NAME_": self.data_table_name, "DATA_": data_dict }, { "TABLE_NAME_": self.base_table_name, "DATA_": basic_dict }] if __name__ == '__main__': try: param = sys.argv[1] except Exception: param = "{}" script = BranchNews(table_name=TABLE_NAME("CHA_BRANCH_HOUSE_DATA"), collection_name="WD_JZ_FJ_XM", param=param) script.main() script.close_client()
for each in self.bank_list: if re_data.get('BANK_NAME_') in each['ALIAS_']: bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: re_data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: re_data["BANK_CODE_"] = "|".join(bank_code_list) return [{"TABLE_NAME_": 'CRM_NEWS', "DATA_": re_data}] if __name__ == '__main__': # param = sys.argv[1] ''' CRMJPFX_ZXYQ_XLCJ_JRBGT CRMJPFX_ZXYQ_XLCJ_JRTS CRMJPFX_ZXYQ_XL_HMTS CRMJPFX_ZXYQ_XLCJ_GSDT ''' for _ in range(100): try: # param = "{'entityType':'CRMNEWS','limitNumber':10000,'entityCode':['CRMJPFX_ZXYQ_XLCJ_GSDT','CRMJPFX_ZXYQ_XL_HMTS', 'CRMJPFX_ZXYQ_XLCJ_JRTS','CRMJPFX_ZXYQ_XLCJ_JRBGT', ]}" param = "{'entityType':'CRMNEWS','limitNumber':10000,'entityCode':['CRMJPFX_ZXYQ_XL_HMTS' ]}" script = BranchNews(table_name=TABLE_NAME("CRM_NEWS"), collection_name="CRMJPFX_ZXYQ", param=param) script.main() script.close_client() except: continue