def login_morning_star(self, cookie_str=None): login_url = 'https://www.morningstar.cn/membership/signin.aspx' if self._chrome_driver == None: from selenium import webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--no-sandbox") # _chrome_driver = webdriver.Chrome("/usr/local/chromedriver") self._chrome_driver = webdriver.Chrome(options=chrome_options) self._chrome_driver.set_page_load_timeout(12000) """ 模拟登录,支持两种方式: 1. 设置已经登录的cookie 2. 输入账号,密码,验证码登录(验证码识别正确率30%,识别识别支持重试) """ if cookie_str: set_cookies(self._chrome_driver, login_url, cookie_str) else: if self._morning_cookies == None: login_status = login_site(self._chrome_driver, login_url) if login_status: print('login success') sleep(3) else: print('login fail') exit() # 获取网站cookie _morning_cookies = self._chrome_driver.get_cookies() else: self._morning_cookies = self._chrome_driver.get_cookies()
def handle_logout(request): logger.info("Starting handle_logout") cookies = get_cookies(request["headers"]) # check for refresh token, which we need to call logout if CONFIG["REFRESH_COOKIE"] in cookies: logger.info("Posting to logout URL") resp = post_to_url(url=wkc_data["end_session_endpoint"], client_id=CONFIG["CLIENT_ID"], client_secret=CONFIG["CLIENT_SECRET"], refresh_token=cookies[CONFIG["REFRESH_COOKIE"]]) logger.info("Back from post to logout URL, resp={r}".format(r=resp)) # unset cookies r = redirect("/") cookies[CONFIG["AUTH_COOKIE"]] = "" cookies[CONFIG["REFRESH_COOKIE"]] = "" r = set_cookies(response=r, cookies=cookies) logger.info("Returning response to client") return r
def get_fund_list(cookie_str=None): from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") chrome_driver = webdriver.Chrome('./chromedriver/chromedriver.exe', chrome_options=options) chrome_driver.set_page_load_timeout(12000) # 防止页面加载个没完 morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx" # "https://cn.morningstar.com/quickrank/default.aspx" """ 模拟登录,支持两种方式: 1. 设置已经登录的cookie 2. 输入账号,密码,验证码登录(验证码识别正确率30%,识别识别支持重试) """ if cookie_str: set_cookies(chrome_driver, morning_fund_selector_url, cookie_str) else: morning_cookies = "" if morning_cookies == "": login_status = login_site(chrome_driver, morning_fund_selector_url) if login_status: print('login success') sleep(3) else: print('login fail') exit() # 获取网站cookie morning_cookies = chrome_driver.get_cookies() else: chrome_driver.get(morning_fund_selector_url) # 再次打开爬取页面 print(chrome_driver.get_cookies()) # 打印设置成功的cookie # 定义起始页码 page_num = 1 page_count = 25 page_num_total = math.ceil( int( chrome_driver.find_element_by_xpath( '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count) result_dir = '../output/' output_head = '代码' + ',' + '晨星专属号' + ',' + '名称' + ',' + \ '类型' + ',' + '三年评级' + ',' + '五年评级' + ',' + '今年回报率' + '\n' # 设置表头 if page_num == 1: with open(result_dir + 'fund_morning_star.csv', 'w+') as csv_file: csv_file.write(output_head) while page_num <= page_num_total: # 求余 remainder = page_num_total % 10 # 判断是否最后一页 num = (remainder + 2) if page_num > (page_num_total - remainder) else 12 xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % ( num) print('page_num', page_num) # 等待,直到当前页(样式判断)等于page_num WebDriverWait(chrome_driver, timeout=600).until( text_to_be_present_in_element( "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_num), xpath_str)) sleep(1) # 列表用于存放爬取的数据 id_list = [] # 雪花id code_list = [] # 基金代码 morning_star_code_list = [] # 晨星专属代码 name_list = [] # 基金名称 fund_cat = [] # 基金分类 fund_rating_3 = [] # 晨星评级(三年) fund_rating_5 = [] # 晨星评级(五年) rate_of_return = [] # 今年以来汇报(%) # 获取每页的源代码 data = chrome_driver.page_source # 利用BeautifulSoup解析网页源代码 bs = BeautifulSoup(data, 'lxml') class_list = ['gridItem', 'gridAlternateItem'] # 数据在这两个类下面 # 取出所有类的信息,并保存到对应的列表里 for i in range(len(class_list)): for tr in bs.find_all('tr', {'class': class_list[i]}): # 雪花id worker = IdWorker() id_list.append(worker.get_id()) tds_text = tr.find_all('td', {'class': "msDataText"}) tds_nume = tr.find_all('td', {'class': "msDataNumeric"}) # 基金代码 code_a_element = tds_text[0].find_all('a')[0] code_list.append(code_a_element.string) # 从href中匹配出晨星专属代码 current_morning_code = re.findall( r'(?<=/quicktake/)(\w+)$', code_a_element.get('href')).pop(0) # 晨星基金专属晨星码 morning_star_code_list.append(current_morning_code) name_list.append(tds_text[1].find_all('a')[0].string) # 基金分类 fund_cat.append(tds_text[2].string) # 三年评级 rating = get_star_count(tds_text[3].find_all('img')[0]['src']) fund_rating_3.append(rating) # 5年评级 rating = get_star_count(tds_text[4].find_all('img')[0]['src']) fund_rating_5.append(rating) # 今年以来回报(%) return_value = tds_nume[ 3].string if tds_nume[3].string != '-' else None rate_of_return.append(return_value) print('数据准备完毕') fund_df = pd.DataFrame({ 'id': id_list, 'fund_code': code_list, 'morning_star_code': morning_star_code_list, 'fund_name': name_list, 'fund_cat': fund_cat, 'fund_rating_3': fund_rating_3, 'fund_rating_5': fund_rating_5, 'rate_of_return': rate_of_return }) sql_insert = "replace into fund_morning_star(`id`, `fund_code`,`morning_star_code`, `fund_name`, `fund_cat`, `fund_rating_3`, `fund_rating_5`, `rate_of_return`) values(%s, %s, %s, %s, %s, %s, %s, %s)" # print('fund_df', fund_df) fund_list = fund_df.values.tolist() # cursor.executemany(sql_insert, fund_list) # connect.commit() print('fund_list', fund_list) with open(result_dir + 'fund_morning_star.csv', 'a') as csv_file: for fund_item in fund_list: output_line = ', '.join(str(x) for x in fund_item) + '\n' csv_file.write(output_line) # 获取下一页元素 next_page = chrome_driver.find_element_by_xpath(xpath_str) # 点击下一页 next_page.click() page_num += 1 chrome_driver.close() print('end')
def handle_login(request): logger.info("Starting handle_login") args = parse_qs(request["querystring"]) redirect_to = "/" # check if state is present if "state" in args: # need to decode state state = args["state"][0] logger.info("Got state of: {state}".format(state=state)) state = base64.b64decode(state).decode("utf-8") logger.info("Decoded state: {state}".format(state=state)) state = json.loads(state) if "source_url" in state: # check hash source_url_secret = CONFIG["STATE_SECRET"] + state["source_url"] source_url_secret_hash = sha256( source_url_secret.encode("utf-8")).hexdigest() if source_url_secret_hash != state["hash"]: return return_bad_request("Error validating state") redirect_to = state["source_url"] logger.info( "Got source url from state of {url}".format(url=redirect_to)) try: auth_code = args["code"] logger.info( "Got auth code from query string: {code}".format(code=auth_code)) # swap the authorisation code for tokens resp = post_to_url(url=wkc_data["token_endpoint"], grant_type="authorization_code", client_id=CONFIG["CLIENT_ID"], client_secret=CONFIG["CLIENT_SECRET"], code=auth_code, redirect_uri=CONFIG["REDIRECT_URI"]) resp = json.loads(resp) logger.info("Exchanged authorisation code for tokens") access_token = resp["access_token"] refresh_token = resp["refresh_token"] # if there is an allowed group set, we need to check the token has the claim for that group if "ALLOWED_GROUP" in CONFIG: allowed_group = CONFIG["ALLOWED_GROUP"] if allowed_group != "": logger.info("Need to check groups") decoded_token = validate_jwt(api=CONFIG["VAL_API_URL"], token=access_token, key_set=keys, aud=CONFIG["CLIENT_ID"]) decoded_token = json.loads(decoded_token.decode("utf-8")) if "groups" in decoded_token: if allowed_group in decoded_token["groups"]: logger.info( f"user has expected group of {allowed_group}") else: logger.info( f"user is missing expected group of {allowed_group}" ) return forbidden( message=CONFIG["ACCESS_DENIED_MESSAGE"]) else: logger.info("no groups claim in token") return forbidden(message=CONFIG["ACCESS_DENIED_MESSAGE"]) # prepare response r = redirect(redirect_to) cookies = {} cookies[CONFIG["AUTH_COOKIE"]] = access_token cookies[CONFIG["REFRESH_COOKIE"]] = refresh_token r = set_cookies(response=r, cookies=cookies, max_age=CONFIG.get("MAX_AGE", "10")) logger.info("Returning response to client") return r except KeyError: return return_bad_request("Bad request missing parameter")
def check_session(request): logger.info("Starting check_session") cookies = get_cookies(request["headers"]) # update aurl with state data source_url = request["uri"] if request["querystring"]: source_url = source_url + "?{qs}".format(qs=request["querystring"]) logger.info("Determined source_url is {s}".format(s=source_url)) source_url_secret = CONFIG["STATE_SECRET"] + source_url state = { "source_url": source_url, "hash": sha256(source_url_secret.encode("utf-8")).hexdigest(), "nonce": get_rand_string(10) } state = base64.b64encode(json.dumps(state).encode("utf-8")).decode("utf-8") logger.info("Encoded state: {state}".format(state=state)) aurl_with_state = "{aurl}&state={state}".format(aurl=aurl, state=state) # check for auth token if CONFIG["AUTH_COOKIE"] in cookies: logger.info("Got access token") # try to validate the token try: decoded = validate_jwt(api=CONFIG["VAL_API_URL"], token=cookies[CONFIG["AUTH_COOKIE"]], key_set=keys, aud=CONFIG["CLIENT_ID"]) logger.info("Access token is valid") return request except ExpiredSignatureError as e: # token is not valid logger.info( "Access token has expired, so going to attempt to refresh") if CONFIG["REFRESH_COOKIE"] in cookies: # we have a refresh token logger.info("Refreshing token") resp = post_to_url( url=wkc_data["token_endpoint"], grant_type="refresh_token", client_id=CONFIG["CLIENT_ID"], client_secret=CONFIG["CLIENT_SECRET"], refresh_token=cookies[CONFIG["REFRESH_COOKIE"]]) logger.info("Called to refresh token") resp = json.loads(resp) if "error" in resp: logger.info( "There was an error refreshing the token, so need to log in again" ) return redirect(aurl) else: access_token = resp["access_token"] logger.info("Got new access token, returning to client") r = redirect("/") cookies[CONFIG["AUTH_COOKIE"]] = access_token r = set_cookies(response=r, cookies=cookies, max_age=CONFIG.get("MAX_AGE", "10")) return r else: # return a 302 redirect as we don't have a refresh token logger.info( "No refresh token present, so need to log in again") return redirect(aurl_with_state) else: logger.info("No access token present, so need to log in") return redirect(aurl_with_state)