def login_morning_star(self, cookie_str=None): login_url = 'https://www.morningstar.cn/membership/signin.aspx' if self._chrome_driver == None: from selenium import webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--no-sandbox") # _chrome_driver = webdriver.Chrome("/usr/local/chromedriver") self._chrome_driver = webdriver.Chrome(options=chrome_options) self._chrome_driver.set_page_load_timeout(12000) """ 模拟登录,支持两种方式: 1. 设置已经登录的cookie 2. 输入账号,密码,验证码登录(验证码识别正确率30%,识别识别支持重试) """ if cookie_str: set_cookies(self._chrome_driver, login_url, cookie_str) else: if self._morning_cookies == None: login_status = login_site(self._chrome_driver, login_url) if login_status: print('login success') sleep(3) else: print('login fail') exit() # 获取网站cookie _morning_cookies = self._chrome_driver.get_cookies() else: self._morning_cookies = self._chrome_driver.get_cookies()
def login(): from selenium import webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--no-sandbox") chrome_driver = webdriver.Chrome(options=chrome_options) chrome_driver.set_page_load_timeout(12000) login_url = 'https://www.morningstar.cn/membership/signin.aspx' login_status = login_site(chrome_driver, login_url) if login_status: print('login success') else: print('login fail') exit() return chrome_driver
def get_fund_list(cookie_str=None): from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") chrome_driver = webdriver.Chrome('./chromedriver/chromedriver.exe', chrome_options=options) chrome_driver.set_page_load_timeout(12000) # 防止页面加载个没完 morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx" # "https://cn.morningstar.com/quickrank/default.aspx" """ 模拟登录,支持两种方式: 1. 设置已经登录的cookie 2. 输入账号,密码,验证码登录(验证码识别正确率30%,识别识别支持重试) """ if cookie_str: set_cookies(chrome_driver, morning_fund_selector_url, cookie_str) else: morning_cookies = "" if morning_cookies == "": login_status = login_site(chrome_driver, morning_fund_selector_url) if login_status: print('login success') sleep(3) else: print('login fail') exit() # 获取网站cookie morning_cookies = chrome_driver.get_cookies() else: chrome_driver.get(morning_fund_selector_url) # 再次打开爬取页面 print(chrome_driver.get_cookies()) # 打印设置成功的cookie # 定义起始页码 page_num = 1 page_count = 25 page_num_total = math.ceil( int( chrome_driver.find_element_by_xpath( '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count) result_dir = '../output/' output_head = '代码' + ',' + '晨星专属号' + ',' + '名称' + ',' + \ '类型' + ',' + '三年评级' + ',' + '五年评级' + ',' + '今年回报率' + '\n' # 设置表头 if page_num == 1: with open(result_dir + 'fund_morning_star.csv', 'w+') as csv_file: csv_file.write(output_head) while page_num <= page_num_total: # 求余 remainder = page_num_total % 10 # 判断是否最后一页 num = (remainder + 2) if page_num > (page_num_total - remainder) else 12 xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % ( num) print('page_num', page_num) # 等待,直到当前页(样式判断)等于page_num WebDriverWait(chrome_driver, timeout=600).until( text_to_be_present_in_element( "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_num), xpath_str)) sleep(1) # 列表用于存放爬取的数据 id_list = [] # 雪花id code_list = [] # 基金代码 morning_star_code_list = [] # 晨星专属代码 name_list = [] # 基金名称 fund_cat = [] # 基金分类 fund_rating_3 = [] # 晨星评级(三年) fund_rating_5 = [] # 晨星评级(五年) rate_of_return = [] # 今年以来汇报(%) # 获取每页的源代码 data = chrome_driver.page_source # 利用BeautifulSoup解析网页源代码 bs = BeautifulSoup(data, 'lxml') class_list = ['gridItem', 'gridAlternateItem'] # 数据在这两个类下面 # 取出所有类的信息,并保存到对应的列表里 for i in range(len(class_list)): for tr in bs.find_all('tr', {'class': class_list[i]}): # 雪花id worker = IdWorker() id_list.append(worker.get_id()) tds_text = tr.find_all('td', {'class': "msDataText"}) tds_nume = tr.find_all('td', {'class': "msDataNumeric"}) # 基金代码 code_a_element = tds_text[0].find_all('a')[0] code_list.append(code_a_element.string) # 从href中匹配出晨星专属代码 current_morning_code = re.findall( r'(?<=/quicktake/)(\w+)$', code_a_element.get('href')).pop(0) # 晨星基金专属晨星码 morning_star_code_list.append(current_morning_code) name_list.append(tds_text[1].find_all('a')[0].string) # 基金分类 fund_cat.append(tds_text[2].string) # 三年评级 rating = get_star_count(tds_text[3].find_all('img')[0]['src']) fund_rating_3.append(rating) # 5年评级 rating = get_star_count(tds_text[4].find_all('img')[0]['src']) fund_rating_5.append(rating) # 今年以来回报(%) return_value = tds_nume[ 3].string if tds_nume[3].string != '-' else None rate_of_return.append(return_value) print('数据准备完毕') fund_df = pd.DataFrame({ 'id': id_list, 'fund_code': code_list, 'morning_star_code': morning_star_code_list, 'fund_name': name_list, 'fund_cat': fund_cat, 'fund_rating_3': fund_rating_3, 'fund_rating_5': fund_rating_5, 'rate_of_return': rate_of_return }) sql_insert = "replace into fund_morning_star(`id`, `fund_code`,`morning_star_code`, `fund_name`, `fund_cat`, `fund_rating_3`, `fund_rating_5`, `rate_of_return`) values(%s, %s, %s, %s, %s, %s, %s, %s)" # print('fund_df', fund_df) fund_list = fund_df.values.tolist() # cursor.executemany(sql_insert, fund_list) # connect.commit() print('fund_list', fund_list) with open(result_dir + 'fund_morning_star.csv', 'a') as csv_file: for fund_item in fund_list: output_line = ', '.join(str(x) for x in fund_item) + '\n' csv_file.write(output_line) # 获取下一页元素 next_page = chrome_driver.find_element_by_xpath(xpath_str) # 点击下一页 next_page.click() page_num += 1 chrome_driver.close() print('end')