def login_morning_star(self, cookie_str=None):
     login_url = 'https://www.morningstar.cn/membership/signin.aspx'
     if self._chrome_driver == None:
         from selenium import webdriver
         chrome_options = webdriver.ChromeOptions()
         chrome_options.add_argument("--no-sandbox")
         # _chrome_driver = webdriver.Chrome("/usr/local/chromedriver")
         self._chrome_driver = webdriver.Chrome(options=chrome_options)
         self._chrome_driver.set_page_load_timeout(12000)
         """
     模拟登录,支持两种方式:
         1. 设置已经登录的cookie
         2. 输入账号,密码,验证码登录(验证码识别正确率30%,识别识别支持重试)
     """
     if cookie_str:
         set_cookies(self._chrome_driver, login_url, cookie_str)
     else:
         if self._morning_cookies == None:
             login_status = login_site(self._chrome_driver, login_url)
             if login_status:
                 print('login success')
                 sleep(3)
             else:
                 print('login fail')
                 exit()
             # 获取网站cookie
             _morning_cookies = self._chrome_driver.get_cookies()
         else:
             self._morning_cookies = self._chrome_driver.get_cookies()
def login():
    from selenium import webdriver
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--no-sandbox")
    chrome_driver = webdriver.Chrome(options=chrome_options)
    chrome_driver.set_page_load_timeout(12000)
    login_url = 'https://www.morningstar.cn/membership/signin.aspx'
    login_status = login_site(chrome_driver, login_url)
    if login_status:
        print('login success')
    else:
        print('login fail')
        exit()
    return chrome_driver
예제 #3
0
def get_fund_list(cookie_str=None):
    from selenium import webdriver

    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    chrome_driver = webdriver.Chrome('./chromedriver/chromedriver.exe',
                                     chrome_options=options)
    chrome_driver.set_page_load_timeout(12000)  # 防止页面加载个没完

    morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx"
    # "https://cn.morningstar.com/quickrank/default.aspx"
    """
    模拟登录,支持两种方式:
        1. 设置已经登录的cookie
        2. 输入账号,密码,验证码登录(验证码识别正确率30%,识别识别支持重试)
    """
    if cookie_str:
        set_cookies(chrome_driver, morning_fund_selector_url, cookie_str)
    else:
        morning_cookies = ""
        if morning_cookies == "":
            login_status = login_site(chrome_driver, morning_fund_selector_url)
            if login_status:
                print('login success')
                sleep(3)
            else:
                print('login fail')
                exit()
            # 获取网站cookie
            morning_cookies = chrome_driver.get_cookies()
        else:
            chrome_driver.get(morning_fund_selector_url)  # 再次打开爬取页面
            print(chrome_driver.get_cookies())  # 打印设置成功的cookie
    # 定义起始页码
    page_num = 1
    page_count = 25
    page_num_total = math.ceil(
        int(
            chrome_driver.find_element_by_xpath(
                '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) /
        page_count)

    result_dir = '../output/'
    output_head = '代码' + ',' + '晨星专属号' + ',' + '名称' + ',' + \
        '类型' + ',' + '三年评级' + ',' + '五年评级' + ',' + '今年回报率' + '\n'
    # 设置表头
    if page_num == 1:
        with open(result_dir + 'fund_morning_star.csv', 'w+') as csv_file:
            csv_file.write(output_head)
    while page_num <= page_num_total:
        # 求余
        remainder = page_num_total % 10
        # 判断是否最后一页
        num = (remainder +
               2) if page_num > (page_num_total - remainder) else 12
        xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % (
            num)
        print('page_num', page_num)
        # 等待,直到当前页(样式判断)等于page_num
        WebDriverWait(chrome_driver, timeout=600).until(
            text_to_be_present_in_element(
                "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']",
                str(page_num), xpath_str))
        sleep(1)
        # 列表用于存放爬取的数据
        id_list = []  # 雪花id
        code_list = []  # 基金代码
        morning_star_code_list = []  # 晨星专属代码
        name_list = []  # 基金名称
        fund_cat = []  # 基金分类
        fund_rating_3 = []  # 晨星评级(三年)
        fund_rating_5 = []  # 晨星评级(五年)
        rate_of_return = []  # 今年以来汇报(%)

        # 获取每页的源代码
        data = chrome_driver.page_source
        # 利用BeautifulSoup解析网页源代码
        bs = BeautifulSoup(data, 'lxml')
        class_list = ['gridItem', 'gridAlternateItem']  # 数据在这两个类下面

        # 取出所有类的信息,并保存到对应的列表里
        for i in range(len(class_list)):
            for tr in bs.find_all('tr', {'class': class_list[i]}):
                # 雪花id
                worker = IdWorker()
                id_list.append(worker.get_id())
                tds_text = tr.find_all('td', {'class': "msDataText"})
                tds_nume = tr.find_all('td', {'class': "msDataNumeric"})
                # 基金代码
                code_a_element = tds_text[0].find_all('a')[0]
                code_list.append(code_a_element.string)
                # 从href中匹配出晨星专属代码
                current_morning_code = re.findall(
                    r'(?<=/quicktake/)(\w+)$',
                    code_a_element.get('href')).pop(0)
                # 晨星基金专属晨星码
                morning_star_code_list.append(current_morning_code)
                name_list.append(tds_text[1].find_all('a')[0].string)
                # 基金分类
                fund_cat.append(tds_text[2].string)
                # 三年评级
                rating = get_star_count(tds_text[3].find_all('img')[0]['src'])
                fund_rating_3.append(rating)
                # 5年评级
                rating = get_star_count(tds_text[4].find_all('img')[0]['src'])
                fund_rating_5.append(rating)
                # 今年以来回报(%)
                return_value = tds_nume[
                    3].string if tds_nume[3].string != '-' else None
                rate_of_return.append(return_value)

        print('数据准备完毕')
        fund_df = pd.DataFrame({
            'id': id_list,
            'fund_code': code_list,
            'morning_star_code': morning_star_code_list,
            'fund_name': name_list,
            'fund_cat': fund_cat,
            'fund_rating_3': fund_rating_3,
            'fund_rating_5': fund_rating_5,
            'rate_of_return': rate_of_return
        })
        sql_insert = "replace into fund_morning_star(`id`, `fund_code`,`morning_star_code`, `fund_name`, `fund_cat`, `fund_rating_3`, `fund_rating_5`, `rate_of_return`) values(%s, %s, %s, %s, %s, %s, %s, %s)"
        # print('fund_df', fund_df)
        fund_list = fund_df.values.tolist()
        # cursor.executemany(sql_insert, fund_list)
        # connect.commit()
        print('fund_list', fund_list)
        with open(result_dir + 'fund_morning_star.csv', 'a') as csv_file:
            for fund_item in fund_list:
                output_line = ', '.join(str(x) for x in fund_item) + '\n'
                csv_file.write(output_line)

        # 获取下一页元素
        next_page = chrome_driver.find_element_by_xpath(xpath_str)
        # 点击下一页
        next_page.click()
        page_num += 1
    chrome_driver.close()
    print('end')