예제 #1
0
    def get_one_weixin_gzh(self, weixin_id, weixin_name, url):

        def _get_articl_detail(driver, article):

            driver.get(article.href)
            article.href = driver.current_url

            soup = bs(driver.page_source, 'lxml')
            detail_div = soup.find('div', {'class':'rich_media_content '})
            # article.detail = detail_div.get_text()
            if detail_div:
                md = html2text.html2text(detail_div.prettify())
                article.detail = md

            stop = 3 * (1 + random.random())
            time.sleep(stop)
            print article.href
            print 'sleep:{:.2f}s\n'.format(stop)

        driver = wh.get_web_driver(url)

        # 点击"查看更多",非登录用户最多显示100篇,点击10次
        try:
            for _ in range(0,1):
                driver.find_element_by_link_text('查看更多').click()
                time.sleep(3)
        except:
            None

        soup = bs(driver.page_source, 'lxml')

        article_list = []

        wxBox = soup.find('div', {'id':'wxbox'})
        txtBoxs = wxBox.find_all('div', 'txt-box')
        for box in txtBoxs:
            a = box.find('a')
            if not a:
                continue

            article = Article()
            article.title = a.get_text()

            article.user_id = weixin_id
            article.user_name = weixin_name

            article.href = self.site + a['href']


            s_p = box.find('div', {'class':'s-p'})
            if s_p:
                article.publish_time = cc.GetTime2(int(s_p['t']))
            print article.user_name
            article_list.append(article)

        for article in article_list:
            _get_articl_detail(driver, article)
            article.to_mysql()

        driver.quit()
예제 #2
0
    def get_logined_webdriver(self):
        """
        获取登录后的driver
        :return:
        """

        driver = get_web_driver(self.general_url, has_proxy=False)
        WebDriverWait(driver, 10, 0.5).until(
            EC.presence_of_element_located((By.LINK_TEXT, '登录')))

        driver.find_element_by_link_text('登录').click()
        WebDriverWait(driver, 10, 0.5).until(
            EC.presence_of_element_located((By.LINK_TEXT, '账号登录')))

        driver.find_element_by_link_text('账号登录').click()
        WebDriverWait(driver, 10, 0.5).until(
            EC.presence_of_element_located(
                (By.XPATH, '//input[@name="username"]')))

        driver.find_element_by_xpath('//input[@name="username"]').clear()
        driver.find_element_by_xpath('//input[@name="password"]').clear()
        driver.find_element_by_xpath('//input[@name="username"]').send_keys(
            '18410182275')
        driver.find_element_by_xpath('//input[@name="password"]').send_keys(
            '12356789')

        driver.find_element_by_xpath('//div[@class="item_btn"]').click()
        time.sleep(3)
        return driver
예제 #3
0
    def get_user_activity_info(self, user_id):
        url = 'http://xueqiu.com/{}'.format(user_id)
        print url
        #r = get_requests(url, self.df_ip)
        driver = get_web_driver(url, has_proxy=False)
        max_window(driver)

        # 获取原发布的总页码
        soup = BeautifulSoup(driver.page_source, 'html5lib')
        page_count = self._get_page_count(soup, 'statusLists')

        # 获取数据库中的最新发表文章时间
        publish_time_lastest = self._get_lastest_publish_time(mysql_table_xueqiu_article, user_id)

        # 获取每页文章列表
        current_page = 1
        while(current_page < page_count+1):
            print "Page:%d / %d" % (current_page, page_count)

            archiveList = self._get_archive_list_in_one_page(soup, user_id)

            # 存入mysql
            [archive.to_mysql() for archive in archiveList] #不需判断数据库是否存在,若存在则抛出异常,不插入


            if len(archiveList) > 0:

                archive = archiveList[-1]

                # 判断是否存在最新文章
                #d1 = str_to_datatime(archive.publish_time)
                #d2 = str_to_datatime(str(publish_time_lastest))
                #if d1 < d2:
                if archive.publish_time < str(publish_time_lastest):
                    print encode_wrap('雪球: 已经是最新的动态了')
                    break

                # 判断文章是否为最近一年发布,若否则不继续搜索
                nowDate = GetNowTime2()
                now_year = int(nowDate[:4])
                last_year = nowDate.replace(str(now_year), str(now_year-1)) # 去年今日
                if archive.publish_time < last_year:
                    break

            # 点击下一页
            clickStatus = self._click_next_page(driver,'//ul[@class="status-list"]', current_page+1)
            if clickStatus:
                soup = BeautifulSoup(driver.page_source, 'html5lib')
                current_page += 1
                wait_time = self._get_wait_time()
                time.sleep(wait_time)
                print 'Page:{}   Wait time:{}'.format(current_page, wait_time)
            else:
                print encode_wrap('点击下一页出错, 退出...')
                break

            if current_page > 5:
                break

        driver.quit()
예제 #4
0
    def __init__(self):
        BaseVideo.__init__(self)
        self.engine = '淘宝'
        self.site = 'taobao'
        self.album_url = '' #专辑的url
        self.general_url = 'https://s.taobao.com/search?q={key}&s={page}' #普通搜索的url
        self.filePath = 'taobao'

        self.timelengthDict = {0:'全部', 1:'10分钟以下', 2:'10-30分钟', 3:'30-60分钟', 4:'60分钟以上'} #时长类型对应网页中的按钮文字

        #self.infoLogger = Logger(logname=dir_log+'info_56(' + GetNowDate()+ ').log', logger='I')
        #self.errorLogger = Logger(logname=dir_log+'error_56(' + GetNowDate()+ ').log', logger='E')
        self.driver = get_web_driver(has_proxy=False)
예제 #5
0
        def get_cookie_from_network():

            driver = get_web_driver(self.general_url, has_proxy=False)
            WebDriverWait(driver, 10, 0.5).until(
                EC.presence_of_element_located((By.LINK_TEXT, '登录')))

            driver.find_element_by_link_text('登录').click()
            WebDriverWait(driver, 10, 0.5).until(
                EC.presence_of_element_located((By.LINK_TEXT, '账号登录')))

            driver.find_element_by_link_text('账号登录').click()
            WebDriverWait(driver, 10, 0.5).until(
                EC.presence_of_element_located(
                    (By.XPATH, '//input[@name="username"]')))

            driver.find_element_by_xpath('//input[@name="username"]').clear()
            driver.find_element_by_xpath('//input[@name="password"]').clear()
            driver.find_element_by_xpath(
                '//input[@name="username"]').send_keys('18410182275')
            driver.find_element_by_xpath(
                '//input[@name="password"]').send_keys('12356789')

            driver.find_element_by_xpath('//div[@class="item_btn"]').click()
            time.sleep(3)
            # 获得 cookie信息
            cookie_list = driver.get_cookies()
            print cookie_list

            # 写入文件 for webdriver
            f = open(self.dir_temp + 'total_weibo.cookie', 'w')
            pickle.dump(cookie_list, f)
            f.close()

            # 写入文件 for requests
            cookie_dict = {}
            for cookie in cookie_list:
                # 写入文件
                f = open(self.dir_temp + cookie['name'] + '.weibo', 'w')
                pickle.dump(cookie, f)
                f.close()

                if cookie.has_key('name') and cookie.has_key('value'):
                    cookie_dict[cookie['name']] = cookie['value']
            driver.quit()
            return cookie_dict
예제 #6
0
    def __init__(self):
        BaseVideo.__init__(self)
        self.engine = '淘宝'
        self.site = 'taobao'
        self.album_url = ''  #专辑的url
        self.general_url = 'https://s.taobao.com/search?q={key}&s={page}'  #普通搜索的url
        self.filePath = 'taobao'

        self.timelengthDict = {
            0: '全部',
            1: '10分钟以下',
            2: '10-30分钟',
            3: '30-60分钟',
            4: '60分钟以上'
        }  #时长类型对应网页中的按钮文字

        #self.infoLogger = Logger(logname=dir_log+'info_56(' + GetNowDate()+ ').log', logger='I')
        #self.errorLogger = Logger(logname=dir_log+'error_56(' + GetNowDate()+ ').log', logger='E')
        self.driver = get_web_driver(has_proxy=False)
예제 #7
0
        def get_cookie_from_network():

            driver = get_web_driver(self.general_url, has_proxy=False)
            WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, '登录')))

            driver.find_element_by_link_text('登录').click()
            WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, '账号登录')))

            driver.find_element_by_link_text('账号登录').click()
            WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.XPATH, '//input[@name="username"]')))

            driver.find_element_by_xpath('//input[@name="username"]').clear()
            driver.find_element_by_xpath('//input[@name="password"]').clear()
            driver.find_element_by_xpath('//input[@name="username"]').send_keys('18410182275')
            driver.find_element_by_xpath('//input[@name="password"]').send_keys('12356789')

            driver.find_element_by_xpath('//div[@class="item_btn"]').click()
            time.sleep(3)
            # 获得 cookie信息
            cookie_list = driver.get_cookies()
            print cookie_list

            # 写入文件 for webdriver
            f = open(self.dir_temp + 'total_weibo.cookie', 'w')
            pickle.dump(cookie_list, f)
            f.close()

            # 写入文件 for requests
            cookie_dict = {}
            for cookie in cookie_list:
                # 写入文件
                f = open(self.dir_temp + cookie['name'] + '.weibo', 'w')
                pickle.dump(cookie, f)
                f.close()

                if cookie.has_key('name') and cookie.has_key('value'):
                    cookie_dict[cookie['name']] = cookie['value']
            driver.quit()
            return cookie_dict
예제 #8
0
    def get_logined_webdriver(self):
        """
        获取登录后的driver
        :return:
        """

        driver = get_web_driver(self.general_url, has_proxy=False)
        WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, '登录')))

        driver.find_element_by_link_text('登录').click()
        WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, '账号登录')))

        driver.find_element_by_link_text('账号登录').click()
        WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.XPATH, '//input[@name="username"]')))

        driver.find_element_by_xpath('//input[@name="username"]').clear()
        driver.find_element_by_xpath('//input[@name="password"]').clear()
        driver.find_element_by_xpath('//input[@name="username"]').send_keys('18410182275')
        driver.find_element_by_xpath('//input[@name="password"]').send_keys('12356789')

        driver.find_element_by_xpath('//div[@class="item_btn"]').click()
        time.sleep(3)
        return driver
예제 #9
0
    def get_new_driver(self):
        url_login = "******"
        driver = get_web_driver(url_login, has_proxy=False)
        # driver.save_screenshot('../Data/weibo.png')
        driver.find_element_by_xpath('//input[@type="text"]').send_keys("cbb6150")
        driver.find_element_by_xpath('//input[@type="password"]').send_keys("xx.785906")

        driver.find_element_by_xpath('//input[@type="submit"]').click()

        # 获得 cookie信息
        cookie = driver.get_cookies()

        print cookie
        print len(cookie)
        # dict_cookie = cookie[-1]
        #
        # data_cookie = ''
        # for key in dict_cookie.keys():
        #     data_cookie += "{}={};".format(key, dict_cookie[key])

        f = open(self.dir_temp + "cookie", "w")
        pickle.dump(cookie, f)

        return driver