def get_one_weixin_gzh(self, weixin_id, weixin_name, url): def _get_articl_detail(driver, article): driver.get(article.href) article.href = driver.current_url soup = bs(driver.page_source, 'lxml') detail_div = soup.find('div', {'class':'rich_media_content '}) # article.detail = detail_div.get_text() if detail_div: md = html2text.html2text(detail_div.prettify()) article.detail = md stop = 3 * (1 + random.random()) time.sleep(stop) print article.href print 'sleep:{:.2f}s\n'.format(stop) driver = wh.get_web_driver(url) # 点击"查看更多",非登录用户最多显示100篇,点击10次 try: for _ in range(0,1): driver.find_element_by_link_text('查看更多').click() time.sleep(3) except: None soup = bs(driver.page_source, 'lxml') article_list = [] wxBox = soup.find('div', {'id':'wxbox'}) txtBoxs = wxBox.find_all('div', 'txt-box') for box in txtBoxs: a = box.find('a') if not a: continue article = Article() article.title = a.get_text() article.user_id = weixin_id article.user_name = weixin_name article.href = self.site + a['href'] s_p = box.find('div', {'class':'s-p'}) if s_p: article.publish_time = cc.GetTime2(int(s_p['t'])) print article.user_name article_list.append(article) for article in article_list: _get_articl_detail(driver, article) article.to_mysql() driver.quit()
def get_logined_webdriver(self): """ 获取登录后的driver :return: """ driver = get_web_driver(self.general_url, has_proxy=False) WebDriverWait(driver, 10, 0.5).until( EC.presence_of_element_located((By.LINK_TEXT, '登录'))) driver.find_element_by_link_text('登录').click() WebDriverWait(driver, 10, 0.5).until( EC.presence_of_element_located((By.LINK_TEXT, '账号登录'))) driver.find_element_by_link_text('账号登录').click() WebDriverWait(driver, 10, 0.5).until( EC.presence_of_element_located( (By.XPATH, '//input[@name="username"]'))) driver.find_element_by_xpath('//input[@name="username"]').clear() driver.find_element_by_xpath('//input[@name="password"]').clear() driver.find_element_by_xpath('//input[@name="username"]').send_keys( '18410182275') driver.find_element_by_xpath('//input[@name="password"]').send_keys( '12356789') driver.find_element_by_xpath('//div[@class="item_btn"]').click() time.sleep(3) return driver
def get_user_activity_info(self, user_id): url = 'http://xueqiu.com/{}'.format(user_id) print url #r = get_requests(url, self.df_ip) driver = get_web_driver(url, has_proxy=False) max_window(driver) # 获取原发布的总页码 soup = BeautifulSoup(driver.page_source, 'html5lib') page_count = self._get_page_count(soup, 'statusLists') # 获取数据库中的最新发表文章时间 publish_time_lastest = self._get_lastest_publish_time(mysql_table_xueqiu_article, user_id) # 获取每页文章列表 current_page = 1 while(current_page < page_count+1): print "Page:%d / %d" % (current_page, page_count) archiveList = self._get_archive_list_in_one_page(soup, user_id) # 存入mysql [archive.to_mysql() for archive in archiveList] #不需判断数据库是否存在,若存在则抛出异常,不插入 if len(archiveList) > 0: archive = archiveList[-1] # 判断是否存在最新文章 #d1 = str_to_datatime(archive.publish_time) #d2 = str_to_datatime(str(publish_time_lastest)) #if d1 < d2: if archive.publish_time < str(publish_time_lastest): print encode_wrap('雪球: 已经是最新的动态了') break # 判断文章是否为最近一年发布,若否则不继续搜索 nowDate = GetNowTime2() now_year = int(nowDate[:4]) last_year = nowDate.replace(str(now_year), str(now_year-1)) # 去年今日 if archive.publish_time < last_year: break # 点击下一页 clickStatus = self._click_next_page(driver,'//ul[@class="status-list"]', current_page+1) if clickStatus: soup = BeautifulSoup(driver.page_source, 'html5lib') current_page += 1 wait_time = self._get_wait_time() time.sleep(wait_time) print 'Page:{} Wait time:{}'.format(current_page, wait_time) else: print encode_wrap('点击下一页出错, 退出...') break if current_page > 5: break driver.quit()
def __init__(self): BaseVideo.__init__(self) self.engine = '淘宝' self.site = 'taobao' self.album_url = '' #专辑的url self.general_url = 'https://s.taobao.com/search?q={key}&s={page}' #普通搜索的url self.filePath = 'taobao' self.timelengthDict = {0:'全部', 1:'10分钟以下', 2:'10-30分钟', 3:'30-60分钟', 4:'60分钟以上'} #时长类型对应网页中的按钮文字 #self.infoLogger = Logger(logname=dir_log+'info_56(' + GetNowDate()+ ').log', logger='I') #self.errorLogger = Logger(logname=dir_log+'error_56(' + GetNowDate()+ ').log', logger='E') self.driver = get_web_driver(has_proxy=False)
def get_cookie_from_network(): driver = get_web_driver(self.general_url, has_proxy=False) WebDriverWait(driver, 10, 0.5).until( EC.presence_of_element_located((By.LINK_TEXT, '登录'))) driver.find_element_by_link_text('登录').click() WebDriverWait(driver, 10, 0.5).until( EC.presence_of_element_located((By.LINK_TEXT, '账号登录'))) driver.find_element_by_link_text('账号登录').click() WebDriverWait(driver, 10, 0.5).until( EC.presence_of_element_located( (By.XPATH, '//input[@name="username"]'))) driver.find_element_by_xpath('//input[@name="username"]').clear() driver.find_element_by_xpath('//input[@name="password"]').clear() driver.find_element_by_xpath( '//input[@name="username"]').send_keys('18410182275') driver.find_element_by_xpath( '//input[@name="password"]').send_keys('12356789') driver.find_element_by_xpath('//div[@class="item_btn"]').click() time.sleep(3) # 获得 cookie信息 cookie_list = driver.get_cookies() print cookie_list # 写入文件 for webdriver f = open(self.dir_temp + 'total_weibo.cookie', 'w') pickle.dump(cookie_list, f) f.close() # 写入文件 for requests cookie_dict = {} for cookie in cookie_list: # 写入文件 f = open(self.dir_temp + cookie['name'] + '.weibo', 'w') pickle.dump(cookie, f) f.close() if cookie.has_key('name') and cookie.has_key('value'): cookie_dict[cookie['name']] = cookie['value'] driver.quit() return cookie_dict
def __init__(self): BaseVideo.__init__(self) self.engine = '淘宝' self.site = 'taobao' self.album_url = '' #专辑的url self.general_url = 'https://s.taobao.com/search?q={key}&s={page}' #普通搜索的url self.filePath = 'taobao' self.timelengthDict = { 0: '全部', 1: '10分钟以下', 2: '10-30分钟', 3: '30-60分钟', 4: '60分钟以上' } #时长类型对应网页中的按钮文字 #self.infoLogger = Logger(logname=dir_log+'info_56(' + GetNowDate()+ ').log', logger='I') #self.errorLogger = Logger(logname=dir_log+'error_56(' + GetNowDate()+ ').log', logger='E') self.driver = get_web_driver(has_proxy=False)
def get_cookie_from_network(): driver = get_web_driver(self.general_url, has_proxy=False) WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, '登录'))) driver.find_element_by_link_text('登录').click() WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, '账号登录'))) driver.find_element_by_link_text('账号登录').click() WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.XPATH, '//input[@name="username"]'))) driver.find_element_by_xpath('//input[@name="username"]').clear() driver.find_element_by_xpath('//input[@name="password"]').clear() driver.find_element_by_xpath('//input[@name="username"]').send_keys('18410182275') driver.find_element_by_xpath('//input[@name="password"]').send_keys('12356789') driver.find_element_by_xpath('//div[@class="item_btn"]').click() time.sleep(3) # 获得 cookie信息 cookie_list = driver.get_cookies() print cookie_list # 写入文件 for webdriver f = open(self.dir_temp + 'total_weibo.cookie', 'w') pickle.dump(cookie_list, f) f.close() # 写入文件 for requests cookie_dict = {} for cookie in cookie_list: # 写入文件 f = open(self.dir_temp + cookie['name'] + '.weibo', 'w') pickle.dump(cookie, f) f.close() if cookie.has_key('name') and cookie.has_key('value'): cookie_dict[cookie['name']] = cookie['value'] driver.quit() return cookie_dict
def get_logined_webdriver(self): """ 获取登录后的driver :return: """ driver = get_web_driver(self.general_url, has_proxy=False) WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, '登录'))) driver.find_element_by_link_text('登录').click() WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, '账号登录'))) driver.find_element_by_link_text('账号登录').click() WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.XPATH, '//input[@name="username"]'))) driver.find_element_by_xpath('//input[@name="username"]').clear() driver.find_element_by_xpath('//input[@name="password"]').clear() driver.find_element_by_xpath('//input[@name="username"]').send_keys('18410182275') driver.find_element_by_xpath('//input[@name="password"]').send_keys('12356789') driver.find_element_by_xpath('//div[@class="item_btn"]').click() time.sleep(3) return driver
def get_new_driver(self): url_login = "******" driver = get_web_driver(url_login, has_proxy=False) # driver.save_screenshot('../Data/weibo.png') driver.find_element_by_xpath('//input[@type="text"]').send_keys("cbb6150") driver.find_element_by_xpath('//input[@type="password"]').send_keys("xx.785906") driver.find_element_by_xpath('//input[@type="submit"]').click() # 获得 cookie信息 cookie = driver.get_cookies() print cookie print len(cookie) # dict_cookie = cookie[-1] # # data_cookie = '' # for key in dict_cookie.keys(): # data_cookie += "{}={};".format(key, dict_cookie[key]) f = open(self.dir_temp + "cookie", "w") pickle.dump(cookie, f) return driver