def get_user_activity_info(self, user_id): url = 'http://xueqiu.com/{}'.format(user_id) print url #r = get_requests(url, self.df_ip) driver = get_web_driver(url, has_proxy=False) max_window(driver) # 获取原发布的总页码 soup = BeautifulSoup(driver.page_source, 'html5lib') page_count = self._get_page_count(soup, 'statusLists') # 获取数据库中的最新发表文章时间 publish_time_lastest = self._get_lastest_publish_time(mysql_table_xueqiu_article, user_id) # 获取每页文章列表 current_page = 1 while(current_page < page_count+1): print "Page:%d / %d" % (current_page, page_count) archiveList = self._get_archive_list_in_one_page(soup, user_id) # 存入mysql [archive.to_mysql() for archive in archiveList] #不需判断数据库是否存在,若存在则抛出异常,不插入 if len(archiveList) > 0: archive = archiveList[-1] # 判断是否存在最新文章 #d1 = str_to_datatime(archive.publish_time) #d2 = str_to_datatime(str(publish_time_lastest)) #if d1 < d2: if archive.publish_time < str(publish_time_lastest): print encode_wrap('雪球: 已经是最新的动态了') break # 判断文章是否为最近一年发布,若否则不继续搜索 nowDate = GetNowTime2() now_year = int(nowDate[:4]) last_year = nowDate.replace(str(now_year), str(now_year-1)) # 去年今日 if archive.publish_time < last_year: break # 点击下一页 clickStatus = self._click_next_page(driver,'//ul[@class="status-list"]', current_page+1) if clickStatus: soup = BeautifulSoup(driver.page_source, 'html5lib') current_page += 1 wait_time = self._get_wait_time() time.sleep(wait_time) print 'Page:{} Wait time:{}'.format(current_page, wait_time) else: print encode_wrap('点击下一页出错, 退出...') break if current_page > 5: break driver.quit()
def get_follow_list(self, id, list_type='friends_content'): try: if list_type is 'friends_content': search_time_column = 'follow_search_time' else: search_time_column = 'fans_search_time' # 过滤已走路径 query = "select %s from %s where user_id='%s'" % (search_time_column, big_v_table_mysql, id ) df_query = pd.read_sql_query(query, engine) if len(df_query) > 0 : data = df_query.ix[0, search_time_column] if not data is None and len(df_query.ix[0, search_time_column]) > 0: print 'has get follow (%s)' % id return self._check_if_need_update_ip() url = 'http://xueqiu.com/%s' % str(id) driver = self.get_web_driver(url) #driver.get(url) max_window(driver) # 模拟点击“关注” driver.find_element_by_xpath('//a[@href="#{0}"]'.format(list_type)).click() soup = BeautifulSoup(driver.page_source, 'html5lib') # 获取关注的总页码 page_count = self._get_page_count(soup, list_type) follow_list = [] current_page = 1 while(current_page < page_count+1): print "Page:%d / %d" % (current_page, page_count) try: # add friends where follows>1000 follow_list_one_page = self._get_fans_list_in_one_page(soup) follow_list.extend(follow_list_one_page) t0 = time.time() # 获取关注列表中的大V信息(每一页) #多线程 pool = ThreadPool(processes=10) pool.map(self.get_BigV_Info, follow_list_one_page) pool.close() pool.join() current_page += 1 t1 = time.time() wait_time = max((self._get_wait_time() - (t1-t0)), 0) time.sleep(wait_time) print 'Page:{} Wait time:{}'.format(current_page, wait_time) # 点击下一页 clickStatus = self._click_next_page(driver,'//ul[@class="users-list"]', current_page) if clickStatus: soup = BeautifulSoup(driver.page_source, 'html5lib') else: print encode_wrap('无下一页{0}, 退出...'.format(current_page)) break except Exception,e: print e break print 'fans count:', len(follow_list) driver.quit() if not len(follow_list): return [] # for follow in followList: # self.get_BigV_Info(follow) # 保存到数据库:粉丝中的大V列表 self._big_v_in_fans_to_sql(follow_list, id) # 标记已搜寻关注列表 sql = 'update {0} set {1} = "{2}" where user_id = "{3}"'.format(big_v_table_mysql, search_time_column, GetNowTime(), id) engine.execute(sql) return follow_list