def getBrowser(self): browser = webdriver.Firefox() print("15") try: browser.get(self.startUrl) print("16") except: mylog.info('open the %s failed' % self.startUrl) browser.implicitly_wait(20) return browser
class TestTime(object): def __init__(self): self.log = MyLog() self.testTime() self.testLocaltime() self.testSleep() self.testStrftime() def testTime(self): self.log.info(u'开始测试time.time()函数') print(u'当前时间戳为:time.time()=%f' % time.time()) print(u'这里返回的是一个浮点型的数值,他是从1970纪元后经过的浮点秒数') print('\n') def testLocaltime(self): self.log.info(u'开始测试time.localtime()函数') print(u'当前本地时间为:time.localtime()=%s' % time.localtime()) print(u'这里返回的是一个struct_time结构的元组') print('\n') def testSleep(self): self.log.info(u'开始测试time.sleep()函数') print(u'这是个计时器,time.sleep(5)') print(u'闭上眼睛数上5秒就行') time.sleep(5) print('\n') def testStrftime(self): self.log.info(u'开始测试time.strftime()函数') print(u'这个函数返回的是一个格式化的时间') print(u'time.strftime("%%Y-%%m-%%d %%X",time.localtime())=%s' % time.strftime("%Y-%m-%d %X", time.localtime())) print('\n')
class GetData(object): def __init__(self): self.url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html' self.log = MyLog() self.items = self.spider() self.pipelines() def get_response(self): #获取页面信息 flag = True ua = UserAgent() while flag: with open('new3proxy.txt', 'r') as fp: lines = fp.readlines() index = random.randint(1, len(lines)) proxys = 'https://' + lines[index - 1] fakeHeaders = {'User-Agent': ua.random} request = urllib.request.Request(self.url, headers=fakeHeaders) proxy = urllib.request.ProxyHandler({'https': proxys}) opener = urllib.request.build_opener(proxy) urllib.request.install_opener(opener) try: response = urllib.request.urlopen(request) flag = False self.log.info(u'导入URL: 成功') return response except: flag = True self.log.error(u'导入URL: 失败') def spider(self): #数据提取 items = [] response = self.get_response() soup = BeautifulSoup(response.read(), 'html.parser') datas = soup.find('div', {'class': 'news-text'}).find_all('tr') for data in datas[1:5]: item = Item() item.paihang = data.find_all('td')[0].text item.name = data.find_all('td')[1].text item.address = data.find_all('td')[2].text item.score = data.find_all('td')[3].text items.append(item) self.log.info(u'获取%s信息: 成功' % item.name) return items def pipelines(self): #数据清洗保存 filename = 'daxuedata.txt' with codecs.open(filename, 'w', 'utf8') as fp: for item in self.items: fp.write('%d \t %s \t %s \t %.f \n' % (int( item.paihang), item.name, item.address, float(item.score))) self.log.info(u'%s保存至%s:成功' % (item.name, filename))
class GetData(object): def __init__(self): self.url = 'https://www.toutiao.com/search/?keyword=\xe8\xa1\x97\xe5\xa4\xb4\xe7\xaf\xae\xe7\x90\x83' self.log = MyLog() # self.urls = self.get_urls() self.items = self.spider() self.pipelines() # def get_urls(self): # pass def get_html(self): driver = webdriver.PhantomJS() driver.get(self.url) driver.implicitly_wait(10) submitelement = driver.find_element_by_xpath('//div[@class="tabBar"]//li[@class="y-left tab-item "]') submitelement.click() time.sleep(5) pageSource = driver.page_source self.log.info(u'successful') return pageSource def spider(self): i = 1 items = [] # for url in self.urls: # response = self.get_response() pageSource = self.get_html() try: soup = BeautifulSoup(pageSource, 'html.parser') datas = soup.find_all('div', {'class': 'articleCard'}) for data in datas: item = Item() try: item.image_url = data.find('a', {'class': 'img-wrap'}).find('img', {'alt': ''})['src'] items.append(item) except KeyError: pass self.log.info(u'获取信息: 成功') except AttributeError: self.log.info(u'url None') return items def pipelines(self): filename = '街头篮球1' if os.path.exists(filename): os.chdir(filename) else: os.mkdir(filename) os.chdir(filename) i = 1 for url in self.items: with open(str(i) + '.jpg', 'wb') as fp: i += 1 pic = requests.get(url.image_url) fp.write(pic.content)
class GetData(object): def __init__(self): self.url = 'https://movie.douban.com/subject/26266893/reviews?start=' self.log = MyLog() self.urls = self.get_urls() self.items = self.spider() self.pipelines() def get_urls(self): pages = 60 urls = [] for i in range(0, pages, 20): url = self.url + str(i) urls.append(url) self.log.info(u'导入URL 成功') return urls def get_response(self, url): flag = True ua = UserAgent() while flag: with open('new4proxy.txt', 'r') as fp: lines = fp.readlines() index = random.randint(1, len(lines)) proxys = 'https://' + lines[index - 1] fakeHeaders = {'User-Agent': ua.random} request = urllib.request.Request(url, headers=fakeHeaders) proxy = urllib.request.ProxyHandler({'https': proxys}) opener = urllib.request.build_opener(proxy) urllib.request.install_opener(opener) try: response = urllib.request.urlopen(request) flag = False self.log.info(u'导入URL: 成功') return response except (HTTPError, URLError): flag = True self.log.error(u'导入URL: 失败') def spider(self): items = [] for url in self.urls: response = self.get_response(url) try: item = Item() soup = BeautifulSoup(response.read(), 'html.parser') item.name = soup.find('a', {'class': 'name'}).text item.content = soup.find('div', { 'class': 'short-content' }).text items.append(item) self.log.info(u'获取%s信息: 成功' % item.name) except AttributeError: self.log.info(u'url None') return items def pipelines(self): filename = 'newdata.txt' with codecs.open(filename, 'w', 'utf8') as fp: for item in self.items: fp.write('%s \t %s \n' % (item.name, item.content)) self.log.info(u'%s保存至%s:成功' % (item.name, filename))