Exemplo n.º 1
0
 def getBrowser(self):
     browser = webdriver.Firefox()
     print("15")
     try:
         browser.get(self.startUrl)
         print("16")
     except:
         mylog.info('open the %s failed' % self.startUrl)
     browser.implicitly_wait(20)
     return browser
Exemplo n.º 2
0
class TestTime(object):
    def __init__(self):
        self.log = MyLog()
        self.testTime()
        self.testLocaltime()
        self.testSleep()
        self.testStrftime()

    def testTime(self):
        self.log.info(u'开始测试time.time()函数')
        print(u'当前时间戳为:time.time()=%f' % time.time())
        print(u'这里返回的是一个浮点型的数值,他是从1970纪元后经过的浮点秒数')
        print('\n')

    def testLocaltime(self):
        self.log.info(u'开始测试time.localtime()函数')
        print(u'当前本地时间为:time.localtime()=%s' % time.localtime())
        print(u'这里返回的是一个struct_time结构的元组')
        print('\n')

    def testSleep(self):
        self.log.info(u'开始测试time.sleep()函数')
        print(u'这是个计时器,time.sleep(5)')
        print(u'闭上眼睛数上5秒就行')
        time.sleep(5)
        print('\n')

    def testStrftime(self):
        self.log.info(u'开始测试time.strftime()函数')
        print(u'这个函数返回的是一个格式化的时间')
        print(u'time.strftime("%%Y-%%m-%%d %%X",time.localtime())=%s' %
              time.strftime("%Y-%m-%d %X", time.localtime()))
        print('\n')
Exemplo n.º 3
0
class GetData(object):
    def __init__(self):
        self.url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
        self.log = MyLog()
        self.items = self.spider()
        self.pipelines()

    def get_response(self):
        #获取页面信息
        flag = True
        ua = UserAgent()
        while flag:
            with open('new3proxy.txt', 'r') as fp:
                lines = fp.readlines()
                index = random.randint(1, len(lines))
                proxys = 'https://' + lines[index - 1]

            fakeHeaders = {'User-Agent': ua.random}
            request = urllib.request.Request(self.url, headers=fakeHeaders)

            proxy = urllib.request.ProxyHandler({'https': proxys})
            opener = urllib.request.build_opener(proxy)
            urllib.request.install_opener(opener)

            try:
                response = urllib.request.urlopen(request)
                flag = False
                self.log.info(u'导入URL: 成功')
                return response
            except:
                flag = True
                self.log.error(u'导入URL: 失败')

    def spider(self):
        #数据提取
        items = []
        response = self.get_response()
        soup = BeautifulSoup(response.read(), 'html.parser')
        datas = soup.find('div', {'class': 'news-text'}).find_all('tr')
        for data in datas[1:5]:
            item = Item()
            item.paihang = data.find_all('td')[0].text
            item.name = data.find_all('td')[1].text
            item.address = data.find_all('td')[2].text
            item.score = data.find_all('td')[3].text
            items.append(item)
            self.log.info(u'获取%s信息: 成功' % item.name)
        return items

    def pipelines(self):
        #数据清洗保存
        filename = 'daxuedata.txt'
        with codecs.open(filename, 'w', 'utf8') as fp:
            for item in self.items:
                fp.write('%d \t %s \t %s \t %.f \n' % (int(
                    item.paihang), item.name, item.address, float(item.score)))
                self.log.info(u'%s保存至%s:成功' % (item.name, filename))
Exemplo n.º 4
0
class GetData(object):
    def __init__(self):
        self.url = 'https://www.toutiao.com/search/?keyword=\xe8\xa1\x97\xe5\xa4\xb4\xe7\xaf\xae\xe7\x90\x83'
        self.log = MyLog()
#        self.urls = self.get_urls()
        self.items = self.spider()
        self.pipelines()
        
#    def get_urls(self):
#        pass
       
    def get_html(self):
        driver = webdriver.PhantomJS()
        driver.get(self.url)
        driver.implicitly_wait(10)
        submitelement = driver.find_element_by_xpath('//div[@class="tabBar"]//li[@class="y-left tab-item "]')
        submitelement.click()
        time.sleep(5)
        pageSource = driver.page_source
        self.log.info(u'successful')
        return pageSource
    
    def spider(self):
        i = 1
        items = []
#        for url in self.urls:
#        response = self.get_response()
        pageSource = self.get_html()
        try:
            soup = BeautifulSoup(pageSource, 'html.parser')
            datas = soup.find_all('div', {'class': 'articleCard'})
            for data in datas:
                item = Item()
                try:
                    item.image_url = data.find('a', {'class': 'img-wrap'}).find('img', {'alt': ''})['src']
                    items.append(item)
                except KeyError:
                    pass
                self.log.info(u'获取信息: 成功')
        except AttributeError:
            self.log.info(u'url None')
        return items
    
    def pipelines(self):
        filename = '街头篮球1'
        if os.path.exists(filename):
            os.chdir(filename)
        else:
            os.mkdir(filename)
            os.chdir(filename)
        i = 1
        for url in self.items:
            with open(str(i) + '.jpg', 'wb') as fp:
                i += 1
                pic = requests.get(url.image_url)
                fp.write(pic.content)
Exemplo n.º 5
0
class GetData(object):
    def __init__(self):
        self.url = 'https://movie.douban.com/subject/26266893/reviews?start='
        self.log = MyLog()
        self.urls = self.get_urls()
        self.items = self.spider()
        self.pipelines()

    def get_urls(self):
        pages = 60
        urls = []
        for i in range(0, pages, 20):
            url = self.url + str(i)
            urls.append(url)
            self.log.info(u'导入URL 成功')
        return urls

    def get_response(self, url):
        flag = True
        ua = UserAgent()
        while flag:
            with open('new4proxy.txt', 'r') as fp:
                lines = fp.readlines()
                index = random.randint(1, len(lines))
                proxys = 'https://' + lines[index - 1]

            fakeHeaders = {'User-Agent': ua.random}
            request = urllib.request.Request(url, headers=fakeHeaders)

            proxy = urllib.request.ProxyHandler({'https': proxys})
            opener = urllib.request.build_opener(proxy)
            urllib.request.install_opener(opener)

            try:
                response = urllib.request.urlopen(request)
                flag = False
                self.log.info(u'导入URL: 成功')
                return response
            except (HTTPError, URLError):
                flag = True
                self.log.error(u'导入URL: 失败')

    def spider(self):
        items = []
        for url in self.urls:
            response = self.get_response(url)
            try:
                item = Item()
                soup = BeautifulSoup(response.read(), 'html.parser')
                item.name = soup.find('a', {'class': 'name'}).text
                item.content = soup.find('div', {
                    'class': 'short-content'
                }).text
                items.append(item)
                self.log.info(u'获取%s信息: 成功' % item.name)
            except AttributeError:
                self.log.info(u'url None')
        return items

    def pipelines(self):
        filename = 'newdata.txt'
        with codecs.open(filename, 'w', 'utf8') as fp:
            for item in self.items:
                fp.write('%s \t %s \n' % (item.name, item.content))
                self.log.info(u'%s保存至%s:成功' % (item.name, filename))