Exemplo n.º 1
0
 def html(self, href):
     html = request.get(href, 3)
     html_soup = BeautifulSoup(html.text, 'lxml')
     max_span = html_soup.find('div', class_="pagenavi").find_all('span')[-2].get_text()
     for page in range(1, int(max_span) + 1):
         page_url = href + '/' + str(page)
         self.img(page_url)
         time.sleep(3)
Exemplo n.º 2
0
def start(start_url):
    start_html = request.get(start_url, 3)
    soup = BeautifulSoup(start_html.text, 'lxml')
    li_list = soup.find('div', {'class': 'all'}).find_all('a')
    for li in li_list:
        title = li.get_text()
        url = li['href']
        spider_queue.push(url, title)
Exemplo n.º 3
0
 def all_url(self, start_url):
     start_html = request.get(start_url, 3)
     soup = BeautifulSoup(start_html.text, 'lxml')
     li_list = soup.find('div', {'class': 'all'}).find_all('a')
     for li in li_list:
         title = li.get_text()
         print(u'开始保存:', title)
         path = str(title).replace('?', '_')
         self.mkdir(path)  # 调用mkdir函数来创建文件夹
         os.chdir('/home/rising/图片/meizitu/' + path)  # 切换到对应的目录
         href = li['href']
         self.html(href)
Exemplo n.º 4
0
 def all_url(self, url):
     html = request.get(url, 3)  ##这儿更改了一下(是不是发现  self 没见了?)
     all_a = BeautifulSoup(html.text,
                           'lxml').find('div', class_='all').find_all('a')
     for a in all_a[1:]:
         title = a.get_text()
         self.title = title
         print(u'开始保存:', title)  ##加点提示不然太枯燥了
         path = str(title).replace(
             "?", ' ')  ##我注意到有个标题带有 ?  这个符号Windows系统是不能创建文件夹的所以要替换掉
         self.mkdir(path)
         os.chdir('E:\图片\mzitu\\' + path)
         href = a['href']
         self.url = href  ##将页面地址保存到self.url中
         if self.meizitu_collection.find_one(
             {'主题页面': href}):  ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。
             print(u'这个页面已经爬取过了')
         else:
             self.html(href)
Exemplo n.º 5
0
 def get(self, url, timeout=10, num_retries=5):
     print('开始获取:', url)
     UA = random.choice(self.user_agent)
     headers = {'User-Agent': UA}
     try:
         IP = ''.join(str(random.choice(self.iplist)).strip())
         proxy = {'http': IP}
         return requests.get(url,
                             headers=headers,
                             proxies=proxy,
                             timeout=timeout)
     except:
         if num_retries > 0:
             print('获取失败,10s后重新获取,剩余尝试次数:', num_retries)
             time.sleep(10)
             return self.get(url, timeout, num_retries - 1)
         else:
             print('代理不好使了!取消代理')
             return request.get(url)
Exemplo n.º 6
0
 def pageurl_crawler():
     while True:
         try:
             (url, name) = img_queue.pop()
             print(url)
         except KeyError:
             print('队列没有数据')
             break
         else:
             title = img_queue.pop_title_(url)
             path = str(title).replace('?', '')
             mkdir(path)
             os.chdir('C:\Data\\' + path)
             html = request.get(url, 3)
             html_soup = BeautifulSoup(html.text, 'lxml')
             img = html_soup.find('div', id='i3').find('img')
             img_url = img['src']
             print(u'得到图片的链接')
             save(img_url, name)
             img_queue.complete(url)
Exemplo n.º 7
0
 def all_url(self, url):
     html = request.get(url, 3)
     html.encoding = html.apparent_encoding
     Soup = BeautifulSoup(html.text, 'lxml')
     all_p = Soup.find('div', class_='box mtop').find_all('div', class_='p')
     for p in all_p:
         a = p.find('a')
         title = a.find('img').attrs["alt"]
         self.title = title
         print(u'开始保存:', title)
         href = a['href']
         self.url = href
         if self.meizitu_collection.find_one({'主题页面': href}):
             print(u'这个页面已经爬取过了')
         else:
             path0 = str(title).replace("?", "_")
             path = str(path0).replace(r'/', "-")
             self.mkdir(path)
             os.chdir("E:\meitulu\\" + path)
             self.html(href)
Exemplo n.º 8
0
    def save(self, url):
        html = self.get(url)
        soup = BeautifulSoup(html, 'lxml')
        tag_all = soup.find_all('div', class_='text')
        for tag in tag_all:
            img_url = tag.find('img')['src']

            if cmp(img_url[0:5], 'http:'):
                img_url = img_url[7:]
            elif cmp(img_url[0:2], '////'):
                img_url = img_url[2:]
            else:
                continue

            [filename, filetype] = img_url.split('/')[-1].split('.')
            with open(
                    '/media/wangs/Docs/pic/jiandan/{}.{}'.format(
                        filename, filetype), 'wb') as img:
                img.write(request.get('http://' + img_url, 3).content)
                img.close()
Exemplo n.º 9
0
 def img(self, page_url, max_span, page_num):  #添加上边传递参数
     img_html = request.get(page_url, 3)  ##这儿更改了一下(删掉 self ;3为timeout参数)
     img_url = BeautifulSoup(img_html.text, 'lxml').find(
         'div', class_='main-image').find('img')['src']
     self.img_urls.append(
         img_url
     )  #每一次 for page in range(1, int(max_span) + 1)获取到的图片地址都会添加到 img_urls这个初始化的列表
     if int(
             max_span
     ) == page_num:  #传递来的两个参数 当max_span和page_num相等时,即为最后一张图片,下载图片并保存至数据库
         self.save(img_url)
         post = {  #构造一个字典
             '标题': self.title,
             '主题页面': self.url,
             '图片地址': self.img_urls,
             '获取时间': datetime.datetime.now()
         }
         self.meizitu_collection.save(post)  #将post内容写入数据库
         print(u'插入数据库成功')
     else:  #max_span 不等于 page_num执行下面
         self.save(img_url)
Exemplo n.º 10
0
 def img(self, page_url, max_span, page_num):
     img_html = request.get(page_url, 3)
     img_url = BeautifulSoup(img_html.text, 'lxml').find(
         'div', class_='main-image').find('img')['src']
     #每次循环都把图片地址存到list
     self.img_urls.append(img_url)
     #当max_span和Page_num相等时,就是最后一张图片了,最后一次下载图片并保存到数据库中。
     print('imge_url:', img_url)
     if int(max_span) == page_num:
         self.save(img_url)
         #构造字典,存入mongodb
         post = {
             '标题': self.title,
             '主题页面': self.url,
             '图片地址': self.img_urls,
             '获取时间': datetime.datetime.now()
         }
         #把信息存入mongodb
         self.meizitu_collection.save(post)
         print(post, '已存入monggodb')
     else:
         self.save(img_url)
Exemplo n.º 11
0
 def all_url(self, url):
     ##调用request函数把套图地址传进去会返回给我们一个response
     html = request.get(url, 3)
     all_a = BeautifulSoup(html.text,
                           'lxml').find('div', class_='all').find_all('a')
     for a in all_a:
         title = a.get_text()
         self.title = title
         # 有个标题带有 ?  这个符号Windows系统是不能创建文件夹的所以要替换掉
         path = str(title).replace('?', '_')
         # 调用mkdir函数创建文件夹!这儿path代表的是标题title
         self.mkdir(path)
         os.chdir("D:\mzitu\\" + path)
         print('开始保存:', title, '路径:', 'D:\mzitu\\', path)
         href = a['href']
         #把页面地址保存到self.url
         self.url = href
         ##调用html函数把href参数传递过去!href是套图的地址
         print('href:', href)
         if self.meizitu_collection.find_one({'主题页面': href}):
             print(href, '页面已经爬取')
         else:
             self.html(href)
Exemplo n.º 12
0
 def save(self, img_url):
     name = img_url[-9:-4]
     print(u'开始保存', img_url)
     img = request.get(img_url, 3)
     with open(name + '.jpg', 'ab') as f:
         f.write(img.content)
Exemplo n.º 13
0
 def img(self, page_url):
     img_html = request.get(page_url, 3)
     img_soup = BeautifulSoup(img_html.text, 'lxml')
     img_url = img_soup.find('div', class_="main-image").find('img')['src']
     self.save(img_url)
Exemplo n.º 14
0
 def get(self, url):
     html = request.get(url, 3)
     return html.text
Exemplo n.º 15
0
 def request(self, url):
     return request.get(url, self.timeout)
Exemplo n.º 16
0
 def img(self, page_url):
     img_html = request.get(page_url, 3)
     # img_url = Download.get(self, page_url, 3)
     img_url = BeautifulSoup(img_html.text, 'lxml').find(
         'div', class_='main-image').find('img')['src']
     self.save(img_url)
Exemplo n.º 17
0
 def download_img(self, img_url, referer):
     name = img_url[-9:-4] + '.jpg'
     # img = self.request(img_url)
     img = request.get(img_url, 3, referer=referer)
     with open(name, 'wb') as f:
         f.write(img.content)
Exemplo n.º 18
0
 def get_img(self, page_url, referer):
     headers = request.build_headers(referer=referer)
     img_html = request.get(page_url, headers, 10)
     src = BeautifulSoup(img_html.text, 'lxml').find(
         'div', class_='main-image').find('img')['src']
     self.save_img(src, referer)
Exemplo n.º 19
0
 def save(self, img_url):
     name = img_url[-9:-4]
     img = request.get(img_url, 3)
     f = open(name + '.jpg', 'ab')
     f.write(img.content)
     f.close()
Exemplo n.º 20
0
    'Cache-Control':
    'max-age=0',
    'Connection':
    'keep-alive',
    'Host':
    'jandan.net',
    'Referer':
    'http://jandan.net/',
    'Upgrade-Insecure-Requests':
    '1',
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}

url = 'http://jandan.net/ooxx/page-{}#comments'.format(3)
html = request.get(url, 3).text
j = re.search(
    r'.*<script\ssrc=\"\/\/(cdn.jandan.net\/static\/min.*?)\"><\/script>.*',
    html)
jsFileUrl = "http://" + j.group(1)
jsFile = requests.get(jsFileUrl, headers=headers).text

index = 0


class jiandan():
    def get(self, url):
        html = request.get(url, 3)
        return html.text

    def save(self, url):
Exemplo n.º 21
0
 def img(self, page_url):
     img_html = request.get(page_url, 3) ##这儿更改了一下(是不是发现  self 没见了?)
     img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
     self.save(img_url)
Exemplo n.º 22
0
 def html(self, href):
     html = request.get(href, 3)##这儿更改了一下(是不是发现  self 没见了?)
     max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text()
     for page in range(1, int(max_span) + 1):
         page_url = href + '/' + str(page)
         self.img(page_url)
Exemplo n.º 23
0
 def save(self, img_url):  #这个函数保存图片
     name = img_url[-9:-4]
     img = request.get(img_url, 3)
     f = open(name + 'jpg', 'ab') #创建文件,写入多媒体文件必须要 b 这个参数!
     f.write(img.content)    #多媒体文件要是用conctent!
     f.close()