Exemplo n.º 1
0
def html(title, href):  #获取每套图中每张图的地址
    path = str(title)
    mkdir(path)
    html = request.get(href, 3)
    html_Soup = BeautifulSoup(html.text, 'html.parser')
    max_span = html_Soup.find('div', {
        'class': 'pagenavi'
    }).find_all('span')[-2].get_text()
    page_num = 0
    for page in range(1, int(max_span) + 1):
        page_num += 1
        page_url = href + '/' + str(page)
        # 连接数据库
        connect = pymysql.Connect(host='localhost',
                                  port=3306,
                                  user='******',
                                  passwd='87869973lhy',
                                  db='mzitu',
                                  charset='utf8')
        # 获取游标
        cursor = connect.cursor()
        sql = "SELECT url FROM mzitu_1 WHERE url = '%s' "
        data = str(page_url)
        cursor.execute(sql % data)
        if cursor.fetchall():
            print('已爬取过该图片')
        else:
            img(page_url, path)
Exemplo n.º 2
0
 def img(self, page_url):  # 根据每张图的页面地址获取图片的实际地址
     img_html = request.get(page_url, 3)
     img_Soup = BeautifulSoup(img_html.text, 'html.parser')
     img_url = img_Soup.find('div', {
         'class': 'main-image'
     }).find('img')['src']
     self.save(img_url)
Exemplo n.º 3
0
 def html(self, href):  #获取每套图中每张图的地址
     html = request.get(href, 3)
     html_Soup = BeautifulSoup(html.text, 'html.parser')
     max_span = html_Soup.find('div', {
         'class': 'pagenavi'
     }).find_all('span')[-2].get_text()
     for page in range(1, int(max_span) + 1):
         page_url = href + '/' + str(page)
         self.img(page_url)
Exemplo n.º 4
0
def save(img_url, path):  #根据图片地址保存图片
    name = img_url[-9:-4]
    img = request.get(img_url, 3)
    file = '/Users/luhuiyang/PycharmProjects/LearnWebcrawler/meitu/'  # 绝对路径
    os.chdir(os.path.join(file, path))  # 切换到当前目录
    f = open(name + '.jpg', 'ab')
    f.write(img.content)
    #print('已保存%s',name)
    f.close()
Exemplo n.º 5
0
 def all_url(self, url):
     start_html = request.get(url, 3)  #调用
     Soup = BeautifulSoup(start_html.text, 'html.parser')
     all_a = Soup.find('div', {'class': 'postlist'}).find_all('a')
     for a in all_a:
         title = a.get_text()
         if not title:
             pass
         else:
             path = str(title)
             self.mkdir(path)
             href = a['href']
             self.html(href)
Exemplo n.º 6
0
def all_url(url):  #获取所有套图的url和title
    start_html = request.get(url, 3)  #调用
    Soup = BeautifulSoup(start_html.text, 'html.parser')
    all_a = Soup.find('div', {'class': 'postlist'}).find_all('a')[:-5]
    img_all_dict = {}
    for a in all_a:
        title = a.get_text()
        print(title)
        if not title:
            pass
        else:
            img_all_dict[title] = a['href']
    return img_all_dict
Exemplo n.º 7
0
    def img(self, page_url, max_span, page_num):  # 根据每张图的页面地址获取图片的实际地址
        img_html = request.get(page_url, 3)
        img_Soup = BeautifulSoup(img_html.text, 'html.parser')
        img_url = img_Soup.find('div', {
            'class': 'main-image'
        }).find('img')['src']
        self.img_urls = img_url
        print(str(self.title), str(self.url), str(self.img_urls))
        #if int(max_span)==page_num:
        self.save(img_url)

        sql = "INSERT INTO mzitu_1 VALUES ( '%s', '%s', '%s' )"
        data = (str(self.title), str(self.url), str(self.img_urls))
        self.cursor.execute(sql % data)
        self.connect.commit()
        print('成功插入数据')
Exemplo n.º 8
0
    def html(self, href):  #获取每套图中每张图的地址
        html = request.get(href, 3)
        html_Soup = BeautifulSoup(html.text, 'html.parser')
        max_span = html_Soup.find('div', {
            'class': 'pagenavi'
        }).find_all('span')[-2].get_text()
        page_num = 0
        for page in range(1, int(max_span) + 1):
            page_num += 1
            page_url = href + '/' + str(page)
            self.url = page_url

            sql = "SELECT url FROM mzitu_1 WHERE url = '%s' "
            data = str(page_url)
            self.cursor.execute(sql % data)
            if self.cursor.fetchall():
                print('已爬取过该图片')
            else:
                self.img(page_url, max_span, page_num)
Exemplo n.º 9
0
def img(page_url, path):  # 根据每张图的页面地址获取图片的实际地址
    img_html = request.get(page_url, 3)
    img_Soup = BeautifulSoup(img_html.text, 'html.parser')
    img_url = img_Soup.find('div', {'class': 'main-image'}).find('img')['src']
    save(img_url, path)
    # 连接数据库
    connect = pymysql.Connect(host='localhost',
                              port=3306,
                              user='******',
                              passwd='87869973lhy',
                              db='mzitu',
                              charset='utf8')
    # 获取游标
    cursor = connect.cursor()
    sql = "INSERT INTO mzitu_1 VALUES ( '%s', '%s', '%s' )"
    data = (str(path), str(page_url), str(img_url))
    cursor.execute(sql % data)
    connect.commit()
    print('成功插入数据')
Exemplo n.º 10
0
 def save(self, img_url):  #根据图片地址保存图片
     name = img_url[-9:-4]
     img = request.get(img_url, 3)
     f = open(name + '.jpg', 'ab')
     f.write(img.content)
     f.close()