def html(title, href): #获取每套图中每张图的地址 path = str(title) mkdir(path) html = request.get(href, 3) html_Soup = BeautifulSoup(html.text, 'html.parser') max_span = html_Soup.find('div', { 'class': 'pagenavi' }).find_all('span')[-2].get_text() page_num = 0 for page in range(1, int(max_span) + 1): page_num += 1 page_url = href + '/' + str(page) # 连接数据库 connect = pymysql.Connect(host='localhost', port=3306, user='******', passwd='87869973lhy', db='mzitu', charset='utf8') # 获取游标 cursor = connect.cursor() sql = "SELECT url FROM mzitu_1 WHERE url = '%s' " data = str(page_url) cursor.execute(sql % data) if cursor.fetchall(): print('已爬取过该图片') else: img(page_url, path)
def img(self, page_url): # 根据每张图的页面地址获取图片的实际地址 img_html = request.get(page_url, 3) img_Soup = BeautifulSoup(img_html.text, 'html.parser') img_url = img_Soup.find('div', { 'class': 'main-image' }).find('img')['src'] self.save(img_url)
def html(self, href): #获取每套图中每张图的地址 html = request.get(href, 3) html_Soup = BeautifulSoup(html.text, 'html.parser') max_span = html_Soup.find('div', { 'class': 'pagenavi' }).find_all('span')[-2].get_text() for page in range(1, int(max_span) + 1): page_url = href + '/' + str(page) self.img(page_url)
def save(img_url, path): #根据图片地址保存图片 name = img_url[-9:-4] img = request.get(img_url, 3) file = '/Users/luhuiyang/PycharmProjects/LearnWebcrawler/meitu/' # 绝对路径 os.chdir(os.path.join(file, path)) # 切换到当前目录 f = open(name + '.jpg', 'ab') f.write(img.content) #print('已保存%s',name) f.close()
def all_url(self, url): start_html = request.get(url, 3) #调用 Soup = BeautifulSoup(start_html.text, 'html.parser') all_a = Soup.find('div', {'class': 'postlist'}).find_all('a') for a in all_a: title = a.get_text() if not title: pass else: path = str(title) self.mkdir(path) href = a['href'] self.html(href)
def all_url(url): #获取所有套图的url和title start_html = request.get(url, 3) #调用 Soup = BeautifulSoup(start_html.text, 'html.parser') all_a = Soup.find('div', {'class': 'postlist'}).find_all('a')[:-5] img_all_dict = {} for a in all_a: title = a.get_text() print(title) if not title: pass else: img_all_dict[title] = a['href'] return img_all_dict
def img(self, page_url, max_span, page_num): # 根据每张图的页面地址获取图片的实际地址 img_html = request.get(page_url, 3) img_Soup = BeautifulSoup(img_html.text, 'html.parser') img_url = img_Soup.find('div', { 'class': 'main-image' }).find('img')['src'] self.img_urls = img_url print(str(self.title), str(self.url), str(self.img_urls)) #if int(max_span)==page_num: self.save(img_url) sql = "INSERT INTO mzitu_1 VALUES ( '%s', '%s', '%s' )" data = (str(self.title), str(self.url), str(self.img_urls)) self.cursor.execute(sql % data) self.connect.commit() print('成功插入数据')
def html(self, href): #获取每套图中每张图的地址 html = request.get(href, 3) html_Soup = BeautifulSoup(html.text, 'html.parser') max_span = html_Soup.find('div', { 'class': 'pagenavi' }).find_all('span')[-2].get_text() page_num = 0 for page in range(1, int(max_span) + 1): page_num += 1 page_url = href + '/' + str(page) self.url = page_url sql = "SELECT url FROM mzitu_1 WHERE url = '%s' " data = str(page_url) self.cursor.execute(sql % data) if self.cursor.fetchall(): print('已爬取过该图片') else: self.img(page_url, max_span, page_num)
def img(page_url, path): # 根据每张图的页面地址获取图片的实际地址 img_html = request.get(page_url, 3) img_Soup = BeautifulSoup(img_html.text, 'html.parser') img_url = img_Soup.find('div', {'class': 'main-image'}).find('img')['src'] save(img_url, path) # 连接数据库 connect = pymysql.Connect(host='localhost', port=3306, user='******', passwd='87869973lhy', db='mzitu', charset='utf8') # 获取游标 cursor = connect.cursor() sql = "INSERT INTO mzitu_1 VALUES ( '%s', '%s', '%s' )" data = (str(path), str(page_url), str(img_url)) cursor.execute(sql % data) connect.commit() print('成功插入数据')
def save(self, img_url): #根据图片地址保存图片 name = img_url[-9:-4] img = request.get(img_url, 3) f = open(name + '.jpg', 'ab') f.write(img.content) f.close()