class GitSpider: def __init__(self): self.links = _Settings().parse() self.util = Utils() def _get_words(self, url): text = self.util.req(url) if not text: return soup = bs4.BeautifulSoup(text, 'lxml') soup_article = soup.find('article') return soup_article.get_text(' ') if soup_article else None def _save(self, url, words): self.util.checkpath(PATH_DIR) if not words: return title = url.split('/')[-1] with open(PATH_DIR + '{}.txt'.format(title), 'w') as f: f.write(words) def start(self): if not self.links: return for url in self.links: words = self._get_words(url) self._save(url, words) print('successfully get {0} '.format(url))
class Stspider: def __init__(self): self.links = _Settings().parse() self.util = Utils() # 获取所有文字内容 def _get_words(self, url): page = self.util.req(url) if not page: return soup = bs4.BeautifulSoup(page, 'lxml') body = soup.find('body') if not body: return else: words = body.get_text(' ') return words # 保存文字内容 def _save(self, url, words): self.util.checkpath(PATH_DIR) if not words: return title = url.split('/')[-1] with open(PATH_DIR + '{}.txt'.format(title), 'w') as f: f.write(words) # 启动 def start(self): if not self.links: return for url in self.links: words = self._get_words(url) self._save(url, words) print('successfully get {0} '.format(url))
class _Down: def __init__(self): self.util = Utils() def _save(self, title, words): self.util.checkpath(PATH_DIR) if not words: return with open(PATH_DIR + title, 'a+') as f: f.write(words) # 递归抓取某文档所有链接 def _download(self, qu, domain, title, switch=True): # print(title) if qu.empty(): return url = qu.get() text = self.util.req(url) if not text: # qu.put(url) return self._download(qu, domain, title, False) if switch: res = self._download_links(domain, text) for i in res: qu.put(i) words = self._download_docs(text) self._save(title, words) return self._download(qu, domain, title, switch=False) def _download_docs(self, page): soup = bs4.BeautifulSoup(page, 'lxml') soup_body = soup.find('body') words = '' if soup_body: words += soup_body.get_text(' ') return words def _download_links(self, domain, page): lst = [] soup = bs4.BeautifulSoup(page, 'lxml') soup_link = soup.find_all('a') for link in soup_link: lst.append(domain + link['href']) return lst def download(self, url, domain, title): # title = 'Problem Solving with Algorithms and Data Structures using Python.pdf' qu = queue.Queue() qu.put(url) return self._download(qu, domain, title)