class GitSpider:
    def __init__(self):
        self.links = _Settings().parse()
        self.util = Utils()

    def _get_words(self, url):
        text = self.util.req(url)
        if not text:
            return

        soup = bs4.BeautifulSoup(text, 'lxml')
        soup_article = soup.find('article')

        return soup_article.get_text(' ') if soup_article else None

    def _save(self, url, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        title = url.split('/')[-1]
        with open(PATH_DIR + '{}.txt'.format(title), 'w') as f:
            f.write(words)

    def start(self):

        if not self.links:
            return

        for url in self.links:
            words = self._get_words(url)
            self._save(url, words)
            print('successfully get {0} '.format(url))
示例#2
0
class Stspider:
    def __init__(self):
        self.links = _Settings().parse()
        self.util = Utils()

    # 获取所有文字内容
    def _get_words(self, url):
        page = self.util.req(url)
        if not page:
            return
        soup = bs4.BeautifulSoup(page, 'lxml')
        body = soup.find('body')
        if not body:
            return
        else:
            words = body.get_text(' ')

        return words

    # 保存文字内容
    def _save(self, url, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        title = url.split('/')[-1]
        with open(PATH_DIR + '{}.txt'.format(title), 'w') as f:
            f.write(words)

    # 启动
    def start(self):

        if not self.links:
            return

        for url in self.links:
            words = self._get_words(url)
            self._save(url, words)
            print('successfully get {0} '.format(url))
示例#3
0
class _Down:
    def __init__(self):
        self.util = Utils()

    def _save(self, title, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        with open(PATH_DIR + title, 'a+') as f:
            f.write(words)

    # 递归抓取某文档所有链接
    def _download(self, qu, domain, title, switch=True):
        # print(title)
        if qu.empty():
            return

        url = qu.get()
        text = self.util.req(url)

        if not text:
            # qu.put(url)
            return self._download(qu, domain, title, False)

        if switch:
            res = self._download_links(domain, text)
            for i in res:
                qu.put(i)

        words = self._download_docs(text)
        self._save(title, words)

        return self._download(qu, domain, title, switch=False)

    def _download_docs(self, page):

        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_body = soup.find('body')
        words = ''
        if soup_body:
            words += soup_body.get_text(' ')

        return words

    def _download_links(self, domain, page):

        lst = []
        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_link = soup.find_all('a')
        for link in soup_link:
            lst.append(domain + link['href'])

        return lst

    def download(self, url, domain, title):
        # title = 'Problem Solving with Algorithms and Data Structures using Python.pdf'
        qu = queue.Queue()
        qu.put(url)

        return self._download(qu, domain, title)