Пример #1
0
    def recursive_listing(self, cwd):
        savepath = None
        if cwd != '/' and cwd[-1] != "/":
            cwd += '/'
        url = self.sitebase + cwd
        req = self.get_content(url, use_bs=True)
        if not req:
            print("crawler error")
            return
        cwd, listing = htmllistparse.parse(req)
        if cwd == None:
            print("It does not seem to be 'Web Directory'.")
            return
        if cwd != '/' and cwd[-1] != "/":
            cwd += '/'
        if self.savebase:
            savepath = self.savebase + cwd
            try:
                print("Create directory: " + savepath)
                if not os.path.exists(savepath):
                    os.makedirs(savepath)
            except Exception as err:
                print("Cannot create directory..: " + str(err))
                sys.exit(0)

        for f in listing:
            if f.name[-1] != "/":
                print(cwd + f.name)
                if self.savebase:
                    self.download_file(cwd, savepath, f.name)
            else:
                print(cwd + f.name)
                self.recursive_listing(cwd + f.name)
            time.sleep(self.delay)
Пример #2
0
def get_book_url_from_id(book_id):
    book_directory = None
    book_url_content = None
    mirror_index = 0

    mirrors = [
        "https://gutenberg.pglaf.org",
        "https://mirrors.xmission.com/gutenberg",
        "https://mirror.csclub.uwaterloo.ca/gutenberg",
        "https://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg",
        "http://gutenberg.readingroo.ms",
    ]

    url_append = ""
    for i, each_char in enumerate(str(book_id)):
        if i + 1 == len(str(book_id)):
            break
        url_append += "/{}".format(each_char)
    url_append += "/{}".format(book_id)

    for _ in range(len(mirrors)):
        try:
            book_directory = mirrors[mirror_index] + url_append
            logger.info("Trying book URL: {}".format(book_directory))
            book_url_content = requests.get(
                book_directory,
                proxies=TOR_PROXIES,
                headers={'User-Agent': generate_user_agent()},
                allow_redirects=True,
                timeout=10)
            break
        except (ConnectTimeout, ConnectionError) as err:
            logger.error("Mirror {} error: {}".format(book_directory, err))
            if mirror_index + 1 < len(mirrors):
                mirror_index += 1
                logger.info("Trying next mirror")
            else:
                logger.info("Returning to first mirror")
                mirror_index = 0

    if not book_url_content:
        logger.error("Could not connect to any mirrors")
        return

    soup = BeautifulSoup(book_url_content.text, "html.parser")
    wd, listing = htmllistparse.parse(soup)

    for each_list in listing:
        if each_list.name == "{}.txt".format(book_id):
            filename = each_list.name
            return "{}/{}".format(book_directory, filename)

    for each_list in listing:
        if each_list.name.startswith(
                str(book_id)) and each_list.name.endswith(".txt"):
            filename = each_list.name
            return "{}/{}".format(book_directory, filename)
Пример #3
0
def parse_dir(html):
    return htmllistparse.parse(bs4.BeautifulSoup(html, 'html5lib'))
#!/usr/bin/python3

import bs4
import htmllistparse
import re
import requests

url = 'https://ftp.freedombox.org/pub/freedombox/latest/'

req = requests.get(url, timeout=30)
req.raise_for_status()

soup = bs4.BeautifulSoup(req.content, 'html5lib')
cwd, listing = htmllistparse.parse(soup)

print('title: FreedomBox Download')
print('---')
print('release_version: <TODO>')
print('---')
print('release_date: <TODO>')
print('---')
print('images:')

for f in listing:
    if f.name.endswith('.xz'):
        if 'a20-olinuxino-lime-' in f.name:
            target = 'A20 OLinuXino LIME'
        elif 'a20-olinuxino-lime2' in f.name:
            target = 'A20 OLinuXino LIME2'
        elif 'a20-olinuxino-micro' in f.name:
            target = 'A20 OLinuXino MICRO'