def recursive_listing(self, cwd): savepath = None if cwd != '/' and cwd[-1] != "/": cwd += '/' url = self.sitebase + cwd req = self.get_content(url, use_bs=True) if not req: print("crawler error") return cwd, listing = htmllistparse.parse(req) if cwd == None: print("It does not seem to be 'Web Directory'.") return if cwd != '/' and cwd[-1] != "/": cwd += '/' if self.savebase: savepath = self.savebase + cwd try: print("Create directory: " + savepath) if not os.path.exists(savepath): os.makedirs(savepath) except Exception as err: print("Cannot create directory..: " + str(err)) sys.exit(0) for f in listing: if f.name[-1] != "/": print(cwd + f.name) if self.savebase: self.download_file(cwd, savepath, f.name) else: print(cwd + f.name) self.recursive_listing(cwd + f.name) time.sleep(self.delay)
def get_book_url_from_id(book_id): book_directory = None book_url_content = None mirror_index = 0 mirrors = [ "https://gutenberg.pglaf.org", "https://mirrors.xmission.com/gutenberg", "https://mirror.csclub.uwaterloo.ca/gutenberg", "https://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg", "http://gutenberg.readingroo.ms", ] url_append = "" for i, each_char in enumerate(str(book_id)): if i + 1 == len(str(book_id)): break url_append += "/{}".format(each_char) url_append += "/{}".format(book_id) for _ in range(len(mirrors)): try: book_directory = mirrors[mirror_index] + url_append logger.info("Trying book URL: {}".format(book_directory)) book_url_content = requests.get( book_directory, proxies=TOR_PROXIES, headers={'User-Agent': generate_user_agent()}, allow_redirects=True, timeout=10) break except (ConnectTimeout, ConnectionError) as err: logger.error("Mirror {} error: {}".format(book_directory, err)) if mirror_index + 1 < len(mirrors): mirror_index += 1 logger.info("Trying next mirror") else: logger.info("Returning to first mirror") mirror_index = 0 if not book_url_content: logger.error("Could not connect to any mirrors") return soup = BeautifulSoup(book_url_content.text, "html.parser") wd, listing = htmllistparse.parse(soup) for each_list in listing: if each_list.name == "{}.txt".format(book_id): filename = each_list.name return "{}/{}".format(book_directory, filename) for each_list in listing: if each_list.name.startswith( str(book_id)) and each_list.name.endswith(".txt"): filename = each_list.name return "{}/{}".format(book_directory, filename)
def parse_dir(html): return htmllistparse.parse(bs4.BeautifulSoup(html, 'html5lib'))
#!/usr/bin/python3 import bs4 import htmllistparse import re import requests url = 'https://ftp.freedombox.org/pub/freedombox/latest/' req = requests.get(url, timeout=30) req.raise_for_status() soup = bs4.BeautifulSoup(req.content, 'html5lib') cwd, listing = htmllistparse.parse(soup) print('title: FreedomBox Download') print('---') print('release_version: <TODO>') print('---') print('release_date: <TODO>') print('---') print('images:') for f in listing: if f.name.endswith('.xz'): if 'a20-olinuxino-lime-' in f.name: target = 'A20 OLinuXino LIME' elif 'a20-olinuxino-lime2' in f.name: target = 'A20 OLinuXino LIME2' elif 'a20-olinuxino-micro' in f.name: target = 'A20 OLinuXino MICRO'