def add_file(self, path, title, download_url, write_data=True, license=None, **node_data): """ read: Creates file in csv and writes file to zip Args: path: (str) where in zip to write file title: (str) content's title download_url: (str) url or local path to download from write_data: (boolean) indicates whether to add as a csv entry (optional) license (str): content's license source_id: (str) content's original id (optional) description: (str) description of content (optional) author (str): who created the content (optional) language (str): language of content (optional) license_description (str): description of content's license (optional) copyright_holder (str): holder of content's license (optional) thumbnail (str): path to thumbnail in zip (optional) Returns: path to file in zip """ assert license, "Files must have a license" self._parse_path(path) _name, ext = os.path.splitext(download_url or "") filepath = "{}/{}{}".format(path, title, ext) if download_url and filepath: self._write_to_zip(filepath, read(download_url)) if write_data: self._commit(filepath, title, license=license, **node_data) return filepath
def write_url(self, url, filename, directory="."): """ write_url: Write contents from url to filename in zip Args: url: (str) url to file to download filename: (str) name of file in zip directory: (str) directory in zipfile to write file to (optional) Returns: path to file in zip """ return self.write_contents(filename, read(url), directory=directory)
def read_source(base, endpoint=None, loadjs=False): """ Read url """ if base.count( 'http://' ) > 1: # Special case: http://web.archive.org/web/.../http://2012books.lardbucket.org/books/... return downloader.read("http://{}".format(base.split('http://')[-1]), loadjs=loadjs) elif not endpoint: return downloader.read(base, loadjs=loadjs) elif endpoint.startswith('http'): return downloader.read(endpoint, loadjs=loadjs) elif endpoint.startswith('/'): return downloader.read(os.path.dirname(base) + endpoint.lstrip('/'), loadjs=loadjs) else: return downloader.read(os.path.dirname(base).rstrip("/") + "/" + endpoint, loadjs=loadjs)
def add_file(self, path, title, download_url, write_data=True, ext=None, license=None, copyright_holder=None, **node_data): """ add_file: Creates file in csv and writes file to zip Args: path: (str) where in zip to write file title: (str) content's title download_url: (str) url or local path to download from write_data: (boolean) indicates whether to add as a csv entry (optional) ext: (str) extension to use for file license (str): content's license copyright_holder (str): holder of content's license (required except for PUBLIC_DOMAIN) license_description (str): description of content's license (optional) source_id: (str) content's original id (optional) description: (str) description of content (optional) author (str): who created the content (optional) language (str): language of content (optional) thumbnail (str): path to thumbnail in zip (optional) Returns: path to file in zip """ if write_data: assert license, "Files must have a license" copyright_holder = None if copyright_holder.strip( ) == '' else copyright_holder assert license in NO_COPYRIGHT_HOLDER_REQUIRED or copyright_holder, "Licenses must have a copyright holder if they are not public domain" self._parse_path(path) if not ext: _name, ext = os.path.splitext(download_url or "") filepath = "{}/{}{}".format(path, title, ext) if download_url and filepath: self._write_to_zip(filepath, read(download_url)) if write_data: self._commit(filepath, title, license=license, copyright_holder=copyright_holder, **node_data) return filepath
def open(self): """ open: Opens pdf file to read from Args: None Returns: None """ filename = os.path.basename(self.download_url) folder, _ext = os.path.splitext(filename) self.path = os.path.sep.join([self.directory, folder, filename]) if not os.path.exists(os.path.dirname(self.path)): os.makedirs(os.path.dirname(self.path)) # Download full pdf if it hasn't already been downloaded if not os.path.isfile(self.path): with open(self.path, "wb") as fobj: fobj.write(read(self.download_url)) self.file = open(self.path, 'rb') self.pdf = CustomPDFReader(self.file)
def read_source(url): """ Read page source as beautiful soup """ html = downloader.read(url) return BeautifulSoup(html, 'html.parser')
def read(self, path, loadjs=False): return read(path, loadjs=loadjs, session=self.session, driver=self.driver)
def read_source(endpoint="books"): """ Reads page source using downloader class to get json data """ page_contents = downloader.read("{baseurl}/{endpoint}".format( baseurl=BASE_URL, endpoint=endpoint)) return json.loads(page_contents) # Open Stax url returns json object