def download(self): import requests import subprocess import os.path from ambry_sources.fetch import download if not self.cmd_exists('pdftohtml'): self.fatal("pdftohtml program does not exist") pdf_file_name, dt = download(self._url, self._library.download_cache) xml_file_name = os.path.split(pdf_file_name)[-1].replace('.pdf', '' ) pdf_path = self._library.download_cache.getsyspath(pdf_file_name) xml_path = self._bundle.build_fs.getsyspath(xml_file_name) if not os.path.exists(xml_path+".xml"): call_args = ['pdftohtml', '-xml', pdf_path, xml_path] process = subprocess.Popen(call_args) if process.wait() != 0: print('Errors while converting pdf to xml.') return xml_path+'.xml'
def _download(self): from ambry_sources.fetch import download self._path , _ = download(self._url, self._fs, self._account_accessor, logger=self._logger)