def extract_files_to_process(options, company_file): """Extract the files from the ENER zip file and the ITR/DFP inside of it, and collect all the XML files """ force_download = options.get("force_download", False) local_base_path = _doc_local_base_path(options, company_file) # Make sure the file is in the local cache local_file = "{0}/{1}". \ format(local_base_path, company_file.file_name) if not exists(options, local_file): copy_file(options, company_file.file_url, local_file) working_local_base_path = \ _doc_local_working_base_path(options, company_file) file_to_export = "{0}/{1}".format(local_base_path, company_file.file_name) if exists(options, working_local_base_path): if force_download: # Clean the folder of the company file (working folder) delete_all(options, working_local_base_path) files_ref = extract_zip(options, file_to_export, working_local_base_path) else: files_ref = listdir(options, working_local_base_path) # If the folder is empty if not files_ref: mkdirs(options, working_local_base_path) files_ref = extract_zip(options, file_to_export, working_local_base_path) else: mkdirs(options, working_local_base_path) files_ref = extract_zip(options, file_to_export, working_local_base_path) available_files = {} if company_file.doc_type in ["ITR", "DFP"]: for the_file in files_ref: if re.match(RE_FILE_BY_XML, the_file, re.IGNORECASE): filename = ntpath.basename(the_file) available_files[filename] = the_file elif re.match(RE_FILE_BY_ITR, the_file, re.IGNORECASE): itr_dest_folder = "{0}/itr_content/".\ format(working_local_base_path) itr_files = extract_zip(options, the_file, itr_dest_folder) for itr_file in itr_files: filename = ntpath.basename(itr_file) available_files["itr/{}".format(filename)] = itr_file # Once unzipped, we can delete the original file from the elif re.match(RE_FILE_BY_DFP, the_file, re.IGNORECASE): dfp_dest_folder = "{0}/dfp_content/".\ format(working_local_base_path) dfp_files = extract_zip(options, the_file, dfp_dest_folder) for dfp_file in dfp_files: filename = ntpath.basename(dfp_file) available_files["dfp/{}".format(filename)] = dfp_file return available_files
def download_file(options, ccvm, doc_type, fiscal_date, version): company_file = BovespaCompanyFile.objects.get(ccvm=ccvm, doc_type=doc_type, fiscal_date=fiscal_date, version=version) # Build the path and file name local_base_path = _doc_local_base_path(options, company_file) cache_base_path = _doc_base_path(options, company_file) force_download = options.get("force_download", False) if force_download or not company_file.file_url or not exists( options, company_file.file_url): # Check if there is a file in the cached path files = listdir(options, cache_base_path) if len(files) == 0: fetch_file_params = {"base_path": local_base_path} fetch_file_params.update(options) file = fetch_tenaciously(fetcher=fetch_file, url=company_file.source_url, n=10, s=10, data=fetch_file_params) file_url = "{0}/{1}".format(cache_base_path, file.filename) # Let's cache the file into our permanent storage copy_file(options, file.file, file_url) company_file.update(file_url=file_url, file_name=file.filename, file_extension=get_extension(file.file)) else: file_url = files[0] file_name = os.path.split(file_url)[1] company_file.update(file_url=file_url, file_name=file_name, file_extension=get_extension(file_name)) return extract_files_to_process(options, company_file)
def fetch_file(url, options): with tempfile.TemporaryDirectory() as temp_path: logger.info("Download from [%s] and store into [%s]" % (url, temp_path)) try: proxy_address = get_proxy_address() response = requests.get(url, stream=True, timeout=(1800, 1800), verify=False, proxies=proxy_address) params = cgi.parse_header( response.headers.get("Content-Disposition", ""))[-1] if "filename" in params: filename = params["filename"] else: filename = url.rpartition("/")[2] filename = os.path.basename(filename) abs_path = os.path.join(temp_path, filename) status = response.status_code response = response if status == HTTP_OK else __content_error( status) with open(abs_path, "wb") as f: for chunk in response.iter_content(chunk_size=1024 * 1024): if chunk: f.write(chunk) base_path = options.get("base_path", ".") dest_file = "{0}/{1}".format(base_path, filename) dest_file = copy_file(options, abs_path, dest_file) return File(status, dest_file, filename, response) except Exception as ex: logger.exception("Failed to download %s" % url) raise DownloadException(url) from ex