def download(url): response = requests.get(url, stream=True) filename = parse_requests_response(response).filename_unsafe if filename is None: raise Exception('No filename could be found for this URL') filename = sanitize(filename) with open(filename, 'wb') as f: total = response.headers.get('content-length') if total is None: f.write(response.content) else: downloaded = 0 total = int(total) for data in response.iter_content( chunk_size=max(int(total / 1000), 1024 * 1024)): downloaded += len(data) f.write(data) done = int(50 * downloaded / total) sys.stdout.write('\r[{}{}]'.format('█' * done, '.' * (50 - done))) sys.stdout.flush() sys.stdout.write('\n') return filename
def get_attachment(url): url = utilities.fix_google_drive_download_url(url) url = utilities.fix_dropbox_download_url(url) response = requests.get(url) response.raise_for_status() mimetype = response.headers.get("content-type").lower().split(";")[0].strip() filename = rfc6266.parse_requests_response(response).filename_unsafe if "X-Auto-Login" in response.headers: raise Exception("Login needed for {0}".format(url)) if not mimetype.startswith("text/") and not mimetype.startswith("image/"): raise Exception("Unhandled file type {0}, {1}, {2}".format(url, mimetype, filename)) if mimetype.startswith("text/"): response.encoding = "utf-8" attachment = { "name": filename, "mime": mimetype, "url": response.url } if mimetype.startswith("text/"): attachment["content"] = response.text else: attachment["content"] = response.content return attachment
def download(self): if self.source_url: if self.is_google_doc(): get_google_doc(self) else: r = requests.get( self.source_url, headers={'User-Agent': 'Cove (cove.opendataservice.coop)'}) r.raise_for_status() content_type = r.headers.get('content-type', '').split(';')[0].lower() file_extension = CONTENT_TYPE_MAP.get(content_type) if not file_extension: possible_extension = rfc6266.parse_requests_response( r).filename_unsafe.split('.')[-1] if possible_extension in CONTENT_TYPE_MAP.values(): file_extension = possible_extension file_name = r.url.split('/')[-1].split('?')[0][:100] if file_name == '': file_name = 'file' if file_extension: if not file_name.endswith(file_extension): file_name = file_name + '.' + file_extension self.original_file.save(file_name, ContentFile(r.content)) else: raise ValueError('No source_url specified.')
def get_attachment(url): url = utilities.fix_google_drive_download_url(url) url = utilities.fix_dropbox_download_url(url) response = requests.get(url) response.raise_for_status() mimetype = response.headers.get("content-type").lower().split( ";")[0].strip() filename = rfc6266.parse_requests_response(response).filename_unsafe if "X-Auto-Login" in response.headers: raise Exception("Login needed for {0}".format(url)) if not mimetype.startswith("text/") and not mimetype.startswith("image/"): raise Exception("Unhandled file type {0}, {1}, {2}".format( url, mimetype, filename)) if mimetype.startswith("text/"): response.encoding = "utf-8" attachment = {"name": filename, "mime": mimetype, "url": response.url} if mimetype.startswith("text/"): attachment["content"] = response.text else: attachment["content"] = response.content return attachment
def filename_from_content_disposition(requests_response): """ Parses the RFC6266 content-disposition header to determine the server- suggested filename for content. """ components = urlparse(requests_response.url) head, tail = posixpath.split(components.path) expected_extension = posixpath.splitext(tail)[1] cd = rfc6266.parse_requests_response(requests_response) return cd.filename_sanitized(expected_extension.lstrip('.') or 'dat')
def save_response(response): tmp = tempdir() filename = secure_filename( parse_requests_response(response).filename_unsafe) filepath = os.path.join(tmp, filename) with open(filepath, 'wb') as out: copyfileobj(response.raw, out) return filepath
def handleMatch(self, match): el = super(ImageDownloadPattern, self).handleMatch(match) urlparts = urllib.parse.urlparse(el.attrib["src"]) if urlparts.netloc: response = requests.get(urlparts.geturl()) response.raise_for_status() filename = rfc6266.parse_requests_response(response).filename_unsafe with open(os.path.join(settings.get("folder"), filename), "wb") as f: f.write(response.content) el.attrib["src"] = filename utilities.fix_image(os.path.join(settings.get("folder"), filename), settings.get("features")["width"]) return el
def download_originals() -> Generator: """ Download the original images from IMGS_URL Return: the total of downloaded images. """ r = requests.get(IMGS_URL) j = json.loads(r.text) for img in j['images']: r = requests.get(img['url']) stream = BytesIO(r.content) img = Image.open(stream) img.filename = rfc6266.parse_requests_response(r).filename_unsafe yield img
def download_file(url, dest=None, chunk_size=1024, replace="ask", label="Downloading {dest_basename} ({size:.2f}MB)", expected_extension=None): """Download a file from a given URL and display progress. :param dest: If the destination exists and is a directory, the filename will be guessed from the Content-Disposition header. If the destination is an existing file, the user will either be prompted to overwrite, or the file will be replaced (depending on the value of **replace**). If the destination does not exist, it will be used as the filename. :param int chunk_size: bytes read in at a time. :param replace: If `False`, an existing destination file will not be overwritten. :param label: a string which is formatted and displayed as the progress bar label. Variables provided include *dest_basename*, *dest*, and *size*. :param expected_extension: if set, the filename will be sanitized to ensure it has the given extension. The extension should not start with a dot (`.`). """ dest = Path(dest or url.split("/")[-1]) response = get(url, stream=True) if (dest.exists() and dest.is_dir() and "Content-Disposition" in response.headers): content_disposition = rfc6266.parse_requests_response(response) if expected_extension is not None: filename = content_disposition.filename_sanitized( expected_extension) filename = secure_filename(filename) dest = dest / filename if dest.exists() and not dest.is_dir(): if (replace is False or replace == "ask" and not click.confirm("Replace {}?".format(dest))): return str(dest) size = int(response.headers.get("content-length", 0)) label = label.format(dest=dest, dest_basename=dest.name, size=size / 1024.0 / 1024) with click.open_file(str(dest), "wb") as f: content_iter = response.iter_content(chunk_size=chunk_size) with click.progressbar(content_iter, length=size / 1024, label=label) as bar: for chunk in bar: if chunk: f.write(chunk) f.flush() return str(dest)
def get_filename_from_request(r): filename = rfc6266.parse_requests_response(r).filename_unsafe extention = re.search('\.\w\w\w$', filename) if extention: return filename else: try: value, params = cgi.parse_header(r.headers['Content-Disposition']) return params.get('filename') or params.get('filename*').replace( "UTF-8''", '') except KeyError: assert get_url_queries(r.url), 'No filename could be extracted' url = strip_url_queries(r.url) r = requests.get(url, stream=True) return get_filename_from_request(r)
def handleMatch(self, match): el = super(ImageDownloadPattern, self).handleMatch(match) urlparts = urllib.parse.urlparse(el.attrib["src"]) if urlparts.netloc: response = requests.get(urlparts.geturl()) response.raise_for_status() filename = rfc6266.parse_requests_response( response).filename_unsafe with open(os.path.join(settings.get("folder"), filename), "wb") as f: f.write(response.content) el.attrib["src"] = filename utilities.fix_image(os.path.join(settings.get("folder"), filename), settings.get("features")["width"]) return el
def download_file(url, dest=None, chunk_size=1024, replace="ask", label="Downloading {dest_basename} ({size:.2f}MB)", expected_extension=None): """Download a file from a given URL and display progress. :param dest: If the destination exists and is a directory, the filename will be guessed from the Content-Disposition header. If the destination is an existing file, the user will either be prompted to overwrite, or the file will be replaced (depending on the value of **replace**). If the destination does not exist, it will be used as the filename. :param int chunk_size: bytes read in at a time. :param replace: If `False`, an existing destination file will not be overwritten. :param label: a string which is formatted and displayed as the progress bar label. Variables provided include *dest_basename*, *dest*, and *size*. :param expected_extension: if set, the filename will be sanitized to ensure it has the given extension. The extension should not start with a dot (`.`). """ dest = Path(dest or url.split("/")[-1]) response = get(url, stream=True) if (dest.exists() and dest.is_dir() and "Content-Disposition" in response.headers): content_disposition = rfc6266.parse_requests_response(response) if expected_extension is not None: filename = content_disposition.filename_sanitized( expected_extension) filename = secure_filename(filename) dest = dest / filename if dest.exists() and not dest.is_dir(): if (replace is False or replace == "ask" and not click.confirm("Replace {}?".format(dest))): return str(dest) size = int(response.headers.get("content-length", 0)) label = label.format(dest=dest, dest_basename=dest.name, size=size/1024.0/1024) with click.open_file(str(dest), "wb") as f: content_iter = response.iter_content(chunk_size=chunk_size) with click.progressbar(content_iter, length=size/1024, label=label) as bar: for chunk in bar: if chunk: f.write(chunk) f.flush() return str(dest)
def fetch(url): response = requests.get(url) url = urlparse(response.url) # Follow redirects if response.status_code not in range(200, 299): sys.exit("Bad response {}".format(response.status_code)) # If Content-Type header is not found we assume MP3 content_type = response.headers.get('content-type', 'audio/mpeg') if not content_type.startswith("audio/"): puts_err("Bad content-type") extension = mimetypes.guess_extension(content_type) content_disposition = parse_requests_response(response) filename = content_disposition.filename_sanitized(extension.lstrip('.')) expected_ittimes = None content_length = response.headers.get('content-length') iter_content = response.iter_content(CHUNK_SIZE) puts(colored.blue(u"Downloading {}\n".format(filename))) # Display progress bar if content-length is set if content_length is not None: expected_ittimes = int(content_length) / CHUNK_SIZE iter_content = progress.bar(iter_content, expected_size=expected_ittimes) with tempfile.NamedTemporaryFile(prefix='song-', suffix=extension, delete=False) \ as fp: for chunk in iter_content: fp.write(chunk) proc = subprocess.Popen(['osascript', '-'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) out, err = proc.communicate(add_to_itunes.format(path=fp.name)) if proc.returncode: puts_err("Couldn't add to iTunes ({})".format(err)) sys.exit(proc.returncode) match = re.match(_itunes_stdout_re, out) if match is not None: song, playlist, source = match.groups() puts(colored.cyan("Added {} to iTunes!".format(song)))
def download(url, name=""): """ Download from url, extracts filename from url or content disposition. Returns original fname, sanitized one and content of file Or None, None, None """ resp = requests.get(url) if resp.status_code == 200: parsed = parse_requests_response(resp) if name: fname = name + os.path.splitext(parsed.filename_unsafe)[-1] else: fname = parsed.filename_sanitized( os.path.splitext(parsed.filename_unsafe)[-1].strip(".")) return parsed.filename_unsafe, fname, resp.content else: return None, None, None
def get_filename(response, expected_extension): """Get filename from content-disposition header""" try: content_disposition = rfc6266.parse_requests_response(response) except Exception as error: # lets not depend on the dependencies of rfc6266 logger.warning( 'Failed to parse content_disposition header from %r, error: %r', response.url, error) # fall back on guessing the filename from the URL basename = requests.utils.unquote_unreserved(response.url).rsplit( '/', 1)[-1] return (basename or 'file') + '.' + expected_extension extension = content_disposition.filename_unsafe.rsplit('.', 1)[-1] # allow for some common file extension variations safe_aliases = (extension.lower(), extension.lower().replace( 'jpeg', 'jpg'), extension.lower().replace('jpg', 'jpeg')) if expected_extension in safe_aliases: expected_extension = extension return content_disposition.filename_sanitized(expected_extension)
def postDownloadCsv(self, marketCode, commodity, commodity2, setMon, pcCode): print('.', end='') payload = { 'captcha': '', 'commodity_id2t': str(commodity2), 'commodity_idt': str(commodity), 'commodityId': str(commodity), 'commodityId2': str(commodity2), 'curpage': '1', 'doQuery': '1', 'doQueryPage': '', 'marketcode': str(marketCode), 'MarketCode': str(marketCode), 'pccode': str(pcCode), 'queryDate': str(self.QueryDate), 'queryDateAh': str(self.QueryDateAh), 'settlemon': str(setMon), 'totalpage': '' } res = self.session.post(self.DownURL, data=payload, headers=self.header, cookies=self.cookies) if res.status_code != requests.codes.ok: raise Exception("Post Download Failed") if res.headers.get('Content-Disposition') == None: print('Download Failed', marketCode, commodity, commodity2, setMon, pcCode) return fileName = rfc6266.parse_requests_response(res).filename_unsafe with open(self.directory + fileName, 'wb') as fd: for chunk in res.iter_content(256): fd.write(chunk)
def extract_downloadinfo(self): ''' Extracts from information from url ''' protocol = get_fileprotocol(self.url) if protocol == 'http' or protocol == 'https': file = File() try: proxies = http_proxies() r = requests_retry_session().get(self.url, stream=True, proxies=proxies) r.raise_for_status() name =rfc6266.parse_requests_response(r).filename_unsafe file.name = name file.url = self.url try: file.size = int(r.headers['Content-Length']) except: file.size = 0 file.setResume(0) return file except Exception as e: print(e.args[0]) return file else: print(protocol, 'not supported yet!') return None
def main(): for item in parse_cmake_dependencies(): if isinstance(item, GitRepository): repo_url, tag = item repo_name = repo_url.rstrip("/").split("/")[-1].replace(".git", "") with TemporaryDirectory(prefix="AppImageKit-") as tempdir: log("Cloning Git repository: {}".format(repo_url)) porcelain.clone(repo_url, tempdir) # TODO: replace with dulwich solution # maybe have a look at Pext source code version = subprocess.check_output( [ "git", "describe", "--always", "--tags", tag, ], cwd=tempdir).decode().split("\n")[0] tarball_name = "{}-{}.tar.gz".format(repo_name, version) if os.path.exists(tarball_name): log("Warning: {} exists, skipping".format(tarball_name)) continue tarball_path = os.path.join(tempdir, tarball_name) log("Creating tarball for tag/branch {}: {}".format( tag, tarball_name)) # TODO: replace with dulwich call to remove dependency on Git # binary subprocess.check_call([ "git", "archive", "--format", "tar.gz", "-o", tarball_path, "--prefix", "{}/".format(repo_name), tag, ], cwd=tempdir) destination = os.path.join(os.getcwd(), "sources") shutil.copyfile(tarball_path, os.path.join(destination, tarball_name)) elif isinstance(item, TarballURL) or isinstance(item, PatchURL): if isinstance(item, TarballURL): url, hash = item elif isinstance(item, PatchURL): url, hash = item[0], None else: url, hash = str(item), None log("Downloading URL: {}".format(url)) digest = None hash_algorithm = hash_value = None if hash is not None: hash_algorithm, hash_value = hash if hash_algorithm not in hashlib.algorithms_available: log("Warning: hashing algorithm {} not supported by " "Python interpreter".format(hash_algorithm)) hash_algorithm = None else: digest = hashlib.new(hash_algorithm.upper()) response = requests.get(url, stream=True) response.raise_for_status() content_disposition = rfc6266.parse_requests_response(response) ext = os.path.splitext(content_disposition.filename_unsafe)[-1] filename = content_disposition.filename_sanitized(ext.strip(".")) if isinstance(item, TarballURL): path = os.path.join("sources", filename) elif isinstance(item, PatchURL): path = os.path.join("patches", filename) else: path = filename try: total = int(response.headers.get("Content-Length", None)) except (ValueError, TypeError): total = None if os.path.exists(path): # if a hash value is available, use that to verify whether # file on system is up to date if hash_algorithm is not None: local_digest = hashlib.new(hash_algorithm) with open(path, "rb") as f: data = f.read(4096) if not data: break local_digest.update(data) if hash_value == local_digest.hexdigest(): log("Warning: file {} exists, " "skipping download".format(path)) continue if total is None: log("Warning: size of file {} unknown, overwriting local " "file".format(path)) else: if os.path.getsize(path) == total: log("Warning: file {} exists, " "skipping download".format(path)) continue os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as f: with tqdm(total=total) as pbar: for chunk in response.iter_content(): f.write(chunk) if digest is not None: digest.update(chunk) pbar.update(len(chunk)) if digest is not None: if hash_value != digest.hexdigest(): log("Warning: could not verify file integrity: " "expected digest: {}, received: {} " "".format(hash_value, digest.hexdigest()))
def main(argv): parser = argparse.ArgumentParser(description='litres.ru backup tool') parser.add_argument("-u", "--user", help="Username") parser.add_argument("-p", "--password", help="Password") parser.add_argument("-f", "--format", default="ios.epub", help="Downloading format. 'list' for available") parser.add_argument("-d", "--debug", action="store_true", help="Add debug output") parser.add_argument("-v", "--verbosedebug", action="store_true", help="You really want to see what happens?") args = parser.parse_args() if args.format == 'list': for f in FORMATS: print f exit(0) else: if args.format not in FORMATS: print "I dont know this format: " + args.format exit(1) if str(args.user) == 'None' or str(args.password) == 'None': print "I cant work without username and passwords" exit(1) if args.debug: print "Will ask for downloading " + args.format print "Try to login to site as " + args.user r = requests.post(URL + "catalit_authorise/", data={ 'login': args.user, 'pwd': args.password }) if args.debug: print "Responce : ", r.status_code, r.reason print "Responce text : " + r.text root = ET.fromstring(r.content) if root.tag == "catalit-authorization-failed": print "Authorization failed" exit(1) sid = root.attrib['sid'] if args.debug: print "Welcome, ", root.attrib['login'], "(", root.attrib['mail'], ")" print "Asking litres.ru for list of books (can take a some time)" print "sid ", sid r = requests.post(URL + "catalit_browser/", data={ 'sid': sid, 'my': "1", 'limit': "0,1000" }) if args.verbosedebug: print "Responce ", r.status_code, r.reason print "Responce text ", r.text root = ET.fromstring(r.content) count_total = root.attrib['records'] if args.debug: print "Total books: ", count_total if args.verbosedebug: print root.tag, root.attrib count = 1 for child in root: if args.verbosedebug: print child.tag, child.attrib hub_id = child.attrib['hub_id'] file_size = 0 for elem in child.iter(): if elem.tag == 'file' and elem.attrib['type'] == args.format: file_size = elem.attrib['size'] if args.verbosedebug: print elem.tag, elem.attrib, elem.text, file_size r = requests.post(URL + "catalit_download_book/", data={ 'sid': sid, 'art': hub_id, 'type': args.format }, stream=True) if args.debug: print "Responce ", r.status_code, r.reason filename = rfc6266.parse_requests_response(r).filename_unsafe print "(", count, "/", count_total, ")", filename with open(filename, "wb") as handle: for data in tqdm(r.iter_content(), unit='b', total=int(file_size)): handle.write(data) time.sleep(1) # do not DDoS litres. count = count + 1 r = requests.get(URL_www + "/pages/my_books_fresh/", cookies={'SID': sid}) items = ET.HTML(r.content).xpath("//div[contains(@class, 'art-item')]") for item in items: link = item.xpath( ".//a[contains(@class, 'art-buttons__read_purchased')]") info = item.xpath(".//div[@data-obj]") if len(link) != 1 or len(info) != 1: continue link = link[0] info = info[0] if args.verbosedebug: print "Book link", link.attrib['href'] print "Book info", info.attrib['data-obj'] data_obj = dict(demjson.decode(info.attrib['data-obj'])) book_name = data_obj['author'] + '_' + data_obj['alt'] fid = re.search(r"file=(\d+)&", link.attrib['href']).group(1) while len(fid) < 8: fid = "0" + fid m = re.match(r"(\d\d)(\d\d)(\d\d)(\d\d)", fid) r = requests.get(URL_www + "/static/pdfjs/" + m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/" + fid + ".js", cookies={'SID': sid}) m = re.search(r"=\s(\{.+\});", r.text) js_obj = dict(demjson.decode(m.group(1))) max_w_index = 0 for i, page in enumerate(js_obj['pages']): if page['p'][0]['w'] > js_obj['pages'][max_w_index]['p'][0]['w']: max_w_index = i pages = js_obj['pages'][max_w_index]['p'] rt = js_obj['pages'][max_w_index]['rt'] os.mkdir(TMP_DIR) imgs = [] for i, page in enumerate(pages): r = requests.get(URL_www + "/pages/read_book_online/?file=" + fid + "&page=" + str(i) + "&rt=" + rt + "&ft=" + page['ext'], cookies={'SID': sid}) img = TMP_DIR + '/' + str(i) + '.' + str(page['ext']) with open(img, 'wb') as f: f.write(r.content) imgs.append(img) if i % 10 == 0: time.sleep(1) with open(book_name + '.pdf', "wb") as f: f.write(img2pdf.convert(imgs)) shutil.rmtree(TMP_DIR)
def main(): for item in parse_cmake_dependencies(): if isinstance(item, GitRepository): repo_url, tag = item repo_name = repo_url.rstrip("/").split("/")[-1].replace(".git", "") with TemporaryDirectory(prefix="AppImageKit-") as tempdir: log("Cloning Git repository: {}".format(repo_url)) porcelain.clone(repo_url, tempdir) # TODO: replace with dulwich solution # maybe have a look at Pext source code version = subprocess.check_output([ "git", "describe", "--always", "--tags", tag, ], cwd=tempdir).decode().split("\n")[0] tarball_name = "{}-{}.tar.gz".format(repo_name, version) if os.path.exists(tarball_name): log("Warning: {} exists, skipping".format(tarball_name)) continue tarball_path = os.path.join(tempdir, tarball_name) log("Creating tarball for tag/branch {}: {}".format( tag, tarball_name )) # TODO: replace with dulwich call to remove dependency on Git # binary subprocess.check_call([ "git", "archive", "--format", "tar.gz", "-o", tarball_path, "--prefix", "{}/".format(repo_name), tag, ], cwd=tempdir) destination = os.path.join(os.getcwd(), "sources") shutil.copyfile(tarball_path, os.path.join(destination, tarball_name)) elif isinstance(item, TarballURL) or isinstance(item, PatchURL): if isinstance(item, TarballURL): url, hash = item elif isinstance(item, PatchURL): url, hash = item[0], None else: url, hash = str(item), None log("Downloading URL: {}".format(url)) digest = None hash_algorithm = hash_value = None if hash is not None: hash_algorithm, hash_value = hash if hash_algorithm not in hashlib.algorithms_available: log("Warning: hashing algorithm {} not supported by " "Python interpreter".format(hash_algorithm)) hash_algorithm = None else: digest = hashlib.new(hash_algorithm.upper()) response = requests.get(url, stream=True) response.raise_for_status() content_disposition = rfc6266.parse_requests_response(response) ext = os.path.splitext(content_disposition.filename_unsafe)[-1] filename = content_disposition.filename_sanitized(ext.strip(".")) if isinstance(item, TarballURL): path = os.path.join("sources", filename) elif isinstance(item, PatchURL): path = os.path.join("patches", filename) else: path = filename try: total = int(response.headers.get("Content-Length", None)) except (ValueError, TypeError): total = None if os.path.exists(path): # if a hash value is available, use that to verify whether # file on system is up to date if hash_algorithm is not None: local_digest = hashlib.new(hash_algorithm) with open(path, "rb") as f: data = f.read(4096) if not data: break local_digest.update(data) if hash_value == local_digest.hexdigest(): log("Warning: file {} exists, " "skipping download".format(path)) continue if total is None: log("Warning: size of file {} unknown, overwriting local " "file".format(path)) else: if os.path.getsize(path) == total: log("Warning: file {} exists, " "skipping download".format(path)) continue os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as f: with tqdm(total=total) as pbar: for chunk in response.iter_content(): f.write(chunk) if digest is not None: digest.update(chunk) pbar.update(len(chunk)) if digest is not None: if hash_value != digest.hexdigest(): log("Warning: could not verify file integrity: " "expected digest: {}, received: {} " "".format(hash_value, digest.hexdigest()))
'datagetter (https://github.com/ThreeSixtyGiving/datagetter)' }) r.raise_for_status() except: print("\n\nDownload failed for dataset {}\n".format( dataset['identifier'])) traceback.print_exc() exit_status = 1 metadata['downloads'] = False else: metadata['downloads'] = True content_type = r.headers.get('content-type', '').split(';')[0].lower() if content_type and content_type in CONTENT_TYPE_MAP: file_type = CONTENT_TYPE_MAP[content_type] elif 'content-disposition' in r.headers: file_type = rfc6266.parse_requests_response( r).filename_unsafe.split('.')[-1] else: file_type = url.split('.')[-1] if file_type not in CONTENT_TYPE_MAP.values(): print("\n\nUnrecognised file type {}\n".format(file_type)) continue metadata['file_type'] = file_type file_name = 'data/original/' + dataset['identifier'] + '.' + file_type with open(file_name, 'wb') as fp: fp.write(r.content) else: file_type = metadata['file_type'] file_name = 'data/original/' + dataset['identifier'] + '.' + file_type json_file_name = 'data/json_all/{}.json'.format(dataset['identifier'])
def fetch_and_convert(args, dataset): r = None metadata = dataset.get('datagetter_metadata', {}) dataset['datagetter_metadata'] = metadata if not dataset['license'] in acceptable_licenses + unacceptable_licenses: raise ValueError('Unrecognised license ' + dataset['license']) url = dataset['distribution'][0]['downloadURL'] if args.download: proxies = None metadata[ 'datetime_downloaded'] = strict_rfc3339.now_to_rfc3339_localoffset( ) if args.socks5_proxy: proxies = { 'http': args.socks5_proxy, 'https': args.socks5_proxy, } try: print("Fetching %s" % url) r = requests.get( url, headers={ 'User-Agent': 'datagetter (https://github.com/ThreeSixtyGiving/datagetter)' }, proxies=proxies) r.raise_for_status() metadata['downloads'] = True except Exception as e: if isinstance(e, KeyboardInterrupt): raise print("\n\nDownload {} failed for dataset {}\n".format( url, dataset['identifier'])) traceback.print_exc() metadata['downloads'] = False metadata['error'] = str(e) if not isinstance(e, requests.exceptions.HTTPError): return content_type = r.headers.get('content-type', '').split(';')[0].lower() if content_type and content_type in CONTENT_TYPE_MAP: file_type = CONTENT_TYPE_MAP[content_type] elif 'content-disposition' in r.headers: file_type = rfc6266.parse_requests_response( r).filename_unsafe.split('.')[-1] else: file_type = url.split('.')[-1] if file_type not in CONTENT_TYPE_MAP.values(): print("\n\nUnrecognised file type {}\n".format(file_type)) return # Check that the downloaded json file is valid json and not junk from the webserver # e.g. a 500 error being output without the proper status code. if file_type == "json": try: json.loads(r.text) except ValueError: print("\n\nJSON file provided by webserver is invalid") metadata['downloads'] = False metadata['error'] = "Invalid JSON file provided by webserver" return metadata['file_type'] = file_type file_name = args.data_dir + '/original/' + dataset[ 'identifier'] + '.' + file_type with open(file_name, 'wb') as fp: fp.write(r.content) else: # --no-download arg # We require the metadata to exist, it won't if the file failed to download correctly if metadata['downloads'] == False: print( "Skipping %s as it was not marked as successfully downloaded" % dataset['identifier']) return file_type = metadata['file_type'] file_name = args.data_dir + '/original/' + dataset[ 'identifier'] + '.' + file_type json_file_name = '{}/json_all/{}.json'.format(args.data_dir, dataset['identifier']) metadata['file_size'] = os.path.getsize(file_name) if args.convert and (args.convert_big_files or metadata['file_size'] < 10 * 1024 * 1024): if file_type == 'json': os.link(file_name, json_file_name) metadata['json'] = json_file_name else: try: print("Running convert on %s to %s" % (file_name, json_file_name)) convert_spreadsheet(file_name, json_file_name, file_type) except KeyboardInterrupt: raise except Exception: print( "\n\nUnflattening failed for file {}\n".format(file_name)) traceback.print_exc() metadata['json'] = None metadata["valid"] = False metadata["error"] = "Could not unflatten file" else: metadata['json'] = json_file_name metadata['acceptable_license'] = dataset['license'] in acceptable_licenses # We can only do anything with the JSON if it did successfully convert. if metadata.get('json'): format_checker = FormatChecker() if args.validate: try: with open(json_file_name, 'r') as fp: validate(json.load(fp), schema, format_checker=format_checker) except (ValidationError, ValueError): metadata['valid'] = False else: metadata['valid'] = True if metadata['valid']: os.link( json_file_name, '{}/json_valid/{}.json'.format(args.data_dir, dataset['identifier'])) data_valid.append(dataset) if metadata['acceptable_license']: os.link( json_file_name, '{}/json_acceptable_license_valid/{}.json'.format( args.data_dir, dataset['identifier'])) data_acceptable_license_valid.append(dataset) if metadata['acceptable_license']: os.link( json_file_name, '{}/json_acceptable_license/{}.json'.format( args.data_dir, dataset['identifier'])) data_acceptable_license.append(dataset)
def download(self, **state): # Load the status file. split = urlsplit(self.url) url_status_file = '{}.json'.format( os.path.join(state['archives_dir'], 'status', split.netloc, split.path[1:])) url_status = get_status(url_status_file) # Get the headers for the URL if 'downloaded' not in url_status or not url_status['downloaded']: req = requests.get(self.url, allow_redirects=True, stream=True) headers = req.headers # Extract a filename filename = rfc6266.parse_requests_response(req).filename_unsafe # Work out our output path output_file = os.path.join(state['archives_dir'], filename) status_file = os.path.join(state['status_dir'], '{}.json'.format(filename)) # Load the status file. status = get_status(status_file) # If our file exists compare the last modified of our file vs the one on the server if os.path.exists(output_file): # Check if we have a last modified value in our header if 'Last-Modified' in headers: # Get when the local and remote files were last modified l_modified = os.path.getmtime(output_file) r_modified = time.mktime( parser.parse(headers['Last-Modified']).timetuple()) # If we were modified after we don't need to download again if l_modified > r_modified: cprint(indent( 'URL {} not modified... Skipping...'.format( filename), 8), 'yellow', attrs=['bold']) url_status = update_status(url_status_file, { 'downloaded': True, 'archive': output_file }) return {'archive': output_file} # If there is an etag we can use we can check that hasn't changed elif 'Etag' in headers: if 'download_etag' not in status or status[ 'download_etag'] != headers['Etag']: status = update_status( status_file, {'download_etag': headers['Etag']}) else: cprint(indent( 'URL {} not modified... Skipping...'.format( filename), 8), 'yellow', attrs=['bold']) url_status = update_status(url_status_file, { 'downloaded': True, 'archive': output_file }) return {'archive': output_file} cprint(indent('Downloading {}'.format(filename), 8), 'green', attrs=['bold']) # Total size in bytes. total_size = int(headers.get('content-length', 0)) # Get the file with open(output_file, 'wb') as f: with tqdm(total=total_size, unit='B', unit_scale=True) as progress: for data in req.iter_content(32 * 1024): f.write(data) progress.update(len(data)) url_status = update_status(url_status_file, { 'downloaded': True, 'archive': output_file }) # Return our updates to state return {'archive': output_file} else: cprint(indent( 'URL {} not modified... Skipping...'.format( os.path.basename(url_status['archive'])), 8), 'yellow', attrs=['bold']) return {'archive': url_status['archive']}
def download(id: int, file: int = typer.Argument(None)): """Download a dataset by passing in the dataset id and optionally the file number. """ datasets = get_datasets(API_URL) # if dataset id exists if id in dataset_ids(datasets): # loop through each dataset for row in datasets: # find the dataset with given id if int(row["id"]) == id: # make directory in the format of id_DatasetName id = row["id"] name = row["Name"] dest_folder = f"{id}_{name.replace(' ','')}" if not os.path.exists(dest_folder): os.makedirs(dest_folder) # if optional argument is passed to download specified file if not (file is None): # making sure that the file passed exists for the specified dataset file_index = ( file - 1 ) # converting user input into proper python indexing if file_index > len(row["Datasets"]) or file_index < 0: typer.echo( "The dataset you selected does not have that file." ) else: # getting download URL url = row["Datasets"][file_index]["URL"] r = requests.get(url, allow_redirects=True, stream=True) # getting individual file's name filename = rfc6266.parse_requests_response( r).filename_unsafe file_path = os.path.join(dest_folder, filename) typer.echo("Downloading file now!") # loading bar code from stack overflow # https://stackoverflow.com/questions/37573483/progress-bar-while-download-file-over-http-with-requests/37573701#37573701 total_size_in_bytes = int( r.headers.get("content-length", 0)) block_size = 1024 # 1 Kibibyte progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) # saving file with open(file_path, "wb") as fid: for data in r.iter_content(block_size): progress_bar.update(len(data)) fid.write(data) progress_bar.close() # if no file is specified, download all files else: urls = [data["URL"] for data in row["Datasets"]] typer.echo("Downloading files now!") for i, url in enumerate(urls): r = requests.get(url, allow_redirects=True, stream=True) filename = rfc6266.parse_requests_response( r).filename_unsafe file_path = os.path.join(dest_folder, filename) total_size_in_bytes = int( r.headers.get("content-length", 0)) block_size = 1024 # 1 Kibibyte progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) with open(file_path, "wb") as fid: for data in r.iter_content(block_size): progress_bar.update(len(data)) fid.write(data) progress_bar.close() # if dataset id doesnt exist else: typer.echo( "That dataset doesn't exist or you've made a typo in the id.") typer.echo( "Use the 'see all datasets' command to view the available datasets." )
def test_requests(httpserver): requests = pytest.importorskip('requests') httpserver.serve_content('eep', headers={ 'Content-Disposition': 'attachment; filename="a b="'}) resp = requests.get(httpserver.url) assert parse_requests_response(resp).filename_unsafe == 'a b='
def test_requests(httpserver): requests = pytest.importorskip('requests') httpserver.serve_content( 'eep', headers={'Content-Disposition': 'attachment; filename="a b="'}) resp = requests.get(httpserver.url) assert parse_requests_response(resp).filename_unsafe == 'a b='