def test_internet_archive(): from datetime import timedelta from django.utils import timezone import internetarchive from perma.models import Link from django.template.defaultfilters import truncatechars start_date = timezone.now() - timedelta(days=3) end_date = timezone.now() - timedelta(days=2) links = Link.objects.filter( internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date) ) guid_results = dict() all_results = dict() c = {"s3": {"access": settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret": settings.INTERNET_ARCHIVE_SECRET_KEY}} internetarchive.get_session(config=c) for link in links: identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid item = internetarchive.get_item(identifier) warc_name = "%s.warc.gz" % link.guid try: fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")] guid_results["uploaded_file"] = warc_name in fnames if settings.INTERNET_ARCHIVE_COLLECTION == "test_collection": guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION else: guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION guid_results["title"] = item.metadata["title"] == "%s: %s" % ( link.guid, truncatechars(link.submitted_title, 50), ) guid_results["mediatype"] = item.metadata["mediatype"] == "web" guid_results["description"] = item.metadata["description"] == "Perma.cc archive of %s created on %s." % ( link.submitted_url, link.creation_timestamp, ) guid_results["contributor"] = item.metadata["contributor"] == "Perma.cc" guid_results["submitted_url"] = item.metadata["submitted_url"] == link.submitted_url guid_results["perma_url"] = item.metadata["perma_url"] == "http://%s/%s" % (settings.HOST, link.guid) guid_results["external-identifier"] = item.metadata["external-identifier"] == "urn:X-perma:%s" % link.guid if link.organization: guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % ( link.organization, link.organization.registrar, ) except Exception as e: guid_results["error"] = e pass all_results[link.guid] = guid_results print all_results
def test_get_session_with_config_file(tmpdir): tmpdir.chdir() test_conf = """[s3]\naccess = key2""" with open("ia_test.ini", "w") as fh: fh.write(test_conf) s = get_session(config_file="ia_test.ini") assert s.access_key == "key2"
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) s = get_session(config={"s3": {"access": "key"}}) files = get_files("nasa", files="nasa_meta.xml", archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml" files = get_files("nasa", files="nasa_meta.xml", config={"logging": {"level": "INFO"}}) files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml" test_conf = """[s3]\naccess = key2""" with open("ia_test.ini", "w") as fh: fh.write(test_conf) files = get_files("nasa", files="nasa_meta.xml", config_file="ia_test.ini") files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml" files = get_files("nasa", files="nasa_meta.xml", http_adapter_kwargs={"max_retries": 3}) files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml" files = get_files("nasa", files="nasa_meta.xml", request_kwargs={"timeout": 4}) files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml"
def get_ia_session(): try: assert config.has_key('s3/access_key') assert config.has_key('s3/secret_key') assert config.has_key('cookie') assert config.has_key('email') except: return None ia_session = get_session( config={ 'general': { 'secure': True }, 's3': { 'access': config.get('s3/access_key'), 'secret': config.get('s3/secret_key') }, 'cookies': { 'logged-in-user': config.get('email'), 'logged-in-sig': config.get('cookie') }, }, http_adapter_kwargs={'max_retries': 10}, ) return ia_session
def getStats(): today = date.today() back30_days = (datetime.now() - timedelta(days=30)).date() back7_days = (datetime.now() - timedelta(days=7)).date() collection = 'greekgovernmentgazette' query_all = f'collection:({collection})' res = {} try: s = get_session() s.mount_http_adapter() search_results = s.search_items(query_all, fields=['identifier', 'addeddate']) lst_res = list(search_results) docs_last30days = [ i for i in lst_res if isodate.parse_date(i['addeddate']) >= back30_days ] docs_last7days = [ i for i in lst_res if isodate.parse_date(i['addeddate']) >= back7_days ] docs_today = [ i for i in lst_res if isodate.parse_date(i['addeddate']) == today ] res['count_all'] = len(lst_res) res['count_last30days'] = len(docs_last30days) res['count_last7days'] = len(docs_last7days) res['count_today'] = len(docs_today) finally: return res
def __init__(self, archive_id, metadata=None, config_file_path=None, repo_base=None): """ :param archive_id: :param config_file_path: :param repo_base: In archive item, place each file in a folder mirroring its local location. """ self.repo_base = repo_base self.archive_id = archive_id self.archive_session = internetarchive.get_session( config_file=config_file_path) self.archive_item = internetarchive.get_item( archive_id, config_file=config_file_path) self.metadata = metadata logging.info(self.archive_item.identifier) self.original_item_files = list( filter( lambda x: x["source"] == "original" and not x["name"]. startswith(self.archive_item.identifier) and not x[ "name"].startswith("_"), self.archive_item.files)) self.original_item_file_names = sorted( map(lambda x: x["name"], self.original_item_files))
def create_subcollection(collection_id, metadata=None, parent_collection=None): """ The expected sub-collection hierarchy is as follows top-level OSF collection -> provider collection -> collection for nodes with multiple children -> all only child nodes :param metadata: dict should attributes for the provider's sub-collection is being created :param parent_collection: str the name of the sub-collection's parent :return: """ if metadata is None: metadata = {} session = internetarchive.get_session( config={ "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY}, }, ) collection = internetarchive.Item(session, collection_id) collection.upload( files={"dummy.txt": BytesIO(b"dummy")}, metadata={ "mediatype": "collection", "collection": parent_collection, **metadata, }, )
def get_ia_item(guid): session = internetarchive.get_session( config={ "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY}, }, ) return session.get_item(guid)
def test_get_session_with_config_file(tmpdir): tmpdir.chdir() test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) s = get_session(config_file='ia_test.ini') assert s.access_key == 'key2'
def stream_from_pbox(itemname, filename): # TODO: handle errors etc archive_session = get_session(config_file=settings.IATOOL_CONFIG_PATH) item = archive_session.get_item(itemname) files = item.get_files(filename) file = files.__next__() return file.download(return_responses=True)
def new_session(): global SESSION if SESSION is not None: raise Exception('have another session!') SESSION = get_session(config=CONFIG)
def test_internet_archive(): from datetime import timedelta from django.utils import timezone import internetarchive from perma.models import Link from django.template.defaultfilters import truncatechars start_date = timezone.now() - timedelta(days=3) end_date = timezone.now() - timedelta(days=2) links = Link.objects.filter(internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date)) guid_results = dict() all_results = dict() c = {"s3":{"access":settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret":settings.INTERNET_ARCHIVE_SECRET_KEY}} internetarchive.get_session(config=c) for link in links: identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid item = internetarchive.get_item(identifier) warc_name = "%s.warc.gz" % link.guid try: fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")] guid_results["uploaded_file"] = warc_name in fnames if settings.INTERNET_ARCHIVE_COLLECTION == 'test_collection': guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION else: guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION guid_results["title"] = item.metadata["title"] == "%s: %s" % (link.guid, truncatechars(link.submitted_title, 50)) guid_results["mediatype"] = item.metadata["mediatype"]=="web" guid_results["description"] = item.metadata["description"]=="Perma.cc archive of %s created on %s." % (link.submitted_url, link.creation_timestamp,) guid_results["contributor"] = item.metadata["contributor"]=="Perma.cc" guid_results["submitted_url"] = item.metadata["submitted_url"]==link.submitted_url guid_results["perma_url"] = item.metadata["perma_url"]=="http://%s/%s" % (settings.HOST, link.guid) guid_results["external-identifier"] = item.metadata["external-identifier"]=="urn:X-perma:%s" % link.guid if link.organization: guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (link.organization, link.organization.registrar) except Exception as e: guid_results["error"] = e pass all_results[link.guid] = guid_results print all_results
def createSession(self): iaKey = decryptEnvVar('IA_ACCESS_KEY') iaSecret = decryptEnvVar('IA_SECRET_KEY') return get_session( config={'s3': { 'access': iaKey, 'secret': iaSecret }})
def start_ia_session(self): """ starts an internet archive session """ config = dict(s3=dict(acccess=settings.INTERNET_ARCHIVE_ACCESS_KEY, secret=settings.INTERNET_ARCHIVE_SECRET_KEY)) s = get_session(config=config, debug=True) s.access_key = settings.INTERNET_ARCHIVE_ACCESS_KEY s.secret_key = settings.INTERNET_ARCHIVE_SECRET_KEY return s
def test_get_item_with_archive_session(): with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) s = get_session(config={'s3': {'access': 'key3'}}) item = get_item('nasa', archive_session=s) assert item.session.access_key == 'key3'
def test_upload_secure_session(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: s = get_session(config={"general": {"secure": True}}) rsps.add_metadata_mock("nasa") item = s.get_item("nasa") with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE) r = item.upload(NASA_METADATA_PATH) assert r[0].url == "https://s3.us.archive.org/nasa/nasa.json"
def upload_to_ia(force=set()): s = get_session() item = s.get_item("NotoFonts") hashdict = {f["name"]: f["md5"] for f in item.files} fonts_modified = False for path in tqdm(sorted(pathset)): filename = path.name file = open(path, "rb").read() hash = md5(file).hexdigest() if "fonts" not in force: try: if hashdict[filename] == hash: print("SKIPPING: " + filename) continue except KeyError: pass fonts_modified = True print("WORKING: " + filename) upload_paths = [] ttf = TTFont(path) print(" CONVERTING TO woff2...") ttf.flavor = "woff2" woff2_path = "upload/" + path.with_suffix(".woff2").name try: ttf.save(open(woff2_path, "wb")) upload_paths.append(woff2_path) except TTLibError: print("could not convert to woff2") print(" CONVERTING TO woff...") ttf.flavor = "woff" woff_path = "upload/" + path.with_suffix(".woff").name ttf.save(open(woff_path, "wb")) upload_paths.append(woff_path) print(" UPLOADING...") r = item.upload(files=[*upload_paths, str(path)], retries=100) for upath in [woff2_path, woff_path]: remove(upath) if "css" in force or fonts_modified: from generate_css import build_all_css print(" GENERATING CSS...") build_all_css() css_files = glob("*.css") for path in [Path(p) for p in sorted(css_files)]: filename = path.name file = open(path, "rb").read() hash = md5(file).hexdigest() # if "css" not in force: try: if hashdict[filename] == hash: print("SKIPPING: " + filename) continue except KeyError: pass print(" UPLOADING " + filename) r = item.upload(files=css_files, retries=100)
def test_upload_secure_session(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: s = get_session(config={'general': {'secure': True}}) rsps.add_metadata_mock('nasa') item = s.get_item('nasa') with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE) r = item.upload(NASA_METADATA_PATH) assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json'
def test_get_session_with_config(): s = get_session(config={ 's3': { 'access': 'key' }, 'gengeral': { 'secure': False } }) assert s.access_key == 'key'
def __init__( self, issue_db: IssueDB, sandcrawler_db_client: SandcrawlerPostgrestClient, sandcrawler_s3_client: SandcrawlerMinioClient, ): self.issue_db: IssueDB = issue_db self.ia_client = internetarchive.get_session() self.sandcrawler_db_client = sandcrawler_db_client self.sandcrawler_s3_client = sandcrawler_s3_client
def test_upload_secure_session(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: c = {'s3': {'access': 'foo', 'secret': 'bar'}, 'general': {'secure': True}} s = get_session(config=c) rsps.add_metadata_mock('nasa') item = s.get_item('nasa') with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE) r = item.upload(NASA_METADATA_PATH) assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json'
def get_stats(days=30): back_days = (datetime.now() - timedelta(days=days)).date() query_all = 'collection:(greekgovernmentgazette) AND date:[{1} TO {0}]' \ .format(datetime.now().strftime('%Y-%m-%d'), back_days.strftime('%Y-%m-%d')) s = get_session() s.mount_http_adapter() search_results = s.search_items(query_all, fields=['identifier', 'addeddate']) return '{}\t{}'.format(query_all, len(search_results))
def test_upload_secure_session(testitem_metadata, json_filename): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: s = get_session(config={'general': {'secure': True}}) rsps.add(responses.GET, 'https://archive.org/metadata/nasa', body=testitem_metadata, status=200) item = s.get_item('nasa') with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE, status=200) r = item.upload(json_filename) assert r[0].url == 'https://s3.us.archive.org/nasa/nasa_meta.json'
def __init__(self, file_storage, access_key, secret_key, loglevel, logfile): self.file_storage = file_storage self.access_key = access_key self.secret_key = secret_key session_data = {'access': access_key, 'secret': secret_key} if logfile: logconfig = {'logging': {'level': loglevel, 'file': logfile}} else: logconfig = {'logging': {'level': loglevel}} self.session = get_session({'s3': session_data, 'logging': logconfig}) self.logger = logging.getLogger('iasync')
def __init__(self, top_dir, access_key, secret_key, loglevel, logfile): self.top_dir = top_dir self.access_key = access_key self.secret_key = secret_key session_data = {'access': access_key, 'secret': secret_key} if logfile: logconfig = {'logging': {'level': loglevel, 'file': logfile}} else: logconfig = {'logging': {'level': loglevel}} self.session = get_session({'s3': session_data, 'logging': logconfig}) self.logger = logging.getLogger('gvision.ia')
def ia_get_session() -> ArchiveSession: """ Get an IA session based on the configuration supplied via environment variables. Because get_session() starts with values found in ~/.ia for the current user then merges in the "additional" supplied values, the configuration file reference is explicitly eliminated. For desktop testing, this guarantees that the application has the same dependence on environment variables as it would when deployed. """ return internetarchive.get_session(IA_CONFIG, config_file='/dev/null')
def main(collection: str, name: str, concurrency: int, dummy: bool=False, dummy_text: str=None): if dummy: t = DummyTrainer(dummy_text) else: urls = from_cdx_url(cdx_url(collection), session=internetarchive.get_session()) t = Trainer(urls, concurrency=concurrency) identifier = str(int(time.time())) filename_base = collection + '_dictionary_' + identifier upload_urls = t.upload(filename_base + '.zstdict.zst', filename_base) print(upload_urls) add_entry(identifier, name, t.sha256, upload_urls['public_url'], upload_urls['backup_url'])
def __init__(self, ckm_repo: CkanMetaRepo, ia_access: str, ia_secret: str, ia_collection: str, token: str = None) -> None: self.ckm_repo = ckm_repo self.ia_collection = ia_collection self.ia_access = ia_access self.ia_session = internetarchive.get_session( config={'s3': { 'access': ia_access, 'secret': ia_secret, }}) self._gh = github.Github(token) if token else github.Github()
def get_dictionary(filename: str) -> zstandard.ZstdCompressionDict: s = internetarchive.get_session() r = s.get('https://archive.org/download/' + filename, headers={'Range': 'bytes=0-7'}) if r.content[:4] != b'\x5D\x2A\x4D\x18': return None data_size = struct.unpack('<L', r.content[4:])[0] r = s.get('https://archive.org/download/' + filename, headers={'Range': 'bytes=8-{}'.format(8 + data_size - 1)}) dictionary = r.content if r.content[:4] == b'\x28\xB5\x2F\xFD': dictionary = zstandard.ZstdDecompressor().decompress(dictionary) if dictionary[:4] != b'\x37\xA4\x30\xEC': raise ValueError('Not a dictionary.') return zstandard.ZstdCompressionDict(dictionary)
def main(): session = internetarchive.get_session() if len(sys.argv) == 3: item_name = sys.argv[1] release_id = sys.argv[2] item_to_fileset(item_name, release_id=release_id, session=session) else: for line in sys.stdin: line = line.strip() if not line: continue fields = line.split("\t") assert len(fields) == 2 item_name = fields[0] release_id = fields[1] item_to_fileset(item_name, release_id=release_id, session=session)
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': { 'level': 'INFO' }}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml'
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': { 'level': 'INFO' }}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml'
def cdx_url(collection: str) -> str: session = internetarchive.get_session() response = session.get( 'https://archive.org/advancedsearch.php', params={ 'q': ( 'collection:archiveteam* ' 'AND format:(Item CDX Index) ' 'AND identifier:{}*'.format(collection) ), 'fl[]': 'identifier', 'sort[]': 'addeddate desc', 'rows': '1', 'output': 'json', 'scope': 'all' } ).json() print(response) identifier = response['response']['docs'][0]['identifier'] return 'https://archive.org/download/{0}/{0}.cdx.gz'.format(identifier)
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': {'level': 'INFO'}}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml'
def __init__(self, **kwargs): super().__init__() self.ingest_strategy = IngestStrategy.ArchiveorgFileset # TODO: enable cleanup when confident (eg, safe path parsing) self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True) self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/") try: os.mkdir(self.working_dir) except FileExistsError: pass self.http_session = requests_retry_session() self.ia_session = internetarchive.get_session( config={ "s3": { "access": os.environ.get("IA_ACCESS_KEY"), "secret": os.environ.get("IA_SECRET_KEY"), }, })
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': {'level': 'INFO'}}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml'
def archive_url(data: typing.Dict[str, bytes], url_data: typing.Tuple[str, int, int, str, bool]): url, offset, length, filename, redownload = url_data if redownload: data[url] = requests.get(url, allow_redirects=False).content else: if filename.endswith('.zst'): with dictionary_lock: dictionary = get_dictionary(filename) r = internetarchive.get_session().get( 'https://archive.org/download/' + filename, headers={ 'Range': 'bytes={}-{}'.format(offset, offset + length - 1) }) if filename.endswith('.zst'): data[url] = zstandard.ZstdDecompressor(dict_data=dictionary) \ .decompressobj().decompress(r.content) elif filename.endswith('.gz'): data[url] = gzip.decompress(r.content) elif filename.endswith('.warc'): data[url] = r.content else: raise ValueError('WARC type not supported.') print(len(data[url]), url)
logger.warning("Timeout or unknown RequestException. Unable to upload " "to IA. Trying again if retries not exceeded: %s" % rd) if self.request.retries == self.max_retries: # Give up for now. It'll get done next time cron is run. return raise self.retry(exc=exc) if all(r.ok for r in responses): rd.filepath_ia = "https://archive.org/download/%s/%s" % ( bucket_name, file_name) rd.save(do_extraction=False, index=False) access_key = settings.IA_ACCESS_KEY secret_key = settings.IA_SECRET_KEY session = ia.get_session({'s3': { 'access': access_key, 'secret': secret_key, }}) def upload_to_ia(identifier, files, metadata=None): """Upload an item and its files to the Internet Archive On the Internet Archive there are Items and files. Items have a global identifier, and files go inside the item: https://internetarchive.readthedocs.io/en/latest/items.html This function mirrors the IA library's similar upload function, but builds in retries and various assumptions that make sense. Note that according to emails with IA staff, it is best to maximize the number of files uploaded to an Item at a time, rather than uploading each file in a separate go.
def test_get_session_with_config(): s = get_session(config={'s3': {'access': 'key'}, 'gengeral': {'secure': False}}) assert s.access_key == 'key'
def test_get_session_with_config(): s = get_session(config={'s3': {'access': 'key'}}) assert s.access_key == 'key'
def test_get_session_with_config(): s = get_session(config={"s3": {"access": "key"}}) assert s.access_key == "key"
inc_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, inc_path) from copy import deepcopy import responses from internetarchive.cli import ia_list from internetarchive import get_session protocol = 'https:' ROOT_DIR = os.getcwd() TEST_JSON_FILE = os.path.join(ROOT_DIR, 'tests/data/nasa_meta.json') SESSION = get_session() with open(TEST_JSON_FILE, 'r') as fh: ITEM_METADATA = fh.read().strip() NASA_FILES = set([ 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_reviews.xml', 'nasa_meta.xml', 'nasa_archive.torrent', 'nasa_files.xml' ]) def test_ia_list(capsys): with responses.RequestsMock() as rsps:
def nasa_item(): session = get_session() with IaRequestsMock() as mocker: mocker.add_metadata_mock('nasa') yield session.get_item('nasa')
def session(): return get_session(config=dict(s3=dict(access='access', secret='secret')))
def session_with_logging(): return get_session(config={'logging': {'level': 'INFO'}})
def session(): return get_session()
def test_get_item_with_archive_session(): with responses.RequestsMock() as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) s = get_session(config={"s3": {"access": "key3"}}) item = get_item("nasa", archive_session=s) assert item.session.access_key == "key3"
def test_get_item_with_archive_session(nasa_mocker): s = get_session(config={'s3': {'access': 'key3'}}) item = get_item('nasa', archive_session=s) assert item.session.access_key == 'key3'