def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) s = get_session(config={"s3": {"access": "key"}}) files = get_files("nasa", files="nasa_meta.xml", archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml" files = get_files("nasa", files="nasa_meta.xml", config={"logging": {"level": "INFO"}}) files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml" test_conf = """[s3]\naccess = key2""" with open("ia_test.ini", "w") as fh: fh.write(test_conf) files = get_files("nasa", files="nasa_meta.xml", config_file="ia_test.ini") files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml" files = get_files("nasa", files="nasa_meta.xml", http_adapter_kwargs={"max_retries": 3}) files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml" files = get_files("nasa", files="nasa_meta.xml", request_kwargs={"timeout": 4}) files = list(files) assert len(files) == 1 assert files[0].name == "nasa_meta.xml"
def test_get_files_formats(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) files = get_files("nasa", formats="JPEG") files = list(files) assert len(files) == 1 assert files[0].name == "globe_west_540.jpg" files = get_files("nasa", formats=["JPEG", "Collection Header"]) expected_files = set(["globe_west_540.jpg", "NASAarchiveLogo.jpg"]) assert set([f.name for f in files]) == expected_files
def test_get_files_glob_pattern(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) files = get_files("nasa", glob_pattern="*torrent") files = list(files) assert len(files) == 1 assert files[0].name == "nasa_archive.torrent" files = get_files("nasa", glob_pattern="*torrent|*jpg") expected_files = set(["globe_west_540.jpg", "NASAarchiveLogo.jpg", "nasa_archive.torrent"]) assert set([f.name for f in files]) == expected_files
def test_get_files_formats(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') files = get_files('nasa', formats='JPEG') files = list(files) assert len(files) == 1 assert files[0].name == 'globe_west_540.jpg' files = get_files('nasa', formats=['JPEG', 'Collection Header']) expected_files = set([ 'globe_west_540.jpg', 'NASAarchiveLogo.jpg', ]) assert set([f.name for f in files]) == expected_files
def test_get_files_glob_pattern(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') files = get_files('nasa', glob_pattern='*torrent') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_archive.torrent' files = get_files('nasa', glob_pattern='*torrent|*jpg') expected_files = set([ 'globe_west_540.jpg', 'NASAarchiveLogo.jpg', 'nasa_archive.torrent', ]) assert set([f.name for f in files]) == expected_files
def test_get_files_multiple(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) _files = ["nasa_meta.xml", "nasa_files.xml"] files = get_files("nasa", files=_files) for f in files: assert f.name in _files
def test_get_files_non_existing(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) files = get_files('nasa', files='none') assert list(files) == []
def test_get_files_formats(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) files = get_files('nasa', formats='JPEG') files = list(files) assert len(files) == 1 assert files[0].name == 'globe_west_540.jpg' files = get_files('nasa', formats=['JPEG', 'Collection Header']) expected_files = set([ 'globe_west_540.jpg', 'NASAarchiveLogo.jpg', ]) assert set([f.name for f in files]) == expected_files
def test_get_files_non_existing(): with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) files = get_files('nasa', files='none') assert list(files) == []
def test_get_files_formats(): with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) files = get_files('nasa', formats='JPEG') files = list(files) assert len(files) == 1 assert files[0].name == 'globe_west_540.jpg' files = get_files('nasa', formats=['JPEG', 'Collection Header']) expected_files = set([ 'globe_west_540.jpg', 'NASAarchiveLogo.jpg', ]) assert set([f.name for f in files]) == expected_files
def test_internet_archive(): from datetime import timedelta from django.utils import timezone import internetarchive from perma.models import Link from django.template.defaultfilters import truncatechars start_date = timezone.now() - timedelta(days=3) end_date = timezone.now() - timedelta(days=2) links = Link.objects.filter( internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date) ) guid_results = dict() all_results = dict() c = {"s3": {"access": settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret": settings.INTERNET_ARCHIVE_SECRET_KEY}} internetarchive.get_session(config=c) for link in links: identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid item = internetarchive.get_item(identifier) warc_name = "%s.warc.gz" % link.guid try: fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")] guid_results["uploaded_file"] = warc_name in fnames if settings.INTERNET_ARCHIVE_COLLECTION == "test_collection": guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION else: guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION guid_results["title"] = item.metadata["title"] == "%s: %s" % ( link.guid, truncatechars(link.submitted_title, 50), ) guid_results["mediatype"] = item.metadata["mediatype"] == "web" guid_results["description"] = item.metadata["description"] == "Perma.cc archive of %s created on %s." % ( link.submitted_url, link.creation_timestamp, ) guid_results["contributor"] = item.metadata["contributor"] == "Perma.cc" guid_results["submitted_url"] = item.metadata["submitted_url"] == link.submitted_url guid_results["perma_url"] = item.metadata["perma_url"] == "http://%s/%s" % (settings.HOST, link.guid) guid_results["external-identifier"] = item.metadata["external-identifier"] == "urn:X-perma:%s" % link.guid if link.organization: guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % ( link.organization, link.organization.registrar, ) except Exception as e: guid_results["error"] = e pass all_results[link.guid] = guid_results print all_results
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': { 'level': 'INFO' }}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml'
def test_get_files_glob_pattern(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) files = get_files('nasa', glob_pattern='*torrent') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_archive.torrent' files = get_files('nasa', glob_pattern='*torrent|*jpg') expected_files = set([ 'globe_west_540.jpg', 'NASAarchiveLogo.jpg', 'nasa_archive.torrent', ]) assert set([f.name for f in files]) == expected_files
def test_get_files_multiple(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) _files = ['nasa_meta.xml', 'nasa_files.xml'] files = get_files('nasa', files=_files) for f in files: assert f.name in _files
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': { 'level': 'INFO' }}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml'
def test_get_files_source(): with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) files = get_files('nasa', source='original') expected_files = set(['NASAarchiveLogo.jpg', 'globe_west_540.jpg']) assert set([f.name for f in files]) == expected_files files = get_files('nasa', source=['original', 'metadata']) expected_files = set([ 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_meta.xml', 'nasa_files.xml', 'nasa_reviews.xml', 'nasa_archive.torrent', ]) assert set([f.name for f in files]) == expected_files
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': {'level': 'INFO'}}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml'
def test_get_files(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') files = get_files('nasa') expected_files = set([ 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_reviews.xml', 'nasa_meta.xml', 'nasa_archive.torrent', 'nasa_files.xml', ]) assert set([f.name for f in files]) == expected_files
def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': {'level': 'INFO'}}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml'
def get_from_ia(reporter, volume): """ Download cases from internet archive via case law and write them to disk. :param reporter: (str) Requires a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) :param volume: (int) Specific volume number of the reporter. If blank function will cycle through all volumes of the reporter on IA. :return: None """ reporter_key = ".".join(['law.free.cap', reporter]) # Checks that the returned reporter is the requested one. # Ex. searching for Mich will return both Mich-app. and Mich. for ia_identifier in search_items(reporter_key): ia_key = ia_identifier['identifier'] if ia_key.split(".")[3] != reporter: continue # Checks if we requested a specific volume of the # reporter and if so skips all other volumes of that reporter ia_volume = ia_key.split(".")[-1] if volume is not None: if volume != ia_volume: continue for item in get_files(ia_key): if "json.json" in item.name: continue if "json" in item.name: url = "https://archive.org/download/%s/%s" % ( ia_key, item.name) file_path = os.path.join(settings.MEDIA_ROOT, 'harvard_corpus', '%s' % ia_key, '%s' % item.name, ) directory = file_path.rsplit("/", 1)[0] if os.path.exists(file_path): logger.info("Already captured: %s", url) continue logger.info("Capturing: %s", url) mkdir_p(directory) data = requests.get(url, timeout=10).json() with open(file_path, 'w') as outfile: json.dump(data, outfile, indent=2)
def ia_get_files(identifier, **kwargs): """ Retrieve information about the files associated with the given IA item. :param str identifier: IA title identifier. :param kwargs: Passed to internetarchive.get_files(). :rtype: list[dict] """ if 'on_the_fly' not in kwargs: kwargs['on_the_fly'] = True result = internetarchive.get_files(identifier, **kwargs) return list(result)
def test_get_files(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) files = get_files("nasa") expected_files = set( [ "NASAarchiveLogo.jpg", "globe_west_540.jpg", "nasa_reviews.xml", "nasa_meta.xml", "nasa_archive.torrent", "nasa_files.xml", ] ) assert set([f.name for f in files]) == expected_files
def test_get_files(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) files = get_files('nasa') expected_files = set([ 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_reviews.xml', 'nasa_meta.xml', 'nasa_archive.torrent', 'nasa_files.xml', ]) assert set([f.name for f in files]) == expected_files
def test_internet_archive(): from datetime import timedelta from django.utils import timezone import internetarchive from perma.models import Link from django.template.defaultfilters import truncatechars start_date = timezone.now() - timedelta(days=3) end_date = timezone.now() - timedelta(days=2) links = Link.objects.filter(internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date)) guid_results = dict() all_results = dict() c = {"s3":{"access":settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret":settings.INTERNET_ARCHIVE_SECRET_KEY}} internetarchive.get_session(config=c) for link in links: identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid item = internetarchive.get_item(identifier) warc_name = "%s.warc.gz" % link.guid try: fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")] guid_results["uploaded_file"] = warc_name in fnames if settings.INTERNET_ARCHIVE_COLLECTION == 'test_collection': guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION else: guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION guid_results["title"] = item.metadata["title"] == "%s: %s" % (link.guid, truncatechars(link.submitted_title, 50)) guid_results["mediatype"] = item.metadata["mediatype"]=="web" guid_results["description"] = item.metadata["description"]=="Perma.cc archive of %s created on %s." % (link.submitted_url, link.creation_timestamp,) guid_results["contributor"] = item.metadata["contributor"]=="Perma.cc" guid_results["submitted_url"] = item.metadata["submitted_url"]==link.submitted_url guid_results["perma_url"] = item.metadata["perma_url"]=="http://%s/%s" % (settings.HOST, link.guid) guid_results["external-identifier"] = item.metadata["external-identifier"]=="urn:X-perma:%s" % link.guid if link.organization: guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (link.organization, link.organization.registrar) except Exception as e: guid_results["error"] = e pass all_results[link.guid] = guid_results print all_results
def get_marc21_files(item): return [f for f in ia.get_files(item) if MARC_EXT.match(f.name)]
def test_get_files_non_existing(nasa_mocker): files = get_files('nasa', files='none') assert list(files) == []
def test_get_files_multiple(nasa_mocker): _files = ['nasa_meta.xml', 'nasa_files.xml'] files = get_files('nasa', files=_files) for f in files: assert f.name in _files
#!/usr/bin/env python2.7 import sys from internetarchive import search_items import os.path from internetarchive import get_files collection = sys.argv[1] list_filename = './' + collection + '_files.txt' if os.path.exists(list_filename): sys.exit(0) list_file = open(list_filename, 'w+') search = search_items('collection:' + collection) for result in search: item_id = result['identifier'] files = [] if len(sys.argv) < 2: files = get_files(item_id, glob_pattern='*') else: for glob in sys.argv[1:]: files += get_files(item_id, glob_pattern=glob) for file in files: list_file.write(item_id + "\t" + file.name + "\n")
def get_marc21_files(item): return [f.name for f in ia.get_files(item) if f.name.endswith('.mrc')]
from internetarchive import get_files, get_item print('Bem vindo ao IA downloader!!') repositorio = input('Qual o nome do repositorio que deseja baixar?') extensão = input('Qual a extensão do arquivo que deseja baixar?') print( f"Listando arquivos no repositorio {repositorio} no formato *{extensão} abaixo: " ) fnames = [f.name for f in get_files(repositorio, glob_pattern=f"*{extensão}")] print(fnames) index = input('Qual arquivo voce quer? \n') for i in fnames: if index in i: arquivo = i print('Arquivo selecionado: ' + arquivo + '\n') bole = input('Deseja baixar o arquivo?\n 0-Não\n 1-Sim\n') if bool(bole): print('Baixando arquivo: ' + arquivo) item = get_item(repositorio) item.download(arquivo) print('Download concluido!') else: print('exit')
## Example #Tiesto's club life downloader from internetarchive import get_files, get_item fnames = [f.name for f in get_files('TiestosClubLife', glob_pattern='*ogg')] index = input('Qual episodio voce quer? \n') for i in fnames: if index in i: episodio = i print('Episodio selecionado: ' + episodio + '\n') bole = input('Deseja baixar o arquivo?\n 0-Não\n 1-Sim\n') if bool(bole): print('Baixando episodio: ' + episodio) item = get_item('TiestosClubLife') item.download(episodio)
"""tag_ia_texts.py - Downloads raw text of an Internet Archive book and TEI tags it """ import io import re import json from internetarchive import get_files from src.ner.flair_ner import NamedEntityRecognizer from src.tei.assemble_document import create_document ia_idents = ["reminiscencesoft00tangrich"] with open('settings.json', 'r') as f: settings = json.load(f) ner = NamedEntityRecognizer( settings['wolfram_kernel_path'], settings['content_types_precedence_order'], settings['minimum_confidence'], ) for ident in ia_idents: files = get_files(ident, glob_pattern="*djvu.txt", formats="txt") txt_file = next(files) txt_file.download(file_path=f"./txt_files/{ident}.txt") with open(f"./tei_files/{ident}.tei", "w") as output_file: with open(f"./txt_files/{ident}.txt", "r") as book: content = book.read() output_file.write(create_document(ner, content)) ner.close()
ol = OpenLibrary(base_url=local_dev, credentials=c) limit = 50000 # if non-zero, a limit to only process this many records from each file count = 0 completed_mrc = [ 'lbrn.mrc', 'multi1.mrc', 'tier1.mrc', # DONE 'tier2.mrc', # DONE 'tier3.mrc', # DONE 'tier4.mrc', # DONE 'multi2.mrc', # NOT DONE, skipping 2nd multi for now in case there are issues ] for f in ia.get_files(item): if f.name.endswith('.mrc'): print('FILENAME: %s' % f.name) if f.name in completed_mrc: continue offset = 0 if f.name == 'tier5.mrc': offset = 19885954 length = 5 # we only need to get the length of the first record (first 5 bytes), the API will seek to the end. while length: count += 1 if limit and count >= limit: # Stop if a limit has been set, and we are over it. break identifier = '{}/{}:{}:{}'.format(item, f.name, offset, length)