Пример #1
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        s = get_session(config={"s3": {"access": "key"}})
        files = get_files("nasa", files="nasa_meta.xml", archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", config={"logging": {"level": "INFO"}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        test_conf = """[s3]\naccess = key2"""
        with open("ia_test.ini", "w") as fh:
            fh.write(test_conf)
        files = get_files("nasa", files="nasa_meta.xml", config_file="ia_test.ini")
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", http_adapter_kwargs={"max_retries": 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", request_kwargs={"timeout": 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"
Пример #2
0
def test_get_files_formats():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        files = get_files("nasa", formats="JPEG")
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "globe_west_540.jpg"

        files = get_files("nasa", formats=["JPEG", "Collection Header"])
        expected_files = set(["globe_west_540.jpg", "NASAarchiveLogo.jpg"])
        assert set([f.name for f in files]) == expected_files
Пример #3
0
def test_get_files_glob_pattern():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        files = get_files("nasa", glob_pattern="*torrent")
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_archive.torrent"

        files = get_files("nasa", glob_pattern="*torrent|*jpg")
        expected_files = set(["globe_west_540.jpg", "NASAarchiveLogo.jpg", "nasa_archive.torrent"])
        assert set([f.name for f in files]) == expected_files
Пример #4
0
def test_get_files_formats():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        files = get_files('nasa', formats='JPEG')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'globe_west_540.jpg'

        files = get_files('nasa', formats=['JPEG', 'Collection Header'])
        expected_files = set([
            'globe_west_540.jpg',
            'NASAarchiveLogo.jpg',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #5
0
def test_get_files_formats():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        files = get_files('nasa', formats='JPEG')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'globe_west_540.jpg'

        files = get_files('nasa', formats=['JPEG', 'Collection Header'])
        expected_files = set([
            'globe_west_540.jpg',
            'NASAarchiveLogo.jpg',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #6
0
def test_get_files_glob_pattern():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        files = get_files('nasa', glob_pattern='*torrent')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_archive.torrent'

        files = get_files('nasa', glob_pattern='*torrent|*jpg')
        expected_files = set([
            'globe_west_540.jpg',
            'NASAarchiveLogo.jpg',
            'nasa_archive.torrent',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #7
0
def test_get_files_glob_pattern():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        files = get_files('nasa', glob_pattern='*torrent')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_archive.torrent'

        files = get_files('nasa', glob_pattern='*torrent|*jpg')
        expected_files = set([
            'globe_west_540.jpg',
            'NASAarchiveLogo.jpg',
            'nasa_archive.torrent',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #8
0
def test_get_files_multiple():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        _files = ["nasa_meta.xml", "nasa_files.xml"]
        files = get_files("nasa", files=_files)
        for f in files:
            assert f.name in _files
Пример #9
0
def test_get_files_non_existing():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        files = get_files('nasa', files='none')
        assert list(files) == []
Пример #10
0
def test_get_files_formats():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        files = get_files('nasa', formats='JPEG')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'globe_west_540.jpg'

        files = get_files('nasa', formats=['JPEG', 'Collection Header'])
        expected_files = set([
            'globe_west_540.jpg',
            'NASAarchiveLogo.jpg',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #11
0
def test_get_files_non_existing():
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        files = get_files('nasa', files='none')
        assert list(files) == []
Пример #12
0
def test_get_files_formats():
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        files = get_files('nasa', formats='JPEG')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'globe_west_540.jpg'

        files = get_files('nasa', formats=['JPEG', 'Collection Header'])
        expected_files = set([
            'globe_west_540.jpg',
            'NASAarchiveLogo.jpg',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #13
0
def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(
        internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date)
    )

    guid_results = dict()
    all_results = dict()

    c = {"s3": {"access": settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret": settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == "test_collection":
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (
                link.guid,
                truncatechars(link.submitted_title, 50),
            )
            guid_results["mediatype"] = item.metadata["mediatype"] == "web"
            guid_results["description"] = item.metadata["description"] == "Perma.cc archive of %s created on %s." % (
                link.submitted_url,
                link.creation_timestamp,
            )
            guid_results["contributor"] = item.metadata["contributor"] == "Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"] == link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"] == "http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"] == "urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (
                    link.organization,
                    link.organization.registrar,
                )

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results
Пример #14
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {
                              'level': 'INFO'
                          }})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'
Пример #15
0
def test_get_files_glob_pattern():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        files = get_files('nasa', glob_pattern='*torrent')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_archive.torrent'

        files = get_files('nasa', glob_pattern='*torrent|*jpg')
        expected_files = set([
            'globe_west_540.jpg',
            'NASAarchiveLogo.jpg',
            'nasa_archive.torrent',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #16
0
def test_get_files_multiple():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        _files = ['nasa_meta.xml', 'nasa_files.xml']
        files = get_files('nasa', files=_files)
        for f in files:
            assert f.name in _files
Пример #17
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {
                              'level': 'INFO'
                          }})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'
Пример #18
0
def test_get_files_source():
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        files = get_files('nasa', source='original')
        expected_files = set(['NASAarchiveLogo.jpg', 'globe_west_540.jpg'])
        assert set([f.name for f in files]) == expected_files

        files = get_files('nasa', source=['original', 'metadata'])
        expected_files = set([
            'NASAarchiveLogo.jpg',
            'globe_west_540.jpg',
            'nasa_meta.xml',
            'nasa_files.xml',
            'nasa_reviews.xml',
            'nasa_archive.torrent',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #19
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {'level': 'INFO'}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa', files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa', files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'
Пример #20
0
def test_get_files():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        files = get_files('nasa')
        expected_files = set([
            'NASAarchiveLogo.jpg',
            'globe_west_540.jpg',
            'nasa_reviews.xml',
            'nasa_meta.xml',
            'nasa_archive.torrent',
            'nasa_files.xml',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #21
0
def test_get_files():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        files = get_files('nasa')
        expected_files = set([
            'NASAarchiveLogo.jpg',
            'globe_west_540.jpg',
            'nasa_reviews.xml',
            'nasa_meta.xml',
            'nasa_archive.torrent',
            'nasa_files.xml',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #22
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {'level': 'INFO'}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa', files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa', files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'
Пример #23
0
def get_from_ia(reporter, volume):
    """
    Download cases from internet archive via case law and write them to
    disk.

    :param reporter: (str) Requires a reporter abbreviation to identify
    cases to download as used by IA.  (Ex. T.C. => tc)
    :param volume: (int) Specific volume number of the reporter.  If blank
    function will cycle through all volumes of the reporter on IA.
    :return: None
    """

    reporter_key = ".".join(['law.free.cap', reporter])

    # Checks that the returned reporter is the requested one.
    # Ex. searching for Mich will return both Mich-app. and Mich.
    for ia_identifier in search_items(reporter_key):
        ia_key = ia_identifier['identifier']
        if ia_key.split(".")[3] != reporter:
            continue

        # Checks if we requested a specific volume of the
        # reporter and if so skips all other volumes of that reporter
        ia_volume = ia_key.split(".")[-1]
        if volume is not None:
            if volume != ia_volume:
                continue

        for item in get_files(ia_key):
            if "json.json" in item.name:
                continue

            if "json" in item.name:
                url = "https://archive.org/download/%s/%s" % (
                    ia_key, item.name)
                file_path = os.path.join(settings.MEDIA_ROOT,
                                         'harvard_corpus',
                                         '%s' % ia_key,
                                         '%s' % item.name,
                                         )
                directory = file_path.rsplit("/", 1)[0]
                if os.path.exists(file_path):
                    logger.info("Already captured: %s", url)
                    continue

                logger.info("Capturing: %s", url)
                mkdir_p(directory)
                data = requests.get(url, timeout=10).json()
                with open(file_path, 'w') as outfile:
                    json.dump(data, outfile, indent=2)
Пример #24
0
def ia_get_files(identifier, **kwargs):
    """
    Retrieve information about the files associated with the given IA item.

    :param str identifier:  IA title identifier.
    :param kwargs:          Passed to internetarchive.get_files().

    :rtype: list[dict]

    """
    if 'on_the_fly' not in kwargs:
        kwargs['on_the_fly'] = True
    result = internetarchive.get_files(identifier, **kwargs)
    return list(result)
Пример #25
0
def test_get_files():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        files = get_files("nasa")
        expected_files = set(
            [
                "NASAarchiveLogo.jpg",
                "globe_west_540.jpg",
                "nasa_reviews.xml",
                "nasa_meta.xml",
                "nasa_archive.torrent",
                "nasa_files.xml",
            ]
        )
        assert set([f.name for f in files]) == expected_files
Пример #26
0
def test_get_files():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        files = get_files('nasa')
        expected_files = set([
            'NASAarchiveLogo.jpg',
            'globe_west_540.jpg',
            'nasa_reviews.xml',
            'nasa_meta.xml',
            'nasa_archive.torrent',
            'nasa_files.xml',
        ])
        assert set([f.name for f in files]) == expected_files
Пример #27
0
def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date   = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date))

    guid_results = dict()
    all_results = dict()

    c = {"s3":{"access":settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret":settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == 'test_collection':
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (link.guid, truncatechars(link.submitted_title, 50))
            guid_results["mediatype"] = item.metadata["mediatype"]=="web"
            guid_results["description"] = item.metadata["description"]=="Perma.cc archive of %s created on %s." % (link.submitted_url, link.creation_timestamp,)
            guid_results["contributor"] = item.metadata["contributor"]=="Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"]==link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"]=="http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"]=="urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (link.organization, link.organization.registrar)

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results
Пример #28
0
def get_marc21_files(item):
    return [f for f in ia.get_files(item) if MARC_EXT.match(f.name)]
Пример #29
0
def test_get_files_non_existing(nasa_mocker):
    files = get_files('nasa', files='none')
    assert list(files) == []
Пример #30
0
def test_get_files_multiple(nasa_mocker):
    _files = ['nasa_meta.xml', 'nasa_files.xml']
    files = get_files('nasa', files=_files)
    for f in files:
        assert f.name in _files
Пример #31
0
#!/usr/bin/env python2.7

import sys
from internetarchive import search_items
import os.path
from internetarchive import get_files

collection = sys.argv[1]

list_filename = './' + collection + '_files.txt'
if os.path.exists(list_filename):
    sys.exit(0)

list_file = open(list_filename, 'w+')

search = search_items('collection:' + collection)
for result in search:
    item_id = result['identifier']
    files = []
    if len(sys.argv) < 2: files = get_files(item_id, glob_pattern='*')
    else:
        for glob in sys.argv[1:]:
            files += get_files(item_id, glob_pattern=glob)
    for file in files:
        list_file.write(item_id + "\t" + file.name + "\n")
Пример #32
0
def test_get_files_non_existing(nasa_mocker):
    files = get_files('nasa', files='none')
    assert list(files) == []
Пример #33
0
def get_marc21_files(item):
    return [f.name for f in ia.get_files(item) if f.name.endswith('.mrc')]
Пример #34
0
def test_get_files_multiple(nasa_mocker):
    _files = ['nasa_meta.xml', 'nasa_files.xml']
    files = get_files('nasa', files=_files)
    for f in files:
        assert f.name in _files
Пример #35
0
from internetarchive import get_files, get_item

print('Bem vindo ao IA downloader!!')

repositorio = input('Qual o nome do repositorio que deseja baixar?')
extensão = input('Qual a extensão do arquivo que deseja baixar?')

print(
    f"Listando arquivos no repositorio {repositorio} no formato *{extensão} abaixo: "
)

fnames = [f.name for f in get_files(repositorio, glob_pattern=f"*{extensão}")]
print(fnames)

index = input('Qual arquivo voce quer? \n')

for i in fnames:
    if index in i:
        arquivo = i

print('Arquivo selecionado: ' + arquivo + '\n')
bole = input('Deseja baixar o arquivo?\n 0-Não\n 1-Sim\n')
if bool(bole):
    print('Baixando arquivo: ' + arquivo)
    item = get_item(repositorio)
    item.download(arquivo)
    print('Download concluido!')
else:
    print('exit')
Пример #36
0
## Example
#Tiesto's club life downloader

from internetarchive import get_files, get_item
fnames = [f.name for f in get_files('TiestosClubLife', glob_pattern='*ogg')]

index = input('Qual episodio voce quer? \n')

for i in fnames:
    if index in i:
        episodio = i

print('Episodio selecionado: ' + episodio + '\n')
bole = input('Deseja baixar o arquivo?\n 0-Não\n 1-Sim\n')
if bool(bole):
    print('Baixando episodio: ' + episodio)
    item = get_item('TiestosClubLife')
    item.download(episodio)
#!/usr/bin/env python2.7

import sys
from internetarchive import search_items
import os.path
from internetarchive import get_files

collection = sys.argv[1]

list_filename = './' + collection + '_files.txt'
if os.path.exists(list_filename):
	sys.exit(0)

list_file = open(list_filename, 'w+')

search = search_items('collection:' + collection)
for result in search:
	item_id = result['identifier']
	files = []
	if len(sys.argv) < 2: files = get_files(item_id, glob_pattern='*')
	else:
		for glob in sys.argv[1:]:
			files += get_files(item_id, glob_pattern=glob)
	for file in files:
		list_file.write(item_id + "\t" + file.name + "\n")
Пример #38
0
"""tag_ia_texts.py - Downloads raw text of an Internet Archive book and TEI tags it
"""
import io
import re
import json
from internetarchive import get_files
from src.ner.flair_ner import NamedEntityRecognizer
from src.tei.assemble_document import create_document

ia_idents = ["reminiscencesoft00tangrich"]

with open('settings.json', 'r') as f:
    settings = json.load(f)

ner = NamedEntityRecognizer(
    settings['wolfram_kernel_path'],
    settings['content_types_precedence_order'],
    settings['minimum_confidence'],
)
for ident in ia_idents:
    files = get_files(ident, glob_pattern="*djvu.txt", formats="txt")
    txt_file = next(files)
    txt_file.download(file_path=f"./txt_files/{ident}.txt")
    with open(f"./tei_files/{ident}.tei", "w") as output_file:
        with open(f"./txt_files/{ident}.txt", "r") as book:
            content = book.read()
        output_file.write(create_document(ner, content))
ner.close()
Пример #39
0
    ol = OpenLibrary(base_url=local_dev, credentials=c)

limit = 50000  # if non-zero, a limit to only process this many records from each file
count = 0

completed_mrc = [
    'lbrn.mrc',
    'multi1.mrc',
    'tier1.mrc',  # DONE
    'tier2.mrc',  # DONE
    'tier3.mrc',  # DONE
    'tier4.mrc',  # DONE
    'multi2.mrc',  # NOT DONE, skipping 2nd multi for now in case there are issues
]

for f in ia.get_files(item):
    if f.name.endswith('.mrc'):
        print('FILENAME: %s' % f.name)
        if f.name in completed_mrc:
            continue
        offset = 0
        if f.name == 'tier5.mrc':
            offset = 19885954
        length = 5  # we only need to get the length of the first record (first 5 bytes), the API will seek to the end.

        while length:
            count += 1
            if limit and count >= limit:
                # Stop if a limit has been set, and we are over it.
                break
            identifier = '{}/{}:{}:{}'.format(item, f.name, offset, length)