Python get_session示例，internetarchive.get_session Python示例

示例#1

0

显示文件

文件： dev.py 项目： harvard-lil/perma

def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(
        internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date)
    )

    guid_results = dict()
    all_results = dict()

    c = {"s3": {"access": settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret": settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == "test_collection":
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (
                link.guid,
                truncatechars(link.submitted_title, 50),
            )
            guid_results["mediatype"] = item.metadata["mediatype"] == "web"
            guid_results["description"] = item.metadata["description"] == "Perma.cc archive of %s created on %s." % (
                link.submitted_url,
                link.creation_timestamp,
            )
            guid_results["contributor"] = item.metadata["contributor"] == "Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"] == link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"] == "http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"] == "urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (
                    link.organization,
                    link.organization.registrar,
                )

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results

示例#2

0

显示文件

文件： test_api.py 项目： h4ck3rm1k3/ia-wrapper

def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open("ia_test.ini", "w") as fh:
        fh.write(test_conf)
    s = get_session(config_file="ia_test.ini")
    assert s.access_key == "key2"

示例#3

0

显示文件

文件： test_api.py 项目： h4ck3rm1k3/ia-wrapper

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        s = get_session(config={"s3": {"access": "key"}})
        files = get_files("nasa", files="nasa_meta.xml", archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", config={"logging": {"level": "INFO"}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        test_conf = """[s3]\naccess = key2"""
        with open("ia_test.ini", "w") as fh:
            fh.write(test_conf)
        files = get_files("nasa", files="nasa_meta.xml", config_file="ia_test.ini")
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", http_adapter_kwargs={"max_retries": 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", request_kwargs={"timeout": 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

示例#4

0

显示文件

def get_ia_session():
    try:
        assert config.has_key('s3/access_key')
        assert config.has_key('s3/secret_key')
        assert config.has_key('cookie')
        assert config.has_key('email')
    except:
        return None

    ia_session = get_session(
        config={
            'general': {
                'secure': True
            },
            's3': {
                'access': config.get('s3/access_key'),
                'secret': config.get('s3/secret_key')
            },
            'cookies': {
                'logged-in-user': config.get('email'),
                'logged-in-sig': config.get('cookie')
            },
        },
        http_adapter_kwargs={'max_retries': 10},
    )
    return ia_session

示例#5

0

显示文件

    def getStats():
        today = date.today()
        back30_days = (datetime.now() - timedelta(days=30)).date()
        back7_days = (datetime.now() - timedelta(days=7)).date()

        collection = 'greekgovernmentgazette'
        query_all = f'collection:({collection})'
        res = {}

        try:
            s = get_session()
            s.mount_http_adapter()

            search_results = s.search_items(query_all,
                                            fields=['identifier', 'addeddate'])
            lst_res = list(search_results)
            docs_last30days = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) >= back30_days
            ]
            docs_last7days = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) >= back7_days
            ]
            docs_today = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) == today
            ]

            res['count_all'] = len(lst_res)
            res['count_last30days'] = len(docs_last30days)
            res['count_last7days'] = len(docs_last7days)
            res['count_today'] = len(docs_today)
        finally:
            return res

示例#6

0

显示文件

文件： archive_utility.py 项目： sanskrit-coders/curation_utils

    def __init__(self,
                 archive_id,
                 metadata=None,
                 config_file_path=None,
                 repo_base=None):
        """
        
        :param archive_id: 
        :param config_file_path:
        :param repo_base: In archive item, place each file in a folder mirroring its local location.
        """
        self.repo_base = repo_base
        self.archive_id = archive_id
        self.archive_session = internetarchive.get_session(
            config_file=config_file_path)
        self.archive_item = internetarchive.get_item(
            archive_id, config_file=config_file_path)
        self.metadata = metadata
        logging.info(self.archive_item.identifier)

        self.original_item_files = list(
            filter(
                lambda x: x["source"] == "original" and not x["name"].
                startswith(self.archive_item.identifier) and not x[
                    "name"].startswith("_"), self.archive_item.files))
        self.original_item_file_names = sorted(
            map(lambda x: x["name"], self.original_item_files))

示例#7

0

显示文件

文件： pigeon.py 项目： Johnetordoff/osf-pigeon

def create_subcollection(collection_id, metadata=None, parent_collection=None):
    """
    The expected sub-collection hierarchy is as follows top-level OSF collection -> provider
    collection -> collection for nodes with multiple children -> all only child nodes

    :param metadata: dict should attributes for the provider's sub-collection is being created
    :param parent_collection: str the name of the  sub-collection's parent
    :return:
    """
    if metadata is None:
        metadata = {}

    session = internetarchive.get_session(
        config={
            "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY},
        },
    )

    collection = internetarchive.Item(session, collection_id)
    collection.upload(
        files={"dummy.txt": BytesIO(b"dummy")},
        metadata={
            "mediatype": "collection",
            "collection": parent_collection,
            **metadata,
        },
    )

示例#8

0

显示文件

文件： pigeon.py 项目： Johnetordoff/osf-pigeon

def get_ia_item(guid):
    session = internetarchive.get_session(
        config={
            "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY},
        },
    )
    return session.get_item(guid)

示例#9

0

显示文件

文件： test_api.py 项目： jleclanche/internetarchive

def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)
    s = get_session(config_file='ia_test.ini')
    assert s.access_key == 'key2'

示例#10

0

显示文件

文件： views.py 项目： machawk1/data-transfer-apis

def stream_from_pbox(itemname, filename):
    # TODO:  handle errors etc
    archive_session = get_session(config_file=settings.IATOOL_CONFIG_PATH)
    item = archive_session.get_item(itemname)
    files = item.get_files(filename)
    file = files.__next__()
    return file.download(return_responses=True)

示例#11

0

显示文件

文件： views.py 项目： WASAPI-Community/data-transfer-apis

def stream_from_pbox(itemname, filename):
    # TODO:  handle errors etc
    archive_session = get_session(config_file=settings.IATOOL_CONFIG_PATH)
    item = archive_session.get_item(itemname)
    files = item.get_files(filename)
    file = files.__next__()
    return file.download(return_responses=True)

示例#12

0

显示文件

def new_session():
    global SESSION

    if SESSION is not None:
        raise Exception('have another session!')

    SESSION = get_session(config=CONFIG)

示例#13

0

显示文件

文件： test_api.py 项目： brycedrennan/internetarchive

def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)
    s = get_session(config_file='ia_test.ini')
    assert s.access_key == 'key2'

示例#14

0

显示文件

def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date   = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date))

    guid_results = dict()
    all_results = dict()

    c = {"s3":{"access":settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret":settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == 'test_collection':
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (link.guid, truncatechars(link.submitted_title, 50))
            guid_results["mediatype"] = item.metadata["mediatype"]=="web"
            guid_results["description"] = item.metadata["description"]=="Perma.cc archive of %s created on %s." % (link.submitted_url, link.creation_timestamp,)
            guid_results["contributor"] = item.metadata["contributor"]=="Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"]==link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"]=="http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"]=="urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (link.organization, link.organization.registrar)

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results

示例#15

0

显示文件

 def createSession(self):
     iaKey = decryptEnvVar('IA_ACCESS_KEY')
     iaSecret = decryptEnvVar('IA_SECRET_KEY')
     return get_session(
         config={'s3': {
             'access': iaKey,
             'secret': iaSecret
         }})

示例#16

0

显示文件

文件： internetarchive.py 项目： rdhyee/open-context-py

 def start_ia_session(self):
     """ starts an internet archive session """
     config = dict(s3=dict(acccess=settings.INTERNET_ARCHIVE_ACCESS_KEY,
                           secret=settings.INTERNET_ARCHIVE_SECRET_KEY))
     s = get_session(config=config, debug=True)
     s.access_key = settings.INTERNET_ARCHIVE_ACCESS_KEY
     s.secret_key = settings.INTERNET_ARCHIVE_SECRET_KEY
     return s

示例#17

0

显示文件

文件： test_api.py 项目： jleclanche/internetarchive

def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key3'}})
        item = get_item('nasa', archive_session=s)
        assert item.session.access_key == 'key3'

示例#18

0

显示文件

文件： test_api.py 项目： bmschmidt/internetarchive

def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key3'}})
        item = get_item('nasa', archive_session=s)
        assert item.session.access_key == 'key3'

示例#19

0

显示文件

文件： test_item.py 项目： jjjake/internetarchive

def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={"general": {"secure": True}})
        rsps.add_metadata_mock("nasa")
        item = s.get_item("nasa")
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == "https://s3.us.archive.org/nasa/nasa.json"

示例#20

0

显示文件

文件： iairma.py 项目： ekansa/open-context-py

 def start_ia_session(self):
     """ starts an internet archive session """
     config = dict(s3=dict(acccess=settings.INTERNET_ARCHIVE_ACCESS_KEY,
                           secret=settings.INTERNET_ARCHIVE_SECRET_KEY))
     s = get_session(config=config,
                     debug=True)
     s.access_key = settings.INTERNET_ARCHIVE_ACCESS_KEY
     s.secret_key = settings.INTERNET_ARCHIVE_SECRET_KEY
     return s

示例#21

0

显示文件

文件： ia_noto.py 项目： iwsfutcmd/ia-noto

def upload_to_ia(force=set()):
    s = get_session()
    item = s.get_item("NotoFonts")
    hashdict = {f["name"]: f["md5"] for f in item.files}

    fonts_modified = False
    for path in tqdm(sorted(pathset)):
        filename = path.name
        file = open(path, "rb").read()
        hash = md5(file).hexdigest()
        if "fonts" not in force:
            try:
                if hashdict[filename] == hash:
                    print("SKIPPING: " + filename)
                    continue
            except KeyError:
                pass
        fonts_modified = True
        print("WORKING: " + filename)
        upload_paths = []
        ttf = TTFont(path)
        print("  CONVERTING TO woff2...")
        ttf.flavor = "woff2"
        woff2_path = "upload/" + path.with_suffix(".woff2").name
        try:
            ttf.save(open(woff2_path, "wb"))
            upload_paths.append(woff2_path)
        except TTLibError:
            print("could not convert to woff2")
        print("  CONVERTING TO woff...")
        ttf.flavor = "woff"
        woff_path = "upload/" + path.with_suffix(".woff").name
        ttf.save(open(woff_path, "wb"))
        upload_paths.append(woff_path)
        print("  UPLOADING...")
        r = item.upload(files=[*upload_paths, str(path)], retries=100)
        for upath in [woff2_path, woff_path]:
            remove(upath)
    if "css" in force or fonts_modified:
        from generate_css import build_all_css

        print("  GENERATING CSS...")
        build_all_css()
        css_files = glob("*.css")
        for path in [Path(p) for p in sorted(css_files)]:
            filename = path.name
            file = open(path, "rb").read()
            hash = md5(file).hexdigest()
            # if "css" not in force:
            try:
                if hashdict[filename] == hash:
                    print("SKIPPING: " + filename)
                    continue
            except KeyError:
                pass
            print("  UPLOADING " + filename)
            r = item.upload(files=css_files, retries=100)

示例#22

0

显示文件

文件： test_item.py 项目： brycedrennan/internetarchive

def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add_metadata_mock('nasa')
        item = s.get_item('nasa')
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json'

示例#23

0

显示文件

def test_get_session_with_config():
    s = get_session(config={
        's3': {
            'access': 'key'
        },
        'gengeral': {
            'secure': False
        }
    })
    assert s.access_key == 'key'

示例#24

0

显示文件

文件： work_pipeline.py 项目： gdamdam/fatcat-scholar

 def __init__(
     self,
     issue_db: IssueDB,
     sandcrawler_db_client: SandcrawlerPostgrestClient,
     sandcrawler_s3_client: SandcrawlerMinioClient,
 ):
     self.issue_db: IssueDB = issue_db
     self.ia_client = internetarchive.get_session()
     self.sandcrawler_db_client = sandcrawler_db_client
     self.sandcrawler_s3_client = sandcrawler_s3_client

示例#25

0

显示文件

文件： test_item.py 项目： jjjake/internetarchive

def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        c = {'s3': {'access': 'foo', 'secret': 'bar'}, 'general': {'secure': True}}
        s = get_session(config=c)
        rsps.add_metadata_mock('nasa')
        item = s.get_item('nasa')
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json'

示例#26

0

显示文件

def get_stats(days=30):
    back_days = (datetime.now() - timedelta(days=days)).date()
    query_all = 'collection:(greekgovernmentgazette) AND date:[{1} TO {0}]' \
                    .format(datetime.now().strftime('%Y-%m-%d'), back_days.strftime('%Y-%m-%d'))
    s = get_session()
    s.mount_http_adapter()

    search_results = s.search_items(query_all,
                                    fields=['identifier', 'addeddate'])

    return '{}\t{}'.format(query_all, len(search_results))

示例#27

0

显示文件

文件： test_item.py 项目： FactMiners/internetarchive

def test_upload_secure_session(testitem_metadata, json_filename):
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add(responses.GET, 'https://archive.org/metadata/nasa',
                 body=testitem_metadata,
                 status=200)
        item = s.get_item('nasa')
        with responses.RequestsMock(
                assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE, status=200)
            r = item.upload(json_filename)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa_meta.json'

示例#28

0

显示文件

def test_upload_secure_session(testitem_metadata, json_filename):
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add(responses.GET,
                 'https://archive.org/metadata/nasa',
                 body=testitem_metadata,
                 status=200)
        item = s.get_item('nasa')
        with responses.RequestsMock(
                assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE, status=200)
            r = item.upload(json_filename)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa_meta.json'

示例#29

0

显示文件

文件： iasync.py 项目： omshivaprakash/egazette

    def __init__(self, file_storage, access_key, secret_key, loglevel, logfile):
        self.file_storage = file_storage
        self.access_key   = access_key
        self.secret_key   = secret_key

        session_data = {'access': access_key, 'secret': secret_key}
        if logfile:
            logconfig    = {'logging': {'level': loglevel, 'file': logfile}}
        else:    
            logconfig    = {'logging': {'level': loglevel}}

        self.session = get_session({'s3': session_data, 'logging': logconfig})
        self.logger = logging.getLogger('iasync')

示例#30

0

显示文件

    def __init__(self, top_dir, access_key, secret_key, loglevel, logfile):
        self.top_dir = top_dir
        self.access_key = access_key
        self.secret_key = secret_key

        session_data = {'access': access_key, 'secret': secret_key}
        if logfile:
            logconfig = {'logging': {'level': loglevel, 'file': logfile}}
        else:
            logconfig = {'logging': {'level': loglevel}}

        self.session = get_session({'s3': session_data, 'logging': logconfig})
        self.logger = logging.getLogger('gvision.ia')

示例#31

0

显示文件

文件： ia.py 项目： uvalib/emma-ia

def ia_get_session() -> ArchiveSession:
    """
    Get an IA session based on the configuration supplied via environment
    variables.

    Because get_session() starts with values found in ~/.ia for the current
    user then merges in the "additional" supplied values, the configuration
    file reference is explicitly eliminated. For desktop testing, this
    guarantees that the application has the same dependence on environment
    variables as it would when deployed.

    """
    return internetarchive.get_session(IA_CONFIG, config_file='/dev/null')

示例#32

0

显示文件

def main(collection: str, name: str, concurrency: int, dummy: bool=False,
         dummy_text: str=None):
    if dummy:
        t = DummyTrainer(dummy_text)
    else:
        urls = from_cdx_url(cdx_url(collection), session=internetarchive.get_session())
        t = Trainer(urls, concurrency=concurrency)
    identifier = str(int(time.time()))
    filename_base = collection + '_dictionary_' + identifier
    upload_urls = t.upload(filename_base + '.zstdict.zst', filename_base)
    print(upload_urls)
    add_entry(identifier, name, t.sha256, upload_urls['public_url'],
              upload_urls['backup_url'])

示例#33

0

显示文件

文件： mirrorer.py 项目： techman83/NetKAN-Infra

 def __init__(self,
              ckm_repo: CkanMetaRepo,
              ia_access: str,
              ia_secret: str,
              ia_collection: str,
              token: str = None) -> None:
     self.ckm_repo = ckm_repo
     self.ia_collection = ia_collection
     self.ia_access = ia_access
     self.ia_session = internetarchive.get_session(
         config={'s3': {
             'access': ia_access,
             'secret': ia_secret,
         }})
     self._gh = github.Github(token) if token else github.Github()

示例#34

0

显示文件

def get_dictionary(filename: str) -> zstandard.ZstdCompressionDict:
    s = internetarchive.get_session()
    r = s.get('https://archive.org/download/' + filename,
              headers={'Range': 'bytes=0-7'})
    if r.content[:4] != b'\x5D\x2A\x4D\x18':
        return None
    data_size = struct.unpack('<L', r.content[4:])[0]
    r = s.get('https://archive.org/download/' + filename,
              headers={'Range': 'bytes=8-{}'.format(8 + data_size - 1)})
    dictionary = r.content
    if r.content[:4] == b'\x28\xB5\x2F\xFD':
        dictionary = zstandard.ZstdDecompressor().decompress(dictionary)
    if dictionary[:4] != b'\x37\xA4\x30\xEC':
        raise ValueError('Not a dictionary.')
    return zstandard.ZstdCompressionDict(dictionary)

示例#35

0

显示文件

文件： archiveorg_fileset.py 项目： internetarchive/sandcrawler

def main():
    session = internetarchive.get_session()
    if len(sys.argv) == 3:
        item_name = sys.argv[1]
        release_id = sys.argv[2]
        item_to_fileset(item_name, release_id=release_id, session=session)
    else:
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            fields = line.split("\t")
            assert len(fields) == 2
            item_name = fields[0]
            release_id = fields[1]
            item_to_fileset(item_name, release_id=release_id, session=session)

示例#36

0

显示文件

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {
                              'level': 'INFO'
                          }})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

示例#37

0

显示文件

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {
                              'level': 'INFO'
                          }})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

示例#38

0

显示文件

def cdx_url(collection: str) -> str:
    session = internetarchive.get_session()
    response = session.get(
        'https://archive.org/advancedsearch.php',
        params={
            'q': (
                'collection:archiveteam* '
                'AND format:(Item CDX Index) '
                'AND identifier:{}*'.format(collection)
            ),
            'fl[]': 'identifier',
            'sort[]': 'addeddate desc',
            'rows': '1',
            'output': 'json',
            'scope': 'all'
        }
    ).json()
    print(response)
    identifier = response['response']['docs'][0]['identifier']
    return 'https://archive.org/download/{0}/{0}.cdx.gz'.format(identifier)

示例#39

0

显示文件

文件： test_api.py 项目： JesseWeinstein/internetarchive

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {'level': 'INFO'}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa', files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa', files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

示例#40

0

显示文件

文件： fileset_strategies.py 项目： internetarchive/sandcrawler

    def __init__(self, **kwargs):
        super().__init__()
        self.ingest_strategy = IngestStrategy.ArchiveorgFileset

        # TODO: enable cleanup when confident (eg, safe path parsing)
        self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files",
                                                   True)
        self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR",
                                          "/tmp/sandcrawler/")
        try:
            os.mkdir(self.working_dir)
        except FileExistsError:
            pass

        self.http_session = requests_retry_session()
        self.ia_session = internetarchive.get_session(
            config={
                "s3": {
                    "access": os.environ.get("IA_ACCESS_KEY"),
                    "secret": os.environ.get("IA_SECRET_KEY"),
                },
            })

示例#41

0

显示文件

文件： test_api.py 项目： jjjake/internetarchive

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {'level': 'INFO'}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa', files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa', files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

示例#42

0

显示文件

def archive_url(data: typing.Dict[str, bytes],
                url_data: typing.Tuple[str, int, int, str, bool]):
    url, offset, length, filename, redownload = url_data
    if redownload:
        data[url] = requests.get(url, allow_redirects=False).content
    else:
        if filename.endswith('.zst'):
            with dictionary_lock:
                dictionary = get_dictionary(filename)
        r = internetarchive.get_session().get(
            'https://archive.org/download/' + filename,
            headers={
                'Range': 'bytes={}-{}'.format(offset, offset + length - 1)
            })
        if filename.endswith('.zst'):
            data[url] = zstandard.ZstdDecompressor(dict_data=dictionary) \
                .decompressobj().decompress(r.content)
        elif filename.endswith('.gz'):
            data[url] = gzip.decompress(r.content)
        elif filename.endswith('.warc'):
            data[url] = r.content
        else:
            raise ValueError('WARC type not supported.')
    print(len(data[url]), url)

示例#43

0

显示文件

文件： tasks.py 项目： snorey/courtlistener

        logger.warning("Timeout or unknown RequestException. Unable to upload "
                       "to IA. Trying again if retries not exceeded: %s" % rd)
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    if all(r.ok for r in responses):
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        rd.save(do_extraction=False, index=False)


access_key = settings.IA_ACCESS_KEY
secret_key = settings.IA_SECRET_KEY
session = ia.get_session({'s3': {
    'access': access_key,
    'secret': secret_key,
}})


def upload_to_ia(identifier, files, metadata=None):
    """Upload an item and its files to the Internet Archive

    On the Internet Archive there are Items and files. Items have a global
    identifier, and files go inside the item:

        https://internetarchive.readthedocs.io/en/latest/items.html

    This function mirrors the IA library's similar upload function, but builds
    in retries and various assumptions that make sense. Note that according to
    emails with IA staff, it is best to maximize the number of files uploaded to
    an Item at a time, rather than uploading each file in a separate go.

示例#44

0

显示文件

文件： test_api.py 项目： jjjake/internetarchive

def test_get_session_with_config():
    s = get_session(config={'s3': {'access': 'key'}, 'gengeral': {'secure': False}})
    assert s.access_key == 'key'

示例#45

0

显示文件

文件： test_api.py 项目： jleclanche/internetarchive

def test_get_session_with_config():
    s = get_session(config={'s3': {'access': 'key'}})
    assert s.access_key == 'key'

示例#46

0

显示文件

文件： test_api.py 项目： h4ck3rm1k3/ia-wrapper

def test_get_session_with_config():
    s = get_session(config={"s3": {"access": "key"}})
    assert s.access_key == "key"

示例#47

0

显示文件

文件： test_ia_list.py 项目： FactMiners/internetarchive

inc_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, inc_path)
from copy import deepcopy

import responses

from internetarchive.cli import ia_list
from internetarchive import get_session


protocol = 'https:'


ROOT_DIR = os.getcwd()
TEST_JSON_FILE = os.path.join(ROOT_DIR, 'tests/data/nasa_meta.json')
SESSION = get_session()
with open(TEST_JSON_FILE, 'r') as fh:
    ITEM_METADATA = fh.read().strip()

NASA_FILES = set([
    'NASAarchiveLogo.jpg',
    'globe_west_540.jpg',
    'nasa_reviews.xml',
    'nasa_meta.xml',
    'nasa_archive.torrent',
    'nasa_files.xml'
])


def test_ia_list(capsys):
    with responses.RequestsMock() as rsps:

示例#48

0

显示文件

文件： conftest.py 项目： jjjake/internetarchive

def nasa_item():
    session = get_session()
    with IaRequestsMock() as mocker:
        mocker.add_metadata_mock('nasa')
        yield session.get_item('nasa')

示例#49

0

显示文件

文件： conftest.py 项目： jjjake/internetarchive

def session():
    return get_session(config=dict(s3=dict(access='access', secret='secret')))

示例#50

0

显示文件

文件： conftest.py 项目： galgeek/internetarchive

def session_with_logging():
    return get_session(config={'logging': {'level': 'INFO'}})

示例#51

0

显示文件

文件： conftest.py 项目： galgeek/internetarchive

def session():
    return get_session()

示例#52

0

显示文件

文件： test_api.py 项目： h4ck3rm1k3/ia-wrapper

def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        s = get_session(config={"s3": {"access": "key3"}})
        item = get_item("nasa", archive_session=s)
        assert item.session.access_key == "key3"

示例#53

0

显示文件

文件： test_api.py 项目： jjjake/internetarchive

def test_get_item_with_archive_session(nasa_mocker):
    s = get_session(config={'s3': {'access': 'key3'}})
    item = get_item('nasa', archive_session=s)
    assert item.session.access_key == 'key3'