def grab_key_values(key):
    s = ArchiveSession()
    search = Search(
        s,
        '(mediatype:audio OR mediatype:movies) AND (closed_captioning:yes OR format:SubRip OR format:"Web Video Text Tracks")',
        fields=[key])
    licenses = defaultdict(int)
    for result in tqdm(search):
        if key not in result:
            print(f"No {key} result:", result)
            continue
        if isinstance(result[key], str):
            licenses[result[key]] += 1
        elif isinstance(result[key], list):
            for licenseurl in result[key]:
                licenses[licenseurl] += 1
            else:
                raise ValueError(f"Unexpected type for {key}: ",
                                 type(result[key]), result)

    print(f"Counts for key={key}")
    print("\n".join(
        str(x) for x in sorted((v, k) for k, v in licenses.items())))
示例#2
0
def main(argv, session):
    args = docopt(__doc__, argv=argv)
    ERRORS = False

    # Validate args.
    s = Schema({
        six.text_type:
        Use(bool),
        '<identifier>':
        Or(
            None,
            And(str,
                validate_ia_identifier,
                error=
                ('<identifier> should be between 3 and 80 characters in length, and '
                 'can only contain alphanumeric characters, underscores ( _ ), or '
                 'dashes ( - )'))),
        '<file>':
        And(
            And(lambda f: all(os.path.exists(x) for x in f if x != '-'),
                error='<file> should be a readable file or directory.'),
            And(lambda f: False
                if f == ['-'] and not args['--remote-name'] else True,
                error=
                '--remote-name must be provided when uploading from stdin.')),
        '--remote-name':
        Or(None, And(str)),
        '--spreadsheet':
        Or(None,
           os.path.isfile,
           error='--spreadsheet should be a readable file.'),
        '--metadata':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--metadata must be formatted as --metadata="key:value"'),
        '--header':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--header must be formatted as --header="key:value"'),
        '--retries':
        Use(lambda x: int(x[0]) if x else 0),
        '--sleep':
        Use(lambda l: int(l[0]), error='--sleep value must be an integer.'),
        '--size-hint':
        Or(Use(lambda l: int(l[0]) if l else None),
           int,
           None,
           error='--size-hint value must be an integer.'),
        '--status-check':
        bool,
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)),
              file=sys.stderr)
        sys.exit(1)

    # Status check.
    if args['--status-check']:
        if session.s3_is_overloaded():
            print('warning: {0} is over limit, and not accepting requests. '
                  'Expect 503 SlowDown errors.'.format(args['<identifier>']),
                  file=sys.stderr)
            sys.exit(1)
        else:
            print('success: {0} is accepting requests.'.format(
                args['<identifier>']))
            sys.exit()

    elif args['<identifier>']:
        item = session.get_item(args['<identifier>'])

    # Upload keyword arguments.
    if args['--size-hint']:
        args['--header']['x-archive-size-hint'] = args['--size-hint']

    queue_derive = True if args['--no-derive'] is False else False
    verbose = True if args['--quiet'] is False else False

    upload_kwargs = dict(
        metadata=args['--metadata'],
        headers=args['--header'],
        debug=args['--debug'],
        queue_derive=queue_derive,
        checksum=args['--checksum'],
        verbose=verbose,
        retries=args['--retries'],
        retries_sleep=args['--sleep'],
        delete=args['--delete'],
    )

    # Upload files.
    if not args['--spreadsheet']:
        if args['-']:
            local_file = TemporaryFile()
            local_file.write(sys.stdin.read())
            local_file.seek(0)
        else:
            local_file = args['<file>']

        if isinstance(local_file,
                      (list, tuple, set)) and args['--remote-name']:
            local_file = local_file[0]
        if args['--remote-name']:
            files = {args['--remote-name']: local_file}
        else:
            files = local_file

        for _r in _upload_files(item, files, upload_kwargs):
            if args['--debug']:
                break
            if (not _r) or (not _r.ok):
                ERRORS = True

    # Bulk upload using spreadsheet.
    else:
        # Use the same session for each upload request.
        session = ArchiveSession()
        spreadsheet = csv.DictReader(open(args['--spreadsheet'], 'rU'))
        prev_identifier = None
        for row in spreadsheet:
            local_file = row['file']
            identifier = row['identifier']
            del row['file']
            del row['identifier']
            if (not identifier) and (prev_identifier):
                identifier = prev_identifier
            item = session.get_item(identifier)
            # TODO: Clean up how indexed metadata items are coerced
            # into metadata.
            md_args = [
                '{0}:{1}'.format(k.lower(), v) for (k, v) in row.items() if v
            ]
            metadata = get_args_dict(md_args)
            upload_kwargs['metadata'].update(metadata)
            r = _upload_files(item, local_file, upload_kwargs, prev_identifier,
                              session)
            for _r in r:
                if args['--debug']:
                    break
                if (not _r) or (not _r.ok):
                    ERRORS = True
            prev_identifier = identifier

    if ERRORS:
        sys.exit(1)
def main(argv, session):
    args = docopt(__doc__, argv=argv)

    # Validate args.
    s = Schema({
        six.text_type: Use(bool),
        '<identifier>': Or(None, And(str, validate_ia_identifier,
            error=('<identifier> should be between 3 and 80 characters in length, and '
                   'can only contain alphanumeric characters, underscores ( _ ), or '
                   'dashes ( - )'))),
        '<file>': And(
            And(lambda f: all(os.path.exists(x) for x in f if x != '-'),
                error='<file> should be a readable file or directory.'),
            And(lambda f: False if f == ['-'] and not args['--remote-name'] else True,
                error='--remote-name must be provided when uploading from stdin.')),
        '--remote-name': Or(None, And(str)),
        '--spreadsheet': Or(None, os.path.isfile,
            error='--spreadsheet should be a readable file.'),
        '--metadata': Or(None, And(Use(get_args_dict), dict),
            error='--metadata must be formatted as --metadata="key:value"'),
        '--header': Or(None, And(Use(get_args_dict), dict),
            error='--header must be formatted as --header="key:value"'),
        '--retries': Use(lambda x: int(x[0]) if x else 0),
        '--sleep': Use(lambda l: int(l[0]), error='--sleep value must be an integer.'),
        '--size-hint': Or(Use(lambda l: int(l[0]) if l else None), int, None,
            error='--size-hint value must be an integer.'),
        '--status-check': bool,
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr)
        sys.exit(1)

    # Status check.
    if args['--status-check']:
        if session.s3_is_overloaded():
            print('warning: {0} is over limit, and not accepting requests. '
                  'Expect 503 SlowDown errors.'.format(args['<identifier>']),
                  file=sys.stderr)
            sys.exit(1)
        else:
            print('success: {0} is accepting requests.'.format(args['<identifier>']))
            sys.exit()

    elif args['<identifier>']:
        item = session.get_item(args['<identifier>'])

    # Upload keyword arguments.
    if args['--size-hint']:
        args['--header']['x-archive-size-hint'] = args['--size-hint']

    queue_derive = True if args['--no-derive'] is False else False
    verbose = True if args['--quiet'] is False else False

    upload_kwargs = dict(
        metadata=args['--metadata'],
        headers=args['--header'],
        debug=args['--debug'],
        queue_derive=queue_derive,
        checksum=args['--checksum'],
        verbose=verbose,
        retries=args['--retries'],
        retries_sleep=args['--sleep'],
        delete=args['--delete'],
    )

    # Upload files.
    if not args['--spreadsheet']:
        if args['-']:
            local_file = TemporaryFile()
            local_file.write(sys.stdin.read())
            local_file.seek(0)
        else:
            local_file = args['<file>']

        if isinstance(local_file, (list, tuple, set)) and args['--remote-name']:
            local_file = local_file[0]
        if args['--remote-name']:
            files = {args['--remote-name']: local_file}
        else:
            files = local_file

        responses = _upload_files(item, files, upload_kwargs)

    # Bulk upload using spreadsheet.
    else:
        # Use the same session for each upload request.
        session = ArchiveSession()
        spreadsheet = csv.DictReader(open(args['--spreadsheet'], 'rU'))
        prev_identifier = None
        responses = []
        for row in spreadsheet:
            local_file = row['file']
            identifier = row['identifier']
            del row['file']
            del row['identifier']
            if (not identifier) and (prev_identifier):
                identifier = prev_identifier
            item = session.get_item(identifier)
            # TODO: Clean up how indexed metadata items are coerced
            # into metadata.
            md_args = ['{0}:{1}'.format(k.lower(), v) for (k, v) in row.items() if v]
            metadata = get_args_dict(md_args)
            upload_kwargs['metadata'].update(metadata)
            r = _upload_files(item, local_file, upload_kwargs, prev_identifier, session,
                              responses)
            responses += r
            prev_identifier = identifier

    if responses and not all(r and r.ok for r in responses):
        sys.exit(1)
示例#4
0
def main(argv):
    args = docopt(__doc__, argv=argv)

    headers = get_args_dict(args['--header'])
    if args['--size-hint']:
        headers['x-archive-size-hint'] = args['--size-hint']

    # Upload keyword arguments.
    upload_kwargs = dict(
        metadata=get_args_dict(args['--metadata']),
        headers=headers,
        debug=args['--debug'],
        queue_derive=True if args['--no-derive'] is False else False,
        ignore_preexisting_bucket=args['--ignore-bucket'],
        checksum=args['--checksum'],
        verbose=True if args['--quiet'] is False else False,
        retries=int(args['--retries']) if args['--retries'] else 0,
        retries_sleep=int(args['--sleep']),
        delete=args['--delete'],
    )

    if args['<file>'] == ['-'] and not args['-']:
        sys.stderr.write(
            '--remote-name is required when uploading from stdin.\n')
        call(['ia', 'upload', '--help'])
        sys.exit(1)

    # Upload from stdin.
    if args['-']:
        local_file = TemporaryFile()
        local_file.write(sys.stdin.read())
        local_file.seek(0)
        _upload_files(args, args['<identifier>'], local_file, upload_kwargs)

    # Bulk upload using spreadsheet.
    elif args['--spreadsheet']:
        # Use the same session for each upload request.
        session = ArchiveSession()

        spreadsheet = csv.DictReader(open(args['--spreadsheet'], 'rU'))
        prev_identifier = None
        for row in spreadsheet:
            local_file = row['file']
            identifier = row['identifier']
            del row['file']
            del row['identifier']
            if (not identifier) and (prev_identifier):
                identifier = prev_identifier
            # TODO: Clean up how indexed metadata items are coerced
            # into metadata.
            md_args = [
                '{0}:{1}'.format(k.lower(), v) for (k, v) in row.items() if v
            ]
            metadata = get_args_dict(md_args)
            upload_kwargs['metadata'].update(metadata)
            _upload_files(args, identifier, local_file, upload_kwargs,
                          prev_identifier, session)
            prev_identifier = identifier

    # Upload files.
    else:
        local_file = args['<file>']
        _upload_files(args, args['<identifier>'], local_file, upload_kwargs)
示例#5
0
from internetarchive.session import ArchiveSession
from internetarchive.search import Search
from internetarchive import get_item
import json, sys
import os
import shutil
import random
import string

SEARCH_QUERY = ''
MAX_MB_FILE = 10 * 1000000
FILE_DIR = './kasette/'

s = ArchiveSession()

def output(str):
    sys.stdout.write("\r%s" % str)
    sys.stdout.flush()

def reset_folder(directory):
    shutil.rmtree(directory, ignore_errors=True)

    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
        output('loaded kasette')
    except OSError:
        output('could not create kasette')

    return True
示例#6
0
def search_items(
    query: str,
    fields: Iterable | None = None,
    sorts=None,
    params: Mapping | None = None,
    full_text_search: bool = False,
    dsl_fts: bool = False,
    archive_session: session.ArchiveSession = None,
    config: Mapping | None = None,
    config_file: str | None = None,
    http_adapter_kwargs: MutableMapping | None = None,
    request_kwargs: Mapping | None = None,
    max_retries: int | Retry | None = None,
) -> search.Search:
    """Search for items on Archive.org.

    :param query: The Archive.org search query to yield results for. Refer to
                  https://archive.org/advancedsearch.php#raw for help formatting your
                  query.

    :param fields: The metadata fields to return in the search results.

    :param params: The URL parameters to send with each request sent to the
                   Archive.org Advancedsearch Api.

    :param full_text_search: Beta support for querying the archive.org
                             Full Text Search API [default: False].

    :param dsl_fts: Beta support for querying the archive.org Full Text
                    Search API in dsl (i.e. do not prepend ``!L `` to the
                    ``full_text_search`` query [default: False].

    :param secure: Configuration options for session.

    :param config_file: A path to a config file used to configure your session.

    :param http_adapter_kwargs: Keyword arguments that
                                :py:class:`requests.adapters.HTTPAdapter` takes.

    :param request_kwargs: Keyword arguments that
                           :py:class:`requests.Request` takes.

    :param max_retries: The number of times to retry a failed request.
                        This can also be an `urllib3.Retry` object.
                        If you need more control (e.g. `status_forcelist`), use a
                        `ArchiveSession` object, and mount your own adapter after the
                        session object has been initialized. For example::

                        >>> s = get_session()
                        >>> s.mount_http_adapter()
                        >>> search_results = s.search_items('nasa')

                        See :meth:`ArchiveSession.mount_http_adapter`
                        for more details.

    :returns: A :class:`Search` object, yielding search results.
    """
    if not archive_session:
        archive_session = get_session(config, config_file, False,
                                      http_adapter_kwargs)
    return archive_session.search_items(
        query,
        fields=fields,
        sorts=sorts,
        params=params,
        full_text_search=full_text_search,
        dsl_fts=dsl_fts,
        request_kwargs=request_kwargs,
        max_retries=max_retries,
    )