def grab_key_values(key):
    s = ArchiveSession()
    search = Search(
        s,
        '(mediatype:audio OR mediatype:movies) AND (closed_captioning:yes OR format:SubRip OR format:"Web Video Text Tracks")',
        fields=[key])
    licenses = defaultdict(int)
    for result in tqdm(search):
        if key not in result:
            print(f"No {key} result:", result)
            continue
        if isinstance(result[key], str):
            licenses[result[key]] += 1
        elif isinstance(result[key], list):
            for licenseurl in result[key]:
                licenses[licenseurl] += 1
            else:
                raise ValueError(f"Unexpected type for {key}: ",
                                 type(result[key]), result)

    print(f"Counts for key={key}")
    print("\n".join(
        str(x) for x in sorted((v, k) for k, v in licenses.items())))
示例#2
0
def main(argv, session):
    args = docopt(__doc__, argv=argv)
    ERRORS = False

    # Validate args.
    s = Schema({
        six.text_type:
        Use(bool),
        '<identifier>':
        Or(
            None,
            And(str,
                validate_ia_identifier,
                error=
                ('<identifier> should be between 3 and 80 characters in length, and '
                 'can only contain alphanumeric characters, underscores ( _ ), or '
                 'dashes ( - )'))),
        '<file>':
        And(
            And(lambda f: all(os.path.exists(x) for x in f if x != '-'),
                error='<file> should be a readable file or directory.'),
            And(lambda f: False
                if f == ['-'] and not args['--remote-name'] else True,
                error=
                '--remote-name must be provided when uploading from stdin.')),
        '--remote-name':
        Or(None, And(str)),
        '--spreadsheet':
        Or(None,
           os.path.isfile,
           error='--spreadsheet should be a readable file.'),
        '--metadata':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--metadata must be formatted as --metadata="key:value"'),
        '--header':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--header must be formatted as --header="key:value"'),
        '--retries':
        Use(lambda x: int(x[0]) if x else 0),
        '--sleep':
        Use(lambda l: int(l[0]), error='--sleep value must be an integer.'),
        '--size-hint':
        Or(Use(lambda l: int(l[0]) if l else None),
           int,
           None,
           error='--size-hint value must be an integer.'),
        '--status-check':
        bool,
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)),
              file=sys.stderr)
        sys.exit(1)

    # Status check.
    if args['--status-check']:
        if session.s3_is_overloaded():
            print('warning: {0} is over limit, and not accepting requests. '
                  'Expect 503 SlowDown errors.'.format(args['<identifier>']),
                  file=sys.stderr)
            sys.exit(1)
        else:
            print('success: {0} is accepting requests.'.format(
                args['<identifier>']))
            sys.exit()

    elif args['<identifier>']:
        item = session.get_item(args['<identifier>'])

    # Upload keyword arguments.
    if args['--size-hint']:
        args['--header']['x-archive-size-hint'] = args['--size-hint']

    queue_derive = True if args['--no-derive'] is False else False
    verbose = True if args['--quiet'] is False else False

    upload_kwargs = dict(
        metadata=args['--metadata'],
        headers=args['--header'],
        debug=args['--debug'],
        queue_derive=queue_derive,
        checksum=args['--checksum'],
        verbose=verbose,
        retries=args['--retries'],
        retries_sleep=args['--sleep'],
        delete=args['--delete'],
    )

    # Upload files.
    if not args['--spreadsheet']:
        if args['-']:
            local_file = TemporaryFile()
            local_file.write(sys.stdin.read())
            local_file.seek(0)
        else:
            local_file = args['<file>']

        if isinstance(local_file,
                      (list, tuple, set)) and args['--remote-name']:
            local_file = local_file[0]
        if args['--remote-name']:
            files = {args['--remote-name']: local_file}
        else:
            files = local_file

        for _r in _upload_files(item, files, upload_kwargs):
            if args['--debug']:
                break
            if (not _r) or (not _r.ok):
                ERRORS = True

    # Bulk upload using spreadsheet.
    else:
        # Use the same session for each upload request.
        session = ArchiveSession()
        spreadsheet = csv.DictReader(open(args['--spreadsheet'], 'rU'))
        prev_identifier = None
        for row in spreadsheet:
            local_file = row['file']
            identifier = row['identifier']
            del row['file']
            del row['identifier']
            if (not identifier) and (prev_identifier):
                identifier = prev_identifier
            item = session.get_item(identifier)
            # TODO: Clean up how indexed metadata items are coerced
            # into metadata.
            md_args = [
                '{0}:{1}'.format(k.lower(), v) for (k, v) in row.items() if v
            ]
            metadata = get_args_dict(md_args)
            upload_kwargs['metadata'].update(metadata)
            r = _upload_files(item, local_file, upload_kwargs, prev_identifier,
                              session)
            for _r in r:
                if args['--debug']:
                    break
                if (not _r) or (not _r.ok):
                    ERRORS = True
            prev_identifier = identifier

    if ERRORS:
        sys.exit(1)
示例#3
0
def main(argv):
    args = docopt(__doc__, argv=argv)

    headers = get_args_dict(args['--header'])
    if args['--size-hint']:
        headers['x-archive-size-hint'] = args['--size-hint']

    # Upload keyword arguments.
    upload_kwargs = dict(
        metadata=get_args_dict(args['--metadata']),
        headers=headers,
        debug=args['--debug'],
        queue_derive=True if args['--no-derive'] is False else False,
        ignore_preexisting_bucket=args['--ignore-bucket'],
        checksum=args['--checksum'],
        verbose=True if args['--quiet'] is False else False,
        retries=int(args['--retries']) if args['--retries'] else 0,
        retries_sleep=int(args['--sleep']),
        delete=args['--delete'],
    )

    if args['<file>'] == ['-'] and not args['-']:
        sys.stderr.write(
            '--remote-name is required when uploading from stdin.\n')
        call(['ia', 'upload', '--help'])
        sys.exit(1)

    # Upload from stdin.
    if args['-']:
        local_file = TemporaryFile()
        local_file.write(sys.stdin.read())
        local_file.seek(0)
        _upload_files(args, args['<identifier>'], local_file, upload_kwargs)

    # Bulk upload using spreadsheet.
    elif args['--spreadsheet']:
        # Use the same session for each upload request.
        session = ArchiveSession()

        spreadsheet = csv.DictReader(open(args['--spreadsheet'], 'rU'))
        prev_identifier = None
        for row in spreadsheet:
            local_file = row['file']
            identifier = row['identifier']
            del row['file']
            del row['identifier']
            if (not identifier) and (prev_identifier):
                identifier = prev_identifier
            # TODO: Clean up how indexed metadata items are coerced
            # into metadata.
            md_args = [
                '{0}:{1}'.format(k.lower(), v) for (k, v) in row.items() if v
            ]
            metadata = get_args_dict(md_args)
            upload_kwargs['metadata'].update(metadata)
            _upload_files(args, identifier, local_file, upload_kwargs,
                          prev_identifier, session)
            prev_identifier = identifier

    # Upload files.
    else:
        local_file = args['<file>']
        _upload_files(args, args['<identifier>'], local_file, upload_kwargs)
示例#4
0
from internetarchive.session import ArchiveSession
from internetarchive.search import Search
from internetarchive import get_item
import json, sys
import os
import shutil
import random
import string

SEARCH_QUERY = ''
MAX_MB_FILE = 10 * 1000000
FILE_DIR = './kasette/'

s = ArchiveSession()

def output(str):
    sys.stdout.write("\r%s" % str)
    sys.stdout.flush()

def reset_folder(directory):
    shutil.rmtree(directory, ignore_errors=True)

    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
        output('loaded kasette')
    except OSError:
        output('could not create kasette')

    return True