def grab_key_values(key): s = ArchiveSession() search = Search( s, '(mediatype:audio OR mediatype:movies) AND (closed_captioning:yes OR format:SubRip OR format:"Web Video Text Tracks")', fields=[key]) licenses = defaultdict(int) for result in tqdm(search): if key not in result: print(f"No {key} result:", result) continue if isinstance(result[key], str): licenses[result[key]] += 1 elif isinstance(result[key], list): for licenseurl in result[key]: licenses[licenseurl] += 1 else: raise ValueError(f"Unexpected type for {key}: ", type(result[key]), result) print(f"Counts for key={key}") print("\n".join( str(x) for x in sorted((v, k) for k, v in licenses.items())))
def main(argv, session): args = docopt(__doc__, argv=argv) ERRORS = False # Validate args. s = Schema({ six.text_type: Use(bool), '<identifier>': Or( None, And(str, validate_ia_identifier, error= ('<identifier> should be between 3 and 80 characters in length, and ' 'can only contain alphanumeric characters, underscores ( _ ), or ' 'dashes ( - )'))), '<file>': And( And(lambda f: all(os.path.exists(x) for x in f if x != '-'), error='<file> should be a readable file or directory.'), And(lambda f: False if f == ['-'] and not args['--remote-name'] else True, error= '--remote-name must be provided when uploading from stdin.')), '--remote-name': Or(None, And(str)), '--spreadsheet': Or(None, os.path.isfile, error='--spreadsheet should be a readable file.'), '--metadata': Or(None, And(Use(get_args_dict), dict), error='--metadata must be formatted as --metadata="key:value"'), '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), '--retries': Use(lambda x: int(x[0]) if x else 0), '--sleep': Use(lambda l: int(l[0]), error='--sleep value must be an integer.'), '--size-hint': Or(Use(lambda l: int(l[0]) if l else None), int, None, error='--size-hint value must be an integer.'), '--status-check': bool, }) try: args = s.validate(args) except SchemaError as exc: print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr) sys.exit(1) # Status check. if args['--status-check']: if session.s3_is_overloaded(): print('warning: {0} is over limit, and not accepting requests. ' 'Expect 503 SlowDown errors.'.format(args['<identifier>']), file=sys.stderr) sys.exit(1) else: print('success: {0} is accepting requests.'.format( args['<identifier>'])) sys.exit() elif args['<identifier>']: item = session.get_item(args['<identifier>']) # Upload keyword arguments. if args['--size-hint']: args['--header']['x-archive-size-hint'] = args['--size-hint'] queue_derive = True if args['--no-derive'] is False else False verbose = True if args['--quiet'] is False else False upload_kwargs = dict( metadata=args['--metadata'], headers=args['--header'], debug=args['--debug'], queue_derive=queue_derive, checksum=args['--checksum'], verbose=verbose, retries=args['--retries'], retries_sleep=args['--sleep'], delete=args['--delete'], ) # Upload files. if not args['--spreadsheet']: if args['-']: local_file = TemporaryFile() local_file.write(sys.stdin.read()) local_file.seek(0) else: local_file = args['<file>'] if isinstance(local_file, (list, tuple, set)) and args['--remote-name']: local_file = local_file[0] if args['--remote-name']: files = {args['--remote-name']: local_file} else: files = local_file for _r in _upload_files(item, files, upload_kwargs): if args['--debug']: break if (not _r) or (not _r.ok): ERRORS = True # Bulk upload using spreadsheet. else: # Use the same session for each upload request. session = ArchiveSession() spreadsheet = csv.DictReader(open(args['--spreadsheet'], 'rU')) prev_identifier = None for row in spreadsheet: local_file = row['file'] identifier = row['identifier'] del row['file'] del row['identifier'] if (not identifier) and (prev_identifier): identifier = prev_identifier item = session.get_item(identifier) # TODO: Clean up how indexed metadata items are coerced # into metadata. md_args = [ '{0}:{1}'.format(k.lower(), v) for (k, v) in row.items() if v ] metadata = get_args_dict(md_args) upload_kwargs['metadata'].update(metadata) r = _upload_files(item, local_file, upload_kwargs, prev_identifier, session) for _r in r: if args['--debug']: break if (not _r) or (not _r.ok): ERRORS = True prev_identifier = identifier if ERRORS: sys.exit(1)
def main(argv, session): args = docopt(__doc__, argv=argv) # Validate args. s = Schema({ six.text_type: Use(bool), '<identifier>': Or(None, And(str, validate_ia_identifier, error=('<identifier> should be between 3 and 80 characters in length, and ' 'can only contain alphanumeric characters, underscores ( _ ), or ' 'dashes ( - )'))), '<file>': And( And(lambda f: all(os.path.exists(x) for x in f if x != '-'), error='<file> should be a readable file or directory.'), And(lambda f: False if f == ['-'] and not args['--remote-name'] else True, error='--remote-name must be provided when uploading from stdin.')), '--remote-name': Or(None, And(str)), '--spreadsheet': Or(None, os.path.isfile, error='--spreadsheet should be a readable file.'), '--metadata': Or(None, And(Use(get_args_dict), dict), error='--metadata must be formatted as --metadata="key:value"'), '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), '--retries': Use(lambda x: int(x[0]) if x else 0), '--sleep': Use(lambda l: int(l[0]), error='--sleep value must be an integer.'), '--size-hint': Or(Use(lambda l: int(l[0]) if l else None), int, None, error='--size-hint value must be an integer.'), '--status-check': bool, }) try: args = s.validate(args) except SchemaError as exc: print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr) sys.exit(1) # Status check. if args['--status-check']: if session.s3_is_overloaded(): print('warning: {0} is over limit, and not accepting requests. ' 'Expect 503 SlowDown errors.'.format(args['<identifier>']), file=sys.stderr) sys.exit(1) else: print('success: {0} is accepting requests.'.format(args['<identifier>'])) sys.exit() elif args['<identifier>']: item = session.get_item(args['<identifier>']) # Upload keyword arguments. if args['--size-hint']: args['--header']['x-archive-size-hint'] = args['--size-hint'] queue_derive = True if args['--no-derive'] is False else False verbose = True if args['--quiet'] is False else False upload_kwargs = dict( metadata=args['--metadata'], headers=args['--header'], debug=args['--debug'], queue_derive=queue_derive, checksum=args['--checksum'], verbose=verbose, retries=args['--retries'], retries_sleep=args['--sleep'], delete=args['--delete'], ) # Upload files. if not args['--spreadsheet']: if args['-']: local_file = TemporaryFile() local_file.write(sys.stdin.read()) local_file.seek(0) else: local_file = args['<file>'] if isinstance(local_file, (list, tuple, set)) and args['--remote-name']: local_file = local_file[0] if args['--remote-name']: files = {args['--remote-name']: local_file} else: files = local_file responses = _upload_files(item, files, upload_kwargs) # Bulk upload using spreadsheet. else: # Use the same session for each upload request. session = ArchiveSession() spreadsheet = csv.DictReader(open(args['--spreadsheet'], 'rU')) prev_identifier = None responses = [] for row in spreadsheet: local_file = row['file'] identifier = row['identifier'] del row['file'] del row['identifier'] if (not identifier) and (prev_identifier): identifier = prev_identifier item = session.get_item(identifier) # TODO: Clean up how indexed metadata items are coerced # into metadata. md_args = ['{0}:{1}'.format(k.lower(), v) for (k, v) in row.items() if v] metadata = get_args_dict(md_args) upload_kwargs['metadata'].update(metadata) r = _upload_files(item, local_file, upload_kwargs, prev_identifier, session, responses) responses += r prev_identifier = identifier if responses and not all(r and r.ok for r in responses): sys.exit(1)
def main(argv): args = docopt(__doc__, argv=argv) headers = get_args_dict(args['--header']) if args['--size-hint']: headers['x-archive-size-hint'] = args['--size-hint'] # Upload keyword arguments. upload_kwargs = dict( metadata=get_args_dict(args['--metadata']), headers=headers, debug=args['--debug'], queue_derive=True if args['--no-derive'] is False else False, ignore_preexisting_bucket=args['--ignore-bucket'], checksum=args['--checksum'], verbose=True if args['--quiet'] is False else False, retries=int(args['--retries']) if args['--retries'] else 0, retries_sleep=int(args['--sleep']), delete=args['--delete'], ) if args['<file>'] == ['-'] and not args['-']: sys.stderr.write( '--remote-name is required when uploading from stdin.\n') call(['ia', 'upload', '--help']) sys.exit(1) # Upload from stdin. if args['-']: local_file = TemporaryFile() local_file.write(sys.stdin.read()) local_file.seek(0) _upload_files(args, args['<identifier>'], local_file, upload_kwargs) # Bulk upload using spreadsheet. elif args['--spreadsheet']: # Use the same session for each upload request. session = ArchiveSession() spreadsheet = csv.DictReader(open(args['--spreadsheet'], 'rU')) prev_identifier = None for row in spreadsheet: local_file = row['file'] identifier = row['identifier'] del row['file'] del row['identifier'] if (not identifier) and (prev_identifier): identifier = prev_identifier # TODO: Clean up how indexed metadata items are coerced # into metadata. md_args = [ '{0}:{1}'.format(k.lower(), v) for (k, v) in row.items() if v ] metadata = get_args_dict(md_args) upload_kwargs['metadata'].update(metadata) _upload_files(args, identifier, local_file, upload_kwargs, prev_identifier, session) prev_identifier = identifier # Upload files. else: local_file = args['<file>'] _upload_files(args, args['<identifier>'], local_file, upload_kwargs)
from internetarchive.session import ArchiveSession from internetarchive.search import Search from internetarchive import get_item import json, sys import os import shutil import random import string SEARCH_QUERY = '' MAX_MB_FILE = 10 * 1000000 FILE_DIR = './kasette/' s = ArchiveSession() def output(str): sys.stdout.write("\r%s" % str) sys.stdout.flush() def reset_folder(directory): shutil.rmtree(directory, ignore_errors=True) try: if not os.path.exists(directory): os.makedirs(directory) output('loaded kasette') except OSError: output('could not create kasette') return True
def search_items( query: str, fields: Iterable | None = None, sorts=None, params: Mapping | None = None, full_text_search: bool = False, dsl_fts: bool = False, archive_session: session.ArchiveSession = None, config: Mapping | None = None, config_file: str | None = None, http_adapter_kwargs: MutableMapping | None = None, request_kwargs: Mapping | None = None, max_retries: int | Retry | None = None, ) -> search.Search: """Search for items on Archive.org. :param query: The Archive.org search query to yield results for. Refer to https://archive.org/advancedsearch.php#raw for help formatting your query. :param fields: The metadata fields to return in the search results. :param params: The URL parameters to send with each request sent to the Archive.org Advancedsearch Api. :param full_text_search: Beta support for querying the archive.org Full Text Search API [default: False]. :param dsl_fts: Beta support for querying the archive.org Full Text Search API in dsl (i.e. do not prepend ``!L `` to the ``full_text_search`` query [default: False]. :param secure: Configuration options for session. :param config_file: A path to a config file used to configure your session. :param http_adapter_kwargs: Keyword arguments that :py:class:`requests.adapters.HTTPAdapter` takes. :param request_kwargs: Keyword arguments that :py:class:`requests.Request` takes. :param max_retries: The number of times to retry a failed request. This can also be an `urllib3.Retry` object. If you need more control (e.g. `status_forcelist`), use a `ArchiveSession` object, and mount your own adapter after the session object has been initialized. For example:: >>> s = get_session() >>> s.mount_http_adapter() >>> search_results = s.search_items('nasa') See :meth:`ArchiveSession.mount_http_adapter` for more details. :returns: A :class:`Search` object, yielding search results. """ if not archive_session: archive_session = get_session(config, config_file, False, http_adapter_kwargs) return archive_session.search_items( query, fields=fields, sorts=sorts, params=params, full_text_search=full_text_search, dsl_fts=dsl_fts, request_kwargs=request_kwargs, max_retries=max_retries, )