def create_subcollection(collection_id, metadata=None, parent_collection=None): """ The expected sub-collection hierarchy is as follows top-level OSF collection -> provider collection -> collection for nodes with multiple children -> all only child nodes :param metadata: dict should attributes for the provider's sub-collection is being created :param parent_collection: str the name of the sub-collection's parent :return: """ if metadata is None: metadata = {} session = internetarchive.get_session( config={ "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY}, }, ) collection = internetarchive.Item(session, collection_id) collection.upload( files={"dummy.txt": BytesIO(b"dummy")}, metadata={ "mediatype": "collection", "collection": parent_collection, **metadata, }, )
def upload_attachment_to_internet_archive(pedido_protocol, filename): download_dir = flask.current_app.config['DOWNLOADS_PATH'] downloaded_attachments = os.listdir(download_dir) if filename not in [a.decode('utf8') for a in downloaded_attachments]: logger.info("Arquivo {!r} não existe!.".format(filename)) # TODO: O que fazer se o arquivo não estiver disponivel? # Já temos um caso onde o download não completa, mas por falha no # servidor do esic. return None else: # try: # # get mediatype from file extension # mediatype = filename.rpartition('.')[2] # except: # mediatype = None item = internetarchive.Item('{prefix}_pedido_{protocol}'.format( prefix=flask.current_app.config['ATTACHMENT_URL_PREFIX'], protocol=pedido_protocol)) metadata = dict( # mediatype=mediatype, # creator='OKF', created_at=arrow.now().isoformat()) result = item.upload('{}/{}'.format(download_dir, filename), metadata=metadata) if not result or result[0].status_code != 200: # TODO: O que fazer nessa situação? logger.info("Erro ao executar upload.") else: os.remove('{}/{}'.format(download_dir, filename))
def try_mirror(self, ckan: CkanMirror) -> bool: if not ckan.can_mirror: # If we can't mirror, then we're done with this message logging.info('Ckan %s cannot be mirrored', ckan.mirror_item()) return True if ckan.mirrored(self.ia_session): # If it's already mirrored, then we're done with this message logging.info('Ckan %s is already mirrored', ckan.mirror_item()) return True download_file = ckan.open_download() if download_file: logging.info('Uploading %s', ckan.mirror_item()) item = internetarchive.Item(self.ia_session, ckan.mirror_item()) item.upload_file(download_file.name, ckan.mirror_filename(), ckan.item_metadata, ckan.download_headers) source_url = ckan.source_download(self._default_branch(ckan)) if source_url: with tempfile.NamedTemporaryFile() as tmp: logging.info('Attempting to archive source from %s', source_url) download_stream_to_file(source_url, tmp) tmp.flush() item.upload_file(tmp.name, ckan.mirror_source_filename(), ckan.item_metadata, ckan.source_download_headers(tmp.name)) return True logging.error("Failed to find or download %s", ckan.download) return False
def mk_mirror(target): '''Make the mirror''' target = 'collection:' + target print("Attempting to download collection: " + target) search = ia.Search(target) ## Because the internetarchive module won't return us a list ## we'll have to make our own. current_item = 1 total_item = 0 collection = [] for entry in search: collection.append(entry) total_item += 1 ## Go through all items of the collection and download for entry in collection: item_id = entry['identifier'] print('Downloading ' + str(current_item) + '/' + str(total_item) + '\t'\ + item_id) item = ia.Item(item_id) status = item.download() print('\t\t Download successful') current_item += 1
def get_new_item(): """return an ia item object for an item that does not yet exist""" now = datetime.datetime.utcnow() item_name = 'test_upload_iawrapper_' + now.strftime('%Y_%m_%d_%H%M%S') item = ia.Item(item_name) if item.exists is False: return item raise KeyError, 'Could not find a unique item name after 5 tries'
def test_download(): item = internetarchive.Item('nasa') item_dir = item.identifier assert not os.path.exists(item_dir) item.download() assert os.path.exists(item_dir) assert os.path.exists(os.path.join(item_dir, item.identifier + '_meta.xml')) shutil.rmtree(item_dir)
def download_item_files(item_id): # XXX Add a repeatable --format flag for this rather than hard coding # XXX Alternatively: yaml config file f = ["Comic Book RAR", "EPUB", "Animated GIF", "Text PDF", "Image Container PDF"] i = internetarchive.Item(item_id) verboseout("Downloading files from " + i.identifier) if args.verbose: i.download(concurrent=True, verbose=True, ignore_existing=True, formats=f) else: i.download(concurrent=True, ignore_existing=True, formats=f)
def test_file(): item = internetarchive.Item('nasa') filename = 'nasa_meta.xml' file = item.get_file(filename) assert not os.path.exists(filename) file.download() assert os.stat(filename).st_size == file.size os.unlink(filename)
def test_file(): item = internetarchive.Item('stairs') filename = 'glogo.png' file = item.file(filename) assert not os.path.exists(filename) file.download() assert os.stat(filename).st_size == file.size os.unlink(filename)
def main(argv): args = docopt(__doc__, argv=argv) item = internetarchive.Item(args['<identifier>']) # Check existence of item. if args['--exists']: if item.exists: stdout.write('{0} exists\n'.format(item.identifier)) exit(0) else: stderr.write('{0} does not exist\n'.format(item.identifier)) exit(1) # Modify metadata. elif args['--modify']: metadata = get_args_dict(args['--modify']) response = modify_metadata(args['<identifier>'], metadata) status_code = response['status_code'] if not response['content']['success']: error_msg = response['content']['error'] stderr.write('error: {0} ({1})\n'.format(error_msg, status_code)) exit(1) stdout.write('success: {0}\n'.format(response['content']['log'])) # Get metadata. elif args['--files']: for f in item.files(): files_md = [ f.item.identifier, f.name, f.source, f.format, f.size, f.md5 ] stdout.write('\t'.join([str(x) for x in files_md]) + '\n') elif args['--formats']: formats = set([f.format for f in item.files()]) stdout.write('\n'.join(formats) + '\n') elif args['--target']: metadata = [] for key in args['--target']: if '/' in key: for i, k in enumerate(key.split('/')): if i == 0: md = item.metadata.get(k) else: if md: md = md.get(k) else: md = item.metadata.get(key) if md: metadata.append(md) stdout.write('\t'.join([str(x) for x in metadata]) + '\n') else: metadata = dumps(item.metadata) stdout.write(metadata + '\n') exit(0)
def get_file(item_name, file_name): """get a file from a newly-created item. Wait for file to land in item, retry if needed""" for i in range(5): print ' waiting 30 seconds for upload of', file_name time.sleep(30) item = ia.Item(item_name) f = item.file(file_name) if f is not None: return f raise KeyError, 'Could not retrieve file after 5 tries'
def export_to_ia(self, sha_value, **kwargs): """ Called after a `Document` is signed and the hashes are calculated. Takes the file in `Document.doc_file` and uploads it to the internetarchive with the sha256 value as the filename. """ item = internetarchive.Item(settings.IA_ITEM) md = dict(creator=settings.IA_CREATOR) key = sha_value + os.path.splitext(self.doc_file.name)[1] item.upload_file(self.doc_file, key=key, metadata=md, access_key=settings.IA_ACCESS_KEY, secret_key=settings.IA_SECRET_KEY)
def __init__(self, identifier='', retries=3, retrysleep=30): """ This module is used for providing regular functions used for uploading files into the Internet Archive. It is an extension of the internetarchive python library, but with better error handling. - identifier (string): The identifier for the item. - retries (int): The number of times to retry a request to the server. - retrysleep (int): Time (in seconds) to sleep before the next request. """ self.IAItem = internetarchive.Item(identifier, max_retries=retries) self.retries = retries # Files that are present by default in all Internet Archive items self.defaultFiles = [ '%s_archive.torrent' % (identifier), '%s_files.xml' % (identifier), '%s_meta.sqlite' % (identifier), '%s_meta.xml' % (identifier) ]
continue dirty_metadata = dict((k, v) for k, v in zip(headers, row)) metadata = compile_metadata(dirty_metadata) if len(metadata.keys()) <= 1: continue else: yield metadata # main() #_________________________________________________________________________________________ if __name__ == '__main__': tab_file = sys.argv[-1] errors = [] for md in iter_csv(tab_file): item = internetarchive.Item(md['identifier']) r = item.modify_metadata(md) if r['status_code'] != 200: message = '{0}\tERROR! {1}\n'.format(md['identifier'], r['content']) sys.stderr.write(message) errors.append(r) else: message = '{0}\thttps:{1}\n'.format(md['identifier'], r['content']['log']) sys.stdout.write(message) if errors == []: sys.exit(0) else: sys.exit(1)
os.environ['AWS_ACCESS_KEY_ID'] = args.accesskey os.environ['AWS_SECRET_ACCESS_KEY'] = args.secretkey print 'Reading', str(args.sqlitefile) con = sqlite3.connect(args.sqlitefile) con.row_factory = sqlite3.Row cur = con.cursor() cur.execute('SELECT * FROM output') for row in cur: if not os.path.isfile(row['path']): print row['parent_item_id'], '|', '000', '|', row[ 'ia_identifier'], '|', row['path'] else: item = internetarchive.Item(row['ia_identifier']) result = item.upload( row['path'], metadata=dict( collection=args.collection, mediatype='audio', language='yid', title=row['item_title'], description=row['item_description'], author=row['item_author'], # title_yivo = row['title_yivo'], # author_last = row['author_last'], # author_first = row['author_first'], # reader_last = row['reader_last'], # reader_first = row['reader_first'], # author_last_eng = row['author_last_eng'],
def test_item(): item = internetarchive.Item('nasa') assert item.metadata['identifier'] == 'nasa'
def test_export_to_ia(self): doc = Document.objects.get(id=1) item = ia.Item(settings.IA_ITEM) fname = doc.sha256 + os.path.splitext(doc.doc_file.name)[1] i = item.get_file(fname) self.assertNotEqual(i, None, "The file is uploaded to the internetarchive")
def test_item(): item = internetarchive.Item('stairs') assert item.metadata['metadata']['identifier'] == 'stairs'
## If we have valid JSON, extract some metadata if data: metadata = {} metadata["title"] = str(data["fulltitle"].encode('ascii', 'ignore')) metadata["description"] = str(data["description"].encode('ascii', 'ignore')).replace("\n", "<br>") metadata["mediatype"] = "movies" metadata["collection"] = "opensource_movies" metadata["subject"] = myTags print("JSON parse successful! Checking identifier...") ## Check to see if our identifier is in use item = ia.get_item(sanitized) if not item.exists: ## Identifier not in use, let's upload print("Identifier cleared for use!") print("[uploading]") item = ia.Item(sanitized) response = item.upload(file, metadata=metadata, access_key=access_key, secret_key=secret_key) print("Server Response: " + str(response)) ## Check the response. An HTTP 200 is OK if "200" in str(response): print("Success, adding other items associated with " + sanitized) for otherFile in os.listdir(workingDirectory): if otherFile.startswith(commonFile) and not otherFile.endswith(".info.json"): print("Adding file: " + str(otherFile)) response = item.upload(otherFile, access_key=access_key, secret_key=secret_key) print("Server Response: " + str(response)) if "200" in str(response): print("Done adding file: " + str(otherFile) + " to item " + str(sanitized)) else: print(bcolors.FAIL + "[ERROR] Server responded with: " + str(response) + bcolors.ENDC + ". Skipping to next file for item") print("Success! Item populating at: " + bcolors.OKGREEN + "https://archive.org/details/" + sanitized + bcolors.ENDC)
## !! will probably crash after 10 or so items !! feel free to edit the script to make it better for bigger collections ## See http://programminghistorian.org/lessons/data-mining-the-internet-archive for more detailed info import internetarchive as ia import time coll = ia.Search('collection:xxxxxxxx') #fill this in -- searches for the ID of a collection in IA ## example of collection page: https://archive.org/details/johnjaycollegeofcriminaljustice ## the collection ID for that page is johnjaycollegeofcriminaljustice ## you can tell a page is a collection if it has a 'Spotlight Item' on the left num = 0 for result in coll.results(): #for all items in a collection num = num + 1 #item count itemid = result['identifier'] print 'Downloading: #' + str(num) + '\t' + itemid item = ia.Item(itemid) item.download() #download all associated files (large!) print '\t\t Download success.' print 'Pausing for 40 minutes' time.sleep(2400) # IA restricts the number of things you can download. Be nice to # their servers -- limit how much you download, too. For me, this # time restriction is still not polite enough, and my connection gets # cut off all the dang time.