def ia_modify_metadata(self, identifier, metadata): try: modify_metadata(identifier, metadata = metadata, \ access_key = self.access_key, \ secret_key = self.secret_key) except Exception as e: self.logger.warn('Could not modify metadata %s. Error %s' , identifier, e) return False return True
def update_metadata(_id, meta, for_real=False): print "modify_metadata(%s)" % _id for item in sorted(meta): _md = {'title': meta[item]["title"]} tgt = meta[item]["target"] if for_real: print ("modify_metadata(%s, metadata=%s, target='%s')" % (_id, _md, tgt)) modify_metadata(_id, metadata=_md, target=tgt) else: print " target=%s metadata=%s" % (tgt, _md)
def update_metadata(_id, meta, for_real=False): print "modify_metadata(%s)" % _id for item in sorted(meta): _md = {'title': meta[item]["title"]} tgt = meta[item]["target"] if for_real: print("modify_metadata(%s, metadata=%s, target='%s')" % (_id, _md, tgt)) modify_metadata(_id, metadata=_md, target=tgt) else: print " target=%s metadata=%s" % (tgt, _md)
def upload_to_internet_archive(self, link_guid): link = Link.objects.get(guid=link_guid) if not settings.UPLOAD_TO_INTERNET_ARCHIVE: return if not link.can_upload_to_internet_archive(): print "Not eligible for upload." return metadata = { "collection":settings.INTERNET_ARCHIVE_COLLECTION, "title":'%s: %s' % (link_guid, truncatechars(link.submitted_title, 50)), "mediatype":'web', "description":'Perma.cc archive of %s created on %s.' % (link.submitted_url, link.creation_timestamp,), "contributor":'Perma.cc', "submitted_url":link.submitted_url, "perma_url":"http://%s/%s" % (settings.HOST, link_guid), "external-identifier":'urn:X-perma:%s' % link_guid, } # set sponsor if organization exists if link.organization: metadata["sponsor"] = "%s - %s" % (link.organization, link.organization.registrar) identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link_guid with default_storage.open(link.warc_storage_file(), 'rb') as warc_file: success = internetarchive.upload( identifier, warc_file, access_key=settings.INTERNET_ARCHIVE_ACCESS_KEY, secret_key=settings.INTERNET_ARCHIVE_SECRET_KEY, retries=10, retries_sleep=60, verbose=True, ) if success: internetarchive.modify_metadata( identifier, metadata=metadata, ) link.uploaded_to_internet_archive = True link.save() else: self.retry(exc=Exception("Internet Archive reported upload failure.")) print "Failed." return success
def main(argv): args = docopt(__doc__, argv=argv) item = get_item(args['<identifier>']) # Check existence of item. if args['--exists']: if item.exists: sys.stdout.write('{0} exists\n'.format(item.identifier)) sys.exit(0) else: sys.stderr.write('{0} does not exist\n'.format(item.identifier)) sys.exit(1) # Modify metadata. elif args['--modify'] or args['--append']: append = True if args['--append'] else False metadata_args = args['--modify'] if args['--modify'] else args['--append'] metadata = get_args_dict(metadata_args) response = modify_metadata(args['<identifier>'], metadata, append=append) if not response.json()['success']: error_msg = response.json()['error'] sys.stderr.write('error: {0} ({1})\n'.format(error_msg, response.status_code)) sys.exit(1) sys.stdout.write('success: {0}\n'.format(response.json()['log'])) # Get metadata. elif args['--formats']: formats = set([f.format for f in item.iter_files()]) sys.stdout.write('\n'.join(formats) + '\n') else: metadata = dumps(item.metadata) sys.stdout.write(metadata + '\n') sys.exit(0)
def update_mp3_metadata(self, mp3_file): """ Update metadata for a given file. :param mp3_file: string or :py:class:mp3_utility.Mp3File """ if isinstance(mp3_file, str): mp3_file = mp3_utility.Mp3File(file_path=mp3_file, load_tags_from_file=True) remote_name = self.get_remote_name(mp3_file.file_path) archive_item_file_details = self.item_files_dict.get(remote_name, None) mp3_metadata = mp3_file.metadata if archive_item_file_details is None: logging.warning("The file does not exist! Skipping.") else: remote_tag_update_needed = (archive_item_file_details.get( "artist", "") != mp3_metadata.artist) or ( archive_item_file_details.get("creator", "") != mp3_metadata.artist) or (archive_item_file_details.get( "title", "") != mp3_metadata.title) or ( archive_item_file_details.get( "album", "") != mp3_metadata.album ) or (archive_item_file_details.get( "album_artist", "") != mp3_metadata.album_artist) if remote_tag_update_needed: logging.info("***Updating %s in archive item." % remote_name) logging.info( internetarchive.modify_metadata( self.archive_id, metadata=dict(title=mp3_metadata.title, album=mp3_metadata.album, album_artist=mp3_metadata.album_artist, artist=mp3_metadata.artist, creator=mp3_metadata.artist), target=os.path.join("files", remote_name)))
def main(argv): args = docopt(__doc__, argv=argv) item = get_item(args['<identifier>']) # Check existence of item. if args['--exists']: if item.exists: stdout.write('{0} exists\n'.format(item.identifier)) exit(0) else: stderr.write('{0} does not exist\n'.format(item.identifier)) exit(1) # Modify metadata. elif args['--modify']: metadata = get_args_dict(args['--modify']) response = modify_metadata(args['<identifier>'], metadata) status_code = response['status_code'] if not response['content']['success']: error_msg = response['content']['error'] stderr.write('error: {0} ({1})\n'.format(error_msg, status_code)) exit(1) stdout.write('success: {0}\n'.format(response['content']['log'])) # Get metadata. elif args['--files']: for i, f in enumerate(item.files()): if not args['--target']: files_md = [f.identifier, f.name, f.source, f.format, f.size, f.md5] else: files_md = [f.__dict__.get(k) for k in args['--target']] stdout.write('\t'.join([str(x) for x in files_md]) + '\n') elif args['--formats']: formats = set([f.format for f in item.files()]) stdout.write('\n'.join(formats) + '\n') elif args['--target']: metadata = [] for key in args['--target']: if '/' in key: for i, k in enumerate(key.split('/')): if i == 0: md = item.metadata.get(k) else: if md: md = md.get(k) else: md = item.metadata.get(key) if md: metadata.append(md) stdout.write('\t'.join([str(x) for x in metadata]) + '\n') else: metadata = dumps(item.metadata) stdout.write(metadata + '\n') exit(0)
def archive_update_metadata(archive_id, metadata, session): success = True m = modify_metadata(archive_id, metadata) if m.status_code != 200: success = False log(session, "Failed to update metadata on archive.org: " + m.reason, c.SESSION_FAILED, c.LOG_ERROR) else: log(session, "Session metadata updated on archive.org", c.SESSION_SYNCED) return success
def main(argv): args = docopt(__doc__, argv=argv) item = internetarchive.Item(args['<identifier>']) # Check existence of item. if args['--exists']: if item.exists: stdout.write('{0} exists\n'.format(item.identifier)) exit(0) else: stderr.write('{0} does not exist\n'.format(item.identifier)) exit(1) # Modify metadata. elif args['--modify']: metadata = get_args_dict(args['--modify']) response = modify_metadata(args['<identifier>'], metadata) status_code = response['status_code'] if not response['content']['success']: error_msg = response['content']['error'] stderr.write('error: {0} ({1})\n'.format(error_msg, status_code)) exit(1) stdout.write('success: {0}\n'.format(response['content']['log'])) # Get metadata. elif args['--files']: for f in item.files(): files_md = [ f.item.identifier, f.name, f.source, f.format, f.size, f.md5 ] stdout.write('\t'.join([str(x) for x in files_md]) + '\n') elif args['--formats']: formats = set([f.format for f in item.files()]) stdout.write('\n'.join(formats) + '\n') elif args['--target']: metadata = [] for key in args['--target']: if '/' in key: for i, k in enumerate(key.split('/')): if i == 0: md = item.metadata.get(k) else: if md: md = md.get(k) else: md = item.metadata.get(key) if md: metadata.append(md) stdout.write('\t'.join([str(x) for x in metadata]) + '\n') else: metadata = dumps(item.metadata) stdout.write(metadata + '\n') exit(0)
def test_modify_metadata(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/test".format(protocol), body={}, status=200) rsps.add( responses.POST, "{0}//archive.org/metadata/test".format(protocol), body=('{"success":true,"task_id":423444944,' '"log":"https://catalogd.archive.org/log/423444944"}'), status=200, ) r = modify_metadata("test", dict(foo=1)) assert r.status_code == 200 assert r.json() == {"task_id": 423444944, "success": True, "log": "https://catalogd.archive.org/log/423444944"}
def test_modify_metadata(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/test'.format(PROTOCOL), body='{}') rsps.add(responses.POST, '{0}//archive.org/metadata/test'.format(PROTOCOL), body=('{"success":true,"task_id":423444944,' '"log":"https://catalogd.archive.org/log/423444944"}')) r = modify_metadata('test', dict(foo=1)) assert r.status_code == 200 assert r.json() == { 'task_id': 423444944, 'success': True, 'log': 'https://catalogd.archive.org/log/423444944' }
def test_modify_metadata(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(PROTOCOL), body='{"metadata":{"title":"foo"}}') rsps.add(responses.POST, '{0}//archive.org/metadata/nasa'.format(PROTOCOL), body=('{"success":true,"task_id":423444944,' '"log":"https://catalogd.archive.org/log/423444944"}')) r = modify_metadata('nasa', dict(foo=1)) assert r.status_code == 200 assert r.json() == { 'task_id': 423444944, 'success': True, 'log': 'https://catalogd.archive.org/log/423444944' }
def main(): desc = 'You can open ZIM files with <a href="https://www.kiwix.org/">Kiwix</a> software.' for i in internetarchive.search_items( 'subject:"kiwix" AND subject:"zim"').iter_as_items(): try: itemid = i.item_metadata['metadata']['identifier'] print(itemid) except: print('Error in', i) continue if not 'description' in i.item_metadata['metadata']: r = internetarchive.modify_metadata( itemid, metadata=dict(description=desc)) if r.status_code == 200: print('Description added: %s' % (desc)) else: print('Error (%s) adding description: %s' % (r.status_code, desc)) else: print('Already has description: %s' % (i.item_metadata['metadata']['description']))
def main(): genres = { 'Gutenberg': 'Literature', 'Khan-academy-videos': 'Course', 'Wikibooks': 'Course', 'Wikinews': 'News', 'Wikipedia': 'Encyclopedia', 'Wikiquote': 'Quotes', 'Wikisource': 'Literature', 'Wikispecies': 'Encyclopedia', 'Wikiversity': 'Course', 'Wikivoyage': 'Travel', 'Wiktionary': 'Dictionary', } for project, genre in genres.items(): #https://archive.org/services/docs/api/internetarchive/quickstart.html#searching for i in internetarchive.search_items( 'subject:"kiwix" AND subject:"zim" AND subject:"%s"' % (project.lower())).iter_as_items(): try: itemid = i.item_metadata['metadata']['identifier'] print(itemid) except: print('Error in', i) continue if not 'genre' in i.item_metadata['metadata']: if project.lower() in itemid.lower(): r = internetarchive.modify_metadata( itemid, metadata=dict(genre=genre)) if r.status_code == 200: print('Genre added: %s' % (genre)) else: print('Error (%s) adding genre: %s' % (r.status_code, genre)) else: print('Unknown project') else: print('Already has genre: %s' % (i.item_metadata['metadata']['genre']))
fname = sys.argv[1] ol = OpenLibrary() n = 0 with open(fname, 'r') as f: for line in f.readlines(): data = json.loads(line) olid = data['openlibrary'] ocaid = data['identifier'] try: e = ol.get(olid) wolid = e.work.olid assert wolid except requests.exceptions.HTTPError as e: print('404', olid, ocaid) wolid = None to_write = { 'openlibrary_edition': olid } if wolid: to_write['openlibrary_work'] = wolid #print(ocaid, to_write) r = modify_metadata(ocaid, metadata=to_write) print('%s: %s' % (ocaid, r.status_code)) n += 1 if n > 300: print('PAUSE') time.sleep(900) n = 0
def upload_to_internet_archive(self, link_guid): link = Link.objects.get(guid=link_guid) if not settings.UPLOAD_TO_INTERNET_ARCHIVE: return if not link.can_upload_to_internet_archive(): print "Not eligible for upload." return metadata = { "collection": settings.INTERNET_ARCHIVE_COLLECTION, "title": '%s: %s' % (link_guid, truncatechars(link.submitted_title, 50)), "mediatype": 'web', "description": 'Perma.cc archive of %s created on %s.' % ( link.submitted_url, link.creation_timestamp, ), "contributor": 'Perma.cc', "submitted_url": link.submitted_url, "perma_url": "http://%s/%s" % (settings.HOST, link_guid), "external-identifier": 'urn:X-perma:%s' % link_guid, } # set sponsor if organization exists if link.organization: metadata["sponsor"] = "%s - %s" % (link.organization, link.organization.registrar) identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link_guid with default_storage.open(link.warc_storage_file(), 'rb') as warc_file: success = internetarchive.upload( identifier, warc_file, access_key=settings.INTERNET_ARCHIVE_ACCESS_KEY, secret_key=settings.INTERNET_ARCHIVE_SECRET_KEY, retries=10, retries_sleep=60, verbose=True, ) if success: internetarchive.modify_metadata( identifier, metadata=metadata, ) link.uploaded_to_internet_archive = True link.save() else: self.retry( exc=Exception("Internet Archive reported upload failure.")) print "Failed." return success
def main(): #https://meta.wikimedia.org/wiki/List_of_Wikipedias langs = { "ab": "Abkhazian", "ace": "Acehnese", "ady": "Adyghe", "af": "Afrikaans", "ak": "Akan", "am": "Amharic", "an": "Aragonese", "ang": "Anglo-Saxon", "ar": "Arabic", "arc": "Aramaic", "arz": "Egyptian Arabic", "as": "Assamese", "ast": "Asturian", "atj": "Atikamekw", "av": "Avar", "ay": "Aymara", "az": "Azerbaijani", "azb": "South Azerbaijani", "ba": "Bashkir", "bar": "Bavarian", "bcl": "Central Bicolano", "be": "Belarusian", "bg": "Bulgarian", "bi": "Bislama", "bjn": "Banjar", "bm": "Bambara", "bn": "Bengali", "bo": "Tibetan", "bpy": "Bishnupriya Manipuri", "br": "Breton", "bs": "Bosnian", "bug": "Buginese", "bxr": "Buryat", "ca": "Catalan", "cdo": "Min Dong", "ce": "Chechen", "ceb": "Cebuano", "ch": "Chamorro", "cho": "Choctaw", "chr": "Cherokee", "chy": "Cheyenne", "ckb": "Sorani", "co": "Corsican", "cr": "Cree", "crh": "Crimean Tatar", "cs": "Czech", "csb": "Kashubian", "cu": "Old Church Slavonic", "cv": "Chuvash", "cy": "Welsh", "da": "Danish", "de": "German", "din": "Dinka", "diq": "Zazaki", "dsb": "Lower Sorbian", "dty": "Doteli", "dv": "Divehi", "dz": "Dzongkha", "ee": "Ewe", "el": "Greek", "eml": "Emilian-Romagnol", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "ext": "Extremaduran", "fa": "Persian", "ff": "Fula", "fi": "Finnish", "fj": "Fijian", "fo": "Faroese", "fr": "French", "frp": "Franco-Provençal", "frr": "North Frisian", "fur": "Friulian", "fy": "West Frisian", "ga": "Irish", "gag": "Gagauz", "gan": "Gan", "gd": "Scottish Gaelic", "gl": "Galician", "glk": "Gilaki", "gn": "Guarani", "gom": "Goan Konkani", "gor": "Gorontalo", "got": "Gothic", "gu": "Gujarati", "gv": "Manx", "ha": "Hausa", "hak": "Hakka", "haw": "Hawaiian", "he": "Hebrew", "hi": "Hindi", "hif": "Fiji Hindi", "ho": "Hiri Motu", "hr": "Croatian", "hsb": "Upper Sorbian", "ht": "Haitian", "hu": "Hungarian", "hy": "Armenian", "ia": "Interlingua", "id": "Indonesian", "ie": "Interlingue", "ig": "Igbo", "ik": "Inupiak", "ilo": "Ilokano", "inh": "Ingush", "io": "Ido", "is": "Icelandic", "it": "Italian", "iu": "Inuktitut", "ja": "Japanese", "jam": "Jamaican Patois", "jbo": "Lojban", "jv": "Javanese", "ka": "Georgian", "kaa": "Karakalpak", "kab": "Kabyle", "kbd": "Kabardian Circassian", "kbp": "Kabiye", "kg": "Kongo", "ki": "Kikuyu", "kj": "Kuanyama", "kk": "Kazakh", "kl": "Greenlandic", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "koi": "Komi-Permyak", "kr": "Kanuri", "krc": "Karachay-Balkar", "ks": "Kashmiri", "ksh": "Ripuarian", "ku": "Kurdish", "kv": "Komi", "kw": "Cornish", "ky": "Kirghiz", "la": "Latin", "lad": "Ladino", "lb": "Luxembourgish", "lbe": "Lak", "lez": "Lezgian", "lfn": "Lingua Franca Nova", "lg": "Luganda", "li": "Limburgish", "lij": "Ligurian", "lmo": "Lombard", "ln": "Lingala", "lo": "Lao", "lrc": "Northern Luri", "lt": "Lithuanian", "ltg": "Latgalian", "lv": "Latvian", "mai": "Maithili", "mdf": "Moksha", "mg": "Malagasy", "mh": "Marshallese", "mhr": "Meadow Mari", "mi": "Maori", "min": "Minangkabau", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "mrj": "Hill Mari", "ms": "Malay", "mt": "Maltese", "mus": "Muscogee", "mwl": "Mirandese", "my": "Burmese", "myv": "Erzya", "mzn": "Mazandarani", "na": "Nauruan", "nah": "Nahuatl", "nap": "Neapolitan", "nds": "Low Saxon", "ne": "Nepali", "new": "Newar", "ng": "Ndonga", "nl": "Dutch", "nn": "Norwegian (Nynorsk)", "no": "Norwegian (Bokmål)", "nov": "Novial", "nrm": "Norman", "nso": "Northern Sotho", "nv": "Navajo", "ny": "Chichewa", "oc": "Occitan", "olo": "Livvi-Karelian", "om": "Oromo", "or": "Oriya", "os": "Ossetian", "pa": "Punjabi", "pag": "Pangasinan", "pam": "Kapampangan", "pap": "Papiamentu", "pcd": "Picard", "pdc": "Pennsylvania German", "pfl": "Palatinate German", "pi": "Pali", "pih": "Norfolk", "pl": "Polish", "pms": "Piedmontese", "pnb": "Western Punjabi", "pnt": "Pontic", "ps": "Pashto", "pt": "Portuguese", "qu": "Quechua", "rm": "Romansh", "rmy": "Romani", "rn": "Kirundi", "ro": "Romanian", "ru": "Russian", "rue": "Rusyn", "rw": "Kinyarwanda", "sa": "Sanskrit", "sah": "Sakha", "sat": "Santali", "sc": "Sardinian", "scn": "Sicilian", "sco": "Scots", "sd": "Sindhi", "se": "Northern Sami", "sg": "Sango", "sh": "Serbo-Croatian", "shn": "Shan", "si": "Sinhalese", "sk": "Slovak", "sl": "Slovenian", "sm": "Samoan", "sn": "Shona", "so": "Somali", "sq": "Albanian", "sr": "Serbian", "srn": "Sranan", "ss": "Swati", "st": "Sesotho", "stq": "Saterland Frisian", "su": "Sundanese", "sv": "Swedish", "sw": "Swahili", "szl": "Silesian", "ta": "Tamil", "tcy": "Tulu", "te": "Telugu", "tet": "Tetum", "tg": "Tajik", "th": "Thai", "ti": "Tigrinya", "tk": "Turkmen", "tl": "Tagalog", "tn": "Tswana", "to": "Tongan", "tpi": "Tok Pisin", "tr": "Turkish", "ts": "Tsonga", "tt": "Tatar", "tum": "Tumbuka", "tw": "Twi", "ty": "Tahitian", "tyv": "Tuvan", "udm": "Udmurt", "ug": "Uyghur", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "ve": "Venda", "vec": "Venetian", "vep": "Vepsian", "vi": "Vietnamese", "vls": "West Flemish", "vo": "Volapük", "wa": "Walloon", "war": "Waray-Waray", "wo": "Wolof", "wuu": "Wu", "xal": "Kalmyk", "xh": "Xhosa", "xmf": "Mingrelian", "yi": "Yiddish", "yo": "Yoruba", "za": "Zhuang", "zea": "Zeelandic", "zh": "Chinese", "zu": "Zulu", } for langid, langword in langs.items(): #https://archive.org/services/docs/api/internetarchive/quickstart.html#searching for i in internetarchive.search_items( 'subject:"kiwix" AND subject:"zim" AND subject:"%s"' % (langid)).iter_as_items(): try: itemid = i.item_metadata['metadata']['identifier'] print(itemid) except: print('Error in', i) continue if not 'language' in i.item_metadata['metadata']: if '_%s_' % (langid) in itemid: r = internetarchive.modify_metadata( itemid, metadata=dict(language=langword)) if r.status_code == 200: print('Language added: %s' % (langword)) else: print('Error (%s) adding language: %s' % (r.status_code, langword)) else: print(i.item_metadata['metadata']) print('Unknown language') else: print('Already has language: %s' % (i.item_metadata['metadata']['language']))