def verify_url_file_extension(url, file_exts, url_protocols=None): """ Verify that a url contains a file extension and that it is allowed. Also checks that the protocol is whitelisted. @param url: the url to check @param file_exts: tuple of allowed file extensions @param url_protocols: tuple of allowed url protocols @return: the file extension @raises: common.MyError """ url_protocols = url_protocols or URL_PROTOCOLS protocol, _, rest = url.partition('://') if protocol not in url_protocols: raise common.MyError( '{0}: Found url with a disallowed protocol'.format(url)) try: ext = os.path.splitext(url)[1] except IndexError: raise common.MyError( '{0}: Found url without a file extension'.format(url)) else: if not ext: raise common.MyError( '{0}: Found url without a file extension'.format(url)) if ext not in file_exts: raise common.MyError( '{0}: Found url with a disallowed file extension ({1})'.format( url, ext)) return ext
def load_settings(args): """ Load settings from file, command line or defaults. Any command line values takes precedence over setting file values. If neither is present then defaults are used. Command line > Settings file > default_options """ default_options = DEFAULT_OPTIONS.copy() options = handle_args(args, PARAMETER_HELP.format(**default_options)) # settings_file must be handled first options['settings_file'] = (options.get('settings_file') or default_options.pop('settings_file')) # combine all loaded settings settings_options = common.open_and_read_file( options.get('settings_file'), as_json=True) for key, val in default_options.items(): options[key] = options.get(key) or settings_options.get(key) or val # read glam-specific settings like location of mapping tables if not options["glam_code"]: err_mess = "The batch settings file ({}) is missing a GLAM code." raise common.MyError(err_mess.format(options.get('settings_file'))) glam_file = os.path.join(SETTINGS_DIR, options["glam_code"]) glam_options = common.open_and_read_file( "{}.json".format(glam_file), as_json=True) for key, val in glam_options.items(): options[key] = glam_options.get(key) return options
def load_mapping_lists_mappings( mappings_dir, update=True, mappings=None, mapping_root=None): """ Add mapping lists to the loaded mappings. :param update: whether to first download the latest mappings :param mappings_dir: path to directory in which mappings are found :param mappings: dict to which mappings should be added. If None then a new dict is returned. :param mapping_root: root path for the mappings on wiki (required for an update) """ mappings = mappings or {} mappings_dir = mappings_dir or MAPPINGS_DIR if update and not mapping_root: raise common.MyError('A mapping root is needed to load new updates.') ml = make_places_list(mappings_dir, mapping_root) mappings['places'] = ml.consume_entries( ml.load_old_mappings(update=update), 'name', require=['category', 'wikidata']) mk = make_keywords_list(mappings_dir, mapping_root) mappings['keywords'] = mk.consume_entries( mk.load_old_mappings(update=update), 'name', require='category', only='category') mp = make_people_list(mappings_dir, mapping_root) mappings['people'] = mp.consume_entries( mp.load_old_mappings(update=update), 'name', require=['creator', 'category', 'wikidata']) return mappings
def makeHitlist(files, data): """ Given a list of paths to file and target filenames construct a hitlist. The hitlist is made up by the (lower case) extension and the extensionless basename of the file. The data file should be a dict where the keys are the (extensionless) target filenames. @param files: list of file paths @param data: dict containing target filenames as keys @return: list of hitList[key] = {ext, path, data} """ hitlist = [] processed_keys = [] # stay paranoid for f in files: key, ext = os.path.splitext(os.path.basename(f)) if key not in data: continue elif key in processed_keys: raise common.MyError('non-unique file key: %s' % key) processed_keys.append(key) hitlist.append({ 'path': f, 'ext': ext.lower(), 'data': data[key], 'key': key }) return hitlist
def process_all_files(base_dir=MAIN_DIR, xml_dir=XML_DIR): """Identify all xml files in a directory, load the data and process.""" # Check categories xml_dir = os.path.join(base_dir, xml_dir) for directory in (base_dir, xml_dir): if not os.path.isdir(directory): raise common.MyError( u'The provided directory was not a valid directory: %s' % directory) # Find candidate files found_files = prep.find_files(path=xml_dir, file_exts=('.xml', ), subdir=False) pywikibot.output("Found %d .xml files" % len(found_files)) data = {} for xml_file in found_files: try: test = InfoEntry(load_xml(xml_file)) except Exception as e: pywikibot.output(u"Encountered error while processing %s: %s" % (os.path.split(xml_file)[-1], e)) continue if test.obj_id in data.keys(): pywikibot.output(u"Multiple files for same object: %s, %s, %s" % (test.obj_id, test.source_file, data[test.obj_id]['source_file'])) continue data[test.obj_id] = test.output() out_file = os.path.join(base_dir, u'processed_lido.json') common.open_and_write_file(out_file, data, as_json=True) pywikibot.output("Created %s with %d entries" % (out_file, len(data)))
def set_options(self, overriding_options): """ Set various options to default or override in initialisation. @param overriding_options: dict of options to use instead of default values. """ overriding_options = overriding_options or {} # default options options = { # the value used to indicate that a mapping is not applicable or # not needed (as opposed to being left unmapped). 'na_value': '-', # delimiter used to separate list values. 'list_delimiter': '/', # key in the mapping entry to be used for name and secondary # sorting. Cannot be a multi-valued (list) field. 'name_key': 'name', # key in the mapping entry to be used for frequency. 'freq_key': 'frequency' } for k, v in overriding_options.items(): if k in options: options[k] = v else: raise common.MyError('{} is not a recognised option'.format(k)) return options
def get_glam_id(self): """Set the identifier used by the GLAM.""" for (glam, idno) in self.glam_id: if glam == self.glam_data.get("glam_code"): return idno # without a glam_id we have to abort raise common.MyError('Could not find an id for this GLAM in the data')
def load_batch_settings(self, options): """Load batch-specific settings for categorization.""" fpath = options.get("batch_settings") batch_settings = common.open_and_read_file(fpath, as_json=True) if ("batch_cat" not in batch_settings.keys() or "batch_date" not in batch_settings.keys()): err = "Batch settings file ({}) is missing base category or date." raise common.MyError(err.format(fpath)) return batch_settings
def check_for_unexpected_lists(self, data, label): """ Ensure there aren't any unexpected lists. :param data: a single image or archive card entry :param label: label allowing the row to be identified in the csv """ delimiter = self.settings.get('list_delimiter') if any(delimiter in entry for entry in data.values()): raise common.MyError( '{}: One of the columns unexpectedly ' 'contains a list\n{}'.format( label, '\n'.join([ '{}: {}'.format(k, v) for k, v in filter( lambda x: delimiter in x[1], data.items()) ])))
def get_license_text(self): """Format a license template.""" if self.copyright and self.default_copyright: # cannot deal with double license info yet raise NotImplementedError copyright = self.copyright or self.default_copyright # CC licenses are used for modern photographs if copyright.get('code') == 'by': return '{{CC-BY-4.0|%s}}' % self.get_byline() elif copyright.get('code') == 'by-sa': return '{{CC-BY-SA-4.0|%s}}' % self.get_byline() elif copyright.get('code') == 'pdm': # for PD try to get death date from creator (wikidata) else PD-70 mapping = self.glam_info.mappings.get('people') persons = (self.creation.get('related_persons') or copyright.get('persons') or self.photographer.get("name")) death_years = [] for person in persons: name = person.get('name') data = self.glam_info.mapped_and_wikidata(name, mapping) death_years.append(data.get('death_year')) death_years = list(filter(None, death_years)) # trim empties try: death_year = max(death_years) except ValueError: death_year = None if death_year and death_year < self.glam_info.pd_year: return '{{PD-old-auto|deathyear=%s}}' % death_year elif death_year and not self.is_photo: raise common.MyError( 'The creator death year is not late enough for PD and ' 'this does not seem to be a photo') elif self.is_photo: return '{{PD-Sweden-photo}}' else: return '{{PD-old-70}}'
def make_item_from_raw(entry, image_file, natmus_info): """ Given the raw metadata for an item, construct an NatmusItem. @param entry: the raw metadata entry as a dict @param natmus_info: the parent NatmusInfo instance @return: NatmusItem """ d = entry.copy() # skip paintings not in wikidata if d['obj_id'] not in natmus_info.wd_paintings.keys() and \ natmus_info.skip_non_wikidata: raise common.MyError( u"skip_4: " u"%s did not have any associated wikidata entry" % d['obj_id']) # add specific image info d['image'] = image_file d['photographer'] = d['images'].get(image_file) # collect nsid entries for k in d['creator'].keys(): helpers.addOrIncrement(natmus_info.nsid, k, key='freq') for s in d['subjects']: if s.get('nsid'): helpers.addOrIncrement(natmus_info.nsid, s.get('nsid'), key='freq') if s.get('other_id'): helpers.addOrIncrement(natmus_info.uri_ids, s.get('other_id'), key='freq') natmus_info.uri_ids[s.get('other_id')]['name'] = s.get('name') # drop unneded fields del d['images'] return NatmusItem(d)
def get_license_text(self): """Format a license template.""" if self.license not in ('PD', 'cc0'): raise common.MyError( 'A non-supported license was encountered: {}'.format( self.license)) # CC0 is straight forward if self.license == 'cc0': return '{{CC0}}' # PD - identify creator and image type (photo/artwork) # creator death year > 70 # {{PD-old-auto}} # photo, creator known and image date < 1969 # {{PD-Sweden-photo}} creator = self.get_creator_data() # skips any uncertain if creator: death_year = creator.get('death_year') creation_year = utils.get_last_year(self.date_text) if death_year and death_year < self.smvk_info.pd_year: return '{{PD-old-auto|deathyear=%s}}' % death_year elif death_year and not self.is_photo(): self.problems.append( 'The creator death year ({}) is not late enough for PD ' 'and this does not seem to be a photo.'.format(death_year)) elif self.is_photo() and creation_year and creation_year < 1969: return '{{PD-Sweden-photo}}' else: self.problems.append( 'Could not determine why this image by {} is PD.'.format( creator.get('name'))) else: # cannot default to PD-Sweden-photo since creator need not be # Swedish. Cannot default to PD-anon-70 since date of first # publication is not known. self.problems.append( 'The creator is unknown so PD status cannot be verified')
def run(self, in_file, base_name, update_mappings): """ Entry point for outputting info data. Loads indata and any mappings to produce a make_info json file. @param in_file: filename (or tuple of such) containing the metadata @param base_name: base name to use for output (defaults to same as in_file) @update_mappings: if mappings should be updated against online sources """ if not base_name: if common.is_str(in_file): base_name, ext = os.path.splitext(in_file) else: raise common.MyError( 'A base name must be provided if multiple in_files ' 'are provided') self.cwd_path = os.path.split(base_name)[0] raw_data = self.load_data(in_file) self.load_mappings(update_mappings) self.process_data(raw_data) out_data = self.make_info() # store output out_file = '%s.json' % base_name common.open_and_write_file(out_file, out_data, as_json=True) pywikibot.output('Created %s' % out_file) # store filenames out_file = '%s.filenames.txt' % base_name out = '' for k in sorted(out_data.keys()): out += '%s|%s\n' % (k, out_data[k]['filename']) common.open_and_write_file(out_file, out) pywikibot.output('Created %s' % out_file)
def run(in_path, out_path, data_path, file_exts=None): """ Prepare an upload. Prepare an upload by: 1. Find files in in_path (with subdirs) with file_exts file extension, 2. Match these against the keys in the makeInfo output data 3. Make info files and rename found file (in new target folder) @todo: throw errors on failed file read/write @param in_path: path to directory where unprocessed files live @param out_path: path to directory where renamed files and info should live @param data_path: path to .json containing makeInfo output data @param file_exts: tupple of allowed file extensions (case insensitive) """ # Load data data = common.open_and_read_file(data_path, codec='utf-8', as_json=True) # set filExts file_exts = file_exts or FILE_EXTS # Find candidate files if not os.path.isdir(in_path): raise common.MyError( 'The provided inPath was not a valid directory: %s' % in_path) found_files = find_files(path=in_path, file_exts=file_exts) # Find matches hitlist = makeHitlist(found_files, data) # make info and rename makeAndRename(hitlist, out_path) # clean up any empty subdirectories removeEmptyDirectories(in_path)
def get_geo_data(self): """ Find commonscat and wikidata entries for each available place level. Returns an dict with the most specific wikidata entry and any matching commonscats in decreasing order of relevance. If any 'other' value is matched the wikidata ids are returned and the categories are added as content_cats. """ if (self.description_place and self.depicted_place and (self.description_place != self.depicted_place)): self.problems.append( 'Cannot handle differing depicted_place and description_place:' '\nDepicted_place: {0}\nDescription_place: {1}'.format( self.depicted_place, self.description_place)) depicted_place = self.depicted_place or self.description_place if not depicted_place: return {} if (depicted_place.get('country') and depicted_place.get('country').get('code') != 'Sverige'): self.meta_cats.add('needing categorisation (not from Sweden)') # set up the geo_types and their corresponding mappings ordered from # most to least specific geo_map = OrderedDict([(i, self.glam_info.mappings.get(i)) for i in GEO_ORDER]) role = depicted_place.pop('role') if any(key not in geo_map for key in depicted_place.keys()): diff = set(depicted_place.keys()) - set(geo_map.keys()) raise common.MyError('{} should be added to GEO_ORDER'.format( ', '.join(diff))) wikidata = {} commonscats = [] labels = OrderedDict() # handle other separately geo_map.pop('other') if depicted_place.get('other'): for geo_type, data in depicted_place.get('other').items(): mapping = self.glam_info.mapped_and_wikidata( data.get('code'), self.glam_info.mappings['places']) if mapping.get('category'): commonscats += mapping.get('category') # this is a list if mapping.get('wikidata'): wikidata[geo_type] = mapping.get('wikidata') labels[geo_type] = data.get('label') for geo_type, mapping in geo_map.items(): if not depicted_place.get(geo_type): continue data = depicted_place.get(geo_type) mapped_data = mapping.get(data.get('code')) if mapped_data.get('wd'): wikidata[geo_type] = mapped_data.get('wd') if mapped_data.get('commonscat'): commonscats.append(mapped_data.get('commonscat')) labels[geo_type] = data.get('label') # just knowing country is pretty bad if len(commonscats) <= 1: self.meta_cats.add('needing categorisation (place)') return { 'role': role, 'wd': wikidata, 'commonscats': commonscats, 'labels': labels }