예제 #1
0
class PaintingsImageBot:
    """Bot to enrich, and create, for items about paintings on Wikidata."""
    def __init__(self, dict_generator, people_items):
        """Initialise the bot."""
        self.people_items = people_items
        self.generator = dict_generator
        self.repo = pywikibot.Site().data_repository()
        self.wd = WD(self.repo, edit_summary=EDIT_SUMMARY)

        # Set log file
        out_dir = path.join(path.split(__file__)[0])
        log_filename = path.join(out_dir, u'PaintingsImageBot.log')
        self.log = codecs.open(log_filename, 'a', 'utf-8')

    def run(self):
        """Start the robot."""
        self.creators = {}

        for painting_data in self.generator:
            # isolate ids
            lido_data, qid, commons_file = painting_data
            painting_item = self.wd.QtoItemPage(qid)
            self.process_painting(painting_item, lido_data, commons_file)

    def process_painting(self, item, lido_data, commons_file):
        """Process a single painting."""
        item.exists()  # load the item
        obj_id_ref = self.make_obj_id_ref(lido_data.get('obj_id'))
        # lido_ref = self.make_lido_ref(lido_data)  # make a reference object

        self.check_and_add_labels(item, lido_data)
        self.add_image_claim(item, commons_file, obj_id_ref)
        self.add_depicted_claim(item, lido_data, obj_id_ref)
        self.add_date_claim(item, lido_data, obj_id_ref)
        self.add_dimension_claims(item, lido_data, obj_id_ref)

    def add_dimension_claims(self, item, lido_data, ref):
        """
        Add height/P2048 and width/P2049 claims.

        Only add non-framed measurements with just height and width.
        """
        height_p = u'P2048'
        width_p = u'P2049'
        # diameter_p = u'P2386'
        # thickness_p = u'P2610'
        dimensions = lido_data.get('measurements').get('_')  # non-framed
        if not dimensions or not dimensions.get('unit'):
            return None
        elif not dimensions.get('width') or not dimensions.get('height') \
                or dimensions.get('depth'):
            # skip complicated cases for now
            return None
        elif not helpers.get_unit_q(dimensions.get('unit')):
            pywikibot.output(u'"%s" is an unmapped unit' %
                             dimensions.get('unit'))
            return None

        # prepare all parts before adding claims
        unit = helpers.get_unit_q(dimensions.get('unit'))
        # unit = self.wd.QtoItemPage(unit)
        unit = entity_url_hack(unit)

        height = pywikibot.WbQuantity(
            dimensions.get('height'),
            # unit=unit,
            entity=unit,
            site=self.wd.repo)
        width = pywikibot.WbQuantity(
            dimensions.get('width'),
            # unit=unit,
            entity=unit,
            site=self.wd.repo)

        # make claims
        self.wd.addNewClaim(height_p, WD.Statement(height), item, ref)
        self.wd.addNewClaim(width_p, WD.Statement(width), item, ref)

    def add_date_claim(self, item, lido_data, ref):
        """
        Add an inception/P571 claim.

        Only adds the claim if it's an exact year.
        """
        prop = u'P571'
        creation_date = lido_data.get('creation_date')
        wb_date = None
        if not creation_date:
            return None

        # exact date
        if creation_date.get('earliest') and \
                creation_date.get('earliest') == creation_date.get('latest'):
            wb_date = helpers.iso_to_WbTime(creation_date.get('earliest'))

        # make claim
        if wb_date:
            self.wd.addNewClaim(prop, WD.Statement(wb_date), item, ref)

    def add_depicted_claim(self, item, lido_data, ref):
        """Add a depicted/P180."""
        prop = u'P180'
        if not lido_data.get('subjects'):
            return None

        for subject in lido_data.get('subjects'):
            nsid = subject.get(u'other_id')
            if nsid in self.people_items:
                person_item = self.wd.QtoItemPage(self.people_items[nsid])
                self.wd.addNewClaim(prop, WD.Statement(person_item), item, ref)

    def add_image_claim(self, item, commons_file, ref):
        """
        Add a image/P18 claim.

        Only adds it if there is None already. If one exists output to log.
        """
        prop = u'P18'
        if not commons_file:
            return

        file_page = pywikibot.FilePage(pywikibot.Site('commons', 'commons'),
                                       commons_file)

        # check if another image is already used
        if prop in item.claims and \
                not self.wd.has_claim(prop, file_page, item):
            self.log.write(
                u"%s already contains image claim: %s -> %s\n" %
                (item.title(), item.claims.get(prop)[0].getTarget().title(),
                 file_page.title()))
        else:
            self.wd.addNewClaim(prop, WD.Statement(file_page), item, ref)

    def check_and_add_labels(self, item, lido_data):
        """Process the title field add to the item if needed."""
        if not lido_data.get('title'):
            return

        for lang, value in lido_data.get('title').iteritems():
            if lang == '_':
                continue
            try:
                self.wd.addLabelOrAlias(lang, value, item, caseSensitive=False)
            except pywikibot.data.api.APIError as e:
                self.log.write(u"%s: had an error: %s\n" % (item.title(), e))

    def make_obj_id_ref(self, obj_id):
        """Make a reference object pointing to the objects collection page."""
        uri = u'http://collection.nationalmuseum.se/eMuseumPlus?' \
              u'service=ExternalInterface&module=collection&' \
              u'objectId=%s&viewType=detailView' % obj_id
        return self.make_url_reference(uri)

    def make_url_reference(self, uri):
        """
        Make a Reference object with a retrieval url and today's date.

        @param uri: retrieval uri/url
        @type uri: str
        @rtype: WD.Reference
        """
        date = helpers.today_as_WbTime()
        ref = WD.Reference(source_test=self.wd.make_simple_claim(u'P854', uri),
                           source_notest=self.wd.make_simple_claim(
                               u'P813', date))
        return ref

    # Not implemented due to uncertainty on referencing individual xml files
    def make_lido_ref(self, lido_data):
        """
        Make a Reference object for the dataset.

        Contains 4 parts:
        * P248: Stated in <the Nationalmuseum dataset>
        * P577: Publication date <from creation date of the document>
        * P854: Reference url <using the input url>
        * P813: Retrieval date <current date>
        """
        exit()
        # P248: Nationalmuseum dataset
        xml_file = lido_data.get('source_file')
        date = helpers.today_as_WbTime()
        pub_date = helpers.iso_to_WbTime(u'2016-09-30')
        zip_url = u'https://github.com/NationalmuseumSWE/WikidataCollection/' \
                  u'blob/master/valid_items_transform_1677.tgz'
        ref = WD.Reference(source_test=[
            self.wd.make_simple_claim(u'P854', zip_url),
            self.wd.make_simple_claim(u'P577', pub_date),
            self.wd.make_simple_claim(u'P?', xml_file),
        ],
                           source_notest=self.wd.make_simple_claim(
                               u'P813', date))
        return ref
예제 #2
0
class KulturnavBot(object):
    """Bot to enrich and create information on Wikidata from KulturNav info."""

    EDIT_SUMMARY = 'import using #Kulturnav data'
    KULTURNAV_ID_P = '1248'
    GEONAMES_ID_P = '1566'
    SWE_KOMMUNKOD_P = '525'
    SWE_COUNTYKOD_P = '507'
    PLACE_P = '276'
    TIME_P = '585'  # date
    DATASET_Q = None
    DISAMBIG_Q = '4167410'
    IS_A_P = '31'
    CATALOG_P = '972'
    DATASET_ID = None
    ENTITY_TYPE = None
    MAP_TAG = None
    COUNTRIES = []  # a list of country Q's
    ADMIN_UNITS = []  # a list of municipality+county Q's
    locations = {}  # a dict of uuid to wikidata location matches
    current_uuid = ''  # for debugging

    def __init__(self, dictGenerator, cache_max_age, verbose=False):
        """
        Initialise the bot.

        Arguments:
            * generator    - A generator that yields Dict objects.
        """
        self.generator = dictGenerator
        self.repo = pywikibot.Site().data_repository()
        self.cutoff = None
        self.verbose = verbose
        self.require_wikidata = True
        self.cache_max_age = cache_max_age

        # trigger wdq query
        self.itemIds = helpers.fill_cache(self.KULTURNAV_ID_P,
                                          cache_max_age=cache_max_age)

        # set up WikidataStuff instance
        self.wd = WD(self.repo, self.EDIT_SUMMARY)

        # load lists
        self.COUNTRIES = wdqsLookup.wdq_to_wdqs(u'TREE[6256][][31]')
        self.ADMIN_UNITS = wdqsLookup.wdq_to_wdqs(u'TREE[15284][][31]')

    @classmethod
    def set_variables(cls,
                      dataset_q=None,
                      dataset_id=None,
                      entity_type=None,
                      map_tag=None,
                      edit_summary=None):
        """Override any class variables.

        Used when command line arguments affect which type of run to do.

        @param dataset_q: the Q-id of the dataset
        @type dataset_q: str
        @param dataset_id: the uuid of the dataset
        @type dataset_id: str
        @param entity_type: the entity type to provide for the search API
        @type entity_type: str
        @param map_tag: the map_tag to use in the search API to find wikidata
            matches
        @type map_tag: str
        @param edit_summary: the edit_summary to use
        @type edit_summary: str
        """
        cls.DATASET_Q = dataset_q or cls.DATASET_Q
        cls.DATASET_ID = dataset_id or cls.DATASET_ID
        cls.ENTITY_TYPE = entity_type or cls.ENTITY_TYPE
        cls.MAP_TAG = map_tag or cls.MAP_TAG
        cls.EDIT_SUMMARY = edit_summary or cls.EDIT_SUMMARY

    def run(self):
        """Start the robot."""
        raise NotImplementedError("run() is not implemented in the base bot.")

    def runLayout(self, datasetRules, datasetProtoclaims, datasetSanityTest,
                  label, shuffle):
        """
        Execute the basic layout of a run.

        It should be called for a dataset-specific run which sets the
        parameters.

        param datasetRules: a dict of additional Rules or values to look for
        param datasetProtoclaims: a function for populating protoclaims
        param datasetSanityTest: a function which must return true for
                                 results to be written to Wikidata
        param label: the key in values to be used for label/alias.
                     set to None to skip addNames()
        param shuffle: whether name/label/alias is shuffled or not
                       i.e. if name = last, first
        """
        count = 0
        for hit in self.generator:
            # print count, self.cutoff
            if self.cutoff and count >= self.cutoff:
                break
            # some type of feedback
            if count % 100 == 0 and count > 0:
                pywikibot.output('%d entries handled...' % count)
            # Required rules/values to search for
            rules = {
                u'identifier': None,
                u'modified': None,
                u'seeAlso': None,
                u'sameAs': None,
                u'exactMatch': None,
                # not expected
                u'wikidata': None,
                u'libris-id': None,
                u'viaf-id': None,
                u'getty_aat': None,
                u'ulan': None
            }
            rules.update(datasetRules)

            # put together empty dict of values then populate
            values = {}
            for k in rules.keys():
                values[k] = None
            if not self.populateValues(values, rules, hit):
                # continue with next hit if problem was encounterd
                continue

            # find the matching wikidata item
            hitItem = self.wikidataMatch(values)
            self.current_uuid = values['identifier']
            # @todo: self.current_protoclaims  # allows these to be accessed more easily

            # convert values to potential claims
            protoclaims = datasetProtoclaims(self, values)
            self.make_base_protoclaims(values, protoclaims)

            # output info for testing
            if self.verbose:
                pywikibot.output(values)
                pywikibot.output(protoclaims)
                pywikibot.output(hitItem)

            # Add information if a match was found
            if hitItem and hitItem.exists():
                # if redirect then get target instead

                # make sure it passes the sanityTests
                if not self.sanityTest(hitItem):
                    continue
                if not datasetSanityTest(self, hitItem):
                    continue

                # add name as label/alias
                if label is not None:
                    self.addNames(values[label], hitItem, shuffle=shuffle)

                # get the "last modified" timestamp and construct a Reference
                date = helpers.iso_to_WbTime(values[u'modified'])
                ref = self.make_ref(date)

                # add each property (if new) and source it
                self.addProperties(protoclaims, hitItem, ref)

            # allow for limited runs
            count += 1

        # done
        pywikibot.output(u'Handled %d entries' % count)

    def populateValues(self, values, rules, hit):
        """
        Populate values and check results given a hit.

        Given a list of values and a kulturnav hit, populate the values
        and check if result is problem free.

        @todo: raise Error instead of using problemFree solution

        param values: dict with keys and every value as None
        param rules: a dict with keys and values either:
            None: the exakt key is present in hit and its value is wanted
            a Rule: acording to the class above
        param hit: a kulturnav entry
        return bool problemFree
        """
        ids = {}
        problemFree = True
        for entries in hit[u'@graph']:
            # populate ids for viaId rules
            if '@id' in entries.keys():
                if entries['@id'] in ids.keys():
                    pywikibot.output('Non-unique viaID key: \n%s\n%s' %
                                     (entries, ids[entries['@id']]))
                ids[entries['@id']] = entries

        for entries in hit[u'@graph']:
            # handle rules
            for key, rule in rules.iteritems():
                val = None
                if rule is None:
                    if key in entries.keys():
                        val = entries[key]
                elif isinstance(rule, Rule):
                    val = rule.resolve(entries, ids)

                # test and register found value
                if val is not None:
                    if values[key] is None:
                        values[key] = val
                    else:
                        pywikibot.output(u'duplicate entries for %s' % key)
                        problemFree = False

        # the minimum which must have been identified
        if values[u'identifier'] is None:
            raise pywikibot.Error(u'Could not isolate the identifier from the '
                                  u'KulturNav object! JSON layout must have '
                                  u'changed. Crashing!')

        # dig into sameAs/exactMatch and seeAlso
        KulturnavBot.set_sameas_values(values)

        # only look at seeAlso if we found no Wikidata link and require one
        if self.require_wikidata and \
                (not values[u'wikidata'] and values[u'seeAlso']):
            values[u'seeAlso'] = helpers.listify(values[u'seeAlso'])
            for sa in values[u'seeAlso']:
                if u'wikipedia' in sa:
                    pywikibot.output(u'Found a Wikipedia link but no '
                                     u'Wikidata link: %s %s' %
                                     (sa, values[u'identifier']))
            problemFree = False

        if not problemFree:
            pywikibot.output(u'Found an issue with %s (%s), skipping' %
                             (values['identifier'], values['wikidata']))
        return problemFree

    def sanityTest(self, hitItem):
        """
        Execute generic sanitytest which should be run independent on dataset.

        return bool
        """
        return self.withoutClaimTest(hitItem, self.IS_A_P, self.DISAMBIG_Q,
                                     u'disambiguation page')

    def withoutClaimTest(self, hitItem, P, Q, descr):
        """
        Execute base test that an item does not contain a particular claim.

        param hitItem: item to check
        param P: the property to look for
        param Q: the Q claim to look for
        param descr: a descriptive text
        return bool
        """
        P = u'P%s' % P.lstrip('P')
        testItem = self.wd.QtoItemPage(Q)
        if self.wd.has_claim(P, testItem, hitItem):
            pywikibot.output(u'%s is matched to %s, '
                             u'FIXIT' % (hitItem.title(), descr))
            return False
        else:
            return True

    def withClaimTest(self, hitItem, P, Q, descr, orNone=True):
        """
        Execute base test that an item contains a certain claim.

        param hitItem: item to check
        param P: the property to look for
        param Q: (list) of Q claim to look for
        param descr: a descriptive text
        param orNone: if complete absence of the Property is also ok
        return bool
        """
        P = u'P%s' % P.lstrip('P')
        Q = helpers.listify(Q)
        testItems = []
        for q in Q:
            testItems.append(self.wd.QtoItemPage(q))
        # check claims
        if P in hitItem.claims.keys():
            for testItem in testItems:
                if self.wd.has_claim(P, testItem, hitItem):
                    return True
            else:
                pywikibot.output(u'%s is identified as something other '
                                 u'than a %s. Check!' %
                                 (hitItem.title(), descr))
                return False
        elif orNone:  # no P claim
            return True

    @staticmethod
    def set_sameas_values(values):
        """Isolate external identifiers through sameAs and exactMatch.

        @param values: All extracted values
        @type values: dict
        """
        # merge sameAs and exactMatch
        match = helpers.bundle_values(
            [values[u'sameAs'], values[u'exactMatch']]) or []

        # dig into sameAs/exactMatch and seeAlso
        for sa in match:
            if u'wikidata' in sa:
                values[u'wikidata'] = sa.split('/')[-1]
            elif u'libris-id' in values.keys() and \
                    u'libris.kb.se/auth/' in sa:
                values[u'libris-id'] = sa.split('/')[-1]
            elif u'viaf-id' in values.keys() and \
                    u'viaf.org/viaf/' in sa:
                values[u'viaf-id'] = sa.split('/')[-1]
            elif u'getty_aat' in values.keys() and \
                    u'vocab.getty.edu/aat/' in sa:
                values[u'getty_aat'] = sa.split('/')[-1]
            elif u'ulan' in values.keys() and \
                    u'vocab.getty.edu/ulan/' in sa:
                values[u'ulan'] = sa.split('/')[-1]

    def make_base_protoclaims(self, values, protoclaims):
        """Construct the protoclaims common for all KulturnavBots.

        Adds the claim to the protoclaims dict.

        @param values: the values extracted using the rules
        @type values: dict
        @param protoclaims: the dict of claims to add
        @type protoclaims: dict
        """
        # kulturnav protoclaim incl. qualifier
        protoclaims[u'P%s' % self.KULTURNAV_ID_P] = \
            WD.Statement(values[u'identifier']).addQualifier(
                WD.Qualifier(
                    P=self.CATALOG_P,
                    itis=self.wd.QtoItemPage(self.DATASET_Q)),
                force=True)

        # authority control protoclaims
        if values.get(u'libris-id'):
            protoclaims[u'P906'] = WD.Statement(values[u'libris-id'])
        if values.get(u'viaf-id'):
            protoclaims[u'P214'] = WD.Statement(values[u'viaf-id'])
        if values.get(u'getty_aat'):
            protoclaims[u'P1014'] = WD.Statement(values[u'getty_aat'])
        if values.get(u'ulan'):
            protoclaims[u'P245'] = WD.Statement(values[u'ulan'])

    def wikidataMatch(self, values):
        """
        Find the matching wikidata item.

        Checks Wikidata first, then kulturNav.

        return ItemPage|None the matching item
        """
        if values[u'identifier'] in self.itemIds:
            hitItemTitle = u'Q%s' % \
                self.itemIds.get(values[u'identifier'])

            if not values[u'wikidata'] and not self.require_wikidata:
                # i.e. uuid has been supplied manually and exists on wikidata
                pass
            elif values[u'wikidata'] != hitItemTitle:
                # this may be caused by either being a redirect
                wd = self.wd.QtoItemPage(values[u'wikidata'])
                wi = self.wd.QtoItemPage(hitItemTitle)
                if wd.isRedirectPage() and wd.getRedirectTarget() == wi:
                    pass
                elif wi.isRedirectPage() and wi.getRedirectTarget() == wd:
                    pass
                else:
                    pywikibot.output(u'Identifier missmatch (skipping): '
                                     u'%s, %s, %s' %
                                     (values[u'identifier'],
                                      values[u'wikidata'], hitItemTitle))
                    return None
        elif values[u'wikidata']:
            hitItemTitle = values[u'wikidata']
        else:
            # no match found
            return None

        # create ItemPage, bypassing any redirect
        hitItem = self.wd.bypassRedirect(self.wd.QtoItemPage(hitItemTitle))
        # in case of redirect
        values[u'wikidata'] = hitItem.title()

        return hitItem

    def addNames(self, names, hitItem, shuffle=False):
        """
        Prepare a nameObj or a list of such for add_label_or_alias().

        param shuffle: bool if name order is last, first then this
                       creates a local rearranged copy
        """
        if names:
            if shuffle:
                namelist = []
                if isinstance(names, dict):
                    s = KulturnavBot.shuffle_names(names)
                    if s is not None:
                        namelist.append(s)
                elif isinstance(names, list):
                    for n in names:
                        s = KulturnavBot.shuffle_names(n)
                        if s is not None:
                            namelist.append(s)
                else:
                    pywikibot.output(u'unexpectedly formatted name'
                                     u'object: %s' % names)
                if namelist:
                    self.add_label_or_alias(namelist, hitItem)
            else:
                self.add_label_or_alias(names, hitItem)

    def addProperties(self, protoclaims, hitItem, ref):
        """
        Add each property (if new) and source it.

        param protoclaims: a dict of claims with a
            key: Prop number
            val: Statement|list of Statments
        param hititem: the target entity
        param ref: WD.Reference
        """
        for pcprop, pcvalue in protoclaims.iteritems():
            if pcvalue:
                if isinstance(pcvalue, list):
                    pcvalue = set(pcvalue)  # eliminate potential duplicates
                    for val in pcvalue:
                        # check if None or a Statement(None)
                        if (val is not None) and (not val.isNone()):
                            self.wd.addNewClaim(pcprop, val, hitItem, ref)
                            # reload item so that next call is aware of changes
                            hitItem = self.wd.QtoItemPage(hitItem.title())
                            hitItem.exists()
                elif not pcvalue.isNone():
                    self.wd.addNewClaim(pcprop, pcvalue, hitItem, ref)

    # KulturNav specific functions
    def dbpedia2Wikidata(self, item):
        """
        Convert dbpedia reference to the equivalent Wikidata item, if present.

        param item: dict with @language, @value keys
        return pywikibot.ItemPage|None
        """
        if KulturnavBot.foobar(item):
            return
        if not all(x in item.keys() for x in (u'@value', u'@language')):
            pywikibot.output(u'invalid dbpedia entry: %s' % item)
            exit(1)

        # any site will work, this is just an example
        site = pywikibot.Site(item[u'@language'], 'wikipedia')
        page = pywikibot.Page(site, item[u'@value'])
        if page.properties().get(u'wikibase_item'):
            qNo = page.properties()[u'wikibase_item']
            return self.wd.QtoItemPage(qNo)

    def db_gender(self, value):
        """Match gender values to items.

        Note that this returns a Statment unlike most other functions

        @param value: The gender value
        @type value: str
        @return: The gender item as a statement
        @rtype: WD.Statement or None
        """
        known = {
            u'male': u'Q6581097',
            u'female': u'Q6581072',
            u'unknown': u'somevalue'
        }  # a special case
        if value not in known.keys():
            pywikibot.output(u'invalid gender entry: %s' % value)
            return

        if known[value] in (u'somevalue', u'novalue'):
            return WD.Statement(known[value], special=True)
        else:
            return WD.Statement(self.wd.QtoItemPage(known[value]))

    def db_name(self, name_obj, typ, limit=75):
        """Check if there is an item matching the name.

        A wrapper for helpers.match_name() to send it the relevant part of a
        nameObj.

        @param nameObj: {'@language': 'xx', '@value': 'xxx'}
        @type nameObj: dict
        @param typ: The name type (either 'lastName' or 'firstName')
        @type typ: str
        @param limit: Number of hits before skipping (defaults to 75,
            ignored if onLabs)
        @type limit: int
        @return: A matching item, if any
        @rtype: pywikibot.ItemPage, or None
        """
        return helpers.match_name(name_obj['@value'],
                                  typ,
                                  self.wd,
                                  limit=limit)

    def location2Wikidata(self, uuid):
        """
        Get location from kulturNav uuid.

        Given a kulturNav uuid or url this checks if that contains a
        GeoNames url and, if so, connects that to a Wikidata object
        using the GEONAMES_ID_P property (if any).

        NOTE that the WDQ results may be outdated
        return pywikibot.ItemPage|None
        """
        # Check if uuid
        if not self.is_uuid(uuid):
            return None
        # Convert url to uuid
        if uuid.startswith(u'http://kulturnav.org'):
            uuid = uuid.split('/')[-1]
        # Check if already stored
        if uuid in self.locations.keys():
            if self.locations[uuid] is None:
                return None
            else:
                qNo = u'Q%d' % self.locations[uuid]
                return self.wd.QtoItemPage(qNo)

        # retrieve various sources
        # @todo: this can be more streamlined by including wdq query for geonames
        #       in that method. Possibly sharing the same "look-up and filter"
        #       mechanism for both.
        #       and then using self.locations[uuid] = self.extract... (which
        #       returns qid or None) then (after both have been processed)
        #       checking self.locations.get(uuid) before
        #       making an ItemPage
        #
        # @todo: change self.locations and self.ADMIN_UNITS to include Q prefix (and thus have the methods return that)
        geo_sources = self.get_geo_sources(uuid)
        kulturarvsdata = self.extract_kulturarvsdata_location(geo_sources)
        if kulturarvsdata:
            self.locations[uuid] = kulturarvsdata
            qNo = u'Q%d' % self.locations[uuid]
            return self.wd.QtoItemPage(qNo)

        # retrieve hit through geonames-lookup
        geonames = KulturnavBot.extract_geonames(geo_sources)
        if geonames:
            # store as a resolved hit, in case wdq yields nothing
            self.locations[uuid] = None
            wdqQuery = u'STRING[%s:"%s"]' % (self.GEONAMES_ID_P, geonames)
            wdqResult = wdqsLookup.wdq_to_wdqs(wdqQuery)
            if wdqResult and len(wdqResult) == 1:
                self.locations[uuid] = wdqResult[0]
                qNo = u'Q%d' % self.locations[uuid]
                return self.wd.QtoItemPage(qNo)
            # else:
            # go to geonames and find wikidata from there
            # add to self.locations[uuid]
            # add GEONAMES_ID_P to the identified wikidata

        # no (clean) hits
        return None

    def get_geo_sources(self, uuid):
        """Extract any geosources from a kulturNav uuid.

        Given a kulturNav uuid return the corresponding properties of
        that target which are likely to contain geosources.

        @param uuid: uuid to check
        @type uuid: str
        @return: the matching properties
        @rtyp: list of dicts
        """
        # debugging
        if not self.is_uuid(uuid):
            return []

        query_url = 'http://kulturnav.org/api/%s'
        json_data = json.load(urllib2.urlopen(query_url % uuid))
        sources = []
        if json_data.get(u'properties'):
            same_as = json_data.get('properties').get('entity.sameAs')
            if same_as:
                sources += same_as
            source_uri = json_data.get('properties') \
                                  .get('superconcept.sourceUri')
            if source_uri:
                sources += source_uri
        return sources

    @staticmethod
    def extract_geonames(sources):
        """Return any geonames ID given a list of get_geo_sources().

        @param sources: output of get_geo_sources()
        @type sources: list of dicts
        @return: geonames id
        @rtype: str or None
        """
        needle = 'http://sws.geonames.org/'
        for s in sources:
            if s.get('value') and s.get('value').startswith(needle):
                return s.get('value').split('/')[-1]
        return None

    def extract_kulturarvsdata_location(self, sources):
        """Return any qids matching kulturarvsdata geo authorities.

        @param sources: output of get_geo_sources()
        @type sources: list of dicts
        @return: the matching qid (without Q-prefix)
        @rtype: str or None
        @raises pywikibot.Error
        """
        needle = u'http://kulturarvsdata.se/resurser/aukt/geo/'
        for s in sources:
            if s.get('value') and s.get('value').startswith(needle):
                s = s.get('value').split('/')[-1]
                wdq_query = None
                if s.startswith('municipality#'):
                    code = s.split('#')[-1]
                    wdq_query = u'STRING[%s:"%s"]' % (self.SWE_KOMMUNKOD_P,
                                                      code)
                elif s.startswith('county#'):
                    code = s.split('#')[-1]
                    wdq_query = u'STRING[%s:"%s"]' % (self.SWE_COUNTYKOD_P,
                                                      code)
                elif s.startswith('country#'):
                    pass  # handle via geonames instead
                elif s.startswith('parish#'):
                    pass  # no id's in wikidata
                else:
                    raise pywikibot.Error(u'Unhandled KulturarvsdataLocation '
                                          u'prefix: %s' % s)

                if wdq_query:
                    # only here if a municipality or county was found
                    wdq_result = wdqsLookup.wdq_to_wdqs(wdq_query)
                    if wdq_result and len(wdq_result) == 1:
                        self.ADMIN_UNITS.append(wdq_result[0])
                        return wdq_result[0]
        return None

    def getLocationProperty(self, item, strict=True):
        """
        Return appropriate location property for an item.

        Given an ItemPage this returns the suitable property which
        should be used to indicate its location.
        P17  - land
        P131 - within administrative unit
        P276 - place

        param item: pywikibot.ItemPage|None
        param strict: bool whether place should be returned if no land
                      or admin_unit hit
        return string|None
        """
        if item is not None:
            q = int(item.title()[1:])
            if q in self.COUNTRIES:
                return u'P17'
            elif q in self.ADMIN_UNITS:
                return u'P131'
            elif not strict:
                return u'P%s' % self.PLACE_P
            elif self.verbose:
                item.exists()
                pywikibot.output(u'Could not set location property for: '
                                 u'%s (%s)' %
                                 (item.title(), item.labels.get('sv')))
        return None

    def kulturnav2Wikidata(self, uuid):
        """Return Wikidata entity connected to a kulturNav uid or url.

        Relies on the KULTURNAV_ID_P property (if any) to get the connection.

        NOTE that the WDQ results may be outdated
        @param uuid: a kulturNav uuid or url
        @type uuid: str
        @return: the matching Wikidata item page
        @rtype: pywikibot.ItemPage or None
        """
        # debugging
        if not self.is_uuid(uuid):
            return None

        # Convert url to uuid
        if uuid.startswith(u'http://kulturnav.org'):
            uuid = uuid.split('/')[-1]

        if uuid in self.itemIds.keys():
            qNo = u'Q%d' % self.itemIds[uuid]
            return self.wd.QtoItemPage(qNo)
        else:
            return None

    def is_uuid(self, uuid):
        """Test if a string really is a uuid.

        @param uuid: uuid to test
        @type uuid: str
        @return: whether the test passed
        @rtype: bool
        """
        if not helpers.is_str(uuid):
            pywikibot.output(u'Not an uuid in %s: %s' %
                             (self.current_uuid, uuid))
            return False

        uuid = uuid.split('/')[-1]  # in case of url
        pattern = r'[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}' \
                  r'\-[0-9a-f]{4}\-[0-9a-f]{12}'
        m = re.search(pattern, uuid)
        if not m or m.group(0) != uuid:
            pywikibot.output(u'Not an uuid in %s: %s' %
                             (self.current_uuid, uuid))
            return False

        return True

    @staticmethod
    def shuffle_names(name_obj):
        """Detect a "Last, First" string and return as "First Last".

        A wrapper for helpers.reorder_names() to send it the relevant part of a
        name_obj.

        @param name_obj: {'@language': 'xx', '@value': 'xxx'}
        @type name_obj: dict
        @return: the reordered name_obj or None if reorder_names failed
        @rtype: dict or None
        """
        name = helpers.reorder_names(name_obj['@value'])
        if name is None:
            return None
        name_obj = name_obj.copy()
        name_obj['@value'] = name
        return name_obj

    def make_ref(self, date):
        """Make a correctly formatted ref object for claims.

        Contains 4 parts:
        * P248: Stated in <the kulturnav dataset>
        * P577: Publication date <from the document>
        * P854: Reference url <using the current uuid>
        * P813: Retrieval date <current date>

        P854
        Should be in source_test (after retroactively fixing older references)
        but by being in source_notest we ensure that duplicate uuids don't
        source the statement twice.

        @param date: The "last modified" time of the document
        @type date: pywikibot.WbTime
        @return: the formated reference
        @rtype WD.Reference
        """
        reference_url = 'http://kulturnav.org/%s' % self.current_uuid
        ref = WD.Reference(
            source_test=self.wd.make_simple_claim(
                'P248', self.wd.QtoItemPage(self.DATASET_Q)),
            source_notest=[
                self.wd.make_simple_claim('P577', date),
                self.wd.make_simple_claim('P854', reference_url),
                self.wd.make_simple_claim('P813', helpers.today_as_WbTime())
            ])
        return ref

    def add_label_or_alias(self, name_obj, item, case_sensitive=False):
        """Add a name as either a label (if none already) or an alias.

        Essentially a filter for the more generic method in WikidatStuff.

        @param name_obj: {'@language': 'xx', '@value': 'xxx'}
                        or a list of such
        @type name_obj: dict or list of dict
        @param item: the item to which the label/alias should be added
        @type item: pywikibot.ItemPage
        @param caseSensitive: whether the comparison is case sensitive
        @type caseSensitive: bool
        """
        # for a list of entries
        if isinstance(name_obj, list):
            for n in name_obj:
                self.add_label_or_alias(n, item, case_sensitive=case_sensitive)
                # reload item so that next call is aware of any changes
                item = self.wd.QtoItemPage(item.title())
                item.exists()
            return

        # for a single entry
        self.wd.addLabelOrAlias(name_obj['@language'],
                                name_obj['@value'],
                                item,
                                caseSensitive=case_sensitive)

    @staticmethod
    def get_kulturnav_generator(uuids, delay=0):
        """Generate KulturNav items from a list of uuids.

        @param uuids: uuids to request items for
        @type uuids: list of str
        @param delay: delay in seconds between each kulturnav request
        @type delay: int
        @yield: dict
        """
        for uuid in uuids:
            time.sleep(delay)
            try:
                json_data = KulturnavBot.get_single_entry(uuid)
            except pywikibot.Error as e:
                pywikibot.output(e)
            else:
                yield json_data

    @classmethod
    def get_search_results(cls, max_hits=250, require_wikidata=True):
        """Make a KulturNav search for all items of a given type in a dataset.

        @param max_hits: the maximum number of results to request at once
        @type max_hits: int
        @param require_wikidata: whether to filter results on having a wikidata
            url in sameAs
        @type require_wikidata: bool
        @return: the resulting uuids
        @rtype: list of str
        """
        search_url = 'http://kulturnav.org/api/search/' + \
                     'entityType:%s,' % cls.ENTITY_TYPE + \
                     'entity.dataset_r:%s' % cls.DATASET_ID
        q = None  # the map_tag query

        # only filter on MAP_TAG if filtering on wikidata
        if require_wikidata:
            search_url += ',%s' % cls.MAP_TAG + ':%s/%d/%d'
            q = '*%2F%2Fwww.wikidata.org%2Fentity%2FQ*'
        else:
            search_url += '/%d/%d'

        # start search
        results = []
        offset = 0
        overview_page = KulturnavBot.get_single_search_results(
            search_url, q, offset, max_hits)
        while overview_page:
            for item in overview_page:
                uuid = item[u'uuid']
                if not require_wikidata or \
                        KulturnavBot.has_wikidata_in_sameas(item, cls.MAP_TAG):
                    results.append(uuid)

            # continue
            offset += max_hits
            overview_page = KulturnavBot.get_single_search_results(
                search_url, q, offset, max_hits)

        # some feedback
        pywikibot.output(u'Found %d matching entries in Kulturnav' %
                         len(results))
        return results

    @staticmethod
    def has_wikidata_in_sameas(item, map_tag):
        """Check if a wikidata url is present in the sameAs property.

        @param item: the search item to check
        @type item: dict
        @param map_tag: the tag to use (concepts don't use sameAs)
        @type map_tag: str
        @rtype: bool
        """
        # The patterns used if we filter on wikidata
        patterns = (u'http://www.wikidata.org/entity/',
                    u'https://www.wikidata.org/entity/')

        same_as = item[u'properties'][map_tag[:map_tag.rfind('_')]]
        for s in same_as:
            if s[u'value'].startswith(patterns):
                return True
        return False

    @staticmethod
    def get_single_search_results(search_url, q, offset, max_hits):
        """Retrieve the results from a single API search.

        @param search_url: basic url from whih to build search
        @type search_url: str
        @param q: the map_tag query, if any
        @type q: str or None
        @param offset: the offset in search results
        @type offset: int
        @param max_hits: the maximum number of results to request at once
        @type max_hits: int
        @return: the search result object
        @rtype: dict
        """
        actual_url = ''
        if q is None:
            actual_url = search_url % (offset, max_hits)
        else:
            actual_url = search_url % (q, offset, max_hits)

        search_page = urllib2.urlopen(actual_url)
        return json.loads(search_page.read())

    @staticmethod
    def get_single_entry(uuid):
        """Retrieve the data on a single kulturnav entry.

        Raises an pywikibot.Error if:
        * @graph is not a key in the json response
        * a non-json response is received

        @param uuid: the uuid for the target item
        @type uuid: str
        @return: the entry object
        @rtype: dict
        @raise: pywikibot.Error
        """
        query_url = 'http://kulturnav.org/%s?format=application/ld%%2Bjson'
        item_url = query_url % uuid
        try:
            record_page = urllib2.urlopen(item_url)
            json_data = json.loads(record_page.read())
        except ValueError as e:
            raise pywikibot.Error('Error loading KulturNav item at '
                                  '%s with error %s' % (item_url, e))
        if json_data.get(u'@graph'):
            return json_data
        else:
            raise pywikibot.Error('No @graph in KulturNav reply at '
                                  '%s\n data: %s' % (item_url, json_data))

    @classmethod
    def main(cls, *args):
        """Start the bot from the command line."""
        options = cls.handle_args(args)

        search_results = cls.get_search_results(
            max_hits=options['max_hits'],
            require_wikidata=options['require_wikidata'])
        kulturnav_generator = cls.get_kulturnav_generator(
            search_results, delay=options['delay'])

        kulturnav_bot = cls(kulturnav_generator, options['cache_max_age'])
        kulturnav_bot.cutoff = options['cutoff']
        kulturnav_bot.require_wikidata = options['require_wikidata']
        kulturnav_bot.run()

    @classmethod
    def run_from_list(cls, uuids, *args):
        """Start the bot with a list of uuids."""
        options = cls.handle_args(args)

        kulturnav_generator = cls.get_kulturnav_generator(
            uuids, delay=options['delay'])
        kulturnav_bot = cls(kulturnav_generator, options['cache_max_age'])
        kulturnav_bot.cutoff = options['cutoff']
        kulturnav_bot.require_wikidata = False
        kulturnav_bot.run()

    @staticmethod
    def handle_args(args):
        """Parse and load all of the basic arguments.

        Also passes any needed arguments on to pywikibot and sets any defaults.

        @param args: arguments to be handled
        @type args: list of strings
        @return: list of options
        @rtype: dict
        """
        options = {
            'cutoff': None,
            'max_hits': 250,
            'delay': 0,
            'require_wikidata': True,
            'cache_max_age': 0,
        }

        for arg in pywikibot.handle_args(args):
            option, sep, value = arg.partition(':')
            if option == '-cutoff':
                options['cutoff'] = int(value)
            elif option == '-max_hits':
                options['max_hits'] = int(value)
            elif option == '-delay':
                options['delay'] = int(value)
            elif option == '-any_item':
                options['require_wikidata'] = False
            elif option == '-wdq_cache':
                options['cache_max_age'] = int(value)

        return options

    @staticmethod
    def foobar(item):
        """Badly named escape mechanism for list results."""
        if isinstance(item, list):
            pywikibot.output(FOO_BAR)
            return True
        return False
예제 #3
0
class PaintingsBot:
    """Bot to enrich, and create, for items about paintings on Wikidata."""
    def __init__(self, dict_generator, painting_id_prop, cache_max_age=0):
        """Initiate the bot, loading files and querying WDQ.

        @param dict_generator: The generator for the Europeana painting objects
        @type dict_generator: generator (that yields Dict objects).
        @param painting_id_prop: the P-id of the painting-id property
        @type painting_id_prop: str
        @param cache_max_age: Max age of local wdq cache, defaults to 0
        @type cache_max_age: int
        """
        self.generator = dict_generator
        self.repo = pywikibot.Site().data_repository()
        self.commons = pywikibot.Site(u'commons', u'commons')
        self.wd = WD(self.repo)
        self.add_new = False  # If new objects should be created
        self.skip_miniatures = True  # If (new) miniatures should be skipped

        # Load prefixes and find allowed collections
        collections = set([INSTITUTION_Q])
        self.mappings = helpers.load_json_file('mappings.json',
                                               force_path=__file__)
        self.prefix_map = self.mappings['prefix_map']
        self.bad_prefix = self.mappings['bad_prefix']
        for p, k in self.prefix_map.iteritems():
            if k['subcol'] is not None:
                collections.add(k['subcol'].strip('Q'))
        self.collections = list(collections)

        # Set log file
        self.log = codecs.open(u'nationalmuseumSE.log', 'a', 'utf-8')

        # Load creator dump file
        self.creator_dump = helpers.load_json_file('Oku_NM_arbetskopia.json',
                                                   force_path=__file__)

        # hard-coded anons e.g. "unknown swedish 17th century"
        anons = helpers.load_json_file('anons.json', force_path=__file__)

        # prepare WDQ painting query
        query = u'CLAIM[195:%s] AND CLAIM[%s]' % \
                (',195:'.join(self.collections), painting_id_prop)
        self.painting_ids = helpers.fill_cache(painting_id_prop,
                                               queryoverride=query,
                                               cache_max_age=cache_max_age)

        # prepare WDQ artist query (nat_mus_id - Q_id pairs)
        self.artist_ids = helpers.fill_cache('P2538',
                                             cache_max_age=cache_max_age)
        # add anons
        for a in anons:
            self.artist_ids[a] = ANON_Q

        self.painting_id_prop = 'P%s' % painting_id_prop

    def run(self):
        """Start the robot."""
        self.creators = {}

        for painting in self.generator:
            # isolate ids
            ids = painting['object']['proxies'][0]['dcIdentifier']['def']
            painting_id = ids[0].replace('Inv Nr.:', '').strip('( )')
            obj_id = ids[1]

            # Museum contains several sub-collections. Only handle mapped ones
            if painting_id.split(' ')[0] in self.prefix_map.keys():
                self.process_painting(painting, painting_id, obj_id)
            elif painting_id.split(' ')[0] not in self.bad_prefix:
                pywikibot.output(u'Skipped due to unknown collection: %s' %
                                 painting_id)

    def process_painting(self, painting, painting_id, obj_id):
        """Process a single painting.

        This will also create it if self.add_new is True.

        @param painting: information object for the painting
        @type painting: dict
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param obj_id: the internal id of the painting in the Nationalmuseum
            database.
        @type obj_id: str
        """
        uri = u'http://collection.nationalmuseum.se/eMuseumPlus?service=' \
              u'ExternalInterface&module=collection&objectId=%s&viewType=' \
              u'detailView' % obj_id
        europeana_url = u'http://europeana.eu/portal/record%s.html' % \
                        painting['object']['about']

        painting_item = None
        # newclaims = []
        if painting_id in self.painting_ids:
            painting_item = self.create_existing_painting(
                painting, painting_id)
        elif self.add_new and not (self.skip_miniatures
                                   and PaintingsBot.is_miniature(painting)):
            # if objection collection is allowed and
            # unless it is a miniature and we are skipping those
            painting_item = self.create_new_painting(painting, painting_id,
                                                     europeana_url, uri)

        # add new claims
        if painting_item and painting_item.exists():
            data = painting_item.get(force=True)
            claims = data.get('claims')

            # add natmus id claim
            self.add_natmus_id(painting_item, obj_id, uri)

            # add inventory number with collection
            self.add_inventory_and_collection_claim(painting_item, painting_id,
                                                    painting, uri)

            # Instance_of
            if u'P31' not in claims:
                self.add_instanceof_claim(painting_item, painting_id, painting)

            # title (as claim)
            # commented out as the titles in Europeana are not reliable
            # if u'P1476' not in claims:
            #    self.add_title_claim(painting_item, painting)

            # Europeana_ID
            self.add_europeana_claim(painting_item, painting)

            # Check for potential images to add, if none is present
            if u'P18' not in claims:
                self.add_image_claim(painting_item, uri)

            # creator through Nat_mus_database dump
            self.add_natmus_creators(painting_item, obj_id, uri)
            # creator IFF through dbpedia
            # if u'P170' not in claims:
            #    self.add_dbpedia_creator(painting_item, painting)

    def add_title_claim(self, painting_item, painting):
        """Add a title/P1476 claim based on dcTitle.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting: information object for the painting
        @type painting: dict
        """
        dc_title = painting['object']['proxies'][0]['dcTitle']
        titles = []
        for lang, title in dc_title.iteritems():
            titles.append(pywikibot.WbMonolingualText(title[0], lang))
        for title in titles:
            self.wd.addNewClaim(u'P1476', WD.Statement(title), painting_item,
                                self.make_europeana_reference(painting))

    def add_locatedin_claim(self, painting_item, painting_id, painting):
        """Add a located_in/P276 claim based on sub-collection.

        No longer used as sub-collection does not match actual placing.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param painting: information object for the painting
        @type painting: dict
        """
        place = self.prefix_map[painting_id.split(' ')[0]]['place']
        place_item = self.wd.QtoItemPage(place)
        self.wd.addNewClaim(u'P276', WD.Statement(place_item), painting_item,
                            self.make_europeana_reference(painting))

    def add_dbpedia_creator(self, painting_item, painting):
        """Add a Creator/P170 claim through a dbpedia look-up.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting: information object for the painting
        @type painting: dict
        """
        creator_id = None
        try:
            db_creator = painting['object']['proxies'][1]['dcCreator']['def']
            if len(db_creator) == 1:
                # skip anything more complex than one creator
                db_creator = db_creator[0].strip()
                if db_creator.startswith('http://dbpedia.org/resource/'):
                    if db_creator not in self.creators.keys():
                        self.creators[db_creator] = \
                            helpers.dbpedia_2_wikidata(db_creator)
                    creator_id = self.creators[db_creator]
        except KeyError:
            return

        if creator_id:
            self.set_creator(painting_item,
                             self.make_europeana_reference(painting),
                             creator_q=creator_id)

    def add_image_claim(self, painting_item, uri):
        """Add a image/P18 claim if exactly one image is found on Commons.

        Uses the nationalmuseum.se uri to search for matches on Commons. Adds a
        claim only if a unique hit is found.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        images = self.file_from_external_link(uri)
        if len(images) > 1:  # for now don't want to choose the appropriate one
            pywikibot.output('Found multiple matching images for %s' %
                             painting_item)
            for image in images:
                pywikibot.output(u'\t%s' % image)
        elif len(images) == 1:
            self.wd.addNewClaim(u'P18', WD.Statement(images[0]), painting_item,
                                self.make_commons_reference())

    def add_europeana_claim(self, painting_item, painting):
        """Add a Europeana ID/P727 claim.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting: information object for the painting
        @type painting: dict
        """
        europeana_prop = u'P727'
        europeana_id = painting['object']['about'].lstrip('/')

        # abort if conflicting info
        if europeana_prop in painting_item.claims and \
                not self.wd.has_claim(europeana_prop, europeana_id,
                                      painting_item):
            pywikibot.output(u'%s has conflicting %s. Expected %s' %
                             (painting_item, europeana_prop, europeana_id))
            return

        self.wd.addNewClaim(europeana_prop, WD.Statement(europeana_id),
                            painting_item,
                            self.make_europeana_reference(painting))

    def add_instanceof_claim(self, painting_item, painting_id, painting):
        """Add an instance_of/P31 claim.

        Instance_of is always painting or icon while working on the paintings
        collection.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param painting: information object for the painting
        @type painting: dict
        """
        dcformat_item = self.wd.QtoItemPage(PAINTING_Q)  # painting
        if painting_id.split(' ')[0] == 'NMI':
            dcformat_item = self.wd.QtoItemPage(ICON_Q)  # icon

        self.wd.addNewClaim(u'P31', WD.Statement(dcformat_item), painting_item,
                            self.make_europeana_reference(painting))

    @staticmethod
    def is_miniature(painting):
        """Determine if the painting is a miniature.

        @param painting: information object for the painting
        @type painting: dict
        @rtype bool
        """
        for concept in painting['object']['concepts']:
            if concept[u'about'] == MINIATURE_URL:
                # pywikibot.output(u'Skipping miniature')
                return True
        return False

    def create_existing_painting(self, painting, painting_id):
        """Add base info to an existing paining.

        Adds the same info as would have been added had it been created with
        create_new_painting()

        @param painting: information object for the painting
        @type painting: dict
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @return: the created painting item
        @rtype: pywikibot.ItemPage
        """
        painting_item = self.wd.QtoItemPage(self.painting_ids.get(painting_id))

        # check label
        data = painting_item.get()
        labels = make_labels(painting)
        new_labels = find_new_values(data, labels, 'labels')
        if new_labels:
            pywikibot.output('Adding label to %s' % painting_item.title())
            painting_item.editLabels(new_labels)

        # check description
        descriptions = make_descriptions(painting)
        if descriptions:
            new_descr = find_new_values(data, descriptions, 'descriptions')
            if new_descr:
                pywikibot.output('Adding description to %s' %
                                 painting_item.title())
                painting_item.editDescriptions(new_descr)

        return painting_item

    def create_new_painting(self, painting, painting_id, europeana_url, uri):
        """Create a new painting item and return it.

        @param painting: information object for the painting
        @type painting: dict
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param europeana_url: reference url for Europeana
        @type europeana_url: str
        @param uri: reference uri at nationalmuseum.se
        @type uri: str
        @return: the created painting item
        @rtype: pywikibot.ItemPage
        """
        data = {'labels': {}, 'descriptions': {}}

        data['labels'] = make_labels(painting)
        data['descriptions'] = make_descriptions(painting)
        if not data['descriptions']:
            return

        # print data
        # create new empty item and request Q-number
        summary = u'%s: Creating new item with data from %s' % (EDIT_SUMMARY,
                                                                europeana_url)
        painting_item = None
        try:
            painting_item = self.wd.make_new_item(data, summary)
        except pywikibot.data.api.APIError as e:
            if e.code == u'modification-failed':
                # disambiguate and try again
                for lang, content in data['descriptions'].iteritems():
                    disambiguation = content['value'] + u' (%s)' % painting_id
                    data['descriptions'][lang]['value'] = disambiguation
                try:
                    painting_item = self.wd.make_new_item(data, summary)
                except pywikibot.data.api.APIError as e:
                    if e.code == u'modification-failed':
                        pywikibot.output(u'modification-failed error: '
                                         u'skipping %s' % uri)
                        return
                    else:
                        raise pywikibot.Error(u'Error during item creation: '
                                              u'%s' % e)
            else:
                raise pywikibot.Error(u'Error during item creation: %s' % e)

        return painting_item

    def add_natmus_id(self, painting_item, obj_id, uri):
        """Add a natmus_painting_id/P2539 claim.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param obj_id: the nationalmuseum database id
        @type obj_id: str
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        self.wd.addNewClaim(u'P2539', WD.Statement(obj_id), painting_item,
                            self.make_url_reference(uri))

    def add_natmus_creators(self, painting_item, obj_id, uri):
        """Add creator/P170 claim(s) based on the database dump info.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param obj_id: the nationalmuseum database id
        @type obj_id: str
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        if obj_id not in self.creator_dump.keys():
            return

        # each artwork may have multiple artists,
        # which must all be on wikidata
        for artist_id in self.creator_dump[obj_id].keys():
            if artist_id not in self.artist_ids.keys():
                self.logger('Artist not found on wikidata: %s' % artist_id)
                return

        dump_entry = self.creator_dump[obj_id]
        if len(dump_entry) == 1:
            artist_entry = dump_entry.iteritems().next()
            self.add_singel_natmus_creator(painting_item, artist_entry, uri)
        elif len(dump_entry) == 2:
            # self.add_double_natmus_creator(painting_item, dump_entry, uri)
            # skipping until duplication issue has been solved
            pass
        else:
            # for now avoid any entries with more creators
            return

    def add_singel_natmus_creator(self, painting_item, artist, uri):
        u"""Add a simple creator/P170 claim based on the database dump info.

        Handles cases with only a single identified creator. Either
        * Known creator
        * Unknown/uncertain creator somehow related to a known person
        where creator is someone whose function is in artist_labels.

        For Forgery/After work by the bot needs to be aware of both parties,
        and both must exist on Wikidata

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param artist: the dump entry for the artist
        @type artist: tuple (artist_id, artist_info)
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        anonymous_combos = {
            u'Tillskriven': 'P1773',
            u'Hennes ateljé': 'P1774',
            u'Hans ateljé': 'P1774',
            u'Hennes skola': 'P1780',
            u'Hans skola': 'P1780',
            u'Hennes art': 'P1777',
            u'Hans art': 'P1777',
        }
        artist_labels = (u'Konstnär', u'Mästare', u'Utförd av')

        artist_id, artist_info = artist
        artist_q = self.artist_ids[artist_id]

        if artist_info.get('OkuBeschreibungS') or \
                artist_info.get('OkuValidierungS'):
            # this always indicates some special case which we cannot handle
            # for now
            return

        if artist_info.get('OkuFunktionS') and \
                artist_info.get('OkuFunktionS') in artist_labels:
            if len(artist_info.keys()) == 1:  # i.e. all other are empty
                self.set_creator(painting_item,
                                 self.make_url_reference(uri),
                                 creator_q=artist_q)
            elif artist_info.get('OkuArtS') in anonymous_combos.keys() and \
                    len(artist_info.keys()) == 2:
                # anonymous but attributed to the artist
                related_info = {
                    'P': anonymous_combos[artist_info.get('OkuArtS')],
                    'itis': self.wd.QtoItemPage(artist_q)
                }
                self.set_creator(painting_item,
                                 self.make_url_reference(uri),
                                 related_info=related_info)
        elif not artist_info.get('OkuFunktionS') and artist_id == '1':
            # this is the special case of a completly unknown creator
            self.set_creator(painting_item, self.make_url_reference(uri))

    def add_double_natmus_creator(self, painting_item, artists, uri):
        u"""Add a comlex creator/P170 claim based on the database dump info.

        Handles cases with two identified creators in a relation along the
        lines of "Painting/Forgery by X after a work by Y"

        The logic is:
        OkuFunktionS in derived_combos -> OkuKueID = creator of original
        OkuFunktionS in artist_labels -> OkuKueID = creator of derivative
        @param artists: the dump entries for the artists
        @type artists: dict of {artist_id: artist_info}
        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        derived_combos = {
            u'Kopia efter': 'P1877',
            u'Efter': 'P1877',
            u'Förfalskning efter': 'P1778',
        }
        artist_labels = (u'Konstnär', u'Utförd av')

        # set up targets
        original = None
        derivative = None
        relation = None

        for artist in artists.iteritems():
            artist_id, artist_info = artist
            if artist_info.get('OkuBeschreibungS') or \
                    artist_info.get('OkuValidierungS'):
                # this indicates some special case which we cannot handle
                # for now
                return

            if artist_info.get('OkuFunktionS') and \
                    len(artist_info.keys()) == 1:
                # cannot deal with OkuArtS
                if artist_info.get('OkuFunktionS') in artist_labels:
                    derivative = artist
                elif artist_info.get('OkuFunktionS') in derived_combos.keys():
                    original = artist
                    relation = derived_combos[artist_info.get('OkuFunktionS')]

        # verify that both roles were filled
        if any(creator is None for creator in (original, derivative)):
            return

        # construct info and set
        original_q = self.artist_ids[original[0]]
        derivative_q = self.artist_ids[derivative[0]]
        related_info = {'P': relation, 'itis': self.wd.QtoItemPage(original_q)}
        self.set_creator(painting_item,
                         self.make_url_reference(uri),
                         creator_q=derivative_q,
                         related_info=related_info)

    def set_creator(self,
                    target_item,
                    reference,
                    creator_q=None,
                    related_info=None):
        """Set a creator/P170 claim for a creator or creator combo.

        Allows for simple claims as well as more complex
        "in the manner of" etc.

        @param target_item: item to which claim is added
        @type target_item: pywikibot.ItemPage
        @param reference: the reference for the statment
        @type reference: WD.Reference
        @param related_info: related info as a dict with P/itis pairs
        @type related_info: dict
        @param creator_q: the Q-id of the creator
        @type creator_q: str
        """
        creator_q = creator_q or ANON_Q
        creator_statement = WD.Statement(self.wd.QtoItemPage(creator_q))

        # set any related qualifiers
        if related_info:
            creator_statement.addQualifier(
                WD.Qualifier(P=related_info['P'], itis=related_info['itis']))

        # set claim
        self.wd.addNewClaim(u'P170', creator_statement, target_item, reference)

    def add_inventory_and_collection_claim(self, painting_item, painting_id,
                                           painting, uri):
        """Add an inventory_no, with qualifier, and a collection/P195 claim.

        This will add the collection qualifier to any matching
        claim missing it.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param painting: information object for the painting
        @type painting: dict
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        nationalmuseum_item = self.wd.QtoItemPage(INSTITUTION_Q)
        collection_p = u'P195'

        # abort if conflicting info
        if self.painting_id_prop in painting_item.claims and \
                not self.wd.has_claim(self.painting_id_prop, painting_id,
                                      painting_item):
            pywikibot.output(
                u'%s has conflicting inv. no (%s). Expected %s' %
                (painting_item, self.painting_id_prop, painting_id))
            return

        # add inventory number with collection
        self.wd.addNewClaim(
            self.painting_id_prop,
            WD.Statement(painting_id).addQualifier(WD.Qualifier(
                P=collection_p, itis=nationalmuseum_item),
                                                   force=True), painting_item,
            self.make_url_reference(uri))

        # add collection (or subcollection)
        subcol = self.prefix_map[painting_id.split(' ')[0]]['subcol']
        collection_item = nationalmuseum_item
        if subcol is not None:
            collection_item = self.wd.QtoItemPage(subcol)

        self.wd.addNewClaim(collection_p, WD.Statement(collection_item),
                            painting_item,
                            self.make_europeana_reference(painting))

    def make_europeana_reference(self, painting):
        """Make a Reference object with a Europeana retrieval url and today's date.

        @param uri: retrieval uri/url
        @type uri: str
        @rtype: WD.Reference
        """
        europeana_url = u'http://europeana.eu/portal/record%s.html' % \
                        painting['object']['about']
        return self.make_url_reference(europeana_url)

    def make_url_reference(self, uri):
        """Make a Reference object with a retrieval url and today's date.

        @param uri: retrieval uri/url
        @type uri: str
        @rtype: WD.Reference
        """
        date = helpers.today_as_WbTime()
        ref = WD.Reference(source_test=self.wd.make_simple_claim(u'P854', uri),
                           source_notest=self.wd.make_simple_claim(
                               u'P813', date))
        return ref

    def make_commons_reference(self):
        """Make a Reference object saying imported from Wikimedia Commons."""
        commons_item = self.wd.QtoItemPage(COMMONS_Q)
        ref = WD.Reference(source_test=self.wd.make_simple_claim(
            u'P143', commons_item))  # imported from
        return ref

    def file_from_external_link(self, uri):
        """Identify files from a Nationalmuseum uri.

        Hits are any files containing a link to the eMuseumPlus uri.

        @param uri: reference url on nationalmuseum.se
        @type uri: str
        @return: matching images
        @rtype: list
        """
        images = []
        uri = uri.split('://')[1]
        objgen = pagegenerators.LinksearchPageGenerator(uri,
                                                        namespaces=[6],
                                                        site=self.commons)
        for page in objgen:
            images.append(pywikibot.FilePage(self.commons, page.title()))

        # I have no clue how the above results in duplicates, but it does so...
        images = list(set(images))

        return images

    def most_missed_creators(self, cache_max_age=0):
        """Produce list of most frequent, but unlinked, creators.

        Query WDQ for all objects in the collection missing an artist
        then put together a top-list for most desired creator
        """
        expected_items = []
        query = u'CLAIM[195:%s] AND NOCLAIM[170]' % \
                ',195:'.join(self.collections)  # collection
        wd_queryset = wdquery.QuerySet(query)

        wd_query = wdquery.WikidataQuery(cacheMaxAge=cache_max_age)
        data = wd_query.query(wd_queryset)

        if data.get('status').get('error') == 'OK':
            expected_items = data.get('items')

        creator_dict = {}
        counter = 0
        for q_val in expected_items:
            q_item = self.wd.QtoItemPage(q_val)
            data = q_item.get()
            claims = data.get('claims')
            if u'P170' in claims:
                continue
            descr = data.get('descriptions').get('en')
            if descr and descr.startswith(u'painting by '):
                creator = descr[len(u'painting by '):]
                if '(' in creator:  # to get rid of disambiguation addition
                    creator = creator[:creator.find('(')].strip()
                if creator in creator_dict.keys():
                    creator_dict[creator] += 1
                else:
                    creator_dict[creator] = 1
                counter += 1
        pywikibot.output(u'Found %d mentions of %d creators' %
                         (counter, len(creator_dict)))
        # output
        f = codecs.open(u'creatorHitlist.csv', 'w', 'utf-8')
        for k, v in creator_dict.iteritems():
            f.write(u'%d|%s\n' % (v, k))
        f.close()

    def logger(self, text):
        """Append text to logfile.

        @param text: text to output
        @type text: str
        """
        self.log.write(u'%s\n' % text)
        self.log.flush()  # because shit tends to crash