class ImporterBot(object):
    """Bot to enrich/create info on Wikidata for Australian heritage items."""
    def __init__(self, base_path, new=False, cutoff=None, preview_file=None):
        """
        Initialise the ImporterBot.

        :param base_path: path to the output directory
        :param new: whether to also create new items
        :param cutoff: the number of items to process before stopping. None
            being interpreted as all.
        :param preview_file: run in demo mode (create previews rather than
            live edits) and output the result to this file.
        """
        self.repo = pywikibot.Site().data_repository()
        self.wd = WdS(self.repo, EDIT_SUMMARY)
        self.new = new
        self.cutoff = cutoff
        if preview_file:
            self.demo = True
            self.preview_file = path.join(base_path, preview_file)
        else:
            self.demo = False
        self.preview_data = []

        self.set_references()
        self.place_id_p = 'P3008'  # unique identifier property
        self.country = self.wd.QtoItemPage('Q408')
        self.states = self.make_states_map()
        self.settlements = self.make_settlements_map()
        self.hectares = self.wd.QtoItemPage(helpers.get_unit_q('ha'))
        self.make_status_and_instance_map()

        self.place_id_items = helpers.fill_cache_wdqs(self.place_id_p,
                                                      no_strip=True)

    def set_references(self):
        """Set the three types of references needed."""
        self.ref = {
            'national':
            self.make_url_ref(
                'http://data.gov.au/dataset/2016-soe-her-aus-national-heritage',  # noqa
                '2017-07-21',
                '2017-06-07'),
            'commonwealth':
            self.make_url_ref(
                'http://data.gov.au/dataset/commonwealth-heritage-list',
                '2017-07-21', '2017-05-31')
        }
        self.coord_ref = {
            'national':
            self.make_url_ref(
                'http://www.environment.gov.au/heritage/places/national-heritage-list',  # noqa
                '2017-08-13'),
            'commonwealth':
            self.make_url_ref(
                'https://data.gov.au/dataset/57720684-4948-45db-a2c8-37259d531d87',  # noqa
                '2017-08-13',
                '2017-07-10')
        }

    def make_status_and_instance_map(self):
        """Construct mapping for cultural heritage status and instance type."""
        self.status = {
            'national': self.wd.QtoItemPage('Q20747146'),
            'commonwealth': self.wd.QtoItemPage('Q30108476')
        }
        self.instance_type = {
            'indigenous': self.wd.QtoItemPage('Q38048771'),
            'historic': self.wd.QtoItemPage('Q38048707'),
            'natural': self.wd.QtoItemPage('Q38048753')
        }

    def make_settlements_map(self):
        """Retrieve Australian settlements with state/territory connection."""
        sparql = (
            "SELECT DISTINCT ?city ?cityLabel ?admin ?adminLabel "
            "WHERE "
            "{ "
            "?city wdt:P31/wdt:P279* wd:Q486972 . "
            "?city wdt:P17 wd:Q408 . "
            "?city wdt:P131* ?admin . "
            "{ ?admin wdt:P31 wd:Q5852411 . }"
            "UNION"
            "{ ?admin wdt:P31 wd:Q14192252 . }"
            "UNION"
            "{ ?admin wdt:P31 wd:Q14192199 . }"
            'SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }'  # noqa
            "}")
        data = wdqs.make_simple_wdqs_query(sparql)
        settlements = dict()
        for d in data:
            state_qid = d['admin'].split('/')[-1]
            city_qid = d['city'].split('/')[-1]
            city_name = d['cityLabel']
            if city_name not in settlements:
                settlements[city_name] = []
            settlements[city_name].append({
                'state': state_qid,
                'qid': city_qid
            })
        return settlements

    def make_states_map(self):
        """
        Retrieve the state/territory mappings from Wikidata.

        Also tries to match items for the EXT and OS codes.
        """
        sparql = ("SELECT ?item ?iso "
                  "WHERE "
                  "{ "
                  "?item wdt:P300 ?value . "
                  "?item wdt:P17 wd:Q408 . "
                  "BIND(REPLACE(?value, 'AU-', '', 'i') AS ?iso) "
                  "}")
        data = wdqs.make_select_wdqs_query(sparql, 'item', 'iso')
        states = dict()
        for k, v in data.items():
            states[v] = self.wd.QtoItemPage(k)

        # external territories (random hits mapped)
        states['EXT'] = {
            'Ashmore and Cartier Islands':
            self.wd.QtoItemPage('Q133888'),
            "Australian Antarctic Territory|Dumont D'Urville Station|Mawson Station":
            self.wd.QtoItemPage('Q178994'),  # noqa
            'Christmas Island|Settlement|Drumsite|Poon Saan':
            self.wd.QtoItemPage('Q31063'),  # noqa
            'Cocos (Keeling) Islands':
            self.wd.QtoItemPage('Q36004'),
            'Coral Sea Islands':
            self.wd.QtoItemPage('Q172216'),
            'Heard and McDonald Islands':
            self.wd.QtoItemPage('Q131198'),
            'Jervis Bay Territory':
            self.wd.QtoItemPage('Q15577'),
            'Norfolk Island|Kingston|Longridge|Burnt Pine|Middlegate':
            self.wd.QtoItemPage('Q31057')  # noqa
        }

        # OS other state?
        states['OS'] = {
            'United Kingdom': self.wd.QtoItemPage('Q145'),
            'USA': self.wd.QtoItemPage('Q30')
        }

        return states

    def make_url_ref(self, url, fetch_date, publish_date=None):
        """Make a Reference object for a url.

        Contains 3 parts:
        * P813: Retrieval date
        * P577: Publication date <from creation date of the document>
        * P854: Reference url <using the input url>

        :param url: the source url
        :param fetch_date: the retrieval date url (iso)
        :param publish_date: the retrieval date url (iso)
        :return: WdS.Reference
        """
        date_claims = []
        if publish_date:
            date_claims.append(
                self.wd.make_simple_claim('P577',
                                          helpers.iso_to_WbTime(publish_date)))
        date_claims.append(
            self.wd.make_simple_claim('P813',
                                      helpers.iso_to_WbTime(fetch_date)))

        ref = WdS.Reference(
            source_test=[self.wd.make_simple_claim('P854', url)],
            source_notest=date_claims)
        return ref

    def output_previews(self):
        """Output any PreviewItems to the preview_file."""
        with open(self.preview_file, 'w', encoding='utf-8') as f:
            for preview in self.preview_data:
                f.write(preview.make_preview_page())
                f.write('--------------\n\n')
        pywikibot.output('Created "{}" for previews'.format(self.preview_file))

    def process_all_objects(self, data):
        """
        Handle all the Australian heritage objects.

        Only increments counter when an object is updated.

        :param data: dict of all the heritage objects.
        """
        count = 0
        for place_id, entry_data in data.items():
            if self.cutoff and count >= self.cutoff:
                break
            item = None
            if place_id in self.place_id_items:
                item = self.wd.QtoItemPage(self.place_id_items[place_id])

            if item or self.new:
                self.process_single_object(entry_data, item)
                count += 1

    def process_single_object(self, data, item):
        """
        Process a single Australian heritage object.

        :param data: dict of data for a single object
        :param item: Wikidata item associated with an object, or None if one
            should be created.
        """
        if not self.demo:
            item = item or self.create_new_place_id_item(data)
            item.exists()  # load the item contents

        # Determine claims
        labels = self.make_labels(data)
        descriptions = self.make_descriptions(data)
        protoclaims = self.make_protoclaims(data)
        ref = self.ref[self.get_heritage_type(data['type'])]

        # Upload claims
        if self.demo:
            self.preview_data.append(
                PreviewItem(labels, descriptions, protoclaims, item, ref))
        else:
            self.commit_labels(labels, item)
            self.commit_descriptions(descriptions, item)
            self.commit_claims(protoclaims, item, ref)

    def create_new_place_id_item(self, data):
        """
        Create a new place_id item with some basic info and return it.

        :param data: dict of data for a single object
        :return: pywikibot.ItemPage
        """
        labels = helpers.convert_language_dict_to_json(self.make_labels(data),
                                                       typ='labels')
        desc = helpers.convert_language_dict_to_json(
            self.make_descriptions(data), typ='descriptions')
        id_claim = self.wd.make_simple_claim(self.place_id_p,
                                             data.get('place_id'))

        item_data = {
            "labels": labels,
            "descriptions": desc,
            "claims": [
                id_claim.toJSON(),
            ]
        }

        try:
            return self.wd.make_new_item(item_data, EDIT_SUMMARY)
        except pywikibot.data.api.APIError as e:
            raise pywikibot.Error('Error during item creation: {:s}'.format(e))

    def make_labels(self, data):
        """
        Make a label object from the available info.

        :param data: dict of data for a single object
        :return: label dict
        """
        labels = {}
        name = data.get('name')
        if name:
            labels['en'] = [
                name.replace('  ', ' ').strip(),
            ]
        return labels

    def make_descriptions(self, data):
        """
        Make a description object in English.

        Address is partitioned so as to include the place name and
        territory/state in case these are not included anywhere later.

        :param data: dict of data for a single object
        :return: description object
        """
        text = '{heritage_type} {list_type} heritage site in {address}'
        descriptions = {
            'en':
            text.format(heritage_type=data['class'].lower(),
                        list_type=self.get_heritage_type(data['type']),
                        address=data['address'].rpartition(',')[2].strip())
        }
        return descriptions

    def commit_labels(self, labels, item):
        """
        Add labels and aliases to item.

        :param labels: label object
        :param item: item to add labels to
        """
        if labels:
            self.wd.add_multiple_label_or_alias(labels,
                                                item,
                                                case_sensitive=False)

    def commit_descriptions(self, descriptions, item):
        """
        Add descriptions to item.

        :param descriptions: description object
        :param item: item to add descriptions to
        """
        if descriptions:
            self.wd.add_multiple_descriptions(descriptions, item)

    def commit_claims(self, protoclaims, item, default_ref):
        """
        Add each claim (if new) and source it.

        :param protoclaims: a dict of claims with
            key: Prop number
            val: Statement|list of Statements
        :param item: the target entity
        :param default_ref: main/default reference to use
        """
        for prop, statements in protoclaims.items():
            if statements:
                statements = helpers.listify(statements)
                statements = set(statements)  # eliminate potential duplicates
                for statement in statements:
                    # check if None or a Statement(None)
                    if (statement is not None) and (not statement.isNone()):
                        # use internal reference if present, else the general
                        ref = statement.ref or default_ref
                        self.wd.addNewClaim(prop, statement, item, ref)

                        # reload item so that next call is aware of changes
                        item = self.wd.QtoItemPage(item.title())
                        item.exists()

    def make_protoclaims(self, data):
        """
        Construct potential claims for an entry.

        :param data: dict of data for a single heritage object
        """
        protoclaims = dict()

        # P17: country
        protoclaims['P17'] = WdS.Statement(self.country)

        # P1435: heritage status
        heritage_type = self.get_heritage_type(data.get('type'))
        statement = WdS.Statement(self.status[heritage_type])
        if data.get('register_date'):
            statement.addQualifier(
                WdS.Qualifier('P580',
                              self.parse_date(data.get('register_date'))))
        protoclaims['P1435'] = statement

        # P31: class
        protoclaims['P31'] = WdS.Statement(
            self.instance_type[data.get('class').lower()])

        # P3008: place_id
        protoclaims[self.place_id_p] = WdS.Statement(data['place_id'])

        # P131: state
        protoclaims['P131'] = WdS.Statement(
            self.get_state(data['state'], data['address']))

        # P2046: area
        if data.get('hectares'):
            protoclaims['P2046'] = WdS.Statement(
                pywikibot.WbQuantity(data['hectares'],
                                     unit=self.hectares,
                                     site=self.wd.repo))

        # P969: address
        if ',' in data['address']:
            protoclaims['P969'] = WdS.Statement(data['address'])

        # P276: place
        protoclaims['P276'] = WdS.Statement(
            self.get_place(data['state'], data['address']))

        # P625: coordinate
        if data.get('lat') and data.get('lon'):
            protoclaims['P625'] = self.get_coordinate_statement(
                data.get('lat'), data.get('lon'), heritage_type)

        return protoclaims

    def get_coordinate_statement(self, lat, lon, heritage_type):
        """Construct a Statement for the provided coordinates."""
        statement = WdS.Statement(
            pywikibot.Coordinate(float(lat),
                                 float(lon),
                                 globe='earth',
                                 precision=DEFAULT_PREC))
        statement.add_reference(self.coord_ref[heritage_type])
        return statement

    def get_heritage_type(self, typ):
        """Determine which heritage type the object is."""
        heritage_type = None
        if typ.startswith('Q1116950'):
            heritage_type = 'commonwealth'
        elif typ.startswith('Q781601'):
            heritage_type = 'national'
        else:
            pywikibot.error('Unrecognized status: {0}'.format(typ))
        return heritage_type

    def parse_date(self, date):
        """Convert date in DD-MMM-YYYY format to WbTime."""
        months = [
            'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP',
            'OCT', 'NOV', 'DEC'
        ]
        dd, mmm, yyyy = date.split('-')
        iso = '{year}-{month:02d}-{day:02d}'.format(year=yyyy,
                                                    day=int(dd),
                                                    month=months.index(mmm) +
                                                    1)
        return helpers.iso_to_WbTime(iso)

    def get_place(self, state, address):
        """
        Determine which settlement the object is in.

        The format of address is "street, place STATE_ISO"
        """
        place = address.rpartition(',')[2][:-len(state)].strip()
        state_item = self.get_state(state, address)
        if place in self.settlements and state_item:
            hits = []
            for candidate in self.settlements[place]:
                if candidate['state'] == state_item.id:
                    hits.append(candidate['qid'])
            if len(set(hits)) == 1:
                return self.wd.QtoItemPage(hits[0])

    def get_state(self, state, address):
        """Determine which state/territory the object is in."""
        state_item = None
        if state not in self.states:
            pywikibot.error('Unrecognized state: {0}'.format(state))
        elif state == 'EXT':
            address = address[:-len('EXT')].strip()
            for key, v in self.states['EXT'].items():
                if any(address.endswith(k) for k in key.split('|')):
                    state_item = v
                    break
        elif state == 'OS':
            for k, v in self.states['OS'].items():
                if address.endswith(k):
                    state_item = v
                    break
        else:
            state_item = self.states[state]
        return state_item
Exemplo n.º 2
0
class PaintingsBot:
    """Bot to enrich, and create, for items about paintings on Wikidata."""
    def __init__(self, dict_generator, painting_id_prop, cache_max_age=0):
        """Initiate the bot, loading files and querying WDQ.

        @param dict_generator: The generator for the Europeana painting objects
        @type dict_generator: generator (that yields Dict objects).
        @param painting_id_prop: the P-id of the painting-id property
        @type painting_id_prop: str
        @param cache_max_age: Max age of local wdq cache, defaults to 0
        @type cache_max_age: int
        """
        self.generator = dict_generator
        self.repo = pywikibot.Site().data_repository()
        self.commons = pywikibot.Site(u'commons', u'commons')
        self.wd = WD(self.repo)
        self.add_new = False  # If new objects should be created
        self.skip_miniatures = True  # If (new) miniatures should be skipped

        # Load prefixes and find allowed collections
        collections = set([INSTITUTION_Q])
        self.mappings = helpers.load_json_file('mappings.json',
                                               force_path=__file__)
        self.prefix_map = self.mappings['prefix_map']
        self.bad_prefix = self.mappings['bad_prefix']
        for p, k in self.prefix_map.iteritems():
            if k['subcol'] is not None:
                collections.add(k['subcol'].strip('Q'))
        self.collections = list(collections)

        # Set log file
        self.log = codecs.open(u'nationalmuseumSE.log', 'a', 'utf-8')

        # Load creator dump file
        self.creator_dump = helpers.load_json_file('Oku_NM_arbetskopia.json',
                                                   force_path=__file__)

        # hard-coded anons e.g. "unknown swedish 17th century"
        anons = helpers.load_json_file('anons.json', force_path=__file__)

        # prepare WDQ painting query
        query = u'CLAIM[195:%s] AND CLAIM[%s]' % \
                (',195:'.join(self.collections), painting_id_prop)
        self.painting_ids = helpers.fill_cache(painting_id_prop,
                                               queryoverride=query,
                                               cache_max_age=cache_max_age)

        # prepare WDQ artist query (nat_mus_id - Q_id pairs)
        self.artist_ids = helpers.fill_cache('P2538',
                                             cache_max_age=cache_max_age)
        # add anons
        for a in anons:
            self.artist_ids[a] = ANON_Q

        self.painting_id_prop = 'P%s' % painting_id_prop

    def run(self):
        """Start the robot."""
        self.creators = {}

        for painting in self.generator:
            # isolate ids
            ids = painting['object']['proxies'][0]['dcIdentifier']['def']
            painting_id = ids[0].replace('Inv Nr.:', '').strip('( )')
            obj_id = ids[1]

            # Museum contains several sub-collections. Only handle mapped ones
            if painting_id.split(' ')[0] in self.prefix_map.keys():
                self.process_painting(painting, painting_id, obj_id)
            elif painting_id.split(' ')[0] not in self.bad_prefix:
                pywikibot.output(u'Skipped due to unknown collection: %s' %
                                 painting_id)

    def process_painting(self, painting, painting_id, obj_id):
        """Process a single painting.

        This will also create it if self.add_new is True.

        @param painting: information object for the painting
        @type painting: dict
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param obj_id: the internal id of the painting in the Nationalmuseum
            database.
        @type obj_id: str
        """
        uri = u'http://collection.nationalmuseum.se/eMuseumPlus?service=' \
              u'ExternalInterface&module=collection&objectId=%s&viewType=' \
              u'detailView' % obj_id
        europeana_url = u'http://europeana.eu/portal/record%s.html' % \
                        painting['object']['about']

        painting_item = None
        # newclaims = []
        if painting_id in self.painting_ids:
            painting_item = self.create_existing_painting(
                painting, painting_id)
        elif self.add_new and not (self.skip_miniatures
                                   and PaintingsBot.is_miniature(painting)):
            # if objection collection is allowed and
            # unless it is a miniature and we are skipping those
            painting_item = self.create_new_painting(painting, painting_id,
                                                     europeana_url, uri)

        # add new claims
        if painting_item and painting_item.exists():
            data = painting_item.get(force=True)
            claims = data.get('claims')

            # add natmus id claim
            self.add_natmus_id(painting_item, obj_id, uri)

            # add inventory number with collection
            self.add_inventory_and_collection_claim(painting_item, painting_id,
                                                    painting, uri)

            # Instance_of
            if u'P31' not in claims:
                self.add_instanceof_claim(painting_item, painting_id, painting)

            # title (as claim)
            # commented out as the titles in Europeana are not reliable
            # if u'P1476' not in claims:
            #    self.add_title_claim(painting_item, painting)

            # Europeana_ID
            self.add_europeana_claim(painting_item, painting)

            # Check for potential images to add, if none is present
            if u'P18' not in claims:
                self.add_image_claim(painting_item, uri)

            # creator through Nat_mus_database dump
            self.add_natmus_creators(painting_item, obj_id, uri)
            # creator IFF through dbpedia
            # if u'P170' not in claims:
            #    self.add_dbpedia_creator(painting_item, painting)

    def add_title_claim(self, painting_item, painting):
        """Add a title/P1476 claim based on dcTitle.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting: information object for the painting
        @type painting: dict
        """
        dc_title = painting['object']['proxies'][0]['dcTitle']
        titles = []
        for lang, title in dc_title.iteritems():
            titles.append(pywikibot.WbMonolingualText(title[0], lang))
        for title in titles:
            self.wd.addNewClaim(u'P1476', WD.Statement(title), painting_item,
                                self.make_europeana_reference(painting))

    def add_locatedin_claim(self, painting_item, painting_id, painting):
        """Add a located_in/P276 claim based on sub-collection.

        No longer used as sub-collection does not match actual placing.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param painting: information object for the painting
        @type painting: dict
        """
        place = self.prefix_map[painting_id.split(' ')[0]]['place']
        place_item = self.wd.QtoItemPage(place)
        self.wd.addNewClaim(u'P276', WD.Statement(place_item), painting_item,
                            self.make_europeana_reference(painting))

    def add_dbpedia_creator(self, painting_item, painting):
        """Add a Creator/P170 claim through a dbpedia look-up.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting: information object for the painting
        @type painting: dict
        """
        creator_id = None
        try:
            db_creator = painting['object']['proxies'][1]['dcCreator']['def']
            if len(db_creator) == 1:
                # skip anything more complex than one creator
                db_creator = db_creator[0].strip()
                if db_creator.startswith('http://dbpedia.org/resource/'):
                    if db_creator not in self.creators.keys():
                        self.creators[db_creator] = \
                            helpers.dbpedia_2_wikidata(db_creator)
                    creator_id = self.creators[db_creator]
        except KeyError:
            return

        if creator_id:
            self.set_creator(painting_item,
                             self.make_europeana_reference(painting),
                             creator_q=creator_id)

    def add_image_claim(self, painting_item, uri):
        """Add a image/P18 claim if exactly one image is found on Commons.

        Uses the nationalmuseum.se uri to search for matches on Commons. Adds a
        claim only if a unique hit is found.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        images = self.file_from_external_link(uri)
        if len(images) > 1:  # for now don't want to choose the appropriate one
            pywikibot.output('Found multiple matching images for %s' %
                             painting_item)
            for image in images:
                pywikibot.output(u'\t%s' % image)
        elif len(images) == 1:
            self.wd.addNewClaim(u'P18', WD.Statement(images[0]), painting_item,
                                self.make_commons_reference())

    def add_europeana_claim(self, painting_item, painting):
        """Add a Europeana ID/P727 claim.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting: information object for the painting
        @type painting: dict
        """
        europeana_prop = u'P727'
        europeana_id = painting['object']['about'].lstrip('/')

        # abort if conflicting info
        if europeana_prop in painting_item.claims and \
                not self.wd.has_claim(europeana_prop, europeana_id,
                                      painting_item):
            pywikibot.output(u'%s has conflicting %s. Expected %s' %
                             (painting_item, europeana_prop, europeana_id))
            return

        self.wd.addNewClaim(europeana_prop, WD.Statement(europeana_id),
                            painting_item,
                            self.make_europeana_reference(painting))

    def add_instanceof_claim(self, painting_item, painting_id, painting):
        """Add an instance_of/P31 claim.

        Instance_of is always painting or icon while working on the paintings
        collection.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param painting: information object for the painting
        @type painting: dict
        """
        dcformat_item = self.wd.QtoItemPage(PAINTING_Q)  # painting
        if painting_id.split(' ')[0] == 'NMI':
            dcformat_item = self.wd.QtoItemPage(ICON_Q)  # icon

        self.wd.addNewClaim(u'P31', WD.Statement(dcformat_item), painting_item,
                            self.make_europeana_reference(painting))

    @staticmethod
    def is_miniature(painting):
        """Determine if the painting is a miniature.

        @param painting: information object for the painting
        @type painting: dict
        @rtype bool
        """
        for concept in painting['object']['concepts']:
            if concept[u'about'] == MINIATURE_URL:
                # pywikibot.output(u'Skipping miniature')
                return True
        return False

    def create_existing_painting(self, painting, painting_id):
        """Add base info to an existing paining.

        Adds the same info as would have been added had it been created with
        create_new_painting()

        @param painting: information object for the painting
        @type painting: dict
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @return: the created painting item
        @rtype: pywikibot.ItemPage
        """
        painting_item = self.wd.QtoItemPage(self.painting_ids.get(painting_id))

        # check label
        data = painting_item.get()
        labels = make_labels(painting)
        new_labels = find_new_values(data, labels, 'labels')
        if new_labels:
            pywikibot.output('Adding label to %s' % painting_item.title())
            painting_item.editLabels(new_labels)

        # check description
        descriptions = make_descriptions(painting)
        if descriptions:
            new_descr = find_new_values(data, descriptions, 'descriptions')
            if new_descr:
                pywikibot.output('Adding description to %s' %
                                 painting_item.title())
                painting_item.editDescriptions(new_descr)

        return painting_item

    def create_new_painting(self, painting, painting_id, europeana_url, uri):
        """Create a new painting item and return it.

        @param painting: information object for the painting
        @type painting: dict
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param europeana_url: reference url for Europeana
        @type europeana_url: str
        @param uri: reference uri at nationalmuseum.se
        @type uri: str
        @return: the created painting item
        @rtype: pywikibot.ItemPage
        """
        data = {'labels': {}, 'descriptions': {}}

        data['labels'] = make_labels(painting)
        data['descriptions'] = make_descriptions(painting)
        if not data['descriptions']:
            return

        # print data
        # create new empty item and request Q-number
        summary = u'%s: Creating new item with data from %s' % (EDIT_SUMMARY,
                                                                europeana_url)
        painting_item = None
        try:
            painting_item = self.wd.make_new_item(data, summary)
        except pywikibot.data.api.APIError as e:
            if e.code == u'modification-failed':
                # disambiguate and try again
                for lang, content in data['descriptions'].iteritems():
                    disambiguation = content['value'] + u' (%s)' % painting_id
                    data['descriptions'][lang]['value'] = disambiguation
                try:
                    painting_item = self.wd.make_new_item(data, summary)
                except pywikibot.data.api.APIError as e:
                    if e.code == u'modification-failed':
                        pywikibot.output(u'modification-failed error: '
                                         u'skipping %s' % uri)
                        return
                    else:
                        raise pywikibot.Error(u'Error during item creation: '
                                              u'%s' % e)
            else:
                raise pywikibot.Error(u'Error during item creation: %s' % e)

        return painting_item

    def add_natmus_id(self, painting_item, obj_id, uri):
        """Add a natmus_painting_id/P2539 claim.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param obj_id: the nationalmuseum database id
        @type obj_id: str
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        self.wd.addNewClaim(u'P2539', WD.Statement(obj_id), painting_item,
                            self.make_url_reference(uri))

    def add_natmus_creators(self, painting_item, obj_id, uri):
        """Add creator/P170 claim(s) based on the database dump info.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param obj_id: the nationalmuseum database id
        @type obj_id: str
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        if obj_id not in self.creator_dump.keys():
            return

        # each artwork may have multiple artists,
        # which must all be on wikidata
        for artist_id in self.creator_dump[obj_id].keys():
            if artist_id not in self.artist_ids.keys():
                self.logger('Artist not found on wikidata: %s' % artist_id)
                return

        dump_entry = self.creator_dump[obj_id]
        if len(dump_entry) == 1:
            artist_entry = dump_entry.iteritems().next()
            self.add_singel_natmus_creator(painting_item, artist_entry, uri)
        elif len(dump_entry) == 2:
            # self.add_double_natmus_creator(painting_item, dump_entry, uri)
            # skipping until duplication issue has been solved
            pass
        else:
            # for now avoid any entries with more creators
            return

    def add_singel_natmus_creator(self, painting_item, artist, uri):
        u"""Add a simple creator/P170 claim based on the database dump info.

        Handles cases with only a single identified creator. Either
        * Known creator
        * Unknown/uncertain creator somehow related to a known person
        where creator is someone whose function is in artist_labels.

        For Forgery/After work by the bot needs to be aware of both parties,
        and both must exist on Wikidata

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param artist: the dump entry for the artist
        @type artist: tuple (artist_id, artist_info)
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        anonymous_combos = {
            u'Tillskriven': 'P1773',
            u'Hennes ateljé': 'P1774',
            u'Hans ateljé': 'P1774',
            u'Hennes skola': 'P1780',
            u'Hans skola': 'P1780',
            u'Hennes art': 'P1777',
            u'Hans art': 'P1777',
        }
        artist_labels = (u'Konstnär', u'Mästare', u'Utförd av')

        artist_id, artist_info = artist
        artist_q = self.artist_ids[artist_id]

        if artist_info.get('OkuBeschreibungS') or \
                artist_info.get('OkuValidierungS'):
            # this always indicates some special case which we cannot handle
            # for now
            return

        if artist_info.get('OkuFunktionS') and \
                artist_info.get('OkuFunktionS') in artist_labels:
            if len(artist_info.keys()) == 1:  # i.e. all other are empty
                self.set_creator(painting_item,
                                 self.make_url_reference(uri),
                                 creator_q=artist_q)
            elif artist_info.get('OkuArtS') in anonymous_combos.keys() and \
                    len(artist_info.keys()) == 2:
                # anonymous but attributed to the artist
                related_info = {
                    'P': anonymous_combos[artist_info.get('OkuArtS')],
                    'itis': self.wd.QtoItemPage(artist_q)
                }
                self.set_creator(painting_item,
                                 self.make_url_reference(uri),
                                 related_info=related_info)
        elif not artist_info.get('OkuFunktionS') and artist_id == '1':
            # this is the special case of a completly unknown creator
            self.set_creator(painting_item, self.make_url_reference(uri))

    def add_double_natmus_creator(self, painting_item, artists, uri):
        u"""Add a comlex creator/P170 claim based on the database dump info.

        Handles cases with two identified creators in a relation along the
        lines of "Painting/Forgery by X after a work by Y"

        The logic is:
        OkuFunktionS in derived_combos -> OkuKueID = creator of original
        OkuFunktionS in artist_labels -> OkuKueID = creator of derivative
        @param artists: the dump entries for the artists
        @type artists: dict of {artist_id: artist_info}
        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        derived_combos = {
            u'Kopia efter': 'P1877',
            u'Efter': 'P1877',
            u'Förfalskning efter': 'P1778',
        }
        artist_labels = (u'Konstnär', u'Utförd av')

        # set up targets
        original = None
        derivative = None
        relation = None

        for artist in artists.iteritems():
            artist_id, artist_info = artist
            if artist_info.get('OkuBeschreibungS') or \
                    artist_info.get('OkuValidierungS'):
                # this indicates some special case which we cannot handle
                # for now
                return

            if artist_info.get('OkuFunktionS') and \
                    len(artist_info.keys()) == 1:
                # cannot deal with OkuArtS
                if artist_info.get('OkuFunktionS') in artist_labels:
                    derivative = artist
                elif artist_info.get('OkuFunktionS') in derived_combos.keys():
                    original = artist
                    relation = derived_combos[artist_info.get('OkuFunktionS')]

        # verify that both roles were filled
        if any(creator is None for creator in (original, derivative)):
            return

        # construct info and set
        original_q = self.artist_ids[original[0]]
        derivative_q = self.artist_ids[derivative[0]]
        related_info = {'P': relation, 'itis': self.wd.QtoItemPage(original_q)}
        self.set_creator(painting_item,
                         self.make_url_reference(uri),
                         creator_q=derivative_q,
                         related_info=related_info)

    def set_creator(self,
                    target_item,
                    reference,
                    creator_q=None,
                    related_info=None):
        """Set a creator/P170 claim for a creator or creator combo.

        Allows for simple claims as well as more complex
        "in the manner of" etc.

        @param target_item: item to which claim is added
        @type target_item: pywikibot.ItemPage
        @param reference: the reference for the statment
        @type reference: WD.Reference
        @param related_info: related info as a dict with P/itis pairs
        @type related_info: dict
        @param creator_q: the Q-id of the creator
        @type creator_q: str
        """
        creator_q = creator_q or ANON_Q
        creator_statement = WD.Statement(self.wd.QtoItemPage(creator_q))

        # set any related qualifiers
        if related_info:
            creator_statement.addQualifier(
                WD.Qualifier(P=related_info['P'], itis=related_info['itis']))

        # set claim
        self.wd.addNewClaim(u'P170', creator_statement, target_item, reference)

    def add_inventory_and_collection_claim(self, painting_item, painting_id,
                                           painting, uri):
        """Add an inventory_no, with qualifier, and a collection/P195 claim.

        This will add the collection qualifier to any matching
        claim missing it.

        @param painting_item: item to which claim is added
        @type painting_item: pywikibot.ItemPage
        @param painting_id: the common (older) id of the painting in the
            Nationalmuseum collection
        @type painting_id: str
        @param painting: information object for the painting
        @type painting: dict
        @param uri: reference url on nationalmuseum.se
        @type uri: str
        """
        nationalmuseum_item = self.wd.QtoItemPage(INSTITUTION_Q)
        collection_p = u'P195'

        # abort if conflicting info
        if self.painting_id_prop in painting_item.claims and \
                not self.wd.has_claim(self.painting_id_prop, painting_id,
                                      painting_item):
            pywikibot.output(
                u'%s has conflicting inv. no (%s). Expected %s' %
                (painting_item, self.painting_id_prop, painting_id))
            return

        # add inventory number with collection
        self.wd.addNewClaim(
            self.painting_id_prop,
            WD.Statement(painting_id).addQualifier(WD.Qualifier(
                P=collection_p, itis=nationalmuseum_item),
                                                   force=True), painting_item,
            self.make_url_reference(uri))

        # add collection (or subcollection)
        subcol = self.prefix_map[painting_id.split(' ')[0]]['subcol']
        collection_item = nationalmuseum_item
        if subcol is not None:
            collection_item = self.wd.QtoItemPage(subcol)

        self.wd.addNewClaim(collection_p, WD.Statement(collection_item),
                            painting_item,
                            self.make_europeana_reference(painting))

    def make_europeana_reference(self, painting):
        """Make a Reference object with a Europeana retrieval url and today's date.

        @param uri: retrieval uri/url
        @type uri: str
        @rtype: WD.Reference
        """
        europeana_url = u'http://europeana.eu/portal/record%s.html' % \
                        painting['object']['about']
        return self.make_url_reference(europeana_url)

    def make_url_reference(self, uri):
        """Make a Reference object with a retrieval url and today's date.

        @param uri: retrieval uri/url
        @type uri: str
        @rtype: WD.Reference
        """
        date = helpers.today_as_WbTime()
        ref = WD.Reference(source_test=self.wd.make_simple_claim(u'P854', uri),
                           source_notest=self.wd.make_simple_claim(
                               u'P813', date))
        return ref

    def make_commons_reference(self):
        """Make a Reference object saying imported from Wikimedia Commons."""
        commons_item = self.wd.QtoItemPage(COMMONS_Q)
        ref = WD.Reference(source_test=self.wd.make_simple_claim(
            u'P143', commons_item))  # imported from
        return ref

    def file_from_external_link(self, uri):
        """Identify files from a Nationalmuseum uri.

        Hits are any files containing a link to the eMuseumPlus uri.

        @param uri: reference url on nationalmuseum.se
        @type uri: str
        @return: matching images
        @rtype: list
        """
        images = []
        uri = uri.split('://')[1]
        objgen = pagegenerators.LinksearchPageGenerator(uri,
                                                        namespaces=[6],
                                                        site=self.commons)
        for page in objgen:
            images.append(pywikibot.FilePage(self.commons, page.title()))

        # I have no clue how the above results in duplicates, but it does so...
        images = list(set(images))

        return images

    def most_missed_creators(self, cache_max_age=0):
        """Produce list of most frequent, but unlinked, creators.

        Query WDQ for all objects in the collection missing an artist
        then put together a top-list for most desired creator
        """
        expected_items = []
        query = u'CLAIM[195:%s] AND NOCLAIM[170]' % \
                ',195:'.join(self.collections)  # collection
        wd_queryset = wdquery.QuerySet(query)

        wd_query = wdquery.WikidataQuery(cacheMaxAge=cache_max_age)
        data = wd_query.query(wd_queryset)

        if data.get('status').get('error') == 'OK':
            expected_items = data.get('items')

        creator_dict = {}
        counter = 0
        for q_val in expected_items:
            q_item = self.wd.QtoItemPage(q_val)
            data = q_item.get()
            claims = data.get('claims')
            if u'P170' in claims:
                continue
            descr = data.get('descriptions').get('en')
            if descr and descr.startswith(u'painting by '):
                creator = descr[len(u'painting by '):]
                if '(' in creator:  # to get rid of disambiguation addition
                    creator = creator[:creator.find('(')].strip()
                if creator in creator_dict.keys():
                    creator_dict[creator] += 1
                else:
                    creator_dict[creator] = 1
                counter += 1
        pywikibot.output(u'Found %d mentions of %d creators' %
                         (counter, len(creator_dict)))
        # output
        f = codecs.open(u'creatorHitlist.csv', 'w', 'utf-8')
        for k, v in creator_dict.iteritems():
            f.write(u'%d|%s\n' % (v, k))
        f.close()

    def logger(self, text):
        """Append text to logfile.

        @param text: text to output
        @type text: str
        """
        self.log.write(u'%s\n' % text)
        self.log.flush()  # because shit tends to crash