Пример #1
0
 def template_processor(self, page, introtext):
     """Process templates.
     
     Infoboxes in the introduction will be processed and removed.
     Other templates will just be removed (most often these are amboxes).
     page is needed as parameter for storing as result
     """
     templates = pywikibot.extract_templates_and_params(introtext)
     for t in templates:
         if re.search(infobox, t[0]):
             # print t #debug only
             for k in t[1].keys():
                 # pywikibot.output(k) #debug only
                 # pywikibot.output(t[1][k])
                 m = self.dateregexwithyear.search(t[1][k])
                 if m:
                     # We have just found the date we are looking for :-)
                     d = {
                         'page': page,
                         'year': int(m.group('year')),
                         'text': k + ' = ' + m.group()
                     }
                     self.data['infobox'].append(d)
                     # pywikibot.output('\03{green}Bingó! ' + d['text'] + '\03{default}')
     # Removal (must be repeated for nested templates):
     while TEMP_REGEX.search(introtext):
         introtext = TEMP_REGEX.sub('',introtext)
     return introtext
Пример #2
0
    def procesPage(self, page):
        """
        Proces a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output('Processing %s' % page)
        if not item.exists():
            pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
            #TODO FIXME: We should provide an option to create the page
        else:
            pagetext = page.get()
            templates = pywikibot.extract_templates_and_params(pagetext)
            for (template, fielddict) in templates:
                # Clean up template
                template = pywikibot.Page(page.site, template,
                                          ns=10).title(withNamespace=False)
                # We found the template we were looking for
                if template in self.templateTitles:
                    for field, value in fielddict.items():
                        field = field.strip()
                        value = value.strip()
                        # This field contains something useful for us
                        if field in self.fields:
                            # Check if the property isn't already set
                            claim = pywikibot.Claim(self.repo, self.fields[field])
                            if claim.getID() in item.get().get('claims'):
                                pywikibot.output(
                                    u'A claim for %s already exists. Skipping'
                                    % claim.getID())
                                # TODO FIXME: This is a very crude way of dupe
                                # checking
                            else:
                                if claim.getType() == 'wikibase-item':
                                    # Try to extract a valid page
                                    match = re.search(pywikibot.link_regex, value)
                                    if match:
                                        try:
                                            link = pywikibot.Link(match.group(1))
                                            linkedPage = pywikibot.Page(link)
                                            if linkedPage.isRedirectPage():
                                                linkedPage = linkedPage.getRedirectTarget()
                                            linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
                                            claim.setTarget(linkedItem)
                                        except pywikibot.exceptions.NoPage:
                                            pywikibot.output('[[%s]] doesn\'t exist so I can\'t link to it' % (linkedItem.title(),))
                                            continue
                                elif claim.getType() == 'string':
                                    claim.setTarget(value.strip())
                                else:
                                    print "%s is not a supported datatype." % claim.getType()
                                    continue

                                pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
                                item.addClaim(claim)
                                # A generator might yield pages from multiple sites
                                source = self.getSource(page.site.language())
                                if source:
                                    claim.addSource(source, bot=True)
Пример #3
0
 def procesPage(self, page):
     """Process a single page/item."""
     item = pywikibot.DataPage(page)
     pywikibot.output('Processing %s' % page)
     if not item.exists():
         pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
         #TODO FIXME: We should provide an option to create the page
     else:
         pagetext = page.get()
         pagetext = pywikibot.removeDisabledParts(pagetext)
         templates = pywikibot.extract_templates_and_params(pagetext)
         for (template, fielddict) in templates:
             # We found the template we were looking for
             if template.replace(u'_', u' ') == self.templateTitle:
                 for field, value in fielddict.items():
                     # This field contains something useful for us
                     if field in self.fields:
                         # Check if the property isn't already set
                         claim = self.fields[field]
                         if claim in item.get().get('claims'):
                             pywikibot.output(
                                 u'A claim for %s already exists. Skipping'
                                 % (claim, ))
                             # TODO FIXME: This is a very crude way of dupe
                             # checking
                         else:
                             # Try to extract a valid page
                             match = re.search(
                                 re.compile(
                                     r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'
                                 ), value)
                             if match:
                                 try:
                                     link = match.group(1)
                                     linkedPage = pywikibot.Page(
                                         self.site, link)
                                     if linkedPage.isRedirectPage():
                                         linkedPage = linkedPage.getRedirectTarget(
                                         )
                                     linkedItem = pywikibot.DataPage(
                                         linkedPage)
                                     pywikibot.output(
                                         'Adding %s --> %s' %
                                         (claim, linkedItem.getID()))
                                     refs = self.setSource(
                                         self.site().language())
                                     if refs:
                                         item.editclaim(str(claim),
                                                        linkedItem.getID(),
                                                        refs=set(refs))
                                     else:
                                         item.editclaim(
                                             str(claim), linkedItem.getID())
                                 except pywikibot.NoPage:
                                     pywikibot.output(
                                         "[[%s]] doesn't exist so I can't link to it"
                                         % linkedItem.title())
Пример #4
0
 def procesPage(self, page):
     """
     Proces a single page
     """
     item = pywikibot.DataPage(page)
     pywikibot.output('Processing %s' % page)
     if not item.exists():
         pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
         #TODO FIXME: We should provide an option to create the page
     else:
         pagetext = page.get()
         pagetext = pywikibot.removeDisabledParts(pagetext)
         templates = pywikibot.extract_templates_and_params(pagetext)
         for (template, fielddict) in templates:
             # We found the template we were looking for
             if template.replace(u'_', u' ') == self.templateTitle:
                 for field, value in fielddict.items():
                     # This field contains something useful for us
                     if field in self.fields:
                         # Check if the property isn't already set
                         claim = self.fields[field]
                         if claim in item.get().get('claims'):
                             pywikibot.output(
                                 u'A claim for %s already exists. Skipping'
                                 % (claim,))
                             # TODO FIXME: This is a very crude way of dupe
                             # checking
                         else:
                             # Try to extract a valid page
                             match = re.search(
                                 re.compile(
                                     r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'),
                                 value)
                             if match:
                                 try:
                                     link = match.group(1)
                                     linkedPage = pywikibot.Page(self.site,
                                                                 link)
                                     if linkedPage.isRedirectPage():
                                         linkedPage = linkedPage.getRedirectTarget()
                                     linkedItem = pywikibot.DataPage(linkedPage)
                                     pywikibot.output('Adding %s --> %s'
                                                      % (claim,
                                                         linkedItem.getID()))
                                     if self.setSource(self.site().language()):
                                         item.editclaim(
                                             str(claim),
                                             linkedItem.getID(),
                                             refs={self.setSource(
                                                 self.site().language())})
                                     else:
                                         item.editclaim(str(claim),
                                                        linkedItem.getID())
                                 except pywikibot.NoPage:
                                     pywikibot.output(
                                         "[[%s]] doesn't exist so I can't link to it"
                                         % linkedItem.title())
Пример #5
0
 def removeTopTemplates(self, text):
     text = text.strip()
     templates = pywikibot.extract_templates_and_params(text)
     for name, params in templates:
         templateRegex = self.getRegexForTemplate(name, params)
         for templateText in re.findall(templateRegex, text):
             if text.startswith(templateText):
                 return text[len(templateText):].strip()
     return text
Пример #6
0
 def removeTopTemplates(self, text):
     text = text.strip()
     templates = pywikibot.extract_templates_and_params(text)
     for name, params in templates:
         templateRegex = self.getRegexForTemplate(name, params)
         for templateText in re.findall(templateRegex, text):
             if text.startswith(templateText):
                 return text[len(templateText):].strip()
     return text
Пример #7
0
 def cosmeticChanges(self, sectionText):
     sectionText = sectionText.strip()
     templates = pywikibot.extract_templates_and_params(sectionText)
     for name, params in templates:
         if name in archiveConfig.templatesToSubstitute:
             templateRegex = self.getRegexForTemplate(name, params)
             for match in re.findall(templateRegex, sectionText):
                 replaceText = u'{{subst:' + match[2:]
                 sectionText = sectionText.replace(match, replaceText)
     return sectionText
Пример #8
0
 def cosmeticChanges(self, sectionText):
     sectionText = sectionText.strip()
     templates = pywikibot.extract_templates_and_params(sectionText)
     for name, params in templates:
         if name in archiveConfig.templatesToSubstitute:
             templateRegex = self.getRegexForTemplate(name, params)
             for match in re.findall(templateRegex, sectionText):
                 replaceText = u'{{subst:' + match[2:]
                 sectionText = sectionText.replace(match, replaceText)
     return sectionText
Пример #9
0
def processCreatorTemplate(name, conf):
	trace = Trace(sys._getframe().f_code.co_name)
	site = pywikibot.Site()
	creator = pywikibot.Page(site, name)
	if creator.exists() == False:
		return u""
	while creator.isRedirectPage():
		creator = creator.getRedirectTarget()
	tls = pywikibot.extract_templates_and_params(creator.get())
	for (template,params) in tls:
		print params
		if template != u"Creator":
			continue
		occupation = params[u"Occupation"]
		for valid in conf['validOccupations']:
			if occupation.find(valid) > -1:
				#print occupation
				return formatAuthor(name) + u" (" + conf['validOccupations'][valid] + u")"
	return u""
 def procesPage(self, page):
     """
     Proces a single page
     """
     item = pywikibot.ItemPage.fromPage(page)
     pywikibot.output('Processing %s' % page)
     if not item.exists():
         pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
         #TODO FIXME: We should provide an option to create the page
     else:
         pagetext = page.get()
         templates = pywikibot.extract_templates_and_params(pagetext)
         for (template, fielddict) in templates:
             # We found the template we were looking for
             if template.replace(u'_', u' ')==self.templateTitle:
                 for field, value in fielddict.items():
                     # This field contains something useful for us
                     if field in self.fields:
                         # Check if the property isn't already set
                         claim = pywikibot.Claim(self.repo, self.fields[field])
                         if claim.getID() in item.get().get('claims'):
                             pywikibot.output(u'A claim for %s already exists. Skipping' % (claim.getID(),))
                             #TODO FIXME: This is a very crude way of dupe checking
                         else:
                             # Try to extract a valid page
                             match = re.search(pywikibot.link_regex, value)
                             if match:
                                 try:
                                     link = pywikibot.Link(match.group(1))
                                     linkedPage = pywikibot.Page(link)
                                     if linkedPage.isRedirectPage():
                                         linkedPage = linkedPage.getRedirectTarget()
                                     linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
                                     claim.setTarget(linkedItem)
                                     pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget().getID()))
                                     item.addClaim(claim)
                                     if self.source:
                                         claim.addSource(self.source, bot=True)
                                 except pywikibot.exceptions.NoPage:
                                     pywikibot.output('[[%s]] doesn\'t exist so I can\'t link to it' % (linkedItem.title(),))
Пример #11
0
    def procesPage(self, site, page):
        """
        Proces a single page
        """
        pywikibot.output('Processing %s' % page)
        try:
	  item = pywikibot.ItemPage.fromPage(page)
	except pywikibot.exceptions.NoPage:
	    pywikibot.output(u'No wikidata for: %s ' % page)
	    return
	
        if not item.exists():
            pywikibot.output('%s doesn\'t have a wikidata item :' % page)
            #TODO FIXME: We should provide an option to create the page
        else:
            pagetext = page.get()
            pagetext = pywikibot.removeDisabledParts(pagetext)
            templates = pywikibot.extract_templates_and_params(pagetext)
            #pywikibot.output( u'Templates: %s' % templates)
            for (template, fielddict) in templates:
                # We found the template we were looking for
                linkedTemplate = pywikibot.Page(self.site, template, ns=10)
                try:
		  if linkedTemplate.isRedirectPage():
                   template2 = linkedTemplate.getRedirectTarget().title()
                   pywikibot.output(
                                    u'Template redirection from %s to %s'
                                    % (template, template2))
                   template = template2[9:]
                except pywikibot.exceptions.InvalidTitle:
		  pywikibot.output("[[%s]]  contains illegal char(s)"
                                            % template)
               
                if template.replace(u'_', u' ') == self.templateTitle:
                     #pywikibot.output( u'Template: %s' % template)
                     for field, value in fielddict.items():
                        # This field contains something useful for us
                        field = field.strip()
                        #pywikibot.output('    field <%s>' % field )
                        # pywikibot.output('    self.fields %s' % (field in self.fields))
                        if (value != "") and (field in self.fields):
                            # Check if the property isn't already set
                            #pywikibot.output('    attribut %s' % field)
                            claim = self.fields[field]
                            if claim[2:-2] in item.get().get('claims'):
                                pywikibot.output(
                                    u'A claim for %s already exists. Skipping'
                                    % (claim,))
                                # TODO FIXME: This is a very crude way of dupe
                                # checking
                            else:
                                # Try to extract a valid page
                                match = re.search(re.compile(
                                    r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'),
                                                  value)
                                #pywikibot.output(u'      cherche %s ' % value)
                                if True:
                                    try:
                                        value = value.strip()
                                        #Date treatement
                                        if claim == "[[P585]]" and value != "":
                                            try:
                                                pywikibot.output(u'      Date: <%s> ' % value)
                                                laDate = parser.parse(value)
                                                pywikibot.output(u'      Date: <%s> ' % laDate)
                                                repo = site.data_repository() # utile self.repo existe ?
                                                theClaim = pywikibot.Claim(repo, claim[2:-2])
                                                # pywikibot.output(u'      Year: %s, Month: %s, Day: %s ' % laDateText[0:3], laDateText[5:6], laDateText[7:8])
						pywikibot.output('Adding %s --> %s'
								% (claim,
								    laDate))
                                                laDate = pywikibot.WbTime(year=laDate.year, month=laDate.month, day=laDate.day)
                                                theClaim.setTarget(laDate)
                                                item.addClaim(theClaim)  
                                                if self.source:
						  theClaim.addSource(self.source, bot=True)
                                            except ValueError:
                                                pywikibot.output(u'      Impossible to parse this date : %s ' % value)
                                                continue
					    continue
					  
                                        if value[:2] == "[[" and value[-2:] == "]]":
					     link = value[2:-2]
                                        else:
                                            link = value
                                        #pywikibot.output(u'      link: <%s> ' % link)
                                        if link == "":
					  continue
                                        #link = match.group(1)
                                        linkedPage = pywikibot.Page(self.site, link)
                                        if linkedPage.isRedirectPage():
                                            linkedPage = linkedPage.getRedirectTarget()
                                        #pywikibot.output(u'      linkedPage %s ' % linkedPage)
                                        linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
                                        linkedItem.get()
                                        if not linkedItem.exists():
					   pywikibot.output('%s doesn\'t have a wikidata item :' % linkedPage)
					   continue
                                        
                                        #value constraints treatement
                                        if (claim in self.valuesConstraints) and (linkedItem.getID() not in  self.valuesConstraints[claim]):
                                             pywikibot.output(u'The value of the property %s is %s does not respect the constraint %s' % 
                                                                       (claim,
                                                                       linkedItem.title(),
                                                                       self.valuesConstraints[claim]))
                                             continue
                                        
                                        #instance of constraint treatment
                                        if claim == "[[P541]]":
					  linkedItem.get()  # you need to call it to access any data.
					  if linkedItem.claims and ('P31' in linkedItem.claims):
					    if linkedItem.claims['P31'][0].getTarget().title(withNamespace=False) != "Q4164871":
						pywikibot.output(u'The P31 value is not Q4164871 but %s ' % linkedItem.claims['P31'][0].getTarget().title(withNamespace=True))
						continue
					  else:
					    pywikibot.output(u'The P31 value is missing ')
					    continue
                                        
                                        #pywikibot.output(u'      linkedItem %s ' % linkedItem)
                                        #pywikibot.output(u'      linkedItem.getID() %s ' % linkedItem.title()[1:])
                                        pywikibot.output('Adding %s --> %s'
                                                         % (claim,
                                                            linkedItem.getID()))
                                        repo = site.data_repository() # utile self.repo existe ?
                    
                                        theClaim = pywikibot.Claim(repo, claim[2:-2])
                                        theClaim.setTarget(linkedItem)
                                        item.addClaim(theClaim)
                                        if self.source:
                                           theClaim.addSource(self.source, bot=True)
                                    except pywikibot.NoPage:
                                        pywikibot.output(
                                            "[[%s]] doesn't exist so I can't link to it"
                                            % linkedPage)
                                    except pywikibot.exceptions.InvalidTitle:
                                        pywikibot.output(
                                            "[[%s]] is an invalid title"
                                            % link)
Пример #12
0
    def processPage(self, page):
        """
        Process a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output('Processing %s' % page)
        if not item.exists():
            pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
            #TODO FIXME: We should provide an option to create the page
            return
        item.get()
        if set(self.fields.values()) <= set(item.claims.keys()):
            pywikibot.output(u'%s item %s has claims for all properties. Skipping' % (page, item.title()))
        else:
            pagetext = page.get()
            templates = pywikibot.extract_templates_and_params(pagetext)
            for (template, fielddict) in templates:
                # Clean up template
                try:
                    template = pywikibot.Page(page.site, template,
                                              ns=10).title(withNamespace=False)
                except pywikibot.exceptions.InvalidTitle as e:
                    pywikibot.error(u"Failed parsing template; '%s' should be the template name." % template)
                    continue
                # We found the template we were looking for
                if template in self.templateTitles:
                    for field, value in fielddict.items():
                        field = field.strip()
                        value = value.strip()
                        if not field or not value:
                            continue

                        # This field contains something useful for us
                        if field in self.fields:
                            # Check if the property isn't already set
                            claim = pywikibot.Claim(self.repo, self.fields[field])
                            if claim.getID() in item.get().get('claims'):
                                pywikibot.output(
                                    u'A claim for %s already exists. Skipping'
                                    % claim.getID())
                                # TODO: Implement smarter approach to merging
                                # harvested values with existing claims esp.
                                # without overwriting humans unintentionally.
                            else:
                                if claim.type == 'wikibase-item':
                                    # Try to extract a valid page
                                    match = re.search(pywikibot.link_regex, value)
                                    if not match:
                                        pywikibot.output(u'%s field %s value %s isnt a wikilink. Skipping' % (claim.getID(), field, value))
                                        continue

                                    link_text = match.group(1)
                                    linked_item = self._template_link_target(item, link_text)
                                    if not linked_item:
                                        continue

                                    claim.setTarget(linked_item)
                                elif claim.type == 'string':
                                    claim.setTarget(value.strip())
                                elif claim.type == 'commonsMedia':
                                    commonssite = pywikibot.Site("commons", "commons")
                                    imagelink = pywikibot.Link(value, source=commonssite, defaultNamespace=6)
                                    image = pywikibot.ImagePage(imagelink)
                                    if image.isRedirectPage():
                                        image = pywikibot.ImagePage(image.getRedirectTarget())
                                    if not image.exists():
                                        pywikibot.output('[[%s]] doesn\'t exist so I can\'t link to it' % (image.title(),))
                                        continue
                                    claim.setTarget(image)
                                else:
                                    pywikibot.output("%s is not a supported datatype." % claim.type)
                                    continue

                                pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
                                item.addClaim(claim)
                                # A generator might yield pages from multiple sites
                                source = self.getSource(page.site)
                                if source:
                                    claim.addSource(source, bot=True)
Пример #13
0
    def procesPage(self, site, page):
        """
        Proces a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output('Processing %s' % page)
        if not item.exists():
            pywikibot.output('%s doesn\'t have a wikidata item :' % page)
            #TODO FIXME: We should provide an option to create the page
        else:
            pagetext = page.get()
            pagetext = pywikibot.removeDisabledParts(pagetext)
            templates = pywikibot.extract_templates_and_params(pagetext)
            for (template, fielddict) in templates:
                # We found the template we were looking for
                linkedTemplate = pywikibot.Page(self.site, template, ns=10)
                if linkedTemplate.isRedirectPage():
                   template2 = linkedTemplate.getRedirectTarget().title()
                   pywikibot.output(
                                    u'Template redirection from %s to %s'
                                    % (template, template2))
                   template = template2[9:]
                if template.replace(u'_', u' ') == self.templateTitle:
                     for field, value in fielddict.items():
                        # This field contains something useful for us
                        # pywikibot.output('    attribut %s' % field)
                        if (value != "") and (field in self.fields):
                            # Check if the property isn't already set
                            claim = self.fields[field]
                            if claim[2:-2] in item.get().get('claims'):
                                pywikibot.output(
                                    u'A claim for %s already exists. Skipping'
                                    % (claim,))
                                # TODO FIXME: This is a very crude way of dupe
                                # checking
                            else:
                                # Try to extract a valid page
                                match = re.search(re.compile(
                                    r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'),
                                                  value)
                                pywikibot.output(u'      cherche %s ' % value)
                                if True:
                                    try:
                                        #Date treatement
                                        #if claim == "[[P585]]":
                                            #try:
                                                #pywikibot.output(u'      Date: %s ' % value)
                                                #pywikibot.output(u'      Date: %s ' % parser.parse(value))
                                                #theClaim = pywikibot.Claim(repo, claim[2:-2])
                                                #theClaim.setTarget(parser.parse(value))
                                                #item.addClaim(theClaim)           
                                            #except ValueError
                                                #pywikibot.output(u'      Impossible to parse this date : %s ' % value)
                                                #continue
					      
                                        #continue
                                        if value[:2] == "[[" and value[-2:] == "]]":
					     link = value[2:-2]
                                        else:
                                            link = value
                                        #link = match.group(1)
                                        linkedPage = pywikibot.Page(self.site, link)
                                        if linkedPage.isRedirectPage():
                                            linkedPage = linkedPage.getRedirectTarget()
                                        pywikibot.output(u'      linkedPage %s ' % linkedPage)
                                        linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
                                        linkedItem.get()
                                        if not linkedItem.exists():
					   pywikibot.output('%s doesn\'t have a wikidata item :' % linkedPage)
					   continue
                                        
                                        #value constraints treatement
                                        if (claim in self.valuesConstraints) and (linkedItem.getID() not in  self.valuesConstraints[claim]):
                                             pywikibot.output(u'The value of the property %s is %s does not respect the constraint %s' % 
                                                                       (claim,
                                                                       linkedItem.title(),
                                                                       self.valuesConstraints[claim]))
                                             continue
                                        
                                        #pywikibot.output(u'      linkedItem %s ' % linkedItem)
                                        pywikibot.output(u'      linkedItem.getID() %s ' % linkedItem.title()[1:])
                                        pywikibot.output('Adding %s --> %s'
                                                         % (claim,
                                                            linkedItem.getID()))
                                        repo = site.data_repository() # utile self.repo existe ?
                    
                                        theClaim = pywikibot.Claim(repo, claim[2:-2])
                                        theClaim.setTarget(linkedItem)
                                        item.addClaim(theClaim)
                                        if self.source:
                                           theClaim.addSource(self.source, bot=True)
                                    except pywikibot.NoPage:
                                        pywikibot.output(
                                            "[[%s]] doesn't exist so I can't link to it"
                                            % linkedItem.title())
                                    except pywikibot.exceptions.InvalidTitle:
                                        pywikibot.output(
                                            "[[%s]] is an invalid title"
                                            % link)
Пример #14
0
    def procesPage(self, page):
        """
        Proces a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output('Processing %s' % page)
        if not item.exists():
            pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
            #TODO FIXME: We should provide an option to create the page
        else:
            pagetext = page.get()
            templates = pywikibot.extract_templates_and_params(pagetext)
            for (template, fielddict) in templates:
                # We found the template we were looking for
                if template.replace(u'_', u' ') == self.templateTitle:
                    for field, value in fielddict.items():
                        # This field contains something useful for us
                        if field in self.fields:
                            # Check if the property isn't already set
                            claim = pywikibot.Claim(self.repo,
                                                    self.fields[field])
                            if claim.getID() in item.get().get('claims'):
                                pywikibot.output(
                                    u'A claim for %s already exists. Skipping'
                                    % claim.getID())
                                # TODO FIXME: This is a very crude way of dupe
                                # checking
                            else:
                                if claim.getType() == 'wikibase-item':
                                    # Try to extract a valid page
                                    match = re.search(pywikibot.link_regex,
                                                      value)
                                    if match:
                                        try:
                                            link = pywikibot.Link(
                                                match.group(1))
                                            linkedPage = pywikibot.Page(link)
                                            if linkedPage.isRedirectPage():
                                                linkedPage = linkedPage.getRedirectTarget(
                                                )
                                            linkedItem = pywikibot.ItemPage.fromPage(
                                                linkedPage)
                                            claim.setTarget(linkedItem)
                                        except pywikibot.exceptions.NoPage:
                                            pywikibot.output(
                                                '[[%s]] doesn\'t exist so I can\'t link to it'
                                                % (linkedItem.title(), ))
                                            continue
                                elif claim.getType() == 'string':
                                    claim.setTarget(value.strip())
                                else:
                                    print "%s is not a supported datatype." % claim.getType(
                                    )
                                    continue

                                pywikibot.output(
                                    'Adding %s --> %s' %
                                    (claim.getID(), claim.getTarget()))
                                item.addClaim(claim)
                                if self.source:
                                    claim.addSource(self.source, bot=True)
Пример #15
0
    def procesPage(self, site, page):
        """
        Proces a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output('Processing %s' % page)
        if not item.exists():
            pywikibot.output('%s doesn\'t have a wikidata item :' % page)
            #TODO FIXME: We should provide an option to create the page
        else:
            pagetext = page.get()
            pagetext = pywikibot.removeDisabledParts(pagetext)
            templates = pywikibot.extract_templates_and_params(pagetext)
            for (template, fielddict) in templates:
                # We found the template we were looking for
                linkedTemplate = pywikibot.Page(self.site, template, ns=10)
                if linkedTemplate.isRedirectPage():
                    template2 = linkedTemplate.getRedirectTarget().title()
                    pywikibot.output(u'Template redirection from %s to %s' %
                                     (template, template2))
                    template = template2[9:]
                if template.replace(u'_', u' ') == self.templateTitle:
                    for field, value in fielddict.items():
                        # This field contains something useful for us
                        # pywikibot.output('    attribut %s' % field)
                        if (value != "") and (field in self.fields):
                            # Check if the property isn't already set
                            claim = self.fields[field]
                            if claim[2:-2] in item.get().get('claims'):
                                pywikibot.output(
                                    u'A claim for %s already exists. Skipping'
                                    % (claim, ))
                                # TODO FIXME: This is a very crude way of dupe
                                # checking
                            else:
                                # Try to extract a valid page
                                match = re.search(
                                    re.compile(
                                        r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'
                                    ), value)
                                pywikibot.output(u'      cherche %s ' % value)
                                if True:
                                    try:
                                        #Date treatement
                                        #if claim == "[[P585]]":
                                        #try:
                                        #pywikibot.output(u'      Date: %s ' % value)
                                        #pywikibot.output(u'      Date: %s ' % parser.parse(value))
                                        #theClaim = pywikibot.Claim(repo, claim[2:-2])
                                        #theClaim.setTarget(parser.parse(value))
                                        #item.addClaim(theClaim)
                                        #except ValueError
                                        #pywikibot.output(u'      Impossible to parse this date : %s ' % value)
                                        #continue

                                        #continue
                                        if value[:2] == "[[" and value[
                                                -2:] == "]]":
                                            link = value[2:-2]
                                        else:
                                            link = value
                                        #link = match.group(1)
                                        linkedPage = pywikibot.Page(
                                            self.site, link)
                                        if linkedPage.isRedirectPage():
                                            linkedPage = linkedPage.getRedirectTarget(
                                            )
                                        pywikibot.output(
                                            u'      linkedPage %s ' %
                                            linkedPage)
                                        linkedItem = pywikibot.ItemPage.fromPage(
                                            linkedPage)
                                        linkedItem.get()
                                        if not linkedItem.exists():
                                            pywikibot.output(
                                                '%s doesn\'t have a wikidata item :'
                                                % linkedPage)
                                            continue

                                        #value constraints treatement
                                        if (claim in self.valuesConstraints
                                            ) and (linkedItem.getID()
                                                   not in self.
                                                   valuesConstraints[claim]):
                                            pywikibot.output(
                                                u'The value of the property %s is %s does not respect the constraint %s'
                                                %
                                                (claim, linkedItem.title(),
                                                 self.valuesConstraints[claim])
                                            )
                                            continue

                                        #pywikibot.output(u'      linkedItem %s ' % linkedItem)
                                        pywikibot.output(
                                            u'      linkedItem.getID() %s ' %
                                            linkedItem.title()[1:])
                                        pywikibot.output(
                                            'Adding %s --> %s' %
                                            (claim, linkedItem.getID()))
                                        repo = site.data_repository(
                                        )  # utile self.repo existe ?

                                        theClaim = pywikibot.Claim(
                                            repo, claim[2:-2])
                                        theClaim.setTarget(linkedItem)
                                        item.addClaim(theClaim)
                                        if self.source:
                                            theClaim.addSource(self.source,
                                                               bot=True)
                                    except pywikibot.NoPage:
                                        pywikibot.output(
                                            "[[%s]] doesn't exist so I can't link to it"
                                            % linkedItem.title())
                                    except pywikibot.exceptions.InvalidTitle:
                                        pywikibot.output(
                                            "[[%s]] is an invalid title" %
                                            link)
Пример #16
0
    def processPage(self, page):
        """
        Process a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output('Processing %s' % page)
        if not item.exists():
            pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
            #TODO FIXME: We should provide an option to create the page
            return
        item.get()
        if set(self.fields.values()) <= set(item.claims.keys()):
            pywikibot.output(
                u'%s item %s has claims for all properties. Skipping' %
                (page, item.title()))
        else:
            pagetext = page.get()
            templates = pywikibot.extract_templates_and_params(pagetext)
            for (template, fielddict) in templates:
                # Clean up template
                try:
                    template = pywikibot.Page(page.site, template,
                                              ns=10).title(withNamespace=False)
                except pywikibot.exceptions.InvalidTitle as e:
                    pywikibot.error(
                        u"Failed parsing template; '%s' should be the template name."
                        % template)
                    continue
                # We found the template we were looking for
                if template in self.templateTitles:
                    for field, value in fielddict.items():
                        field = field.strip()
                        value = value.strip()
                        if not field or not value:
                            continue

                        # This field contains something useful for us
                        if field in self.fields:
                            # Check if the property isn't already set
                            claim = pywikibot.Claim(self.repo,
                                                    self.fields[field])
                            if claim.getID() in item.get().get('claims'):
                                pywikibot.output(
                                    u'A claim for %s already exists. Skipping'
                                    % claim.getID())
                                # TODO: Implement smarter approach to merging
                                # harvested values with existing claims esp.
                                # without overwriting humans unintentionally.
                            else:
                                if claim.type == 'wikibase-item':
                                    # Try to extract a valid page
                                    match = re.search(pywikibot.link_regex,
                                                      value)
                                    if not match:
                                        pywikibot.output(
                                            u'%s field %s value %s isnt a wikilink. Skipping'
                                            % (claim.getID(), field, value))
                                        continue

                                    link_text = match.group(1)
                                    linked_item = self._template_link_target(
                                        item, link_text)
                                    if not linked_item:
                                        continue

                                    claim.setTarget(linked_item)
                                elif claim.type == 'string':
                                    claim.setTarget(value.strip())
                                elif claim.type == 'commonsMedia':
                                    commonssite = pywikibot.Site(
                                        "commons", "commons")
                                    imagelink = pywikibot.Link(
                                        value,
                                        source=commonssite,
                                        defaultNamespace=6)
                                    image = pywikibot.ImagePage(imagelink)
                                    if image.isRedirectPage():
                                        image = pywikibot.ImagePage(
                                            image.getRedirectTarget())
                                    if not image.exists():
                                        pywikibot.output(
                                            '[[%s]] doesn\'t exist so I can\'t link to it'
                                            % (image.title(), ))
                                        continue
                                    claim.setTarget(image)
                                else:
                                    pywikibot.output(
                                        "%s is not a supported datatype." %
                                        claim.type)
                                    continue

                                pywikibot.output(
                                    'Adding %s --> %s' %
                                    (claim.getID(), claim.getTarget()))
                                item.addClaim(claim)
                                # A generator might yield pages from multiple sites
                                source = self.getSource(page.site)
                                if source:
                                    claim.addSource(source, bot=True)