Exemplo n.º 1
0
 def date_from_match(self, offset, match):
     if match.group(1 + offset):
         # DD MONTH YYYY format
         date = sling.Date(int(match.group(3 + offset)),
                           self.months[match.group(2 + offset)],
                           int(match.group(1 + offset)))
     else:
         # MONTH DD, YYYY format"""
         date = sling.Date(int(match.group(6 + offset)),
                           self.months[match.group(4 + offset)],
                           int(match.group(5 + offset)))
     return date
Exemplo n.º 2
0
    def for_item(self, item, prop, value, store=None):
        assert isinstance(value, sling.Frame)
        if isinstance(prop, sling.Frame):
            prop = [prop]
        else:
            prop = list(prop)

        if store is None:
            store = sling.Store(self.kb)

        # Compute existing facts without any backoff.
        exact_facts = self._existing_facts(store, item, prop, False)
        if len(exact_facts) == 0:
            return (FactMatchType.NEW, item)

        if value in exact_facts:
            return (FactMatchType.EXACT, item)

        # For date-valued properties, existing dates could be int or string
        # (which won't match 'value', which is a sling.Frame). For them, we do a
        # more elaborate matching procedure.
        if self._date_valued(prop[-1]):
            proposed_date = sling.Date(value)
            existing_dates = [sling.Date(e) for e in exact_facts]
            for e in existing_dates:
                if e.value() == proposed_date.value():
                    return (FactMatchType.EXACT, item)

        # Check whether the proposed fact subsumes an existing fact.
        closure_facts = self._existing_facts(store, item, prop, True)
        if value in closure_facts:
            return (FactMatchType.SUBSUMES_EXISTING, item)

        # Check whether the proposed fact is subsumed by an existing fact.
        # Again, dates require special treatment.
        if self._date_valued(prop[-1]):
            for e in existing_dates:
                if self._finer_date(proposed_date, e):
                    return (FactMatchType.SUBSUMED_BY_EXISTING, (item, e))
        else:
            for existing in exact_facts:
                if isinstance(existing, sling.Frame):
                    if self.subsumes(store, prop[-1], existing, value):
                        return (FactMatchType.SUBSUMED_BY_EXISTING, (item,
                                                                     existing))

        # Check for conflicts in case of unique-valued properties.
        if len(prop) == 1 and prop[0] in self.unique_properties:
            return (FactMatchType.CONFLICT, (item, exact_facts[0]))

        # Proposed fact is an additional one. Report the existing fanout.
        return (FactMatchType.ADDITIONAL, (item, len(exact_facts)))
Exemplo n.º 3
0
 def precise_date(self, dates):
     if dates is None: return False
     first = True
     for date in dates:
         if not first: return True  # more than one date - don't try to fix
         if date is not None and sling.Date(date).precision > sling.YEAR:
             return True
         first = False
     return False
Exemplo n.º 4
0
 def store_records(self, records, batch_size=3):
   updated = 0
   recno = 0
   for item_str, record in records:
     recno += 1
     if recno < flags.arg.first:
       print "Skipping record number", recno
       continue
     if recno > flags.arg.last: break
     if updated >= batch_size:
       print "Hit batch size of", batch_size
       break
     print "Processing", item_str
     fact_record = self.rs.parse(record)
     item = fact_record[self.n_item]
     facts = fact_record[self.n_facts]
     provenance = fact_record[self.n_provenance]
     if self.rs[item_str] != item:
       self.log_status_skip(item, facts, "inconsistent input")
       continue # read next record in the file
     wd_item = pywikibot.ItemPage(self.repo, item_str)
     if wd_item.isRedirectPage():
       self.log_status_skip(item, facts, "redirect page")
       continue
     wd_claims = wd_item.get().get('claims')
     # Process facts / claims
     for prop, val in facts:
       prop_str = str(prop)
       fact = self.rs.frame({prop: val})
       if prop_str in wd_claims:
         self.log_status_skip(item, fact, "already has property")
         continue
       if self.ever_had_prop(wd_item, prop_str):
         self.log_status_skip(item, fact, "already had property")
         continue
       claim = pywikibot.Claim(self.repo, prop_str)
       if claim.type == "time":
         date = sling.Date(val) # parse date from record
         precision = precision_map[date.precision] # sling to wikidata
         target = pywikibot.WbTime(year=date.year, precision=precision)
       elif claim.type == 'wikibase-item':
         target = pywikibot.ItemPage(self.repo, val)
       else:
         # TODO add location and possibly other types
         print "Error: Unknown claim type", claim.type
         continue
       claim.setTarget(target)
       cat_str = str(provenance[self.n_category])
       summary = provenance[self.n_method] + " " + cat_str
       wd_item.addClaim(claim, summary=summary)
       rev_id = str(wd_item.latest_revision_id)
       claim.addSources(self.get_sources(cat_str))
       self.log_status_stored(item, fact, rev_id)
       updated += 1
     print item, recno
   print "Last record:", recno, "Total:", updated, "records updated."
Exemplo n.º 5
0
 def get_name(self, x):
     """Return name for given wikidata id."""
     if x is None:
         return None
     if isinstance(x, int):
         date = sling.Date(x)
         return self.cal.str(date)
     elif not x.startswith("Q"):
         return None
     if isinstance(self.kb[x].name, bytes):
         return self.kb[x].name.decode("utf-8", errors="ignore")
     else:
         return self.kb[x].name
Exemplo n.º 6
0
 def process_log_data(self, files):
   no_of_files = len(files)
   file_no = 0
   rs = sling.Store(self.store)
   skipped = 0
   updated = 0
   errors = 0
   deleted = 0
   changed = 0
   for r_file in files:
     file_no += 1
     print "Processing file {:4d} of {} ({})".format(file_no,
                                                     no_of_files,
                                                     r_file)
     reader = sling.RecordReader(r_file)
     for item_str, record in reader:
       rec = rs.parse(record)
       status = rec[self.n_status]
       if self.n_skipped in status:
         skipped += 1
         continue
       elif self.n_revision not in status:
         print "ERROR - unknown status"
         errors += 1
         continue
       updated += 1
       wd_item = pywikibot.ItemPage(self.repo, item_str)
       wd_claims = wd_item.get().get('claims')
       facts = rec[self.n_facts]
       for prop, val in facts:
         p_claims =  wd_claims.get(str(prop), [])
         if not p_claims:
           deleted += 1
           continue
         for wd_claim in p_claims:
           if wd_claim.type == "time":
             date = sling.Date(val) # parse date from record
             precision = precision_map[date.precision] # sling to wikidata
             target = pywikibot.WbTime(year=date.year, precision=precision)
           elif wd_claim.type == 'wikibase-item':
             target = pywikibot.ItemPage(self.repo, val)
           else:
             # TODO add location and possibly other types
             print "Error: Unknown claim type", claim.type
             continue
           if not wd_claim.target_equals(target):
             changed += 1
     reader.close()
   print skipped, "skipped,", updated, "updated,", deleted, "deleted,", \
     changed, "changed,", errors, "error records in file"
   print "Done processing last file"
Exemplo n.º 7
0
    def match_type(self, store, prop, existing, proposed):
        existing = [self.kb.resolve(e) for e in existing]
        proposed = self.kb.resolve(proposed)

        if len(existing) == 0:
            return FactMatchType.NEW

        exact = False
        subsumes = False
        subsumed = False

        # For date-valued properties, existing dates could be int or string
        # (which won't match 'proposed', which could be a sling.Frame).
        # For them, we do a more elaborate matching procedure.
        if self._date_valued(prop):
            existing_dates = [sling.Date(e) for e in existing]
            proposed_date = sling.Date(proposed)
            for e in existing_dates:
                exact |= e.value() == proposed_date.value()
                subsumes |= self._finer_date(e, proposed_date)
                subsumed |= self._finer_date(proposed_date, e)
        else:
            closure_property = self.closure_properties.get(prop, None)
            for e in existing:
                exact |= e == proposed
                if isinstance(e, sling.Frame):
                    subsumes |= self.subsumes(store, closure_property,
                                              proposed, e)
                    subsumed |= self.subsumes(store, closure_property, e,
                                              proposed)

        if exact: return FactMatchType.EXACT
        if subsumes: return FactMatchType.SUBSUMES_EXISTING
        if subsumed: return FactMatchType.SUBSUMED_BY_EXISTING
        if prop in self.unique_properties: return FactMatchType.CONFLICT
        return FactMatchType.ADDITIONAL
Exemplo n.º 8
0
    def find_inceptions(self, inc_cats):
        self.out_file = "data/e/wikibot/inc-dates.rec"
        record_file = sling.RecordWriter(self.out_file)
        records = 0
        store = sling.Store(self.kb)
        types = {}

        for item in self.kb:
            if self.wikimedia_category in item(self.instanceof): continue
            if self.human in item(self.instanceof): continue
            if not self.is_org(item): continue
            name = item.name
            if name is not None and name.startswith("Category:"): continue
            if item[self.inception] is not None: continue
            cat_dates = []
            # Collect all the item's inception categories in cat_dates
            for cat in item(self.item_category):
                cat_inc_date = inc_cats.get(cat)
                if cat_inc_date is None: continue
                cat_dates.append((cat, cat_inc_date))
            if not cat_dates:
                continue  # no inception categories found for item
            msd = self.most_specific_date(cat_dates)
            if msd is None: continue
            (inc_cat, inc_date) = msd
            records += 1

            facts = store.frame({self.inception: sling.Date(inc_date).value()})
            provenance = store.frame({
                self.category:
                inc_cat,
                self.method:
                "Member of an inception category, '" + inc_cat.name + "'"
            })
            fact = store.frame({
                self.item: item,
                self.facts: facts,
                self.provenance: provenance
            })
            record_file.write(item.id, fact.data(binary=True))

        record_file.close()
        print records, "inception date records written to file:", self.out_file
        print self.conflicts, "conflicts encountered"
Exemplo n.º 9
0
    def find_births(self, birth_cats):
        self.out_file = "local/data/e/wikibot/birth-dates.rec"
        record_file = sling.RecordWriter(self.out_file)
        records = 0

        for item in self.kb:
            if self.human not in item(self.instanceof): continue
            if item[self.date_of_birth] is not None: continue
            cat_dates = []
            # Collect all the item's birth categories in cat_dates
            for cat in item(self.item_category):
                cat_birth_date = birth_cats.get(cat)
                if cat_birth_date is None: continue
                cat_dates.append((cat, cat_birth_date))
            if not cat_dates: continue  # no birth categories found for item
            msd = self.most_specific_date(cat_dates)
            if msd is None: continue
            (birth_cat, birth_date) = msd
            records += 1
            store = sling.Store(self.kb)
            facts = store.frame({
                self.date_of_birth:
                self.calendar.value(sling.Date(birth_date))
            })
            provenance = store.frame({
                self.category:
                birth_cat,
                self.method:
                "Member of a birth category, '" + birth_cat.name + "'"
            })
            fact = store.frame({
                self.item: item,
                self.facts: facts,
                self.provenance: provenance
            })
            record_file.write(item.id, fact.data(binary=True))

        record_file.close()
        print records, "birth date records written to file:", self.out_file
        print self.conflicts, "conflicts encountered"
Exemplo n.º 10
0
 def same_year(self, year, dates):
     if dates is None: return False
     for date in dates:
         if date and year == sling.Date(date).year: return True
     return False
Exemplo n.º 11
0
  relationship = rec[x_relationship]
  start = relationship[x_start_node]
  end = relationship[x_end_node]
  start_lei = start[x_node_id]
  end_lei = end[x_node_id]
  reltype = relationship[x_relationship_type]
  starttime = None
  endtime = None

  periods = relationship[x_relationship_periods]
  if periods != None:
    for period in periods(x_relationship_period):
      if period[x_period_type] == "RELATIONSHIP_PERIOD":
        period_start = period[x_start_date]
        period_end = period[x_end_date]
        if period_start: starttime = sling.Date(period_start).value()
        if period_end: endtime = sling.Date(period_end).value()

  # Dertermine relationship type.
  if reltype == "IS_ULTIMATELY_CONSOLIDATED_BY":
    parent_rel = n_owned_by
    child_rel = n_owner_of
  elif reltype == "IS_DIRECTLY_CONSOLIDATED_BY":
    parent_rel = n_parent
    child_rel = n_subsidiary
  else:
    continue

  # Get related organizations.
  subsidiary = store["P1278/" + start_lei]
  if subsidiary.isglobal():
Exemplo n.º 12
0
 def store_records(self, records, batch_size=3):
     updated = 0
     recno = 0
     for item_bytes, record in records:
         item_str = item_bytes.decode()
         recno += 1
         if recno < flags.arg.first:
             print("Skipping record number", recno)
             continue
         if recno > flags.arg.last: break
         if updated >= batch_size:
             print("Hit batch size of", batch_size)
             break
         print("Processing https://www.wikidata.org/wiki/" + item_str)
         fact_record = self.rs.parse(record)
         item = fact_record[self.n_item]
         facts = fact_record[self.n_facts]
         provenance = fact_record[self.n_provenance]
         if self.rs[item_str] != item:
             self.log_status_skip(item, facts, "inconsistent input")
             continue  # read next record in the file
         wd_item = pywikibot.ItemPage(self.repo, item_str)
         if not wd_item.exists():
             self.log_status_skip(item, facts, "page does not exist")
             continue
         if wd_item.isRedirectPage():
             self.log_status_skip(item, facts, "redirect page")
             continue
         try:
             wd_item.get()
             wd_claims = wd_item.claims
         except:
             self.log_status_skip(item, facts, "exception getting claims")
             continue
         # Process facts / claims
         for prop, val in facts:
             prop_str = str(prop)
             fact = self.rs.frame({prop: val})
             claim = pywikibot.Claim(self.repo, prop_str)
             if prop in self.uniq_prop:
                 if prop_str not in wd_claims:
                     if self.ever_had_prop(wd_item, prop_str):
                         self.log_status_skip(item, fact,
                                              "already had property")
                         continue
                 if claim.type == "time":
                     date = sling.Date(val)  # parse date from val
                     target = self.get_wbtime(date)
                     if target is None:
                         self.log_status_skip(item, facts,
                                              "date precision exception")
                         continue
                     if prop_str in wd_claims:
                         if len(wd_claims[prop_str]
                                ) > 1:  # more than one property already
                             self.log_status_skip(
                                 item, fact, "has property more than once")
                             continue
                         old = wd_claims[prop_str][0].getTarget()
                         if old is not None:
                             if old.precision >= target.precision:
                                 err_str = "precise date already exists"
                                 self.log_status_skip(item, fact, err_str)
                                 continue
                             if old.year != date.year:
                                 self.log_status_skip(
                                     item, fact, "conflicting year in date")
                                 continue
                             if old.precision >= pywikibot.WbTime.PRECISION['month'] and \
                                old.month != date.month:
                                 self.log_status_skip(
                                     item, fact,
                                     "conflicting month in date")
                                 continue
                             # Item already has property with a same year less precise date.
                             # Ensure sources are all WP or empty
                             if not self.all_WP(
                                     wd_claims[prop_str][0].getSources()):
                                 self.log_status_skip(
                                     item, fact,
                                     "date with non-WP source(s)")
                                 continue
                         wd_item.removeClaims(wd_claims[prop_str])
                 elif claim.type == 'wikibase-item':
                     if prop_str in wd_claims:
                         self.log_status_skip(item, fact,
                                              "already has property")
                         continue
                     target = pywikibot.ItemPage(self.repo, val)
                 else:
                     # TODO add location and possibly other types
                     print("Error: Unknown claim type", claim.type)
                     continue
             else:  # property not unique
                 if claim.type == 'wikibase-item':
                     target = pywikibot.ItemPage(self.repo, val.id)
                 elif claim.type == "time":
                     target = self.get_wbtime(val)
                     if target is None:
                         self.log_status_skip(item, facts,
                                              "date precision exception")
                         continue
                 else:
                     # TODO add location and possibly other types
                     print("Error: Unknown claim type", claim.type)
                     continue
                 if prop_str in wd_claims:
                     old_fact = False
                     for clm in wd_claims[prop_str]:
                         if clm.target_equals(target):
                             self.log_status_skip(item, fact,
                                                  "already has fact")
                             old_fact = True
                     if old_fact: continue
             if provenance[self.n_category]:
                 s = str(provenance[self.n_category])
                 sources = self.get_sources(item, s)
             elif provenance[self.n_url]:
                 s = str(provenance[self.n_url])
                 sources = self.get_wp_sources()
             else:
                 continue
             summary = provenance[self.n_method] + " " + s
             claim.setTarget(target)
             wd_item.addClaim(claim, summary=summary)
             rev_id = str(wd_item.latest_revision_id)
             if len(sources) > 0:
                 claim.addSources(sources)
             self.log_status_stored(item, fact, rev_id)
             updated += 1
         print(item, recno)
     print("Last record:", recno, "Total:", updated, "records updated.")
Exemplo n.º 13
0
 def store_records(self, records, batch_size=3):
     updated = 0
     recno = 0
     for item_str, record in records:
         recno += 1
         if recno < flags.arg.first:
             print "Skipping record number", recno
             continue
         if recno > flags.arg.last: break
         if updated >= batch_size:
             print "Hit batch size of", batch_size
             break
         print "Processing https://www.wikidata.org/wiki/" + item_str
         fact_record = self.rs.parse(record)
         item = fact_record[self.n_item]
         facts = fact_record[self.n_facts]
         provenance = fact_record[self.n_provenance]
         if self.rs[item_str] != item:
             self.log_status_skip(item, facts, "inconsistent input")
             continue  # read next record in the file
         wd_item = pywikibot.ItemPage(self.repo, item_str)
         if not wd_item.exists():
             self.log_status_skip(item, facts, "page does not exist")
             continue
         if wd_item.isRedirectPage():
             self.log_status_skip(item, facts, "redirect page")
             continue
         try:
             wd_item.get()
             wd_claims = wd_item.claims
         except:
             self.log_status_skip(item, facts, "exception getting claims")
             continue
         # Process facts / claims
         for prop, val in facts:
             prop_str = str(prop)
             fact = self.rs.frame({prop: val})
             if prop_str not in wd_claims and self.ever_had_prop(
                     wd_item, prop_str):
                 self.log_status_skip(item, fact, "already had property")
                 continue
             claim = pywikibot.Claim(self.repo, prop_str)
             if claim.type == "time":
                 date = sling.Date(val)  # parse date from record
                 precision = precision_map[
                     date.precision]  # sling to wikidata
                 if date.precision <= sling.YEAR:
                     target = pywikibot.WbTime(year=date.year,
                                               precision=precision)
                 elif date.precision == sling.MONTH:
                     target = pywikibot.WbTime(year=date.year,
                                               month=date.month,
                                               precision=precision)
                 elif date.precision == sling.DAY:
                     target = pywikibot.WbTime(year=date.year,
                                               month=date.month,
                                               day=date.day,
                                               precision=precision)
                 else:
                     self.log_status_skip(item, facts,
                                          "date precision exception")
                     continue
                 if prop_str in wd_claims:
                     if len(wd_claims[prop_str]
                            ) > 1:  # more than one property already
                         self.log_status_skip(
                             item, fact, "has property more than once")
                         continue
                     old = wd_claims[prop_str][0].getTarget()
                     if old is not None:
                         if old.precision >= precision:
                             self.log_status_skip(
                                 item, fact, "precise date already exists")
                             continue
                         if old.year != date.year:
                             self.log_status_skip(
                                 item, fact, "conflicting year in date")
                             continue
                         if old.precision >= pywikibot.WbTime.PRECISION['month'] and \
                            old.month != date.month:
                             self.log_status_skip(
                                 item, fact, "conflicting month in date")
                             continue
                         # item already has property with a same year less precise date
                         # check that sources are all WP or empty
                         if not self.all_WP(
                                 wd_claims[prop_str][0].getSources()):
                             self.log_status_skip(
                                 item, fact, "date with non-WP source(s)")
                             continue
                     wd_item.removeClaims(wd_claims[prop_str])
             elif claim.type == 'wikibase-item':
                 if prop_str in wd_claims:
                     self.log_status_skip(item, fact,
                                          "already has property")
                     continue
                 target = pywikibot.ItemPage(self.repo, val)
             else:
                 # TODO add location and possibly other types
                 print "Error: Unknown claim type", claim.type
                 continue
             if provenance[self.n_category]:
                 s = str(provenance[self.n_category])
                 sources = self.get_sources(s)
             elif provenance[self.n_url]:
                 s = str(provenance[self.n_url])
                 sources = self.get_wp_sources()
             else:
                 continue
             summary = provenance[self.n_method] + " " + s
             claim.setTarget(target)
             wd_item.addClaim(claim, summary=summary)
             rev_id = str(wd_item.latest_revision_id)
             if len(sources) > 0: claim.addSources(sources)
             self.log_status_stored(item, fact, rev_id)
             updated += 1
         print item, recno
     print "Last record:", recno, "Total:", updated, "records updated."
Exemplo n.º 14
0
 def process_log_data(self, files):
   no_of_files = len(files)
   file_no = 0
   rs = sling.Store(self.store)
   skipped = 0
   updated = 0
   errors = 0
   deleted = 0
   changed = 0
   redirected = 0
   updates = {}
   for r_file in files:
     file_no += 1
     print "Processing file {:4d} of {} ({})".format(file_no,
                                                     no_of_files,
                                                     r_file)
     print r_file
     reader = sling.RecordReader(r_file)
     last_updated = updated
     for item_str, record in reader:
       rec = rs.parse(record)
       status = rec[self.n_status]
       if self.n_skipped in status:
         skipped += 1
         continue
       elif self.n_revision not in status:
         print "ERROR - unknown status"
         errors += 1
         continue
       updated += 1
       wd_item = pywikibot.ItemPage(self.repo, item_str)
       if wd_item.isRedirectPage():
         redirected += 1
         continue
       wd_claims = wd_item.get().get('claims')
       facts = rec[self.n_facts]
       for prop, val in facts:
         p_claims =  wd_claims.get(str(prop), [])
         if not p_claims:
           deleted += 1
           continue
         for wd_claim in p_claims:
           if wd_claim.type == "time":
             date = sling.Date(val) # parse date from record
             precision = precision_map[date.precision] # sling to wikidata
             target = pywikibot.WbTime(year=date.year, precision=precision)
           elif wd_claim.type == 'wikibase-item':
             target = pywikibot.ItemPage(self.repo, val)
           else:
             # TODO add location and possibly other types
             print "Error: Unknown claim type", claim.type
             continue
           if not wd_claim.target_equals(target):
             print item_str, target, wd_claim.target
             changed += 1
     reader.close()
     print updated - last_updated
     f = r_file.split("-")
     date = int(f[1] + f[2] + f[3])
     if date not in updates: updates[date] = 0
     updates[date] += (updated - last_updated)
   print skipped, "skipped,", updated, "updated,", deleted, "deleted,", \
     changed, "changed,", errors, "error records in file"
   print "Done processing last file"
   # Print number of accumulated updates over time
   first = min(updates)
   acc_upd = 0
   d = datetime.date(first / 10000, (first % 10000) / 100, first % 100)
   while d <= datetime.date.today():
     num = d.year * 10000 + d.month * 100 + d.day
     if num in updates: acc_upd += updates[num]
     print d.strftime("%Y-%m-%d") + "," + str(acc_upd)
     d += datetime.timedelta(days = 1)