def __init__(self): self.site = pywikibot.Site("wikidata", "wikidata") self.repo = self.site.data_repository() time_str = datetime.datetime.now().isoformat("-")[:19].replace(":","-") if flags.arg.test: record_file_name = "local/data/e/wikibot/test-birth-dates.rec" time_str = "test-" + time_str else: record_file_name = "local/data/e/wikibot/birth-dates.rec" status_file_name = "local/logs/wikibotlog-" + time_str + ".rec" self.record_file = sling.RecordReader(record_file_name) self.status_file = sling.RecordWriter(status_file_name) self.store = sling.Store() self.n_item = self.store["item"] self.n_facts = self.store["facts"] self.n_provenance = self.store["provenance"] self.n_category = self.store["category"] self.n_method = self.store["method"] self.n_status = self.store["status"] self.n_revision = self.store["revision"] self.n_url = self.store["url"] self.n_skipped = self.store["skipped"] self.store.freeze() self.rs = sling.Store(self.store) self.source_claim = pywikibot.Claim(self.repo, "P3452") # inferred from self.time_claim = pywikibot.Claim(self.repo, "P813") # referenced (on) today = datetime.date.today() time_target = pywikibot.WbTime(year=today.year, month=today.month, day=today.day) self.time_claim.setTarget(time_target)
def train(args): check_present( args, ["train_corpus", "output_folder", "dev_corpus", "train_shuffle_seed"]) train_corpus_path = args.train_corpus if args.train_shuffle_seed > 0: reader = sling.RecordReader(args.train_corpus) items = [(key, value) for key, value in reader] reader.close() r = random.Random(args.train_shuffle_seed) r.shuffle(items) train_corpus_path = os.path.join(args.output_folder, "train_shuffled.rec") writer = sling.RecordWriter(train_corpus_path) for key, value in items: writer.write(key, value) writer.close() print("Wrote shuffled train corpus to %s using seed %d" % \ (train_corpus_path, args.train_shuffle_seed)) # Setting an explicit seed for the sake of determinism. torch.manual_seed(1) # Make commons store if needed. if args.commons == '' or not os.path.exists(args.commons): if args.commons == '': fname = os.path.join(args.output_folder, "commons") print("Will create a commons store at", fname) args.commons = fname else: print("No commons found at", args.commons, ", creating it...") _, symbols = commons_builder.build( [train_corpus_path, args.dev_corpus], args.commons) print("Commons created at", args.commons, "with", len(symbols), \ "symbols besides the usual ones.") # Make the training spec. spec = Spec() spec.build(args.commons, train_corpus_path) # Initialize the model with the spec and any word embeddings. caspar = Caspar(spec) embeddings_file = args.word_embeddings if embeddings_file == '': embeddings_file = None caspar.initialize(embeddings_file) tmp_folder = os.path.join(args.output_folder, "tmp") if not os.path.exists(tmp_folder): os.makedirs(tmp_folder) evaluator = partial(dev_accuracy, args.dev_corpus, tmp_folder) output_file_prefix = os.path.join(args.output_folder, "caspar") hyperparams = Hyperparams(args) print("Using hyperparameters:", hyperparams) trainer = Trainer(caspar, hyperparams, evaluator, output_file_prefix) train = Corpora(train_corpus_path, spec.commons, gold=True) trainer.train(train)
def init(self, parses_filename, output_dir): self.output_dir = output_dir reader = sling.RecordReader(parses_filename) self.category_name_to_qid = {} # category name -> qid self.category_frame = {} # category qid -> frame self.category_parses = {} # category qid -> parses self.signature_to_parse = defaultdict(list) # signature -> parse self.store = sling.Store() self.num_parses = 0 for index, (qid, value) in enumerate(reader): if (index + 1) % 20000 == 0: log.info("%d categories read" % index) qid = qid.decode('utf-8') frame = self.store.parse(value) self.category_name_to_qid[frame.name] = qid self.category_frame[qid] = frame self.category_parses[qid] = [] for parse in frame("parse"): element = Parse(self.num_parses, qid, frame, parse) signature = util.full_parse_signature(parse) self.signature_to_parse[signature].append(element) self.category_parses[qid].append(element) self.num_parses += 1 self.store.lockgc() self.store.freeze() self.store.unlockgc()
def fetch_aliases(self, alias_file_patterns): print("Pre-fetching all raw aliases...") all_aliases = {} for ii in range(10): fname = alias_file_patterns % ii print("reading from %s..." % fname) db = sling.RecordReader(fname) for aid, als in db: all_aliases[aid.decode("utf-8", errors="ignore")] = als return all_aliases
def process_log_data(self, files): no_of_files = len(files) file_no = 0 rs = sling.Store(self.store) skipped = 0 updated = 0 errors = 0 deleted = 0 changed = 0 for r_file in files: file_no += 1 print "Processing file {:4d} of {} ({})".format(file_no, no_of_files, r_file) reader = sling.RecordReader(r_file) for item_str, record in reader: rec = rs.parse(record) status = rec[self.n_status] if self.n_skipped in status: skipped += 1 continue elif self.n_revision not in status: print "ERROR - unknown status" errors += 1 continue updated += 1 wd_item = pywikibot.ItemPage(self.repo, item_str) wd_claims = wd_item.get().get('claims') facts = rec[self.n_facts] for prop, val in facts: p_claims = wd_claims.get(str(prop), []) if not p_claims: deleted += 1 continue for wd_claim in p_claims: if wd_claim.type == "time": date = sling.Date(val) # parse date from record precision = precision_map[date.precision] # sling to wikidata target = pywikibot.WbTime(year=date.year, precision=precision) elif wd_claim.type == 'wikibase-item': target = pywikibot.ItemPage(self.repo, val) else: # TODO add location and possibly other types print "Error: Unknown claim type", claim.type continue if not wd_claim.target_equals(target): changed += 1 reader.close() print skipped, "skipped,", updated, "updated,", deleted, "deleted,", \ changed, "changed,", errors, "error records in file" print "Done processing last file"
def run(self, task): self.init(task) max_parses = int(task.param("max_parses")) reader = sling.RecordReader(task.input("input").name) writer = sling.RecordWriter(task.output("output").name) for index, (key, value) in enumerate(reader): store = sling.Store(self.kb) category = store.parse(value) document = sling.Document(category.document) # Score each parse. parse_with_score = self.score(category) # Keep only the top-k parses. ranked_parses = sorted(parse_with_score, key=lambda x: -x[1]) if len(ranked_parses) > max_parses: dropped = len(ranked_parses) - max_parses ranked_parses = ranked_parses[0:max_parses] task.increment("parses-dropped", dropped) task.increment("categories-with-too-many-parses") # Compute signature for each parse and store it in the parse. for parse, _ in ranked_parses: tokens, span_signature = self.signature(document, parse) parse["signature"] = tokens for span in parse.spans: if span in span_signature: span["signature"] = span_signature[span] # Also compute the coarse signature. tokens, span_signature = self.signature(document, parse, coarse=True) parse["coarse_signature"] = tokens for span in parse.spans: if span in span_signature: span["coarse_signature"] = span_signature[span] # Replace the current set of parses with the ranked list. del category["parse"] for parse, _ in ranked_parses: category.append("parse", parse) task.increment("parses-kept", len(ranked_parses)) writer.write(key, category.data(binary=True)) reader.close() writer.close()
def extract_entity_mentions(nq_data, labelled_record): """Parse ourput corpus and create map from tokens to entity ids. Args: nq_data: A python dictionary containint NQ data of 1 train/dev shard labelled_record: Sling output document with labelled paragraphs Returns: nq_data: Original object augmented with entity maps """ recin = sling.RecordReader(labelled_record) commons = sling.Store() docschema = sling.DocumentSchema(commons) commons.freeze() cnt = 1 for key, value in recin: store = sling.Store(commons) doc = sling.Document(store.parse(value), store, docschema) index, ans_type, idx, ans_id = key.decode("utf-8").split("|") cnt += 1 entity_map = {} # Parse entity mentions labelled by sling for m in doc.mentions: e = [i["is"] for i in m.evokes()] if not e: continue if is_sling_entity(e): e_val = e[0]["id"] if m.begin in entity_map: entity_map[m.begin].append((m.end, e_val)) else: entity_map[m.begin] = [(m.end, e_val)] if ans_type == "annotated_long_answer": nq_data[index]["annotations"][int( idx)]["long_answer"]["entity_map"] = entity_map elif ans_type == "question": nq_data[index]["question_entity_map"] = entity_map elif ans_type == "annotated_short_answer": nq_data[index]["annotations"][int(idx)]["short_answers"][int( ans_id)]["entity_map"] = entity_map else: nq_data[index]["long_answer_candidates"][int( idx)]["entity_map"] = entity_map return nq_data
def create_train_dev_split(input_file: str, train_file: str, dev_file: str, ratio: float): reader = sling.RecordReader(input_file) frames: List[Tuple[str, str]] = [] for key, value in reader: frames.append((key, value)) shuffle(frames) total_count = len(frames) train_count = int(round((1 - ratio) * total_count)) files = {'train': train_file, 'dev': dev_file} result_frames = {'train': frames[:train_count], 'dev': frames[train_count:]} for split in result_frames: writer = sling.RecordWriter(files[split]) for (key, value) in result_frames[split]: writer.write(key, value) writer.close()
def build(recordio_filenames, output_filename, text=False): commons = sling.Store() schema = sling.DocumentSchema(commons) commons.freeze() symbol_names = {} symbol_names["thing"] = 1 # Adds handle's id to 'symbol_names' if it is already not in 'commons'. def add(handle): if type(handle) is not sling.Frame or handle.id is None: return id_str = str(handle.id) if commons[id_str] is not None: return if id_str not in symbol_names: symbol_names[id_str] = 0 symbol_names[id_str] += 1 for filename in recordio_filenames: reader = sling.RecordReader(filename) for key, value in reader: store = sling.Store(commons) document = sling.Document(store.parse(value), schema=schema) for mention in document.mentions: for frame in mention.evokes(): for slot_role, slot_value in frame: add(slot_role) add(slot_value) for theme in document.themes: for slot_role, slot_value in theme: add(slot_role) add(slot_value) output = sling.Store() schema = sling.DocumentSchema(output) for name, count in symbol_names.iteritems(): output.frame({"id": name}) output.freeze() output.save(output_filename, binary=not text) return output, symbol_names
def read(self, parses_filename): reader = sling.RecordReader(parses_filename) self.category_name_to_qid = {} # category name -> qid self.category_frame = {} # category qid -> frame self.full_signature_to_parse = defaultdict(list) # signature -> parse self.coarse_signature_to_parse = defaultdict(list) # signature -> parse store = sling.Store() for index, (qid, value) in enumerate(reader): if index > 0 and index % 20000 == 0: log.info("%d categories read" % index) frame = store.parse(value) self.category_name_to_qid[frame.name] = qid self.category_frame[qid] = frame for parse in frame("parse"): element = (qid, frame, parse) full_signature = util.full_parse_signature(parse) self.full_signature_to_parse[full_signature].append(element) coarse_signature = util.coarse_parse_signature(parse) self.coarse_signature_to_parse[coarse_signature].append(element)
def run(self, task): self.init(task) reader = sling.RecordReader(task.input("parses").name) writer = sling.RecordWriter(task.output("output").name) for key, value in reader: store = sling.Store(self.kb) category = store.parse(value) matches = self.matcher.for_parses(category, store, max_evidences=-1) frame_cache = {} # (pid, qid) -> frame containing their match statistics for parse, parse_match in zip(category("parse"), matches): for span, span_match in zip(parse.spans, parse_match): span_key = (span.pids, span.qid) if span_key not in frame_cache: match_frame = span_match.as_frame(store) frame_cache[span_key] = match_frame span["fact_matches"] = frame_cache[span_key] writer.write(key, category.data(binary=True)) task.increment("fact-matcher/categories-processed") reader.close() writer.close()
def load( record: str, load_tokens: bool = True, load_mentions: bool = True ) -> Iterable[Tuple[sling.nlp.document.Document, Tuple[int, str, str]]]: """load documents from a .rec file. Warning: this may take good amount of RAM space (each *.rec file is 5.3GB). """ for k, rec in sling.RecordReader(record): store = sling.Store(commons) # parse record into frame doc_frame = store.parse(rec) # instantiate a document #parsed_doc = sling.Document(doc_frame, store, DOCSCHEMA) parsed_doc = MyDocument(doc_frame, store, DOCSCHEMA, load_tokens=load_tokens, load_mentions=load_mentions) metadata = get_metadata(doc_frame) yield parsed_doc, metadata
def read_corpus(file_pattern): docs = [] if file_pattern.endswith(".zip"): with gfile.GFile(file_pattern, 'r') as f: buf = io.BytesIO(f.read()) with zipfile.ZipFile(buf, 'r') as zipreader: docs = [None] * len(zipreader.namelist()) for index, fname in enumerate(zipreader.namelist()): docs[index] = zipreader.read(fname) elif file_pattern.endswith(".rec"): reader = sling.RecordReader(file_pattern) for _, value in reader: docs.append(value) reader.close() else: filenames = gfile.Glob(file_pattern) docs = [None] * len(filenames) for index, name in enumerate(filenames): with gfile.GFile(name, 'r') as f: docs[index] = f.read() print len(docs), "files in", file_pattern return docs
def __init__(self, recordio, commons, schema=None, gold=False, loop=False): self.filename = recordio self.commons_owned = False if isinstance(commons, str): self.commons = sling.Store() self.commons.load(commons) self.commons_owned = True else: assert isinstance(commons, sling.Store) self.commons = commons if schema is None or self.commons_owned: schema = sling.DocumentSchema(self.commons) if self.commons_owned: self.commons.freeze() assert schema is not None self.schema = schema self.reader = sling.RecordReader(recordio) self.generator = None self.loop = loop self.generator = None self.set_gold(gold)
def compare(arg): base_reader = sling.RecordReader(arg.base) expt_reader = sling.RecordReader(arg.expt) commons = sling.Store() commons.load(arg.commons) schema = sling.DocumentSchema(commons) commons.freeze() store = sling.Store(commons) index = -1 for (_, base_val), (_, expt_val) in zip(base_reader, expt_reader): index += 1 base_doc = sling.Document(frame=store.parse(base_val), schema=schema) expt_doc = sling.Document(frame=store.parse(expt_val), schema=schema) # Basic checks. base = base_doc.frame["trace"] expt = expt_doc.frame["trace"] if base is None and expt_doc is not None: checker.error('No trace in base document at index %d' % index) elif base is not None and expt_doc is None: checker.error('No trace in expt document at index %d' % index) if base is None: continue # Traces should be over the same token range. checker = Checker(index, base_doc, expt_doc, arg.diff) checker.check_eq(base["begin"], expt["begin"], "Trace Begin") checker.check_eq(base["end"], expt["end"], "Trace End") # Check LSTM features. base_lstm = base["/trace/lstm_features"] expt_lstm = expt["/trace/lstm_features"] checker.check_eq(len(base_lstm), len(expt_lstm), "LSTM Features Length") for i in range(len(base_lstm)): checker.frame_eq(base_lstm[i], expt_lstm[i], \ "LSTM features for token %d (%s)" % (i, base_doc.tokens[i].word)) # Check steps. base_steps = base["/trace/steps"] expt_steps = expt["/trace/steps"] min_steps = min(len(base_steps), len(expt_steps)) for i in range(min_steps): message = "Step %d's current token index" % i checker.check_eq(base_steps[i]["/trace/current"], \ expt_steps[i]["/trace/current"], message) # Check FF features for the step. base_ff = base_steps[i]["/trace/ff_features"] expt_ff = expt_steps[i]["/trace/ff_features"] checker.check_eq(len(base_ff), len(expt_ff), \ "# of FF features for step %d" % i) base_dict = {f["/trace/feature"] : f["/trace/values"] for f in base_ff} expt_dict = {f["/trace/feature"] : f["/trace/values"] for f in expt_ff} for k, v in base_dict.items(): checker.check_eq(k in expt_dict, True, \ "Step %d: FF feature %s not in expt" % (i, k)) checker.check_eq(v, expt_dict[k], \ "Step %d: FF feature %s has a different value in expt" % (i, k)) for k, v in expt_dict.items(): checker.check_eq(k in base_dict, True, \ "Step %d: FF feature %s not in base" % (i, k)) # Check action(s) in the step. base_actions = base_steps[i]["/trace/actions"] expt_actions = expt_steps[i]["/trace/actions"] for idx in range(min(len(base_actions), len(expt_actions))): checker.frame_eq(base_actions[idx]["/trace/predicted"], \ expt_actions[idx]["/trace/predicted"], "Step %d, predicted action %d" % (i, idx), ["/trace/_str"]) checker.frame_eq(base_actions[idx]["/trace/final"], \ expt_actions[idx]["/trace/final"], "Step %d, final action %d" % (i, idx), ["/trace/_str"]) # There should be the same number of actions in the step. checker.check_eq(len(base_actions), len(expt_actions), \ "Step %d: # of actions" % i) # There should be the same number of steps. checker.check_eq(len(base_steps), len(expt_steps), "# of Steps") base_reader.close() expt_reader.close()
def run(self, task): # Get parameters. language = task.param("language") # Load knowledge base. log.info("Load knowledge base") kb = sling.Store() kb.load(task.input("kb").name) n_infobox = kb["/wp/infobox"] n_page_item = kb["/wp/page/item"] n_file = kb["/wp/info/file"] n_media = kb["/wp/media"] image_fields = [ (kb["/wp/info/image"], kb["/wp/info/caption"]), (kb["/wp/info/cover"], kb["/wp/info/caption"]), (kb["/wp/info/logo"], kb["/wp/info/logo_caption"]), (kb["/wp/info/photo"], kb["/wp/info/photo_caption"]), (kb["/wp/info/flag_image"], kb["/wp/info/flag_caption"]), ] p_media = kb["media"] p_id = kb["id"] p_is = kb["is"] p_imported_from = kb["P143"] p_media_legend = kb["P2096"] image_properties = [ kb["P18"], # image kb["P154"], # logo image kb["P41"], # flag image ] lang = kb["/lang/" + language] wikipedia_item = lang["/lang/wikilang/wikipedia"] docschema = sling.DocumentSchema(kb) kb.freeze() # Fetch media titles for Wikipedia from yesterday. log.info("Fetch local media titles") yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d") mediaurl = "https://dumps.wikimedia.org/other/mediatitles/%s/" \ "%swiki-%s-all-media-titles.gz" % (yesterday, language, yesterday) r = urllib.request.urlopen(mediaurl) mediatitles = set(gzip.decompress(r.read()).decode().split('\n')) task.increment("local_media_files", len(mediatitles)) # Open output file. fout = open(task.output("output").name, "w") # Process input articles. for res in task.inputs("input"): log.info("Extract media files from", res.name) for _, data in sling.RecordReader(res.name): # Read article into store. store = sling.Store(kb) doc = store.parse(data) task.increment("documents") # Find first infobox. infobox = None for theme in doc(docschema.document_theme): if theme.isa(n_infobox): infobox = theme break if infobox is None: continue task.increment("infoboxes") # Find images in infobox. imagelist = [] for n_image, n_caption in image_fields: image = infobox[n_image] caption = infobox[n_caption] if image is None: continue # Get image for repeated image field. if type(image) is sling.Frame: group = image image = group[n_file] caption = group[n_caption] if image is None: continue if "{" in image or "[" in image: # Structured annotations. annotations = sling.lex(image, store=store, schema=docschema) for theme in annotations.themes: if theme.isa(n_media): image = theme[p_is] if image is not None: imagelist.append((image, None)) task.increment("structured_annotations") else: # Image filename. imagelist.append((image, caption)) if len(imagelist) == 0: continue # Process list of images for item. known_images = 0 image_frames = [] item = doc[n_page_item] if item is None: continue for image, caption in imagelist: # Disregard direct URLs for now. if image.startswith("http://") or \ image.startswith("https://") or \ image.startswith("//"): task.increment("url_images") continue # Trim image name. Remove File: prefix. colon = image.find(':') if colon > 0 and colon < 10: image = image[colon + 1:] image = titlecase(image.strip()).replace('_', ' ') if len(image) == 0 or image in default_images: task.increment("empty_images") continue if image.endswith("‎"): image = image[:-5] frag = image.find('#') if frag > 0: image = image[:frag] image = html.unescape(image) image = urllib.parse.unquote(image) # Discard media files with unknown or ignored extensions. dot = image.rfind('.') ext = image[dot:].lower() if dot > 0 else None if ext in ignored_extensions: task.increment("ignored_image_format") continue if ext not in known_extensions: log.info("unknown format:", item.id, image) task.increment("unknown_image_format") continue # Get item from KB and check if image is already known. task.increment("images") known = False for prop in image_properties: for img in item(prop): img = kb.resolve(img) if img == image: known = True known_images += 1 if known: task.increment("known_images") continue task.increment("new_images") # Check if image is in local Wikipedia or Wikimedia Commons. fn = image.replace(' ', '_') if fn in mediatitles: urlbase = "https://upload.wikimedia.org/wikipedia/" + language task.increment("local_images") else: urlbase = "https://upload.wikimedia.org/wikipedia/commons" task.increment("commons_images") if known_images == 0: task.increment("commons_imaged_items") # Compute URL for image. md5 = md5hash(fn) fn = fn.replace("?", "%3F") fn = fn.replace("+", "%2B") fn = fn.replace("&", "%26") url = "%s/%s/%s/%s" % (urlbase, md5[0], md5[0:2], fn) # Create frame for item with media image. slots = [ (p_is, url), (p_imported_from, wikipedia_item), ] if caption != None: capdoc = sling.lex(caption, store=store, schema=docschema) captxt = capdoc.phrase(0, len(capdoc.tokens)) slots.append((p_media_legend, captxt)) image_frames.append(store.frame(slots)) # Create item frame with extra image info. if len(image_frames) == 0: continue slots = [(p_id, item.id)] for image_frame in image_frames: slots.append((p_media, image_frame)) frame = store.frame(slots) fout.write(frame.data(utf8=True)) fout.write("\n") if known_images == 0: task.increment("imaged_items") fout.close()
import random import sling import sling.flags as flags flags.define('--input', help='input file with documents') flags.define('--output', help='output for shuffled documents') flags.define('--seed', help='seed for shuffling the corpus', default="314159", type=int, metavar='NUM') if __name__ == '__main__': flags.parse() # Read input corpus. reader = sling.RecordReader(flags.arg.input) records = [(key, value) for key, value in reader] reader.close() # Shufle documents. r = random.Random(flags.arg.seed) r.shuffle(records) # Write shuffled documents to output. writer = sling.RecordWriter(flags.arg.output) for key, value in records: writer.write(key, value) writer.close()
def run(self, task): self.init(task) writer = sling.RecordWriter(task.output("output").name) rejected = sling.RecordWriter(task.output("rejected").name) inputs = [t.name for t in task.inputs("items")] for filename in inputs: reader = sling.RecordReader(filename) for index, (key, value) in enumerate(reader): store = sling.Store(self.kb) frame = store.parse(value) # Only process category items. if not self.is_category(frame): rejected.write(key, "not_category") continue # See if the category should be skipped. members = self.get_members(frame) reject, reason = self.reject(key, frame, members) if reject: task.increment("skipped_categories/" + reason) rejected.write(key, reason) continue # First, collect the targets of all facts of all category members. qp_counts = self.qid_pid_counts(store, members) # Next, tokenize the category title. title = self.get_title(frame) colon = title.find(':') title = title[colon + 1:] document = sling.tokenize(title, store) # Next, find matches for all spans. These are reported as a list, # where ith item = spans that begin at token i (possibly an empty list). begin_to_spans = self.compute_spans(document, qp_counts) # Construct maximal parses with non-overlapping spans. parses = self.construct_parses(begin_to_spans) # Post-process parses. parses = self.post_process(parses) if len(parses) == 0 or len(parses) == 1 and len( parses[0]) == 0: task.increment("skipped_categories/no_parses") rejected.write(key, "no_parses") continue # Write parses as frames. frame = store.frame({"name": title, "members": members}) frame["document"] = document.frame for parse in parses: span_array = store.array(len(parse)) for i, span in enumerate(parse): span_array[i] = store.frame({ "begin": span.begin, "end": span.end, "qid": span.qid, "prior": span.prior, "pids": list(span.pids), "count": span.count }) parse_frame = store.frame({"spans": span_array}) frame.append("parse", parse_frame) writer.write(key, frame.data(binary=True)) task.increment("categories_accepted") # Compute histogram over number of parses. for b in self.num_parses_bins: if len(parses) <= b: task.increment("#parses <= %d" % b) if self.num_parses_bins[-1] < len(parses): task.increment("#parses > %d" % self.num_parses_bins[-1]) reader.close() writer.close() rejected.close()
def __init__(self): self.site = pywikibot.Site("wikidata", "wikidata") self.repo = self.site.data_repository() time_str = datetime.datetime.now().isoformat("-")[:19].replace( ":", "-") if flags.arg.test: record_file_name = "local/data/e/wikibot/test-" + flags.arg.input + ".rec" time_str = "test-" + time_str else: record_file_name = "local/data/e/wikibot/" + flags.arg.input + ".rec" status_file_name = "local/logs/wikibotlog-" + time_str + ".rec" self.record_file = sling.RecordReader(record_file_name) self.status_file = sling.RecordWriter(status_file_name) self.store = sling.Store() self.store.lockgc() print("loading kb") self.store.load("local/data/e/wiki/kb.sling") print("kb loaded") self.page_cat = self.store["/wp/page/category"] self.date_of_birth = self.store['P569'] self.date_of_death = self.store['P570'] self.n_item = self.store["item"] self.n_facts = self.store["facts"] self.n_provenance = self.store["provenance"] self.n_category = self.store["category"] self.n_url = self.store["url"] self.n_method = self.store["method"] self.n_status = self.store["status"] self.n_revision = self.store["revision"] self.n_skipped = self.store["skipped"] self.store.freeze() self.rs = sling.Store(self.store) self.wiki = {'fr': 'Q8447', 'en': 'Q328', 'da': 'Q181163', \ 'pt': 'Q11921', 'fi': 'Q175482', 'es': 'Q8449', \ 'pl': 'Q1551807', 'de': 'Q48183', 'nl': 'Q10000', \ 'sv': 'Q169514', 'it': 'Q11920', 'no': 'Q191769'} self.languages = self.wiki.keys() self.wiki_sources = {} for lang, wp in self.wiki.items(): # P143 means 'imported from Wikimedia project' source_claim = pywikibot.Claim(self.repo, "P143") target = pywikibot.ItemPage(self.repo, wp) source_claim.setTarget(target) self.wiki_sources[lang] = source_claim self.record_db = {} fname = "local/data/e/wiki/{}/[email protected]" for lang in self.languages: self.record_db[lang] = sling.RecordDatabase(fname.format(lang)) # inferred from self.source_claim = pywikibot.Claim(self.repo, "P3452") # Wikimedia import URL self.url_source_claim = pywikibot.Claim(self.repo, "P4656") # imported from Wikimedia project self.wp_source_claim = pywikibot.Claim(self.repo, "P143") self.en_wp = pywikibot.ItemPage(self.repo, "Q328") self.wp_source_claim.setTarget(self.en_wp) # referenced (on) self.time_claim = pywikibot.Claim(self.repo, "P813") today = datetime.date.today() time_target = pywikibot.WbTime(year=today.year, month=today.month, day=today.day) self.time_claim.setTarget(time_target) self.uniq_prop = {self.date_of_birth, self.date_of_death} kb = self.store # Collect unique-valued properties. # They will be used to update claims in Wikidata accordingly. constraint_role = kb["P2302"] unique = kb["Q19474404"] # single-value constraint for prop in kb["/w/entity"]("role"): for constraint_type in prop(constraint_role): if kb.resolve(constraint_type) == unique: self.uniq_prop.add(prop)
def process_log_data(self, files): no_of_files = len(files) file_no = 0 rs = sling.Store(self.store) skipped = 0 updated = 0 errors = 0 deleted = 0 changed = 0 redirected = 0 updates = {} for r_file in files: file_no += 1 print "Processing file {:4d} of {} ({})".format(file_no, no_of_files, r_file) print r_file reader = sling.RecordReader(r_file) last_updated = updated for item_str, record in reader: rec = rs.parse(record) status = rec[self.n_status] if self.n_skipped in status: skipped += 1 continue elif self.n_revision not in status: print "ERROR - unknown status" errors += 1 continue updated += 1 wd_item = pywikibot.ItemPage(self.repo, item_str) if wd_item.isRedirectPage(): redirected += 1 continue wd_claims = wd_item.get().get('claims') facts = rec[self.n_facts] for prop, val in facts: p_claims = wd_claims.get(str(prop), []) if not p_claims: deleted += 1 continue for wd_claim in p_claims: if wd_claim.type == "time": date = sling.Date(val) # parse date from record precision = precision_map[date.precision] # sling to wikidata target = pywikibot.WbTime(year=date.year, precision=precision) elif wd_claim.type == 'wikibase-item': target = pywikibot.ItemPage(self.repo, val) else: # TODO add location and possibly other types print "Error: Unknown claim type", claim.type continue if not wd_claim.target_equals(target): print item_str, target, wd_claim.target changed += 1 reader.close() print updated - last_updated f = r_file.split("-") date = int(f[1] + f[2] + f[3]) if date not in updates: updates[date] = 0 updates[date] += (updated - last_updated) print skipped, "skipped,", updated, "updated,", deleted, "deleted,", \ changed, "changed,", errors, "error records in file" print "Done processing last file" # Print number of accumulated updates over time first = min(updates) acc_upd = 0 d = datetime.date(first / 10000, (first % 10000) / 100, first % 100) while d <= datetime.date.today(): num = d.year * 10000 + d.month * 100 + d.day if num in updates: acc_upd += updates[num] print d.strftime("%Y-%m-%d") + "," + str(acc_upd) d += datetime.timedelta(days = 1)
def __init__(self, filename, commons=None): self.input = sling.RecordReader(filename) self.iter = iter(self.input) self.commons = sling.Store() if commons == None else commons self.docschema = sling.DocumentSchema(self.commons) if commons == None: self.commons.freeze()
def run(self): month = "(" + "|".join(self.months.keys()) + ")" day = "(\d{1,2})" year = "(\d{4})" date = "(?:(?:" + day + " " + month + " " + year + ")|" date += "(?:" + month + " " + day + ", " + year + "))" date += "(?:[^)]+?)?" dates = date + u"\s*-+\s*" + date dates = u"(?:(?:(?:born|b\.|n\xe9e),? ([^0-9)]*?)" + date + \ "(?:(?:died|d\.),? [^0-9)]*?" + date + ")?)|(?:" + dates + "))" pat = "(?:[^(]|\([^0-9]*\))*?\([^0-9)]*?" + dates + "\s*\)" rec = re.compile(pat) self.out_file = "local/data/e/wikibot/birth-death-dates.rec" record_file = sling.RecordWriter(self.out_file) records = 0 store = sling.Store(self.kb) for i in range(10): i_file = "local/data/e/wiki/en/documents-0000" + str( i) + "-of-00010.rec" print i_file, records for (item_id, record) in sling.RecordReader(i_file): item = self.kb[item_id] if self.human not in item(self.instanceof): continue if self.precise_date(item(self.date_of_birth)) and \ self.precise_date(item(self.date_of_death)): continue parsed_record = sling.Store().parse(record) doc = sling.Document(parsed_record) raw_text = parsed_record['text'] if len(raw_text) == 0: continue start_index = raw_text.find("<b>") + len("<b>") first = 1 while first < len(doc.tokens) and \ doc.tokens[first].start <= start_index: first += 1 last = first while last < len(doc.tokens) and doc.tokens[last].brk < 3: last += 1 text = doc.phrase(max(0, first - 1), min(len(doc.tokens), last + 15)) m = rec.match(text) if m is None: continue if text.find("(baptised") >= 0 or text.find("throne") >= 0: continue if text.find("(baptized") >= 0 or text.find("partner") >= 0: continue if m.group(2) or m.group(5): first = self.date_from_match(1, m) if first.year < 1753: continue # possibly Julian calendar date if m.group(8) or m.group(11): second = self.date_from_match(7, m) if second.year < 1753: continue # possibly Julian calendar date facts = store.frame({ self.date_of_birth: first.value(), self.date_of_death: second.value() }) else: # Only one date match mg1 = m.group(1) dob = item(self.date_of_birth) dod = item(self.date_of_death) if mg1 and max(mg1.find("died"), mg1.find("d.")) >= 0: # death date only if self.precise_date(dod): continue if self.same_year(first.year, dob): continue # b&d too close facts = store.frame({ self.date_of_death: first.value(), }) else: # birth date only if self.precise_date(dob): continue if self.same_year(first.year, dod): continue # b&d too close facts = store.frame({ self.date_of_birth: first.value(), }) else: first = self.date_from_match(13, m) second = self.date_from_match(19, m) if min(first.year, second.year) < 1753: continue # possibly Julian facts = store.frame({ self.date_of_birth: first.value(), self.date_of_death: second.value() }) records += 1 provenance = store.frame({ self.url: parsed_record['url'], self.method: "English Wikipedia dates for '" + str(item.name) + "'" }) fact = store.frame({ self.item: item, self.facts: facts, self.provenance: provenance }) record_file.write(item.id, fact.data(binary=True)) record_file.close() print records, "birth/death date records written to file:", self.out_file
{=text =/s/document/text} {=tokens =/s/document/tokens} {=mention =/s/document/mention} {=theme =/s/document/theme} {=token =/s/token} {=index =/s/token/index} {=start =/s/token/start} {=size =/s/token/length} {=break =/s/token/break} {=word =/s/token/text} {=phrase =/s/phrase} {=begin =/s/phrase/begin} {=length =/s/phrase/length} {=evokes =/s/phrase/evokes} """) commons.freeze() # Convert documents. num_docs = 0 fin = sling.RecordReader(sys.argv[1]) fout = sling.RecordWriter(sys.argv[2]) for key, value in fin: store = sling.Store(commons) f = store.parse(value) fout.write(key, f.data(binary=True)) num_docs += 1 fin.close() fout.close() print num_docs, "documents converted"