frame["document"] = document.frame for parse in parses: span_array = store.array(len(parse)) for i, span in enumerate(parse): span_array[i] = store.frame({ "begin": span.begin, "end": span.end, "qid": span.qid, "prior": span.prior, "pids": list(span.pids), "count": span.count }) parse_frame = store.frame({"spans": span_array}) frame.append("parse", parse_frame) writer.write(key, frame.data(binary=True)) task.increment("categories_accepted") # Compute histogram over number of parses. for b in self.num_parses_bins: if len(parses) <= b: task.increment("#parses <= %d" % b) if self.num_parses_bins[-1] < len(parses): task.increment("#parses > %d" % self.num_parses_bins[-1]) reader.close() writer.close() rejected.close() register_task("category-parse-generator", CategoryParseGenerator)
frame_cache = { } # (pid, qid) -> frame containing their match statistics for parse, parse_match in zip(category("parse"), matches): for span, span_match in zip(parse.spans, parse_match): span_key = (span.pids, span.qid) if span_key not in frame_cache: match_frame = span_match.as_frame(store) frame_cache[span_key] = match_frame span["fact_matches"] = frame_cache[span_key] writer.write(key, category.data(binary=True)) task.increment("fact-matcher/categories-processed") reader.close() writer.close() register_task("category-parse-fact-matcher", FactMatcherTask) # Loads a KB and brings up a shell to compute and debug match statistics. def shell(): kb = load_kb("local/data/e/wiki/kb.sling") extractor = sling.api.FactExtractor(kb) matcher = FactMatcher(kb, extractor) parses = "local/data/e/wikicat/filtered-parses.rec" db = sling.RecordDatabase(parses) while True: item = raw_input("Enter item or category QID:") # See if a category QID was entered, if so, compute and output match
# Score each parse. parse_with_score = self.score(category) # Keep only the top-k parses. ranked_parses = sorted(parse_with_score, key=lambda x: -x[1]) if len(ranked_parses) > max_parses: dropped = len(ranked_parses) - max_parses ranked_parses = ranked_parses[0:max_parses] task.increment("parses-dropped", dropped) task.increment("categories-with-too-many-parses") # Compute signature for each parse and store it in the parse. for parse, _ in ranked_parses: tokens, span_signature = self.signature(document, parse) parse["signature"] = tokens for span in parse.spans: if span in span_signature: span["signature"] = span_signature[span] # Replace the current set of parses with the ranked list. del category["parse"] for parse, _ in ranked_parses: category.append("parse", parse) task.increment("parses-kept", len(ranked_parses)) writer.write(key, category.data(binary=True)) reader.close() writer.close() register_task("prelim-category-parse-ranker", PrelimCategoryParseRanker)