예제 #1
0
  def handle_category(self, qid, form):
    def is_on(name):
      return form.getvalue("main_form_" + name) == "on"

    # Various options.
    show_span_qid = is_on("show_span_qid")
    show_prelim_parse_scores = is_on("show_prelim_parse_scores")
    show_span_scores = is_on("show_span_scores")
    show_fact_matches = is_on("show_fact_matching_statistics")
    show_span_fact_matches = is_on("show_span_fact_match_stats")
    show_similar_categories = is_on("show_similar_categories")
    signature_type = form.getvalue("main_form_signature_type")
    metric = form.getvalue("main_form_sort_metric")
    fact_weights = self.fact_match_weights(form)

    frame = browser_globals.category_frame[qid]
    document = sling.Document(frame=frame.document)

    num = len([p for p in frame("parse")])
    self._tag("div", "<b>%s = %s</b>: %d members, %d parses" % \
              (qid, frame.name, len(frame.members), num))
    self._br()

    # Write the parses in a tabular format.
    show_prelim_parse_scores &= metric != "prelim_parse_score"
    self.write_main_table_header(
      "Signature",
      [t.word for t in document.tokens],
      "Metric",
      "Prelim Scores" if show_prelim_parse_scores else None,
      [t.name for t in FactMatchType] if show_fact_matches else None,
      "Matching Categories" if show_similar_categories else None)

    # Each parse is written as one row.
    parses = [(parse, self.parse_score(frame, parse, metric, fact_weights)) \
      for parse in frame("parse")]
    parses.sort(key=lambda x: -x[1])
    for parse, metric_value in parses:
      signature = util.parse_signature(parse, signature_type)

      self._begin("tr")
      self._begin("td")
      self._form_anchor(signature, signature)
      self._end("td")
      self._separator(header=False)
      prev_span_end = -1
      for span in parse.spans:
        for index in xrange(prev_span_end + 1, span.begin):
          self._empty_cell()

        self._begin("td", colspan=span.end-span.begin, align='middle')
        text = util.span_signature(span, signature_type)
        if show_span_qid:
          text += " (" + str(span.qid) + ")"
        title = '.'.join([str(p) for p in span.pids]) + ' = ' + str(span.qid)
        if "name" in span.qid:
          title += " (" + span.qid[name] + ")"
        self._tag("span", text, title=title)

        if show_span_scores and "prior" in span:
          self._br()
          self._text("%s = %0.4f" % ("prior", span.prior))

        if show_span_fact_matches:
          local_counts = util.fact_matches_for_span(span)
          self._br()
          self._begin("table class='span_fact_match'")
          self._begin("thead")
          for t in FactMatchType:
            self._tag("th", t.name)
          self._end("thead")
          self._begin("tr")
          self.write_fact_match_counts(local_counts)
          self._end(["tr", "table"])

        self._end("td")
        prev_span_end = span.end - 1

      for index in xrange(prev_span_end + 1, len(document.tokens)):
        self._empty_cell()

      self._separator(header=False)
      if type(metric_value) is int:
        self._cell(metric_value)
      else:
        self._cell("%.4f" % metric_value)

      if show_prelim_parse_scores:
        self._separator(header=False)
        self._begin("td class='numeric'")
        for score_type in ["prior", "member_score", "cover"]:
          if score_type in parse:
            self._text("%s = %0.4f" % (score_type, parse[score_type]))
            self._br()
        if "score" in parse:
          self._color_text("Overall = %0.4f" % parse.score, "blue")
        self._end("td")

      if show_fact_matches:
        self._separator(header=False)
        total_fact_counts = util.fact_matches_for_parse(parse)
        self.write_fact_match_counts(total_fact_counts)

      if show_similar_categories:
        self._separator(header=False)
        self._begin("td")
        limit = 5
        signature_mapping = browser_globals.full_signature_to_parse
        if signature_type == "coarse":
          signature_mapping = browser_globals.coarse_signature_to_parse
        seen = set()
        for (other_qid, other_category, other_parse) in \
          signature_mapping[signature]:
          if len(seen) >= limit:
            break
          if other_qid != qid and other_qid not in seen:
            seen.add(other_qid)
            self._text(other_category.name)
            self._form_anchor(" (= %s)" % other_qid, other_qid)
            self._text(" (%0.4f)" % other_parse.score)
            self._br()
        self._end("td")
      self._end("tr")
    self._end("table")
예제 #2
0
  def handle_signature(self, signature, categories, form):
    score_type = form.getvalue("main_form_sort_metric")
    fact_weights = self.fact_match_weights(form)
    signature_type = form.getvalue("main_form_signature_type")

    # Sort all parses with this signature.
    output = []
    for (qid, category, parse) in categories:
      score = self.parse_score(category, parse, score_type, fact_weights)
      output.append((qid, category, parse, score))
    output.sort(key=lambda x: -x[3])

    # Get fact-matching statistics. Consider only the top parse for a category
    # if it has >1 parses with the same signature.
    category_count = defaultdict(int)
    match_counts = util.MatchCounts()
    span_match_counts = defaultdict(util.MatchCounts)
    num_members = 0
    for qid, category, parse, score in output:
      category_count[qid] += 1
      if category_count[qid] > 1:
        continue
      num_members += len(category.members)
      util.fact_matches_for_parse(parse, match_counts)
      for span in parse.spans:
        span_signature = util.span_signature(span, signature_type)
        util.fact_matches_for_span(span, span_match_counts[span_signature])

    # Write a summary.
    self._tag("div",
      "<b>%s</b>: in %d (category, parse) pairs across %d categories" % \
      (signature, len(categories), len(category_count)))
    self._tag("b", "#Items across categories: ")
    self._text("%d (incl. possible duplicates)" % num_members)

    self._br()
    self._tag("b", "Span-level fact-matching statistics")
    self.write_main_table_header(
      ["Span Signature"],
      [t.name for t in FactMatchType])
    for span_signature, span_matches in span_match_counts.iteritems():
      self._begin("tr")
      self._cell(span_signature)
      self._separator(header=False)
      self.write_fact_match_counts(span_matches)
      self._end("tr")
    self._begin("tr")
    self._cell("All")
    self._separator(header=False)
    self.write_fact_match_counts(match_counts)
    self._end("tr")
    self._end("table")

    # Give an option to generate a recordio file.
    if signature_type == "full":
      self._br()
      self._begin("table class='main_table'")
      self._begin("tr")
      self._begin("td")
      self._begin("form", id="recordio_form", method="POST", action="", \
                  target="_blank")
      self._begin_end("input", type="hidden", name="main_form_functionality", \
                      value="recordio")
      self._begin_end("input", type="hidden", name="recordio_signature", \
                      id="recordio_signature", value=signature)
      self._tag("b", "Generate recordio for this signature")
      self._br()
      self._br()
      self._text("Filename: ")
      filename = "local/data/e/wikicat/" + signature.replace("$", "_") + ".rec"
      self._begin_end("input", type="text", size=100, \
                      name="recordio_filename", value=filename)
      self._br()
      self._text("Category QIDs ('ALL' for all): ")
      self._begin_end("input", type="text", size=100, value="ALL", \
                      name="recordio_categories")
      self._br()
      self._text("Generate facts for these types:")
      self._begin_end("input", type="text", size=100, \
                      value="NEW,ADDITIONAL,SUBSUMED_BY_EXISTING", \
                      name="recordio_match_types")
      self._br()
      self._text("Generate the following facts:")
      self._br()
      count = 0
      for token in signature.split():
        if token[0] == '$' and token[1:].find("$") >= 0:
          name = "recordio_span%d" % count
          self._text("&nbsp;&nbsp;" + token + " ")
          self._begin_end("input", type="checkbox", name=name, id=name, \
                          checked="on")
          self._br()
          count += 1
      self._begin_end("input", type="hidden", name="recordio_num_spans", \
                      id="recordio_total_spans", value=count)
      self._begin_end("input", type="submit")
      self._end(["form", "td", "tr", "table"])

    # Write the individual parses in a tabular form.
    self._br()
    self._tag("b", "Categories with parses matching '" + signature + "'")
    seen = set()
    max_rows = 200
    self.write_main_table_header(
      ["Category", "Prelim parse score", "#Members", "Fact-matching score"],
      [t.name for t in FactMatchType])
    row_count = 0
    for qid, category, parse, score in output:
      if row_count >= max_rows:
        break
      if qid in seen:
        continue
      row_count += 1
      seen.add(qid)
      self._begin("tr")
      self._begin("td")
      self._form_anchor(qid + ": " + category.name, qid)
      if category_count[qid] > 1:
        more = category_count[qid] - 1
        self._text(" (%d more parse%s)" % (more, "" if more == 1 else "s"))
      self._end("td")
      self._cell(parse.score, numeric=True)
      self._cell(len(category.members), numeric=True)
      self._cell("%.4f" % self.parse_fact_score(parse, fact_weights), True)

      self._separator(header=False)
      counts = util.fact_matches_for_parse(parse)
      self.write_fact_match_counts(counts)
      self._end("tr")
    self._end("table")
예제 #3
0
    def handle_signature(self, signature, categories, form):
        score_type = form.getvalue("main_form_sort_metric")
        fact_weights = self.fact_match_weights(form)
        signature_type = form.getvalue("main_form_signature_type")

        # Sort all parses with this signature.
        output = []
        for (qid, category, parse) in categories:
            score = self.parse_score(category, parse, score_type, fact_weights)
            output.append((qid, category, parse, score))
        output.sort(key=lambda x: -x[3])

        # Get fact-matching statistics. Consider only the top parse for a category
        # if it has >1 parses with the same signature.
        category_count = defaultdict(int)
        match_counts = defaultdict(int)
        span_match_counts = defaultdict(lambda: defaultdict(int))
        num_members = 0
        for qid, category, parse, score in output:
            category_count[qid] += 1
            if category_count[qid] > 1:
                continue
            num_members += len(category.members)
            util.fact_matches_for_parse(parse, match_counts)
            for span in parse.spans:
                span_signature = util.span_signature(span, signature_type)
                util.fact_matches_for_span(span,
                                           span_match_counts[span_signature])

        # Write a summary.
        self._tag("div",
          "<b>%s</b>: in %d (category, parse) pairs across %d categories" % \
          (signature, len(categories), len(category_count)))
        self._tag("b", "#Items across categories: ")
        self._text("%d (incl. possible duplicates)" % num_members)

        self._br()
        self._tag("b", "Span-level fact-matching statistics")
        self.write_main_table_header(["Span Signature"],
                                     [t.name for t in FactMatchType])
        for span_signature, span_stats in span_match_counts.iteritems():
            self._begin("tr")
            self._cell(span_signature)
            self._separator(header=False)
            self.write_fact_match_counts(span_stats)
            self._end("tr")
        self._begin("tr")
        self._cell("All")
        self._separator(header=False)
        self.write_fact_match_counts(match_counts)
        self._end("tr")
        self._end("table")

        # Write the individual parses in a tabular form.
        self._br()
        self._tag("b", "Categories with parses matching '" + signature + "'")
        seen = set()
        max_rows = 200
        self.write_main_table_header([
            "Category", "Prelim parse score", "#Members", "Fact-matching score"
        ], [t.name for t in FactMatchType])
        row_count = 0
        for qid, category, parse, score in output:
            if row_count >= max_rows:
                break
            if qid in seen:
                continue
            row_count += 1
            seen.add(qid)
            self._begin("tr")
            self._begin("td")
            self._form_anchor(qid + ": " + category.name, qid)
            if category_count[qid] > 1:
                more = category_count[qid] - 1
                self._text(" (%d more parse%s)" %
                           (more, "" if more == 1 else "s"))
            self._end("td")
            self._cell(parse.score, numeric=True)
            self._cell(len(category.members), numeric=True)
            self._cell("%.4f" % self.parse_fact_score(parse, fact_weights),
                       True)

            self._separator(header=False)
            counts = util.fact_matches_for_parse(parse)
            self.write_fact_match_counts(counts)
            self._end("tr")
        self._end("table")
예제 #4
0
  def write_recordio(self, form):
    filename = form.getvalue("recordio_filename")
    signature = form.getvalue("recordio_signature")
    chosen_categories = form.getvalue("recordio_categories")
    all_chosen = chosen_categories == 'ALL'
    chosen_categories = set(chosen_categories.split(","))

    # See which spans in the signature should be focused on.
    num_spans = int(form.getvalue("recordio_num_spans"))
    chosen_spans = []
    for index in xrange(num_spans):
      if form.getvalue("recordio_span%d" % index) == "on":
        chosen_spans.append(index)

    parses = None
    if signature in browser_globals.coarse_signature_to_parse:
      parses = browser_globals.coarse_signature_to_parse[signature]
    else:
      parses = browser_globals.full_signature_to_parse[signature]

    allowed_match_types = set(form.getvalue("recordio_match_types").split(","))
    store = browser_globals.store
    writer = sling.RecordWriter(filename)

    counts = defaultdict(int)
    categories_seen = set()
    for (category_qid, category_frame, parse) in parses:
      # Skip category if we shouldn't extract facts for its members.
      if not all_chosen and category_qid not in chosen_categories:
        continue

      if category_qid in categories_seen:
        continue

      categories_seen.add(category_qid)
      for index in chosen_spans:
        span = parse.spans[index]
        pid = span.pids
        qid = span.qid

        # Can't upload multi-hop facts yet.
        if len(pid) > 1:
          continue

        pid = pid[0]
        matches = util.fact_matches_for_span(span, max_examples=-1)
        for match_type, examples in matches.examples.iteritems():
          if match_type not in allowed_match_types:
            continue

          for member in examples:
            frame = store.frame([("item", member)])
            frame.facts = store.frame([(pid, qid)])
            frame.provenance = store.frame([
                ("category", category_qid),
                ("method", "Member of Category:%s" % category_frame.name)
            ])
            frame.comment = "%s : %s = %s" % \
                (member.name if member.name is not None else member, \
                 span.signature,
                 qid.name if qid.name is not None else qid)
            counts[(pid, match_type)] += 1
            writer.write(member.id, frame.data(binary=True))
    writer.close()
    self._text("Wrote recordio to: " + str(filename))
    self._br()
    self._text("Fact counts by property: %s" % dict(counts))
예제 #5
0
 def compute_span_counts(self):
     if len(self.span_counts) == 0:
         for span in self.parse.spans:
             c = util.fact_matches_for_span(span)
             self.span_counts.append(c)