Пример #1
0
 def find_date_mentions(self) -> Iterable:
     for n, (doc_wid, doc_raw) in enumerate(self.corpus.input):
         doc_wid = str(doc_wid, 'utf-8')
         store = sling.Store(self.commons)
         frame = store.parse(doc_raw)
         document = sling.Document(frame, store, self.docschema)
         doc_title = get_metadata(frame)[1]
         if len(document.tokens) == 0:
             continue
         sorted_mentions = sorted(document.mentions, key=lambda m: m.begin)
         postion2time: Dict[int, int] = {}
         for ii, mention in enumerate(sorted_mentions):
             linked_entity = self.get_linked_entity(mention)
             if type(linked_entity) is not int:
                 continue
             for i in range(mention.begin, mention.end):
                 postion2time[i] = linked_entity
         colored_tokens: List[str] = []
         for i, tok in enumerate(document.tokens):
             if i in postion2time:
                 colored_tokens.append(
                     colored('{}:{}'.format(tok.word, postion2time[i]),
                             'green'))
             else:
                 if tok.word.isnumeric():
                     colored_tokens.append(colored(tok.word, 'red'))
                 else:
                     colored_tokens.append(tok.word)
         colored_text = ' '.join(colored_tokens)
         yield doc_wid, doc_title, colored_text
Пример #2
0
    def find_all_mentions(self):
        for n, (doc_wid, doc_raw) in enumerate(self.corpus.input):
            doc_wid = str(doc_wid, 'utf-8')
            store = sling.Store(self.commons)
            frame = store.parse(doc_raw)
            document = sling.Document(frame, store, self.docschema)
            doc_title = get_metadata(frame)[1]
            if len(document.tokens) == 0:
                continue
            sorted_mentions = sorted(document.mentions, key=lambda m: m.begin)
            all_mentions: List[Tuple[int, int, Any]] = []
            for ii, mention in enumerate(sorted_mentions):
                linked_entity = self.get_linked_entity(mention)
                all_mentions.append(
                    (mention.begin, mention.end, linked_entity))

            tokens = [t.word for t in document.tokens]
            prev_e = 0
            colored_tokens: List[str] = []
            for s, e, wid in all_mentions:
                colored_tokens.append(' '.join(tokens[prev_e:s]))
                colored_tokens.append(
                    colored('{}||{}'.format(' '.join(tokens[s:e]), wid),
                            'green'))
                prev_e = e
            colored_text = ' '.join(colored_tokens)
            yield doc_wid, doc_title, colored_text
Пример #3
0
    def _end_document(self, callback=None):
        if len(self.tokens.spans) == 0: return

        # Save SRL annotations for the last sentence.
        self._save_srl_annotations()

        # Add tokens as leaf constituents.
        self._add_token_constituents()

        # Find heads of all constituents.
        for node in self.constituents.spans:
            self.converter.head_finder.find(node)

        # Generate input statistics.
        self._summarize_input()

        # Sanity check: all annotations should be complete.
        assert len(self.current_srl) == 0, self.current_srl
        assert self.ner.all_ended()
        assert self.coref.all_ended()
        assert self.constituents.all_ended()
        for s in self.srl:
            assert s.all_ended()

        # Write the SLING document and invoke the callback.
        if callback is not None:
            store = sling.Store(self.converter.commons)
            document = sling.Document(None, store, self.converter.schema)
            self.write(document)
            callback(document)
Пример #4
0
def end_document():
  global document, brk, begin
  if document is not None:
    end_span()
    document.update()
    fout.write(None, document.frame.data(binary=True))
  document = sling.Document(store=store)
  brk = sling.NO_BREAK
  begin = None
  kind = None
Пример #5
0
def lex(text, store=None, schema=None):
  # Initialize tokenizer if needed.
  global tokenizer
  if tokenizer == None: tokenizer = sling.api.Tokenizer()

  # Create store for document if needed.
  if store == None: store = sling.Store()

  # Parse LEX-encoded text.
  frame = tokenizer.lex(store, text)

  # Return document with annotations.
  return sling.Document(frame, store, schema)
Пример #6
0
def tokenize(text, store=None, schema=None):
    # Initialize tokenizer if needed.
    global tokenizer
    if tokenizer == None: tokenizer = sling.api.Tokenizer()

    # Create store for document if needed.
    if store == None: store = sling.Store()

    # Tokenize text.
    frame = tokenizer.tokenize(store, text)

    # Return document with tokens.
    return sling.Document(frame, store, schema)
Пример #7
0
def extract_entity_mentions(nq_data, labelled_record):
    """Parse ourput corpus and create map from tokens to entity ids.

  Args:
    nq_data: A python dictionary containint NQ data of 1 train/dev shard
    labelled_record: Sling output document with labelled paragraphs

  Returns:
    nq_data: Original object augmented with entity maps
  """
    recin = sling.RecordReader(labelled_record)
    commons = sling.Store()
    docschema = sling.DocumentSchema(commons)
    commons.freeze()
    cnt = 1

    for key, value in recin:
        store = sling.Store(commons)
        doc = sling.Document(store.parse(value), store, docschema)
        index, ans_type, idx, ans_id = key.decode("utf-8").split("|")
        cnt += 1
        entity_map = {}

        # Parse entity mentions labelled by sling
        for m in doc.mentions:
            e = [i["is"] for i in m.evokes()]
            if not e:
                continue
            if is_sling_entity(e):
                e_val = e[0]["id"]
                if m.begin in entity_map:
                    entity_map[m.begin].append((m.end, e_val))
                else:
                    entity_map[m.begin] = [(m.end, e_val)]

        if ans_type == "annotated_long_answer":
            nq_data[index]["annotations"][int(
                idx)]["long_answer"]["entity_map"] = entity_map
        elif ans_type == "question":
            nq_data[index]["question_entity_map"] = entity_map
        elif ans_type == "annotated_short_answer":
            nq_data[index]["annotations"][int(idx)]["short_answers"][int(
                ans_id)]["entity_map"] = entity_map
        else:
            nq_data[index]["long_answer_candidates"][int(
                idx)]["entity_map"] = entity_map
    return nq_data
Пример #8
0
    def run(self, task):
        self.init(task)

        max_parses = int(task.param("max_parses"))
        reader = sling.RecordReader(task.input("input").name)
        writer = sling.RecordWriter(task.output("output").name)
        for index, (key, value) in enumerate(reader):
            store = sling.Store(self.kb)
            category = store.parse(value)
            document = sling.Document(category.document)

            # Score each parse.
            parse_with_score = self.score(category)

            # Keep only the top-k parses.
            ranked_parses = sorted(parse_with_score, key=lambda x: -x[1])
            if len(ranked_parses) > max_parses:
                dropped = len(ranked_parses) - max_parses
                ranked_parses = ranked_parses[0:max_parses]
                task.increment("parses-dropped", dropped)
                task.increment("categories-with-too-many-parses")

            # Compute signature for each parse and store it in the parse.
            for parse, _ in ranked_parses:
                tokens, span_signature = self.signature(document, parse)
                parse["signature"] = tokens
                for span in parse.spans:
                    if span in span_signature:
                        span["signature"] = span_signature[span]

                # Also compute the coarse signature.
                tokens, span_signature = self.signature(document,
                                                        parse,
                                                        coarse=True)
                parse["coarse_signature"] = tokens
                for span in parse.spans:
                    if span in span_signature:
                        span["coarse_signature"] = span_signature[span]

            # Replace the current set of parses with the ranked list.
            del category["parse"]
            for parse, _ in ranked_parses:
                category.append("parse", parse)
            task.increment("parses-kept", len(ranked_parses))
            writer.write(key, category.data(binary=True))
        reader.close()
        writer.close()
Пример #9
0
 def iter_mentions_position(
     self,
     wid_set: Set[str] = None
 ) -> Iterable[Tuple[str, Dict[str, List[int]]]]:
     for n, (doc_wid, doc_raw) in enumerate(self.corpus.input):
         doc_wid = str(doc_wid, 'utf-8')
         if wid_set is not None and doc_wid not in wid_set:
             continue
         store = sling.Store(self.commons)
         frame = store.parse(doc_raw)
         document = sling.Document(frame, store, self.docschema)
         sorted_mentions = sorted(document.mentions, key=lambda m: m.begin)
         m2pos: Dict[str, List[int]] = defaultdict(list)
         for ii, mention in enumerate(sorted_mentions):
             linked_entity = self.get_linked_entity(mention)
             m2pos[linked_entity].append(mention.begin)
         yield (doc_wid, m2pos)
Пример #10
0
 def iter_mentions(self, wid_set: Set[str]=None, only_entity: bool=False, split_by: str=None) -> \
         Iterable[Tuple[str, sling.Document, List[Tuple[str, int, int]]]]:
     assert split_by in {'sentence', None}, 'not supported split_by'
     split_by = {'sentence': 3, None: None}[split_by]
     for n, (doc_wid, doc_raw) in enumerate(self.corpus.input):
         doc_wid = str(doc_wid, 'utf-8')
         if wid_set is not None and doc_wid not in wid_set:
             continue
         store = sling.Store(self.commons)
         frame = store.parse(doc_raw)
         document = sling.Document(frame, store, self.docschema)
         sorted_mentions = sorted(document.mentions, key=lambda m: m.begin)
         tokens = [t.word for t in document.tokens]
         split_start = [0] + [
             i for i, t in enumerate(document.tokens) if t.brk == split_by
         ]
         split_ind = 0
         mentions: List[Tuple[str, int, int]] = []
         for mention in sorted_mentions:
             while len(split_start
                       ) > split_ind + 1 and mention.begin >= split_start[
                           split_ind + 1]:
                 if len(mentions) > 0:
                     yield (
                         doc_wid,
                         tokens[split_start[split_ind]:split_start[split_ind
                                                                   + 1]],
                         mentions)
                     mentions = []
                 split_ind += 1
             if len(split_start
                    ) > split_ind + 1 and mention.end > split_start[
                        split_ind + 1]:
                 # skip mentions beyond the boundary
                 continue
             linked_entity = self.get_linked_entity(mention)
             if only_entity and (type(linked_entity) is not str
                                 or not linked_entity.startswith('Q')):
                 continue
             mentions.append(
                 (linked_entity, mention.begin - split_start[split_ind],
                  mention.end - split_start[split_ind]))
         if len(mentions) > 0:
             yield (doc_wid, tokens[split_start[split_ind]:], mentions)
Пример #11
0
def build(recordio_filenames, output_filename, text=False):
    commons = sling.Store()
    schema = sling.DocumentSchema(commons)
    commons.freeze()

    symbol_names = {}
    symbol_names["thing"] = 1

    # Adds handle's id to 'symbol_names' if it is already not in 'commons'.
    def add(handle):
        if type(handle) is not sling.Frame or handle.id is None: return

        id_str = str(handle.id)
        if commons[id_str] is not None: return

        if id_str not in symbol_names: symbol_names[id_str] = 0
        symbol_names[id_str] += 1

    for filename in recordio_filenames:
        reader = sling.RecordReader(filename)
        for key, value in reader:
            store = sling.Store(commons)
            document = sling.Document(store.parse(value), schema=schema)

            for mention in document.mentions:
                for frame in mention.evokes():
                    for slot_role, slot_value in frame:
                        add(slot_role)
                        add(slot_value)

            for theme in document.themes:
                for slot_role, slot_value in theme:
                    add(slot_role)
                    add(slot_value)

    output = sling.Store()
    schema = sling.DocumentSchema(output)

    for name, count in symbol_names.iteritems():
        output.frame({"id": name})
    output.freeze()
    output.save(output_filename, binary=not text)
    return output, symbol_names
Пример #12
0
    def parse(self, obj):
        if type(obj) is sling.Document:
            # Parser document.
            obj.update()
            self.parser.parse(obj.frame)
            obj.refresh_annotations()
            return obj
        elif type(obj) is sling.Frame:
            # Parse document frame and return parsed document.
            self.parser.parse(obj)
            return sling.Document(obj)
        else:
            # Create local store for new document.
            store = sling.Store(self.commons)

            # Tokenize text.
            doc = tokenize(str(obj), store=store, schema=self.schema)

            # Parse document.
            self.parser.parse(doc.frame)
            doc.refresh_annotations()
            return doc
Пример #13
0
    def link_documents(self,
                       N=None,
                       out_file="/tmp/linked.rec",
                       add_negatives=False,
                       filter_subjects=None):
        """ Load n documents and link them to facts """
        start = time.time()
        fout = open(out_file, "w")
        for n, (doc_id, doc_raw) in enumerate(self.corpus.input):
            if n == N:
                break
            if n % 1000 == 0:
                print("processed", n, "items in %.1f" % (time.time() - start),
                      "seconds")
            # get kb items
            doc_id = str(doc_id, "utf-8")
            if filter_subjects is not None and doc_id not in filter_subjects:
                continue
            kb_item = self.kb[doc_id]
            tail_entities = {}
            all_properties = []
            for prop, tail in kb_item:
                tup = self.get_canonical_property(prop, tail)
                if tup is None:
                    tup = self.get_date_property(prop, tail)
                    if tup is None:
                        continue
                tail_entities[tup[1]] = tup[0]
                all_properties.append(tup[0])
            store = sling.Store(self.commons)
            document = sling.Document(store.parse(doc_raw), store,
                                      self.docschema)
            if len(document.tokens) == 0:
                print("Skipping %s No tokens." % (doc_id))
                continue
            # build token maps
            tok_to_sent_id, tok_to_para_id, sent_to_span, para_to_span = {}, {}, {}, {}
            tok_to_char_offset = {}
            offset = 0
            sent_begin = para_begin = 0
            sent_id = para_id = 0
            for ii, token in enumerate(document.tokens):
                if ii > 0 and token.brk == 4:
                    para_to_span[para_id] = (para_begin, ii)
                    sent_to_span[sent_id] = (sent_begin, ii)
                    para_id += 1
                    sent_id += 1
                    sent_begin = para_begin = ii
                elif ii > 0 and token.brk == 3:
                    sent_to_span[sent_id] = (sent_begin, ii)
                    sent_id += 1
                    sent_begin = ii
                tok_to_sent_id[ii] = sent_id
                tok_to_para_id[ii] = para_id
                tok_to_char_offset[ii] = offset
                offset += len(token.word) + 1
            para_to_span[para_id] = (para_begin, ii + 1)
            sent_to_span[sent_id] = (sent_begin, ii + 1)
            # find subjects
            sent_to_subj, para_to_subj = defaultdict(list), defaultdict(list)
            mentid_to_linked_entity = {}
            sorted_mentions = sorted(document.mentions, key=lambda m: m.begin)
            for ii, mention in enumerate(sorted_mentions):
                if tok_to_sent_id[mention.begin] != tok_to_sent_id[mention.end
                                                                   - 1]:
                    continue
                linked_entity = self.get_linked_entity(mention)
                mentid_to_linked_entity[ii] = linked_entity
                if linked_entity == doc_id:
                    sent_id = tok_to_sent_id[mention.begin]
                    sent_to_subj[sent_id].append(mention)
                    para_id = tok_to_para_id[mention.begin]
                    para_to_subj[para_id].append(mention)

            # find tails
            relations = []
            seen_properties = {}
            for ii, mention in enumerate(sorted_mentions):
                # first look for sentence matches
                linked_entity = mentid_to_linked_entity[ii]
                if linked_entity == doc_id:
                    continue
                if linked_entity in tail_entities:
                    if tail_entities[linked_entity] in seen_properties:
                        continue
                    my_sent = tok_to_sent_id[mention.begin]
                    if my_sent in sent_to_subj:
                        my_para = tok_to_para_id[mention.begin]
                        para_span = para_to_span[my_para]
                        #sent_span = sent_to_span[my_sent]
                        fout.write(
                            self.serialize_relation(
                                document, tok_to_char_offset, para_span,
                                doc_id, sent_to_subj[my_sent],
                                tail_entities[linked_entity], linked_entity,
                                mention, "sentence") + "\n")
                        seen_properties[tail_entities[linked_entity]] = my_para
                        self.relation_stats["sentences"][
                            tail_entities[linked_entity]] += 1

            for ii, mention in enumerate(sorted_mentions):
                # next look for paragraph matches
                linked_entity = mentid_to_linked_entity[ii]
                if linked_entity == doc_id:
                    continue
                if linked_entity in tail_entities:
                    if tail_entities[linked_entity] in seen_properties:
                        continue
                    my_para = tok_to_para_id[mention.begin]
                    if my_para in para_to_subj:
                        para_span = para_to_span[my_para]
                        fout.write(
                            self.serialize_relation(
                                document, tok_to_char_offset, para_span,
                                doc_id, para_to_subj[my_para],
                                tail_entities[linked_entity], linked_entity,
                                mention, "paragraph") + "\n")
                        seen_properties[tail_entities[linked_entity]] = my_para
                        self.relation_stats["paragraphs"][
                            tail_entities[linked_entity]] += 1

            # add negatives
            if add_negatives:
                max_neg = len(seen_properties)
                num_neg = 0
                all_para_id = list(para_to_subj.keys())
                if not all_para_id:
                    continue
                for tail, prop in tail_entities.items():
                    if num_neg == max_neg:
                        break
                    if prop in seen_properties:
                        continue
                    random_para_id = random.choice(all_para_id)
                    random_para_span = para_to_span[random_para_id]
                    fout.write(
                        self.serialize_relation(
                            document, tok_to_char_offset, random_para_span,
                            doc_id, para_to_subj[random_para_id], prop, None,
                            None, "entity negative") + "\n")
                    num_neg += 1
                    seen_properties[prop] = None
                    self.relation_stats["entity negatives"][prop] += 1

        fout.close()
        print("Sentences -- Total ",
              sum(self.relation_stats["sentences"].values()))
        print(" :: ".join(
            "%s:%d" % (k, v)
            for k, v in self.relation_stats["sentences"].items()))
        print("Paragraphs -- Total ",
              sum(self.relation_stats["paragraphs"].values()))
        print(" :: ".join(
            "%s:%d" % (k, v)
            for k, v in self.relation_stats["paragraphs"].items()))
Пример #14
0
    def run(self):
        month = "(" + "|".join(self.months.keys()) + ")"
        day = "(\d{1,2})"
        year = "(\d{4})"
        date = "(?:(?:" + day + " " + month + " " + year + ")|"
        date += "(?:" + month + " " + day + ", " + year + "))"
        date += "(?:[^)]+?)?"
        dates = date + u"\s*-+\s*" + date
        dates = u"(?:(?:(?:born|b\.|n\xe9e),? ([^0-9)]*?)" + date + \
          "(?:(?:died|d\.),? [^0-9)]*?" + date + ")?)|(?:" + dates + "))"
        pat = "(?:[^(]|\([^0-9]*\))*?\([^0-9)]*?" + dates + "\s*\)"
        rec = re.compile(pat)

        self.out_file = "local/data/e/wikibot/birth-death-dates.rec"
        record_file = sling.RecordWriter(self.out_file)
        records = 0
        store = sling.Store(self.kb)

        for i in range(10):
            i_file = "local/data/e/wiki/en/documents-0000" + str(
                i) + "-of-00010.rec"
            print i_file, records
            for (item_id, record) in sling.RecordReader(i_file):
                item = self.kb[item_id]
                if self.human not in item(self.instanceof): continue
                if self.precise_date(item(self.date_of_birth)) and \
                   self.precise_date(item(self.date_of_death)):
                    continue
                parsed_record = sling.Store().parse(record)
                doc = sling.Document(parsed_record)
                raw_text = parsed_record['text']
                if len(raw_text) == 0: continue
                start_index = raw_text.find("<b>") + len("<b>")
                first = 1
                while first < len(doc.tokens) and \
                      doc.tokens[first].start <= start_index:
                    first += 1
                last = first
                while last < len(doc.tokens) and doc.tokens[last].brk < 3:
                    last += 1
                text = doc.phrase(max(0, first - 1),
                                  min(len(doc.tokens), last + 15))
                m = rec.match(text)
                if m is None: continue
                if text.find("(baptised") >= 0 or text.find("throne") >= 0:
                    continue
                if text.find("(baptized") >= 0 or text.find("partner") >= 0:
                    continue
                if m.group(2) or m.group(5):
                    first = self.date_from_match(1, m)
                    if first.year < 1753:
                        continue  # possibly Julian calendar date
                    if m.group(8) or m.group(11):
                        second = self.date_from_match(7, m)
                        if second.year < 1753:
                            continue  # possibly Julian calendar date
                        facts = store.frame({
                            self.date_of_birth: first.value(),
                            self.date_of_death: second.value()
                        })
                    else:
                        # Only one date match
                        mg1 = m.group(1)
                        dob = item(self.date_of_birth)
                        dod = item(self.date_of_death)
                        if mg1 and max(mg1.find("died"), mg1.find("d.")) >= 0:
                            # death date only
                            if self.precise_date(dod): continue
                            if self.same_year(first.year, dob):
                                continue  # b&d too close
                            facts = store.frame({
                                self.date_of_death:
                                first.value(),
                            })
                        else:
                            # birth date only
                            if self.precise_date(dob): continue
                            if self.same_year(first.year, dod):
                                continue  # b&d too close
                            facts = store.frame({
                                self.date_of_birth:
                                first.value(),
                            })
                else:
                    first = self.date_from_match(13, m)
                    second = self.date_from_match(19, m)
                    if min(first.year, second.year) < 1753:
                        continue  # possibly Julian
                    facts = store.frame({
                        self.date_of_birth: first.value(),
                        self.date_of_death: second.value()
                    })
                records += 1
                provenance = store.frame({
                    self.url:
                    parsed_record['url'],
                    self.method:
                    "English Wikipedia dates for '" + str(item.name) + "'"
                })
                fact = store.frame({
                    self.item: item,
                    self.facts: facts,
                    self.provenance: provenance
                })
                record_file.write(item.id, fact.data(binary=True))
        record_file.close()
        print records, "birth/death date records written to file:", self.out_file
Пример #15
0
  def compare(arg):
    base_reader = sling.RecordReader(arg.base)
    expt_reader = sling.RecordReader(arg.expt)

    commons = sling.Store()
    commons.load(arg.commons)
    schema = sling.DocumentSchema(commons)
    commons.freeze()
    
    store = sling.Store(commons)
    index = -1
    for (_, base_val), (_, expt_val) in zip(base_reader, expt_reader):
      index += 1
      base_doc = sling.Document(frame=store.parse(base_val), schema=schema)
      expt_doc = sling.Document(frame=store.parse(expt_val), schema=schema)

      # Basic checks.
      base = base_doc.frame["trace"]
      expt = expt_doc.frame["trace"]
      if base is None and expt_doc is not None:
        checker.error('No trace in base document at index %d' % index)
      elif base is not None and expt_doc is None:
        checker.error('No trace in expt document at index %d' % index)
      if base is None:
        continue

      # Traces should be over the same token range.
      checker = Checker(index, base_doc, expt_doc, arg.diff)
      checker.check_eq(base["begin"], expt["begin"], "Trace Begin")
      checker.check_eq(base["end"], expt["end"], "Trace End")

      # Check LSTM features.
      base_lstm = base["/trace/lstm_features"]
      expt_lstm = expt["/trace/lstm_features"]
      checker.check_eq(len(base_lstm), len(expt_lstm), "LSTM Features Length")
      for i in range(len(base_lstm)):
        checker.frame_eq(base_lstm[i], expt_lstm[i], \
          "LSTM features for token %d (%s)" % (i, base_doc.tokens[i].word))

      # Check steps.
      base_steps = base["/trace/steps"]
      expt_steps = expt["/trace/steps"]
      min_steps = min(len(base_steps), len(expt_steps))
      for i in range(min_steps):
        message = "Step %d's current token index" % i
        checker.check_eq(base_steps[i]["/trace/current"], \
          expt_steps[i]["/trace/current"], message)

        # Check FF features for the step.
        base_ff = base_steps[i]["/trace/ff_features"]
        expt_ff = expt_steps[i]["/trace/ff_features"]
        checker.check_eq(len(base_ff), len(expt_ff), \
          "# of FF features for step %d" % i)

        base_dict = {f["/trace/feature"] : f["/trace/values"] for f in base_ff}
        expt_dict = {f["/trace/feature"] : f["/trace/values"] for f in expt_ff}
        for k, v in base_dict.items():
          checker.check_eq(k in expt_dict, True, \
            "Step %d: FF feature %s not in expt" % (i, k))
          checker.check_eq(v, expt_dict[k], \
            "Step %d: FF feature %s has a different value in expt" % (i, k))
        for k, v in expt_dict.items():
          checker.check_eq(k in base_dict, True, \
            "Step %d: FF feature %s not in base" % (i, k))

        # Check action(s) in the step.
        base_actions = base_steps[i]["/trace/actions"]
        expt_actions = expt_steps[i]["/trace/actions"]
        for idx in range(min(len(base_actions), len(expt_actions))):
          checker.frame_eq(base_actions[idx]["/trace/predicted"], \
            expt_actions[idx]["/trace/predicted"],
            "Step %d, predicted action %d" % (i, idx),
            ["/trace/_str"])
          checker.frame_eq(base_actions[idx]["/trace/final"], \
            expt_actions[idx]["/trace/final"],
            "Step %d, final action %d" % (i, idx),
            ["/trace/_str"])

        # There should be the same number of actions in the step.
        checker.check_eq(len(base_actions), len(expt_actions), \
          "Step %d: # of actions" % i)

      # There should be the same number of steps.
      checker.check_eq(len(base_steps), len(expt_steps), "# of Steps")

    base_reader.close()
    expt_reader.close()
Пример #16
0
  def handle_category(self, qid, form):
    def is_on(name):
      return form.getvalue("main_form_" + name) == "on"

    # Various options.
    show_span_qid = is_on("show_span_qid")
    show_prelim_parse_scores = is_on("show_prelim_parse_scores")
    show_span_scores = is_on("show_span_scores")
    show_fact_matches = is_on("show_fact_matching_statistics")
    show_span_fact_matches = is_on("show_span_fact_match_stats")
    show_similar_categories = is_on("show_similar_categories")
    signature_type = form.getvalue("main_form_signature_type")
    metric = form.getvalue("main_form_sort_metric")
    fact_weights = self.fact_match_weights(form)

    frame = browser_globals.category_frame[qid]
    document = sling.Document(frame=frame.document)

    num = len([p for p in frame("parse")])
    self._tag("div", "<b>%s = %s</b>: %d members, %d parses" % \
              (qid, frame.name, len(frame.members), num))
    self._br()

    # Write the parses in a tabular format.
    show_prelim_parse_scores &= metric != "prelim_parse_score"
    self.write_main_table_header(
      "Signature",
      [t.word for t in document.tokens],
      "Metric",
      "Prelim Scores" if show_prelim_parse_scores else None,
      [t.name for t in FactMatchType] if show_fact_matches else None,
      "Matching Categories" if show_similar_categories else None)

    # Each parse is written as one row.
    parses = [(parse, self.parse_score(frame, parse, metric, fact_weights)) \
      for parse in frame("parse")]
    parses.sort(key=lambda x: -x[1])
    for parse, metric_value in parses:
      signature = util.parse_signature(parse, signature_type)

      self._begin("tr")
      self._begin("td")
      self._form_anchor(signature, signature)
      self._end("td")
      self._separator(header=False)
      prev_span_end = -1
      for span in parse.spans:
        for index in xrange(prev_span_end + 1, span.begin):
          self._empty_cell()

        self._begin("td", colspan=span.end-span.begin, align='middle')
        text = util.span_signature(span, signature_type)
        if show_span_qid:
          text += " (" + str(span.qid) + ")"
        title = '.'.join([str(p) for p in span.pids]) + ' = ' + str(span.qid)
        if "name" in span.qid:
          title += " (" + span.qid[name] + ")"
        self._tag("span", text, title=title)

        if show_span_scores and "prior" in span:
          self._br()
          self._text("%s = %0.4f" % ("prior", span.prior))

        if show_span_fact_matches:
          local_counts = util.fact_matches_for_span(span)
          self._br()
          self._begin("table class='span_fact_match'")
          self._begin("thead")
          for t in FactMatchType:
            self._tag("th", t.name)
          self._end("thead")
          self._begin("tr")
          self.write_fact_match_counts(local_counts)
          self._end(["tr", "table"])

        self._end("td")
        prev_span_end = span.end - 1

      for index in xrange(prev_span_end + 1, len(document.tokens)):
        self._empty_cell()

      self._separator(header=False)
      if type(metric_value) is int:
        self._cell(metric_value)
      else:
        self._cell("%.4f" % metric_value)

      if show_prelim_parse_scores:
        self._separator(header=False)
        self._begin("td class='numeric'")
        for score_type in ["prior", "member_score", "cover"]:
          if score_type in parse:
            self._text("%s = %0.4f" % (score_type, parse[score_type]))
            self._br()
        if "score" in parse:
          self._color_text("Overall = %0.4f" % parse.score, "blue")
        self._end("td")

      if show_fact_matches:
        self._separator(header=False)
        total_fact_counts = util.fact_matches_for_parse(parse)
        self.write_fact_match_counts(total_fact_counts)

      if show_similar_categories:
        self._separator(header=False)
        self._begin("td")
        limit = 5
        signature_mapping = browser_globals.full_signature_to_parse
        if signature_type == "coarse":
          signature_mapping = browser_globals.coarse_signature_to_parse
        seen = set()
        for (other_qid, other_category, other_parse) in \
          signature_mapping[signature]:
          if len(seen) >= limit:
            break
          if other_qid != qid and other_qid not in seen:
            seen.add(other_qid)
            self._text(other_category.name)
            self._form_anchor(" (= %s)" % other_qid, other_qid)
            self._text(" (%0.4f)" % other_parse.score)
            self._br()
        self._end("td")
      self._end("tr")
    self._end("table")
Пример #17
0
 def next(self):
     _, data = self.input.next()
     f = sling.Store(self.commons).parse(data)
     return sling.Document(f, schema=self.docschema)
Пример #18
0
 def __getitem__(self, key):
     data = self.input.lookup(key)
     f = sling.Store(self.commons).parse(data)
     return sling.Document(f, schema=self.docschema)
Пример #19
0
 def __init__(self, commons_store: sling.Store, schema: sling.DocumentSchema, doc_name: str):
     self.store = sling.Store(commons_store)
     self.schema = schema
     self.doc_name = doc_name
     self.doc = sling.Document(None, self.store, self.schema)
Пример #20
0
    def link_documents(self,
                       max_n=None,
                       fact_out_file="/tmp/facts.json",
                       qry_out_file="/tmp/queries.json",
                       para_out_file="/tmp/paragraphs.json",
                       filter_subjects=None,
                       exclude_subjects=None):
        """Load n documents and link them to facts."""
        start = time.time()
        fout = open(fact_out_file, "w")
        fq_out, fp_out = open(qry_out_file, "w"), open(para_out_file, "w")
        seen_articles = set()
        total_paras = 0
        for n, (doc_id, doc_raw) in enumerate(self.corpus.input):
            doc_id = str(doc_id, "utf-8")
            if n % 1000 == 0:
                print("processed", n, "items in %.1f" % (time.time() - start),
                      "sec")
            if max_n is not None and random.uniform(
                    0, 1) > (float(max_n) / 550000):
                continue
            if filter_subjects is not None and doc_id not in filter_subjects:
                continue
            if exclude_subjects is not None and doc_id in exclude_subjects:
                continue
            # get kb items
            seen_articles.add(doc_id)
            kb_item = self.kb[doc_id]
            tail_entities = {}
            all_properties = []
            for prop, tail in kb_item:
                tup = self.get_canonical_property(prop, tail)
                if tup is None:
                    tup = self.get_date_property(prop, tail)
                    if tup is None:
                        continue
                tail_entities[tup[1]] = tup[0]
                all_properties.append(tup[0])
            store = sling.Store(self.commons)
            document = sling.Document(store.parse(doc_raw), store,
                                      self.docschema)
            if not document.tokens:
                print("Skipping %s No tokens." % (doc_id))
                continue
            # build token maps
            tok_to_sent_id, tok_to_para_id, sent_to_span, para_to_span = {}, {}, {}, {}
            tok_to_char_offset = {}
            offset = 0
            sent_begin = para_begin = 0
            sent_id = para_id = 0
            for ii, token in enumerate(document.tokens):
                if ii > 0 and token.brk == 4:
                    para_to_span[para_id] = (para_begin, ii)
                    sent_to_span[sent_id] = (sent_begin, ii)
                    para_id += 1
                    sent_id += 1
                    sent_begin = para_begin = ii
                elif ii > 0 and token.brk == 3:
                    sent_to_span[sent_id] = (sent_begin, ii)
                    sent_id += 1
                    sent_begin = ii
                tok_to_sent_id[ii] = sent_id
                tok_to_para_id[ii] = para_id
                tok_to_char_offset[ii] = offset
                offset += len(token.word) + 1
            para_to_span[para_id] = (para_begin, len(document.tokens))
            sent_to_span[sent_id] = (sent_begin, len(document.tokens))
            # find subjects
            sent_to_subj, para_to_subj = (collections.defaultdict(list),
                                          collections.defaultdict(list))
            para_to_ment = collections.defaultdict(list)
            ment_to_para_index = {}
            mentid_to_linked_entity = {}
            sorted_mentions = sorted(document.mentions, key=lambda m: m.begin)
            for ii, mention in enumerate(sorted_mentions):
                if tok_to_sent_id[mention.begin] != tok_to_sent_id[mention.end
                                                                   - 1]:
                    continue
                linked_entity = self.get_linked_entity(mention)
                mentid_to_linked_entity[ii] = linked_entity
                para_id = tok_to_para_id[mention.begin]
                para_to_ment[para_id].append((mention, linked_entity))
                ment_to_para_index[ii] = len(para_to_ment[para_id]) - 1
                if linked_entity == doc_id:
                    sent_id = tok_to_sent_id[mention.begin]
                    sent_to_subj[sent_id].append(ii)
                    para_to_subj[para_id].append(ii)

            # save paragraphs
            local_to_global_para = {}
            for para_id, para_span in para_to_span.items():
                if para_span[1] - para_span[0] < MIN_LEN:
                    continue
                if len(para_to_ment[para_id]) <= 1:
                    continue
                local_to_global_para[para_id] = total_paras
                fp_out.write(
                    self.serialize_para(document, tok_to_char_offset,
                                        para_span, para_to_ment[para_id],
                                        doc_id, total_paras) + "\n")
                total_paras += 1

            # find tails
            seen_properties = {}
            for ii, mention in enumerate(sorted_mentions):
                # first look for sentence matches
                linked_entity = mentid_to_linked_entity[ii]
                if linked_entity == doc_id:
                    continue
                if linked_entity in tail_entities:
                    if tail_entities[linked_entity] in seen_properties:
                        continue
                    my_sent = tok_to_sent_id[mention.begin]
                    my_para = tok_to_para_id[mention.begin]
                    para_span = para_to_span[my_para]
                    if my_para not in local_to_global_para:
                        continue
                    if my_sent in sent_to_subj:
                        # sent_span = sent_to_span[my_sent]
                        fq_out.write(
                            self.serialize_query(local_to_global_para[my_para],
                                                 ment_to_para_index, doc_id,
                                                 sent_to_subj[my_sent],
                                                 tail_entities[linked_entity],
                                                 linked_entity, ii, "sentence")
                            + "\n")
                        subj_mentions = [
                            para_to_ment[my_para][ment_to_para_index[mm]][0]
                            for mm in sent_to_subj[my_sent]
                        ]
                        fout.write(
                            self.serialize_fact(
                                document, tok_to_char_offset, para_span,
                                doc_id, subj_mentions,
                                tail_entities[linked_entity], linked_entity,
                                mention, "sentence") + "\n")
                        seen_properties[tail_entities[linked_entity]] = my_para
                        self.relation_stats["sentences"][
                            tail_entities[linked_entity]] += 1

            for ii, mention in enumerate(sorted_mentions):
                # next look for paragraph matches
                linked_entity = mentid_to_linked_entity[ii]
                if linked_entity == doc_id:
                    continue
                if linked_entity in tail_entities:
                    if tail_entities[linked_entity] in seen_properties:
                        continue
                    my_para = tok_to_para_id[mention.begin]
                    para_span = para_to_span[my_para]
                    if my_para not in local_to_global_para:
                        continue
                    if my_para in para_to_subj:
                        fq_out.write(
                            self.serialize_query(local_to_global_para[my_para],
                                                 ment_to_para_index, doc_id,
                                                 para_to_subj[my_para],
                                                 tail_entities[linked_entity],
                                                 linked_entity, ii,
                                                 "paragraph") + "\n")
                        subj_mentions = [
                            para_to_ment[my_para][ment_to_para_index[mm]][0]
                            for mm in para_to_subj[my_para]
                        ]
                        fout.write(
                            self.serialize_fact(
                                document, tok_to_char_offset, para_span,
                                doc_id, subj_mentions,
                                tail_entities[linked_entity], linked_entity,
                                mention, "paragraph") + "\n")
                        seen_properties[tail_entities[linked_entity]] = my_para
                        self.relation_stats["paragraphs"][
                            tail_entities[linked_entity]] += 1

            # add negatives
            max_neg = len(seen_properties)
            num_neg = 0
            all_para_id = list(para_to_subj.keys())
            if not all_para_id:
                continue
            for tail, prop in tail_entities.items():
                if num_neg == max_neg:
                    break
                if prop in seen_properties:
                    continue
                random_para_id = random.choice(all_para_id)
                random_para_span = para_to_span[random_para_id]
                subj_mentions = [
                    para_to_ment[random_para_id][ment_to_para_index[mm]][0]
                    for mm in para_to_subj[random_para_id]
                ]
                fout.write(
                    self.serialize_fact(
                        document, tok_to_char_offset, random_para_span, doc_id,
                        subj_mentions, prop, None, None, "entity negative") +
                    "\n")
                num_neg += 1
                seen_properties[prop] = None
                self.relation_stats["entity negatives"][prop] += 1

        fout.close()
        fq_out.close()
        fp_out.close()
        print("Sentences -- ", sum(self.relation_stats["sentences"].values()))
        print(" :: ".join(
            "%s:%d" % (k, v)
            for k, v in self.relation_stats["sentences"].items()))
        print("Paragraphs -- ",
              sum(self.relation_stats["paragraphs"].values()))
        print(" :: ".join(
            "%s:%d" % (k, v)
            for k, v in self.relation_stats["paragraphs"].items()))
Пример #21
0
    'Q57652',  # Helle Thorning-Schmidt
    'Q1636974',  # Danske Bank
    'Q186285',  # University of Copenhagen
    'Q1687170',  # Jens Christian Skou
]

articles = sling.RecordDatabase("data/e/wiki/en/[email protected]")
output = sling.RecordWriter("/tmp/chunked.rec")

for docid in documentids:
    # Read document from article database.
    store = sling.Store(commons)
    if docid.startswith("Q"):
        record = articles.lookup(docid)
        article = store.parse(record)
        document = sling.Document(article, schema=docschema)
        document.remove_annotations()
        document.update()
    else:
        document = sling.tokenize(docid, store=store, schema=docschema)

    print document.frame["title"]

    begin = 0
    while begin < len(document.tokens):
        # Find next sentence.
        end = begin + 1
        while end < len(document.tokens) and \
              document.tokens[end].brk < sling.SENTENCE_BREAK:
            end += 1
        print "s:", document.phrase(begin, end)