Пример #1
0
    def silver_annotation(self, indocs=None, outdocs=None, language=None):
        if indocs == None: indocs = self.wiki.wikipedia_documents(language)
        if outdocs == None: outdocs = self.silver_documents(language)
        if language == None: language = flags.arg.language
        phrases = corpora.repository("data/wiki/" + language) + "/phrases.txt"

        with self.wf.namespace(language + "-silver"):
            mapper = self.wf.task("document-processor", "labeler")

            mapper.add_annotator("mentions")
            mapper.add_annotator("anaphora")
            mapper.add_annotator("phrase-structure")
            mapper.add_annotator("relations")

            mapper.add_param("resolve", True)
            mapper.add_param("language", language)
            mapper.attach_input("commons", self.wiki.knowledge_base())
            mapper.attach_input("aliases", self.wiki.phrase_table(language))
            mapper.attach_input("dictionary", self.idftable(language))
            if os.path.isfile(phrases):
                mapper.attach_input("phrases",
                                    self.wf.resource(phrases, format="lex"))

            self.wf.connect(self.wf.read(indocs), mapper)
            output = self.wf.channel(mapper, format="message/document")
            return self.wf.write(output, outdocs)
Пример #2
0
 def template_defs(self, language=None):
     """Resource for template definitions."""
     if language == None: language = flags.arg.language
     return self.wf.resource("templates.sling",
                             dir=corpora.repository("data/wiki/" +
                                                    language),
                             format="store/frame")
Пример #3
0
 def alias_corrections(self):
     """Resource for alias corrections."""
     return self.wf.resource("aliases.sling",
                             dir=corpora.repository("data/wiki"),
                             format="store/frame")
Пример #4
0
 def wikipedia_defs(self):
     """Resource for Wikipedia schema definitions."""
     return self.wf.resource("wikipedia.sling",
                             dir=corpora.repository("data/wiki"),
                             format="store/frame")
Пример #5
0
 def unit_defs(self):
     """Resource for calendar definitions."""
     return self.wf.resource("units.sling",
                             dir=corpora.repository("data/wiki"),
                             format="store/frame")
Пример #6
0
 def country_defs(self):
     """Resource for country definitions."""
     return self.wf.resource("countries.sling",
                             dir=corpora.repository("data/wiki"),
                             format="store/frame")
Пример #7
0
 def language_defs(self):
     """Resource for language definitions. This defines the /lang/<lang>
 symbols and has meta information for each language."""
     return self.wf.resource("languages.sling",
                             dir=corpora.repository("data/wiki"),
                             format="store/frame")
Пример #8
0
  def silver_annotation(self, docs=None, language=None):
    if language == None: language = flags.arg.language
    if docs == None: docs = self.data.wikipedia_documents(language)
    train_docs = self.training_documents(language)
    eval_docs = self.evaluation_documents(language)
    phrases = corpora.repository("data/wiki/" + language) + "/phrases.txt"

    split_ratio = 5000
    if flags.arg.silver_corpus_size:
      split_ratio = int(flags.arg.silver_corpus_size / 100)

    with self.wf.namespace(language + "-silver"):
      # Map document through silver annotation pipeline and split corpus.
      mapper = self.wf.task("corpus-split", "labeler")

      mapper.add_annotator("mentions")
      mapper.add_annotator("anaphora")
      #mapper.add_annotator("phrase-structure")
      mapper.add_annotator("relations")
      mapper.add_annotator("types")
      mapper.add_annotator("clear-references")

      mapper.add_param("resolve", True)
      mapper.add_param("language", language)
      mapper.add_param("initial_reference", False)
      mapper.add_param("definite_reference", False)
      mapper.add_param("split_ratio", split_ratio)

      mapper.attach_input("commons", self.data.knowledge_base())
      mapper.attach_input("aliases", self.data.phrase_table(language))
      mapper.attach_input("dictionary", self.idftable(language))

      config = corpora.repository("data/wiki/" + language + "/silver.sling")
      if os.path.isfile(config):
        mapper.attach_input("commons", self.wf.resource(config,
                                                        format="store/frame"))

      reader_params = None
      if flags.arg.silver_corpus_size:
        reader_params = {
          "limit": int(flags.arg.silver_corpus_size / length_of(docs))
        }

      self.wf.connect(self.wf.read(docs, params=reader_params),
                      mapper, name="docs")

      train_channel = self.wf.channel(mapper, name="train",
                                      format="message/document")
      eval_channel = self.wf.channel(mapper, name="eval",
                                     format="message/document")

      # Write shuffled training documents.
      train_shards = length_of(train_docs)
      train_shuffled = self.wf.shuffle(train_channel,
                                       shards=train_shards,
                                       bufsize=256 * 1024 * 1024)
      self.wf.write(train_shuffled, train_docs, name="train")

      # Write evaluation documents.
      self.wf.write(eval_channel, eval_docs, name="eval")

    return train_docs, eval_docs
Пример #9
0
 def custom_properties(self):
     """Resource for custom SLING knowledge base properties."""
     return self.wf.resource("custom-properties.sling",
                             dir=corpora.repository("data/nlp/schemas"),
                             format="store/frame")
Пример #10
0
 def dataset(self, path):
     if path.startswith("repo/"):
         return self.wf.resource(corpora.repository(path[5:]),
                                 format="file")
     else:
         return self.wf.resource(path, dir=flags.arg.workdir, format="file")
Пример #11
0
 def recon_config(self):
   """Resource for reconciler configuration."""
   return self.wf.resource("recon.sling",
                           dir=corpora.repository("data/wiki"),
                           format="store/frame")
Пример #12
0
 def xref_config(self):
   """Resource for cross-references configuration."""
   return self.wf.resource("xrefs.sling",
                           dir=corpora.repository("data/wiki"),
                           format="store/frame")
Пример #13
0
 def document_schema_defs(self):
     """Resource for document schema definitions."""
     return self.wf.resource("document-schema.sling",
                             dir=corpora.repository("data/nlp/schemas"),
                             format="store/frame")
Пример #14
0
 def catalog_defs(self):
     """Resource for global catalog definitions."""
     return self.wf.resource("catalog.sling",
                             dir=corpora.repository("data/nlp/schemas"),
                             format="store/frame")
Пример #15
0
 def xref_properties(self):
   """Resource for properties tracked for cross-references."""
   return self.wf.resource("xrefs.sling",
                           dir=corpora.repository("data/wiki"),
                           format="store/frame")