Пример #1
0
 def fused_items(self):
     """Resource for merged items. This is a set of record files where each
 item is represented as a frame.
 """
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(),
                             format="records/frame")
Пример #2
0
 def name_table(self, language=None):
     """Resource for item name table. This is a repository with all the names
 and the items they are aliases for."""
     if language == None: language = flags.arg.language
     return self.wf.resource("name-table.repo",
                             dir=corpora.wikidir(language),
                             format="repository")
Пример #3
0
    def wikidata_items(self):
        """Resource for wikidata items. This is a set of record files where each
    WikiData item is represented as a frame:
      <qid>: {
        =<qid>
        :/w/item
        name: "..."
        description: "..."
        alias: {
          name: "..."
          lang: /lang/<lang>
          sources: ...
        }
        ...
        /w/wikipedia: {
          /lang/<lang>: <wid>
          ...
       }
       ... properties
      }

      <qid>: Wikidata item id (Q<item number>, e.g. Q35)
      <pid>: Wikidata property id (P<property number>, e.g. P31)
      <wid>: Wikipedia page id (/wp/<lang>/<pageid>, /wp/en/76972)
    """
        return self.wf.resource("*****@*****.**",
                                dir=corpora.wikidir(),
                                format="records/frame")
Пример #4
0
 def wikipedia_documents(self, language=None):
     """Resource for parsed Wikipedia documents. This is a set of record files
 with one record per article, where the text has been extracted from the
 wiki markup and tokenized. The documents also contains additional
 structured information (e.g. categories) and mentions for links to other
 Wikipedia pages:
   <wid>: {
     =<wid>
     :/wp/page
     /wp/page/pageid: ...
     /wp/page/title: "..."
     lang: /lang/<lang>
     /wp/page/text: "<Wikipedia page in wiki markup format>"
     /wp/page/qid: <qid>
     :document
     url: "http://<lang>.wikipedia.org/wiki/<name>"
     title: "..."
     text: "<clear text extracted from wiki markup>"
     tokens: [...]
     mention: {
       :/wp/link
       begin: ...
       length: ...
       evokes: <qid>
     }
     ...
     /wp/page/category: <qid>
     ...
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/document")
Пример #5
0
 def wikipedia_items(self):
     """Resource for item data from Wikipedia . This merges the item categories
 from all Wikipedias.
 """
     return self.wf.resource("wikipedia-items.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Пример #6
0
 def wikipedia_category_documents(self, language=None):
     """Resource for parsed Wikipedia category documents.
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/document")
Пример #7
0
 def phrase_table(self, language=None):
     """Resource for item name phrase table. This is a repository with phrase
 fingerprints of the item names."""
     if language == None: language = flags.arg.language
     return self.wf.resource("phrase-table.repo",
                             dir=corpora.wikidir(language),
                             format="repository")
Пример #8
0
 def knowledge_base(self):
     """Resource for knowledge base. This is a SLING frame store with frames for
 each Wikidata item and property plus additional schema information.
 """
     return self.wf.resource("kb.sling",
                             dir=corpora.wikidir(),
                             format="store/frame")
Пример #9
0
 def vocabulary(self, language=None):
     """Resource for word embedding vocabulary. This is a text map with
 (normalized) words and counts.
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("word-vocabulary.map",
                             dir=corpora.wikidir(language),
                             format="textmap/word")
Пример #10
0
 def wikipedia_categories(self, language=None):
     """Resource for wikipedia categories. This is a set of record files where
 each Wikipedia article is encoded as a SLING document.
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/frame")
Пример #11
0
 def wikipedia_mapping(self, language=None):
     """Resource for wikipedia to wikidata mapping. This is a SLING frame store
 with one frame per Wikipedia article with infomation for mapping it to
 Wikidata.
   {
     =<wid>
     /w/item/qid: <qid>
     /w/item/kind: /w/item/kind/...
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("mapping.sling",
                             dir=corpora.wikidir(language),
                             format="store/frame")
Пример #12
0
 def wikipedia_redirects(self, language=None):
     """Resource for wikidata redirects. This is encoded as a SLING frame store
 where each redirect is a SLING frame.
   {
     =<wid for redirect page>
     :/wp/redirect
     /wp/redirect/pageid: ...
     /wp/redirect/title: "..."
     /wp/redirect/link: <wid for target page>
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("redirects.sling",
                             dir=corpora.wikidir(language),
                             format="store/frame")
Пример #13
0
 def wikidata_properties(self):
     """Resource for wikidata properties. This is a record file where each
 Wikidata property is represented as a frame.
   <pid>: {
     =<pid>
     :/w/property
     name: "..."
     description: "..."
     /w/datatype: ...
     ... properties ...
   }
 """
     return self.wf.resource("properties.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Пример #14
0
 def wikipedia_articles(self, language=None):
     """Resource for wikipedia articles. This is a set of record files where each
 Wikipedia article is encoded as a SLING document.
   <wid>: {
     =<wid>
     :/wp/page
     /wp/page/pageid: ...
     /wp/page/title: "..."
     lang: /lang/<lang>
     /wp/page/text: "<Wikipedia page in Wiki markup format>"
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/frame")
Пример #15
0
 def item_names(self, language=None):
     """Resource for item names in language. This is a set of record files with
 one SLING frame per item.
   <qid>: {
     alias: {
       name: "<alias>"
       lang: /lang/<lang>
       sources: ...
       count: ...
     }
     ...
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/alias")
Пример #16
0
 def wikipedia_aliases(self, language=None):
     """Resource for wikipedia aliases. The aliases are extracted from the
 Wiipedia pages from anchors, redirects, disambiguation pages etc. This is
 a set of record files with a SLING frame record for each item:
   <qid>: {
     alias: {
       name: "<alias>"
       lang: /lang/<lang>
       sources: ...
       count: ...
     }
     ...
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/alias")
Пример #17
0
 def wikipedia_members(self):
     """Resource for members of categories.
 """
     return self.wf.resource("wikipedia-members.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Пример #18
0
 def word_embeddings(self, language=None):
     """Resource for word embeddings in word2vec embedding format."""
     if language == None: language = flags.arg.language
     return self.wf.resource("word-embeddings.vec",
                             dir=corpora.wikidir(language),
                             format="embeddings")
Пример #19
0
 def item_popularity(self):
     """Resource for item popularity."""
     return self.wf.resource("item-popularity.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")