def knowledge_base(self): """Resource for knowledge base. This is a SLING frame store with frames for each Wikidata item and property plus additional schema information. """ return self.wf.resource("kb.sling", dir=corpora.wikidir(), format="store/frame")
def wikipedia_items(self): """Resource for item data from Wikipedia . This merges the item categories from all Wikipedias. """ return self.wf.resource("wikipedia-items.rec", dir=corpora.wikidir(), format="records/frame")
def fused_items(self): """Resource for merged items. This is a set of record files where each item is represented as a frame. """ return self.wf.resource("*****@*****.**", dir=corpora.wikidir(), format="records/frame")
def kb_input(self, task, kb_dir=None): if kb_dir is None: kb_dir = corpora.wikidir() kb = self.wf.resource(file="kb.sling", dir=kb_dir, format="store/frame") task.attach_input("kb", kb)
def wikidata_items(self): """Resource for wikidata items. This is a set of record files where each WikiData item is represented as a frame: <qid>: { =<qid> :/w/item name: "..." description: "..." alias: { name: "..." lang: /lang/<lang> sources: ... } ... /w/wikipedia: { /lang/<lang>: <wid> ... } ... properties } <qid>: Wikidata item id (Q<item number>, e.g. Q35) <pid>: Wikidata property id (P<property number>, e.g. P31) <wid>: Wikipedia page id (/wp/<lang>/<pageid>, /wp/en/76972) """ return self.wf.resource("*****@*****.**", dir=corpora.wikidir(), format="records/frame")
def generate_parses(self, language, min_members): with self.wf.namespace("generate-parses"): generator = self.wf.task("category-parse-generator") generator.add_params({ "language": language, "min_members": min_members }) wikidir = corpora.wikidir() self.kb_input(generator, kb_dir=wikidir) items = self.wf.resource(file="*****@*****.**", dir=wikidir, format="records/frame") generator.attach_input("items", items) phrase_table_dir = wikidir + "/" + language phrase_table = self.wf.resource("phrase-table.repo", dir=phrase_table_dir, format="text/frame") generator.attach_input("phrase-table", phrase_table) output = self.generated_parses_resource() generator.attach_output("output", output) rejected = self.wf.resource("rejected-categories.rec", dir=self.outdir, format="records/text") generator.attach_output("rejected", rejected) return output
def name_table(self, language=None): """Resource for item name table. This is a repository with all the names and the items they are aliases for.""" if language == None: language = flags.arg.language return self.wf.resource("name-table.repo", dir=corpora.wikidir(language), format="repository")
def phrase_table(self, language=None): """Resource for item name phrase table. This is a repository with phrase fingerprints of the item names.""" if language == None: language = flags.arg.language return self.wf.resource("phrase-table.repo", dir=corpora.wikidir(language), format="repository")
def wikipedia_category_documents(self, language=None): """Resource for parsed Wikipedia category documents. """ if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=corpora.wikidir(language), format="records/document")
def wikipedia_documents(self, language=None): """Resource for parsed Wikipedia documents. This is a set of record files with one record per article, where the text has been extracted from the wiki markup and tokenized. The documents also contains additional structured information (e.g. categories) and mentions for links to other Wikipedia pages: <wid>: { =<wid> :/wp/page /wp/page/pageid: ... /wp/page/title: "..." lang: /lang/<lang> /wp/page/text: "<Wikipedia page in wiki markup format>" /wp/page/qid: <qid> :document url: "http://<lang>.wikipedia.org/wiki/<name>" title: "..." text: "<clear text extracted from wiki markup>" tokens: [...] mention: { :/wp/link begin: ... length: ... evokes: <qid> } ... /wp/page/category: <qid> ... } """ if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=corpora.wikidir(language), format="records/document")
def wikipedia_categories(self, language=None): """Resource for wikipedia categories. This is a set of record files where each Wikipedia article is encoded as a SLING document. """ if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=corpora.wikidir(language), format="records/frame")
def vocabulary(self, language=None): """Resource for word embedding vocabulary. This is a text map with (normalized) words and counts. """ if language == None: language = flags.arg.language return self.wf.resource("word-vocabulary.map", dir=corpora.wikidir(language), format="textmap/word")
def wikidata_redirects(self): """Resource for Wikidata redirects. This is a set of record files where each Wikidata redirect is represented as a frame: <qid>: { =<qid> +<redirect> } """ return self.wf.resource("wikidata-redirects.rec", dir=corpora.wikidir(), format="records/frame")
def wikipedia_mapping(self, language=None): """Resource for wikipedia to wikidata mapping. This is a SLING frame store with one frame per Wikipedia article with infomation for mapping it to Wikidata. { =<wid> /w/item/qid: <qid> /w/item/kind: /w/item/kind/... } """ if language == None: language = flags.arg.language return self.wf.resource("mapping.sling", dir=corpora.wikidir(language), format="store/frame")
def wikipedia_redirects(self, language=None): """Resource for wikidata redirects. This is encoded as a SLING frame store where each redirect is a SLING frame. { =<wid for redirect page> :/wp/redirect /wp/redirect/pageid: ... /wp/redirect/title: "..." /wp/redirect/link: <wid for target page> } """ if language == None: language = flags.arg.language return self.wf.resource("redirects.sling", dir=corpora.wikidir(language), format="store/frame")
def wikipedia_articles(self, language=None): """Resource for wikipedia articles. This is a set of record files where each Wikipedia article is encoded as a SLING document. <wikipedia article title>: { :/wp/page /wp/page/pageid: ... /wp/page/title: "..." lang: /lang/<lang> /wp/page/text: "<Wikipedia page in Wiki markup format>" } """ if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=corpora.wikidir(language), format="records/frame")
def wikidata_properties(self): """Resource for wikidata properties. This is a record file where each Wikidata property is represented as a frame. <pid>: { =<pid> :/w/property name: "..." description: "..." /w/datatype: ... ... properties ... } """ return self.wf.resource("properties.rec", dir=corpora.wikidir(), format="records/frame")
def wikipedia_aliases(self, language=None): """Resource for wikipedia aliases. The aliases are extracted from the Wikipedia pages from anchors, redirects, disambiguation pages etc. This is a set of record files with a SLING frame record for each item: <qid>: { alias: {+"<alias>"@/lang/xx sources: ... count: ... } ... } """ if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=corpora.wikidir(language), format="records/alias")
def item_names(self, language=None): """Resource for item names in language. This is a set of record files with one SLING frame per item. <qid>: { alias: { name: "<alias>" lang: /lang/<lang> sources: ... count: ... form: ... } ... } """ if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=corpora.wikidir(language), format="records/alias")
def word_embeddings(self, language=None): """Resource for word embeddings in word2vec embedding format.""" if language == None: language = flags.arg.language return self.wf.resource("word-embeddings.vec", dir=corpora.wikidir(language), format="embeddings")
def xrefs(self): """Resource for store with cross-reference items.""" return self.wf.resource("xrefs.sling", dir=corpora.wikidir(), format="store/frame")
def wikidata_latest(self): """Resource for latest Wikidata update. This contains the the QID and revision of the latest update.""" return self.wf.resource("latest", dir=corpora.wikidir(), format="text")
def wikipedia_items(self): """Resource for item data from Wikipedias.""" return self.wf.resource("wikipedia-items.rec", dir=corpora.wikidir(), format="records/frame")
def fanin(self): """Resource for item fan-in, i.e. the number of times an item is a target in a relation.""" return self.wf.resource("fanin.rec", dir=corpora.wikidir(), format="records/frame")
def item_popularity(self): """Resource for item popularity.""" return self.wf.resource("item-popularity.rec", dir=corpora.wikidir(), format="records/frame")
def fanin(self): """Resource for link fan-in.""" return self.wf.resource("fanin.rec", dir=corpora.wikidir(), format="records/frame")
def wikilinks(self): """Resource for link graph.""" return self.wf.resource("*****@*****.**", dir=corpora.wikidir(), format="records/frame")
def wikipedia_members(self): """Resource for members of categories. """ return self.wf.resource("wikipedia-members.rec", dir=corpora.wikidir(), format="records/frame")
def wikipedia_summaries(self, language=None): """Resource for Wikipedia document summaries.""" return self.wf.resource("summaries.rec", dir=corpora.wikidir(language), format="records/document")