def build_knowledge_base(): # Merge categories from wikipedias. if flags.arg.merge_categories: log.info("Merge wikipedia categories") wf = wiki.WikiWorkflow("category-merging") wf.merge_wikipedia_categories() workflow.run(wf.wf) # Invert categories. if flags.arg.invert_categories: log.info("Invert categories") wf = wiki.WikiWorkflow("category-inversion") wf.invert_wikipedia_categories() workflow.run(wf.wf) # Extract link graph. if flags.arg.extract_wikilinks: log.info("Extract link graph") wf = wiki.WikiWorkflow("link-graph") wf.extract_links() workflow.run(wf.wf) # Fuse items. if flags.arg.fuse_items: log.info("Fuse items") wf = wiki.WikiWorkflow("fuse-items") wf.fuse_items() workflow.run(wf.wf) # Build knowledge base repository. if flags.arg.build_kb: log.info("Build knowledge base repository") wf = wiki.WikiWorkflow("knowledge-base") wf.build_knowledge_base() workflow.run(wf.wf)
def parse_wikipedia(): # Convert wikipedia pages to SLING documents. if flags.arg.parse_wikipedia: for language in flags.arg.languages: log.info("Parse " + language + " wikipedia") wf = wiki.WikiWorkflow(language + "-wikipedia-parsing") wf.parse_wikipedia(language=language) workflow.run(wf.wf)
def sling_entity_link(sling_input_corpus, sling_output_corpus): """Does sling entity linking and created linked output corpus.""" labeler = entity.EntityWorkflow("wiki-label") unannotated = labeler.wf.resource(sling_input_corpus, format="records/document") annotated = labeler.wf.resource(sling_output_corpus, format="records/document") labeler.label_documents(indocs=unannotated, outdocs=annotated) workflow.run(labeler.wf)
def annotate_corpus(self, unannotated_file, annotated_file): """Run silver annotations from SLING.""" if os.path.exists(annotated_file): return labeler = silver.SilverWorkflow("silver") # labeler = entity.EntityWorkflow("wiki-label") unannotated = labeler.wf.resource(unannotated_file, format="records/document") annotated = labeler.wf.resource(annotated_file, format="records/document") labeler.silver_annotation(indocs=unannotated, outdocs=annotated) # labeler.label_documents(indocs=unannotated, outdocs=annotated) workflow.run(labeler.wf)
def annotate_corpus( self, unannotated_file="local/data/e/wiki/en/documents-00000-of-00010.rec", annotated_file="/tmp/labeled.rec"): if os.path.exists(annotated_file): return labeler = entity.EntityWorkflow("wiki-label") unannotated = labeler.wf.resource(unannotated_file, format="records/document") annotated = labeler.wf.resource(annotated_file, format="records/document") labeler.label_documents(indocs=unannotated, outdocs=annotated) workflow.run(labeler.wf)
def build_knowledge_base(): # Build knowledge base repository. if flags.arg.build_kb: log.info("Build knowledge base repository") wf = wiki.WikiWorkflow("knowledge-base") wf.build_knowledge_base() workflow.run(wf.wf) # Extract item names from wikidata and wikipedia. if flags.arg.extract_names: for language in flags.arg.languages: log.info("Extract " + language + " names") wf = wiki.WikiWorkflow(language + "-name-extraction") wf.extract_names(language=language) workflow.run(wf.wf) # Build name table. if flags.arg.build_nametab: for language in flags.arg.languages: log.info("Build " + language + " name table") wf = wiki.WikiWorkflow(language + "-name-table") wf.build_name_table(language=language) workflow.run(wf.wf) # Build phrase table. if flags.arg.build_phrasetab: for language in flags.arg.languages: log.info("Build " + language + " phrase table") wf = wiki.WikiWorkflow(language + "-phrase-table") wf.build_phrase_table(language=language) workflow.run(wf.wf)
def fuse_items(): # Merge categories from wikipedias. if flags.arg.merge_categories: log.info("Merge wikipedia categories") wf = wiki.WikiWorkflow("category-merging") wf.merge_wikipedia_categories() workflow.run(wf.wf) # Invert categories. if flags.arg.invert_categories: log.info("Invert categories") wf = wiki.WikiWorkflow("category-inversion") wf.invert_wikipedia_categories() workflow.run(wf.wf) # Compute item popularity. if flags.arg.compute_item_popularity: log.info("Compute item popularity") wf = wiki.WikiWorkflow("item-popularity") wf.compute_item_popularity() workflow.run(wf.wf) # Fuse items. if flags.arg.fuse_items: log.info("Fuse items") wf = wiki.WikiWorkflow("fuse-items") wf.fuse_items() workflow.run(wf.wf)
def download_corpora(): if flags.arg.download_wikidata or flags.arg.download_wikipedia: wf = download.DownloadWorkflow("wiki-download") # Download wikidata dump. if flags.arg.download_wikidata: wf.download_wikidata() # Download wikipedia dumps. if flags.arg.download_wikipedia: for language in flags.arg.languages: wf.download_wikipedia(language=language) workflow.run(wf.wf)
def extract_named_entities(): # Extract Wikipedia link graph. if flags.arg.extract_wikilinks: log.info("Extract Wikipedia link graph") wf = entity.EntityWorkflow("wiki-links") wf.extract_wikilinks() workflow.run(wf.wf) # Extract IDF table. if flags.arg.build_idf: wf = entity.EntityWorkflow("idf-table") for language in flags.arg.languages: log.info("Build " + language + " IDF table") wf.build_idf(language=language) workflow.run(wf.wf) # Fuse NER items. if flags.arg.fuse_ner_items: log.info("Fuse NER items") wf = entity.EntityWorkflow("fuse-ner-items") wf.fuse_items() workflow.run(wf.wf) # Build NER knowledge base. if flags.arg.build_ner_kb: log.info("Build NER knowledge base") wf = entity.EntityWorkflow("ner-knowledge-base") wf.build_knowledge_base() workflow.run(wf.wf)
def import_wiki(): if flags.arg.import_wikidata or flags.arg.import_wikipedia: wf = wiki.WikiWorkflow("wiki-import") # Import wikidata. if flags.arg.import_wikidata: log.info("Import wikidata") wf.wikidata() # Import wikipedia(s). if flags.arg.import_wikipedia: for language in flags.arg.languages: log.info("Import " + language + " wikipedia") wf.wikipedia(language=language) workflow.run(wf.wf)
def silver_annotation(): # Extract IDF table. if flags.arg.build_idf: wf = silver.SilverWorkflow("idf-table") for language in flags.arg.languages: log.info("Build " + language + " IDF table") wf.build_idf(language=language) workflow.run(wf.wf) # Run silver-labeling of Wikipedia documents. if flags.arg.silver_annotation: for language in flags.arg.languages: log.info("Silver-label " + language + " wikipedia") wf = silver.SilverWorkflow(language + "-silver") wf.silver_annotation(language=language) workflow.run(wf.wf)
def train_embeddings(): # Extract vocabulary for word embeddings. if flags.arg.extract_vocabulary: for language in flags.arg.languages: log.info("Extract " + language + " vocabulary") wf = embedding.EmbeddingWorkflow(language + "-vocabulary") wf.extract_vocabulary(language=language) workflow.run(wf.wf) # Train word embeddings. if flags.arg.train_word_embeddings: for language in flags.arg.languages: log.info("Train " + language + " word embeddings") wf = embedding.EmbeddingWorkflow(language + "-word-embeddings") wf.train_word_embeddings(language=language) workflow.run(wf.wf) # Extract vocabulary for fact and category embeddings. if flags.arg.extract_fact_lexicon: log.info("Extract fact and category lexicons") wf = embedding.EmbeddingWorkflow("fact-lexicon") wf.extract_fact_lexicon() workflow.run(wf.wf) # Extract facts from knowledge base. if flags.arg.extract_facts: log.info("Extract facts from knowledge base") wf = embedding.EmbeddingWorkflow("fact-extraction") wf.extract_facts() workflow.run(wf.wf) # Train fact and category embeddings. if flags.arg.train_fact_embeddings: log.info("Train fact and category embeddings") wf = embedding.EmbeddingWorkflow("fact-embeddings") wf.train_fact_embeddings() workflow.run(wf.wf)
def build_alias_tables(): # Extract item names from wikidata and wikipedia. if flags.arg.extract_names: for language in flags.arg.languages: log.info("Extract " + language + " names") wf = wiki.WikiWorkflow(language + "-name-extraction") wf.extract_names(language=language) workflow.run(wf.wf) # Build name table. if flags.arg.build_nametab: for language in flags.arg.languages: log.info("Build " + language + " name table") wf = wiki.WikiWorkflow(language + "-name-table") wf.build_name_table(language=language) workflow.run(wf.wf) # Build phrase table. if flags.arg.build_phrasetab: for language in flags.arg.languages: log.info("Build " + language + " phrase table") wf = wiki.WikiWorkflow(language + "-phrase-table") wf.build_phrase_table(language=language) workflow.run(wf.wf)
# Group aliases by alias fingerprint. aliases_by_fp = wf.shuffle(item_aliases) # Merge all aliases for fingerprint. output = res(fgdir + "/" + language + "/aliases.rec", "records/alias") merger = wf.reduce(aliases_by_fp, output, "alias-merger") return output # Build name table. def build_name_table(aliases, language): builder = wf.task("name-table-builder") wf.connect(wf.read(aliases, name="alias-reader"), builder) repo = res(fgdir + "/" + language + "/name-table.repo", "repository") builder.attach_output("repository", repo) # Run tasks. compute_fanin() collect_xrefs() reconcile_items() build_kb() for language in ["en", "de", "fr"]: aliases = extract_aliases(language) build_name_table(aliases, language) workflow.run(wf) # Shut down. workflow.shutdown()