예제 #1
0
def build_knowledge_base():
    # Merge categories from wikipedias.
    if flags.arg.merge_categories:
        log.info("Merge wikipedia categories")
        wf = wiki.WikiWorkflow("category-merging")
        wf.merge_wikipedia_categories()
        workflow.run(wf.wf)

    # Invert categories.
    if flags.arg.invert_categories:
        log.info("Invert categories")
        wf = wiki.WikiWorkflow("category-inversion")
        wf.invert_wikipedia_categories()
        workflow.run(wf.wf)

    # Extract link graph.
    if flags.arg.extract_wikilinks:
        log.info("Extract link graph")
        wf = wiki.WikiWorkflow("link-graph")
        wf.extract_links()
        workflow.run(wf.wf)

    # Fuse items.
    if flags.arg.fuse_items:
        log.info("Fuse items")
        wf = wiki.WikiWorkflow("fuse-items")
        wf.fuse_items()
        workflow.run(wf.wf)

    # Build knowledge base repository.
    if flags.arg.build_kb:
        log.info("Build knowledge base repository")
        wf = wiki.WikiWorkflow("knowledge-base")
        wf.build_knowledge_base()
        workflow.run(wf.wf)
예제 #2
0
def parse_wikipedia():
    # Convert wikipedia pages to SLING documents.
    if flags.arg.parse_wikipedia:
        for language in flags.arg.languages:
            log.info("Parse " + language + " wikipedia")
            wf = wiki.WikiWorkflow(language + "-wikipedia-parsing")
            wf.parse_wikipedia(language=language)
            workflow.run(wf.wf)
예제 #3
0
def sling_entity_link(sling_input_corpus, sling_output_corpus):
    """Does sling entity linking and created linked output corpus."""
    labeler = entity.EntityWorkflow("wiki-label")
    unannotated = labeler.wf.resource(sling_input_corpus,
                                      format="records/document")
    annotated = labeler.wf.resource(sling_output_corpus,
                                    format="records/document")
    labeler.label_documents(indocs=unannotated, outdocs=annotated)
    workflow.run(labeler.wf)
예제 #4
0
 def annotate_corpus(self, unannotated_file, annotated_file):
     """Run silver annotations from SLING."""
     if os.path.exists(annotated_file):
         return
     labeler = silver.SilverWorkflow("silver")
     # labeler = entity.EntityWorkflow("wiki-label")
     unannotated = labeler.wf.resource(unannotated_file,
                                       format="records/document")
     annotated = labeler.wf.resource(annotated_file,
                                     format="records/document")
     labeler.silver_annotation(indocs=unannotated, outdocs=annotated)
     # labeler.label_documents(indocs=unannotated, outdocs=annotated)
     workflow.run(labeler.wf)
예제 #5
0
 def annotate_corpus(
         self,
         unannotated_file="local/data/e/wiki/en/documents-00000-of-00010.rec",
         annotated_file="/tmp/labeled.rec"):
     if os.path.exists(annotated_file):
         return
     labeler = entity.EntityWorkflow("wiki-label")
     unannotated = labeler.wf.resource(unannotated_file,
                                       format="records/document")
     annotated = labeler.wf.resource(annotated_file,
                                     format="records/document")
     labeler.label_documents(indocs=unannotated, outdocs=annotated)
     workflow.run(labeler.wf)
예제 #6
0
def build_knowledge_base():
    # Build knowledge base repository.
    if flags.arg.build_kb:
        log.info("Build knowledge base repository")
        wf = wiki.WikiWorkflow("knowledge-base")
        wf.build_knowledge_base()
        workflow.run(wf.wf)

    # Extract item names from wikidata and wikipedia.
    if flags.arg.extract_names:
        for language in flags.arg.languages:
            log.info("Extract " + language + " names")
            wf = wiki.WikiWorkflow(language + "-name-extraction")
            wf.extract_names(language=language)
            workflow.run(wf.wf)

    # Build name table.
    if flags.arg.build_nametab:
        for language in flags.arg.languages:
            log.info("Build " + language + " name table")
            wf = wiki.WikiWorkflow(language + "-name-table")
            wf.build_name_table(language=language)
            workflow.run(wf.wf)

    # Build phrase table.
    if flags.arg.build_phrasetab:
        for language in flags.arg.languages:
            log.info("Build " + language + " phrase table")
            wf = wiki.WikiWorkflow(language + "-phrase-table")
            wf.build_phrase_table(language=language)
            workflow.run(wf.wf)
예제 #7
0
def fuse_items():
    # Merge categories from wikipedias.
    if flags.arg.merge_categories:
        log.info("Merge wikipedia categories")
        wf = wiki.WikiWorkflow("category-merging")
        wf.merge_wikipedia_categories()
        workflow.run(wf.wf)

    # Invert categories.
    if flags.arg.invert_categories:
        log.info("Invert categories")
        wf = wiki.WikiWorkflow("category-inversion")
        wf.invert_wikipedia_categories()
        workflow.run(wf.wf)

    # Compute item popularity.
    if flags.arg.compute_item_popularity:
        log.info("Compute item popularity")
        wf = wiki.WikiWorkflow("item-popularity")
        wf.compute_item_popularity()
        workflow.run(wf.wf)

    # Fuse items.
    if flags.arg.fuse_items:
        log.info("Fuse items")
        wf = wiki.WikiWorkflow("fuse-items")
        wf.fuse_items()
        workflow.run(wf.wf)
예제 #8
0
def download_corpora():
    if flags.arg.download_wikidata or flags.arg.download_wikipedia:
        wf = download.DownloadWorkflow("wiki-download")

        # Download wikidata dump.
        if flags.arg.download_wikidata:
            wf.download_wikidata()

        # Download wikipedia dumps.
        if flags.arg.download_wikipedia:
            for language in flags.arg.languages:
                wf.download_wikipedia(language=language)

        workflow.run(wf.wf)
예제 #9
0
def extract_named_entities():
  # Extract Wikipedia link graph.
  if flags.arg.extract_wikilinks:
    log.info("Extract Wikipedia link graph")
    wf = entity.EntityWorkflow("wiki-links")
    wf.extract_wikilinks()
    workflow.run(wf.wf)

  # Extract IDF table.
  if flags.arg.build_idf:
    wf = entity.EntityWorkflow("idf-table")
    for language in flags.arg.languages:
      log.info("Build " + language + " IDF table")
      wf.build_idf(language=language)
    workflow.run(wf.wf)

  # Fuse NER items.
  if flags.arg.fuse_ner_items:
    log.info("Fuse NER items")
    wf = entity.EntityWorkflow("fuse-ner-items")
    wf.fuse_items()
    workflow.run(wf.wf)

  # Build NER knowledge base.
  if flags.arg.build_ner_kb:
    log.info("Build NER knowledge base")
    wf = entity.EntityWorkflow("ner-knowledge-base")
    wf.build_knowledge_base()
    workflow.run(wf.wf)
예제 #10
0
def import_wiki():
    if flags.arg.import_wikidata or flags.arg.import_wikipedia:
        wf = wiki.WikiWorkflow("wiki-import")
        # Import wikidata.
        if flags.arg.import_wikidata:
            log.info("Import wikidata")
            wf.wikidata()

        # Import wikipedia(s).
        if flags.arg.import_wikipedia:
            for language in flags.arg.languages:
                log.info("Import " + language + " wikipedia")
                wf.wikipedia(language=language)

        workflow.run(wf.wf)
예제 #11
0
def silver_annotation():
    # Extract IDF table.
    if flags.arg.build_idf:
        wf = silver.SilverWorkflow("idf-table")
        for language in flags.arg.languages:
            log.info("Build " + language + " IDF table")
            wf.build_idf(language=language)
        workflow.run(wf.wf)

    # Run silver-labeling of Wikipedia documents.
    if flags.arg.silver_annotation:
        for language in flags.arg.languages:
            log.info("Silver-label " + language + " wikipedia")
            wf = silver.SilverWorkflow(language + "-silver")
            wf.silver_annotation(language=language)
            workflow.run(wf.wf)
예제 #12
0
파일: run.py 프로젝트: yespon/sling
def train_embeddings():
    # Extract vocabulary for word embeddings.
    if flags.arg.extract_vocabulary:
        for language in flags.arg.languages:
            log.info("Extract " + language + " vocabulary")
            wf = embedding.EmbeddingWorkflow(language + "-vocabulary")
            wf.extract_vocabulary(language=language)
            workflow.run(wf.wf)

    # Train word embeddings.
    if flags.arg.train_word_embeddings:
        for language in flags.arg.languages:
            log.info("Train " + language + " word embeddings")
            wf = embedding.EmbeddingWorkflow(language + "-word-embeddings")
            wf.train_word_embeddings(language=language)
            workflow.run(wf.wf)

    # Extract vocabulary for fact and category embeddings.
    if flags.arg.extract_fact_lexicon:
        log.info("Extract fact and category lexicons")
        wf = embedding.EmbeddingWorkflow("fact-lexicon")
        wf.extract_fact_lexicon()
        workflow.run(wf.wf)

    # Extract facts from knowledge base.
    if flags.arg.extract_facts:
        log.info("Extract facts from knowledge base")
        wf = embedding.EmbeddingWorkflow("fact-extraction")
        wf.extract_facts()
        workflow.run(wf.wf)

    # Train fact and category embeddings.
    if flags.arg.train_fact_embeddings:
        log.info("Train fact and category embeddings")
        wf = embedding.EmbeddingWorkflow("fact-embeddings")
        wf.train_fact_embeddings()
        workflow.run(wf.wf)
예제 #13
0
def build_alias_tables():
    # Extract item names from wikidata and wikipedia.
    if flags.arg.extract_names:
        for language in flags.arg.languages:
            log.info("Extract " + language + " names")
            wf = wiki.WikiWorkflow(language + "-name-extraction")
            wf.extract_names(language=language)
            workflow.run(wf.wf)

    # Build name table.
    if flags.arg.build_nametab:
        for language in flags.arg.languages:
            log.info("Build " + language + " name table")
            wf = wiki.WikiWorkflow(language + "-name-table")
            wf.build_name_table(language=language)
            workflow.run(wf.wf)

    # Build phrase table.
    if flags.arg.build_phrasetab:
        for language in flags.arg.languages:
            log.info("Build " + language + " phrase table")
            wf = wiki.WikiWorkflow(language + "-phrase-table")
            wf.build_phrase_table(language=language)
            workflow.run(wf.wf)
예제 #14
0
  # Group aliases by alias fingerprint.
  aliases_by_fp = wf.shuffle(item_aliases)

  # Merge all aliases for fingerprint.
  output = res(fgdir + "/" + language + "/aliases.rec", "records/alias")
  merger = wf.reduce(aliases_by_fp, output, "alias-merger")
  return output

# Build name table.
def build_name_table(aliases, language):
  builder = wf.task("name-table-builder")
  wf.connect(wf.read(aliases, name="alias-reader"), builder)
  repo = res(fgdir + "/" + language + "/name-table.repo", "repository")
  builder.attach_output("repository", repo)

# Run tasks.
compute_fanin()
collect_xrefs()
reconcile_items()
build_kb()
for language in ["en", "de", "fr"]:
  aliases = extract_aliases(language)
  build_name_table(aliases, language)

workflow.run(wf)

# Shut down.
workflow.shutdown()