--archives ../karma.zip \
      --py-files ../lib/python-lib.zip \
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaContextWorkflow.py \
      ../sample-data/sample-unicode-jsonld.json text \
      https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json \
      ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaContextWorkflow")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputType = argv[2]
    contextUrl = argv[3]
    outputFilename = argv[4]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # Read input
    inputRDD = fileUtil.load_file(inputFilename, inputType, "json")

    # Apply the context
    outputRDD = workflow.apply_context(inputRDD, contextUrl)

    # Save the output
    fileUtil.save_file(outputRDD, outputFilename, "text", "json")
Пример #2
0
            "rdf.generation.disable.nesting": "false"
        })
    countryRDD1 = workflow.run_karma(
        input_country_rdd,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/countries/country-model.ttl",
        "http://dig.isi.edu/geonames",
        "http://schema.org/Country1",
        country_context,
        data_type="csv",
        additional_settings={
            "karma.input.delimiter": "\t",
            "rdf.generation.disable.nesting": "false"
        })

    # Apply the context
    cityRDD = workflow.apply_context(cityRDD1, city_context)
    cityRDD = cityRDD.persist(StorageLevel.MEMORY_AND_DISK)
    cityRDD.setName("cityRDD")

    stateRDD = workflow.apply_context(stateRDD1, state_context)
    stateRDD = stateRDD.persist(StorageLevel.MEMORY_AND_DISK)
    stateRDD.setName("stateRDD")

    countryRDD = workflow.apply_context(countryRDD1, country_context)
    countryRDD = countryRDD.persist(StorageLevel.MEMORY_AND_DISK)
    countryRDD.setName("countryRDD")

    cityAlternateRDD = workflow.apply_context(city_alternate_names_rdd,
                                              city_context)
    cityAlternateRDD = cityAlternateRDD.persist(StorageLevel.MEMORY_AND_DISK)
    cityAlternateRDD.setName("cityAlternateRDD")
Пример #3
0
                                   "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_model.ttl",
                                   "http://dig.isi.edu/geonames",
                                   "http://schema.org/State1",
                                   state_context,
                                   data_type="csv",
                                   additional_settings={"karma.input.delimiter":"\t", "rdf.generation.disable.nesting":"false"})
    countryRDD1 = workflow.run_karma(input_country_rdd,
                                   "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/countries/country-model.ttl",
                                   "http://dig.isi.edu/geonames",
                                   "http://schema.org/Country1",
                                   country_context,
                                   data_type="csv",
                                   additional_settings={"karma.input.delimiter":"\t", "rdf.generation.disable.nesting":"false"})

    # Apply the context
    cityRDD = workflow.apply_context(cityRDD1, city_context)
    cityRDD = cityRDD.persist(StorageLevel.MEMORY_AND_DISK)
    cityRDD.setName("cityRDD")

    stateRDD = workflow.apply_context(stateRDD1, state_context)
    stateRDD = stateRDD.persist(StorageLevel.MEMORY_AND_DISK)
    stateRDD.setName("stateRDD")


    countryRDD = workflow.apply_context(countryRDD1, country_context)
    countryRDD = countryRDD.persist(StorageLevel.MEMORY_AND_DISK)
    countryRDD.setName("countryRDD")

    cityAlternateRDD = workflow.apply_context(city_alternate_names_rdd, city_context)
    cityAlternateRDD = cityAlternateRDD.persist(StorageLevel.MEMORY_AND_DISK)
    cityAlternateRDD.setName("cityAlternateRDD")
Пример #4
0
    inputFilename = argv[1]
    inputFileType = argv[2]
    inputDataType = argv[3]
    numPartitions = int(argv[4])
    outputFilename = argv[5]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = fileUtil.load_file(inputFilename, inputFileType,
                                  inputDataType).partitionBy(numPartitions)

    #2. Apply the karma Model
    contextUrl = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json"
    outputRDD_karma = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        contextUrl,
        num_partitions=numPartitions,
        data_type=inputDataType)

    #3. Apply the content
    outputRDD = workflow.apply_context(outputRDD_karma, contextUrl)

    #3. Save the output
    fileUtil.save_file(outputRDD, outputFilename, "text", "json")
    sys.exit(0)