--archives ../karma.zip \
      --py-files ../lib/python-lib.zip \
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaContextWorkflow.py \
      ../sample-data/sample-unicode-jsonld.json text \
      https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json \
      ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaContextWorkflow")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputType = argv[2]
    contextUrl = argv[3]
    outputFilename = argv[4]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # Read input
    inputRDD = fileUtil.load_file(inputFilename, inputType, "json")

    # Apply the context
    outputRDD = workflow.apply_context(inputRDD, contextUrl)

    # Save the output
    fileUtil.save_file(outputRDD, outputFilename, "text", "json")
Exemplo n.º 2
0
      --py-files ../lib/python-lib.zip \
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaWorkflowCSV.py ../sample-data/sample-unicode.txt ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaCSV")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    outputFilename = argv[2]
    numPartitions = 1

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)

    #2. Apply the karma Model
    outputRDD = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl",
        "http://dig.isi.edu/data",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions,
        data_type="csv",
        additional_settings={
            "karma.input.delimiter": "\t",
    --conf "spark.driver.extraClassPath=/home/hadoop/effect-workflows/lib/karma-spark-0.0.1-SNAPSHOT-shaded.jar" \
    --py-files /home/hadoop/effect-workflows/lib/python-lib.zip \
    --archives /home/hadoop/effect-workflows/karma.zip \
    /home/hadoop/effect-workflows/effectWorkflow.py \
    cdr hdfs://ip-172-31-19-102/user/effect/data/cdr-framed sequence 10
'''

context_url = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json"
base_uri = "http://effect.isi.edu/data/"

if __name__ == "__main__":
    sc = SparkContext()
    conf = SparkConf()

    java_import(sc._jvm, "edu.isi.karma")
    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    parser = ArgumentParser()
    parser.add_argument("-i", "--input", help="input folder", required=True)
    parser.add_argument("-o", "--output", help="input folder", required=True)
    parser.add_argument("-n",
                        "--partitions",
                        help="Number of partitions",
                        required=False,
                        default=20)
    parser.add_argument("-t",
                        "--host",
                        help="ES hostname",
                        default="localhost",
                        required=False)
Exemplo n.º 4
0
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    java_import(sc._jvm, "edu.isi.karma")
    inputFilename = args[0]
    input_country = args[1]
    outputFilename = args[2]
    city_alternate_name_input = args[3]
    city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json"
    state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json"
    country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json"

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # 1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)
    input_country_rdd =  workflow.batch_read_csv(input_country)
    input_alternate_city_rdd = workflow.batch_read_csv(city_alternate_name_input)
    print input_alternate_city_rdd.first()

    inputRDD_partitioned = inputRDD.partitionBy(10)

    #2. Apply the karma Model
    cityRDD1 = workflow.run_karma(inputRDD_partitioned,
                                   "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
                                   "http://dig.isi.edu/geonames",
                                   "http://schema.org/City1",
                                   city_context,
Exemplo n.º 5
0
                      default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    java_import(sc._jvm, "edu.isi.karma")
    inputFilename = args[0]
    input_country = args[1]
    outputFilename = args[2]
    city_alternate_name_input = args[3]
    city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json"
    state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json"
    country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json"

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # 1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)
    input_country_rdd = workflow.batch_read_csv(input_country)
    input_alternate_city_rdd = workflow.batch_read_csv(
        city_alternate_name_input)
    print input_alternate_city_rdd.first()

    inputRDD_partitioned = inputRDD.partitionBy(10)

    #2. Apply the karma Model
    cityRDD1 = workflow.run_karma(
        inputRDD_partitioned,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
        "http://dig.isi.edu/geonames",
Exemplo n.º 6
0
    numPartitions = int(argv[3])

    outputFilename = argv[4]
    loadelasticsearch = argv[5] == "True"

    es_server = argv[6]
    es_port = argv[7]
    es_index = argv[8]

    #After applying karma, we would like to reduce the number of partitions
    numFramerPartitions = max(10, numPartitions / 10)

    github_base = 'https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0'
    context_url = github_base + '/karma/karma-context.json'

    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    rdd_list = list()

    #Read the input data
    escorts_rdd = inputRDD = fileUtil.load_file(
        inputFilename, inputType, "json").partitionBy(numPartitions)
    escorts_rdd.persist(StorageLevel.MEMORY_AND_DISK)

    # Apply the main model
    main_rdd = workflow.run_karma(
        escorts_rdd, github_base + '/datasets/ht/CDRv2/main/ht-main-model.ttl',
        "http://dig.isi.edu/ht/data/", "http://schema.org/Offer1", context_url,
        numFramerPartitions
    )  #Partition the output data by numFramerPartitions and the rest
Exemplo n.º 7
0
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaReduceWorkflow.py \
      ../sample-data/sample-unicode.json text ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaReduceWorkflow")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputType = argv[2]
    outputFilename = argv[3]
    numPartitions = 1

    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    # Read input
    inputRDD = fileUtil.load_file(inputFilename, inputType, "json")

    #1. Apply the first karma Model
    outputRDD1 = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions)

    #2. Apply the second Karma Model
    #                                     {"metadata.broker.list": brokers})

    # This is read everything on all partitions on the queue.
    # auto.offset.reset: smallest -> Read all data on the queue
    #                  : largest ->  Start reading from now, the largest offset. You can
    #                   also omit auto.offset.reset and that starts at teh largest offset
    #                   then
    kvs = KafkaUtils.createDirectStream(ssc, topics, {
        "metadata.broker.list": brokers,
        "auto.offset.reset": "smallest"
    })

    kvs.pprint()

    # Apply the karma Model
    workflow = Workflow(sc)

    inputDStream = kvs.map(lambda x: ("karma", json.loads(x[1])))
    outputDStream = inputDStream.transform(lambda rdd: workflow.run_karma(
        rdd,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl",
        "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json"
    ))
    # outputDStream = inputDStream
    final = outputDStream.transform(lambda rdd: workflow.save_rdd_to_es(
        rdd, "localhost", "9200", "karma/Webpage"))
    final.pprint()

    # Start streaming
    ssc.start()
Exemplo n.º 9
0
      karmaWorkflow.py ../sample-data/part-00002-seq sequence json 1 ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karma")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputFileType = argv[2]
    inputDataType = argv[3]
    numPartitions = int(argv[4])
    outputFilename = argv[5]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = fileUtil.load_file(inputFilename, inputFileType,
                                  inputDataType).partitionBy(numPartitions)

    #2. Apply the karma Model
    contextUrl = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json"
    outputRDD_karma = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        contextUrl,
        num_partitions=numPartitions,
        data_type=inputDataType)