示例#1
0
    sc = SparkContext(appName="karmaCSV")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    outputFilename = argv[2]
    numPartitions = 1

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)

    #2. Apply the karma Model
    outputRDD = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl",
        "http://dig.isi.edu/data",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions,
        data_type="csv",
        additional_settings={
            "karma.input.delimiter": "\t",
            "karma.output.format": "n3"
        })

    #3. Save the output
    outputRDD.map(lambda x: x[1]).saveAsTextFile(outputFilename)
    print("Got arguments:", args)

    input = args.input.strip()
    numPartitions = int(args.partitions)
    doc_type = args.doctype.strip()
    outputFilename = args.output.strip()
    numFramerPartitions = numPartitions / 2
    inputRDD = workflow.batch_read_csv(input)
    outputFileType = "sequence"

    #2. Apply the karma Model
    reduced_rdd = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/usc-isi-i2/effect-alignment/master/models/ransonware/ransomware-model.ttl",
        "http://effect.isi.edu/data/",
        "http://schema.dig.isi.edu/ontology/Malware1",
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json",
        num_partitions=numPartitions,
        data_type="csv",
        additional_settings={"karma.input.delimiter": ","})
    if reduced_rdd is not None:
        reduced_rdd = reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK)
        fileUtil.save_file(reduced_rdd, outputFilename + '/reduced_rdd',
                           "sequence", "json")
        reduced_rdd.setName("karma_out_reduced")

        types = [{
            "name": "AttackEvent",
            "uri": "http://schema.dig.isi.edu/ontology/AttackEvent"
        }, {
            "name": "EmailAddress",
    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # 1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)
    input_country_rdd =  workflow.batch_read_csv(input_country)
    input_alternate_city_rdd = workflow.batch_read_csv(city_alternate_name_input)
    print input_alternate_city_rdd.first()

    inputRDD_partitioned = inputRDD.partitionBy(10)

    #2. Apply the karma Model
    cityRDD1 = workflow.run_karma(inputRDD_partitioned,
                                   "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
                                   "http://dig.isi.edu/geonames",
                                   "http://schema.org/City1",
                                   city_context,
                                   data_type="csv",
                                   additional_settings={"karma.input.delimiter":"\t", "rdf.generation.disable.nesting":"false"})

    city_alternate_names_rdd = workflow.run_karma(input_alternate_city_rdd,
                                   "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/city-alternatenames/city-alternate-names-model.ttl",
                                   "http://dig.isi.edu/geonames",
                                   "http://schema.org/City1",
                                   city_context,
                                   data_type="csv",
                                   additional_settings={"karma.input.delimiter":"\t", "rdf.generation.disable.nesting":"false"})

    stateRDD1 = workflow.run_karma(inputRDD_partitioned,
                                   "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_model.ttl",
                                   "http://dig.isi.edu/geonames",
    # 1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)
    input_country_rdd = workflow.batch_read_csv(input_country)
    input_alternate_city_rdd = workflow.batch_read_csv(
        city_alternate_name_input)
    print input_alternate_city_rdd.first()

    inputRDD_partitioned = inputRDD.partitionBy(10)

    #2. Apply the karma Model
    cityRDD1 = workflow.run_karma(
        inputRDD_partitioned,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
        "http://dig.isi.edu/geonames",
        "http://schema.org/City1",
        city_context,
        data_type="csv",
        additional_settings={
            "karma.input.delimiter": "\t",
            "rdf.generation.disable.nesting": "false"
        })

    city_alternate_names_rdd = workflow.run_karma(
        input_alternate_city_rdd,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/city-alternatenames/city-alternate-names-model.ttl",
        "http://dig.isi.edu/geonames",
        "http://schema.org/City1",
        city_context,
        data_type="csv",
        additional_settings={
            "karma.input.delimiter": "\t",
    context_url = github_base + '/karma/karma-context.json'

    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    rdd_list = list()

    #Read the input data
    escorts_rdd = inputRDD = fileUtil.load_file(
        inputFilename, inputType, "json").partitionBy(numPartitions)
    escorts_rdd.persist(StorageLevel.MEMORY_AND_DISK)

    # Apply the main model
    main_rdd = workflow.run_karma(
        escorts_rdd, github_base + '/datasets/ht/CDRv2/main/ht-main-model.ttl',
        "http://dig.isi.edu/ht/data/", "http://schema.org/Offer1", context_url,
        numFramerPartitions
    )  #Partition the output data by numFramerPartitions and the rest
    #of the workflow works with the same number of partitions
    main_rdd.persist(StorageLevel.MEMORY_AND_DISK)
    rdd_list.append(main_rdd)
    print "main model done"

    # Apply the AdultService Model
    adultservice_rdd = workflow.run_karma(
        escorts_rdd, github_base +
        '/datasets/ht/CDRv2/adultservice/ht-adultservice-model.ttl',
        "http://dig.isi.edu/ht/data/",
        "http://schema.dig.isi.edu/ontology/AdultService1", context_url,
        numFramerPartitions)
    adultservice_rdd.persist(StorageLevel.MEMORY_AND_DISK)
    inputFilename = argv[1]
    inputType = argv[2]
    outputFilename = argv[3]
    numPartitions = 1

    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    # Read input
    inputRDD = fileUtil.load_file(inputFilename, inputType, "json")

    #1. Apply the first karma Model
    outputRDD1 = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions)

    #2. Apply the second Karma Model
    outputRDD2 = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model2.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions)

    #3. Combined the data and then apply the Karma JSON Reducer
    reducedRDD = workflow.reduce_rdds(numPartitions, outputRDD1, outputRDD2)
    # auto.offset.reset: smallest -> Read all data on the queue
    #                  : largest ->  Start reading from now, the largest offset. You can
    #                   also omit auto.offset.reset and that starts at teh largest offset
    #                   then
    kvs = KafkaUtils.createDirectStream(ssc, topics, {
        "metadata.broker.list": brokers,
        "auto.offset.reset": "smallest"
    })

    kvs.pprint()

    # Apply the karma Model
    workflow = Workflow(sc)

    inputDStream = kvs.map(lambda x: ("karma", json.loads(x[1])))
    outputDStream = inputDStream.transform(lambda rdd: workflow.run_karma(
        rdd,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl",
        "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json"
    ))
    # outputDStream = inputDStream
    final = outputDStream.transform(lambda rdd: workflow.save_rdd_to_es(
        rdd, "localhost", "9200", "karma/Webpage"))
    final.pprint()

    # Start streaming
    ssc.start()
    ssc.start()
    ssc.awaitTermination()
示例#8
0
    inputFilename = argv[1]
    inputFileType = argv[2]
    inputDataType = argv[3]
    numPartitions = int(argv[4])
    outputFilename = argv[5]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = fileUtil.load_file(inputFilename, inputFileType,
                                  inputDataType).partitionBy(numPartitions)

    #2. Apply the karma Model
    contextUrl = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json"
    outputRDD_karma = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        contextUrl,
        num_partitions=numPartitions,
        data_type=inputDataType)

    #3. Apply the content
    outputRDD = workflow.apply_context(outputRDD_karma, contextUrl)

    #3. Save the output
    fileUtil.save_file(outputRDD, outputFilename, "text", "json")
    sys.exit(0)