sc = SparkContext(appName="karmaCSV") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] outputFilename = argv[2] numPartitions = 1 fileUtil = FileUtil(sc) workflow = Workflow(sc) #1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) #2. Apply the karma Model outputRDD = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl", "http://dig.isi.edu/data", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions, data_type="csv", additional_settings={ "karma.input.delimiter": "\t", "karma.output.format": "n3" }) #3. Save the output outputRDD.map(lambda x: x[1]).saveAsTextFile(outputFilename)
print("Got arguments:", args) input = args.input.strip() numPartitions = int(args.partitions) doc_type = args.doctype.strip() outputFilename = args.output.strip() numFramerPartitions = numPartitions / 2 inputRDD = workflow.batch_read_csv(input) outputFileType = "sequence" #2. Apply the karma Model reduced_rdd = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/usc-isi-i2/effect-alignment/master/models/ransonware/ransomware-model.ttl", "http://effect.isi.edu/data/", "http://schema.dig.isi.edu/ontology/Malware1", "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json", num_partitions=numPartitions, data_type="csv", additional_settings={"karma.input.delimiter": ","}) if reduced_rdd is not None: reduced_rdd = reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK) fileUtil.save_file(reduced_rdd, outputFilename + '/reduced_rdd', "sequence", "json") reduced_rdd.setName("karma_out_reduced") types = [{ "name": "AttackEvent", "uri": "http://schema.dig.isi.edu/ontology/AttackEvent" }, { "name": "EmailAddress",
fileUtil = FileUtil(sc) workflow = Workflow(sc) # 1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) input_country_rdd = workflow.batch_read_csv(input_country) input_alternate_city_rdd = workflow.batch_read_csv(city_alternate_name_input) print input_alternate_city_rdd.first() inputRDD_partitioned = inputRDD.partitionBy(10) #2. Apply the karma Model cityRDD1 = workflow.run_karma(inputRDD_partitioned, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl", "http://dig.isi.edu/geonames", "http://schema.org/City1", city_context, data_type="csv", additional_settings={"karma.input.delimiter":"\t", "rdf.generation.disable.nesting":"false"}) city_alternate_names_rdd = workflow.run_karma(input_alternate_city_rdd, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/city-alternatenames/city-alternate-names-model.ttl", "http://dig.isi.edu/geonames", "http://schema.org/City1", city_context, data_type="csv", additional_settings={"karma.input.delimiter":"\t", "rdf.generation.disable.nesting":"false"}) stateRDD1 = workflow.run_karma(inputRDD_partitioned, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_model.ttl", "http://dig.isi.edu/geonames",
# 1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) input_country_rdd = workflow.batch_read_csv(input_country) input_alternate_city_rdd = workflow.batch_read_csv( city_alternate_name_input) print input_alternate_city_rdd.first() inputRDD_partitioned = inputRDD.partitionBy(10) #2. Apply the karma Model cityRDD1 = workflow.run_karma( inputRDD_partitioned, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl", "http://dig.isi.edu/geonames", "http://schema.org/City1", city_context, data_type="csv", additional_settings={ "karma.input.delimiter": "\t", "rdf.generation.disable.nesting": "false" }) city_alternate_names_rdd = workflow.run_karma( input_alternate_city_rdd, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/city-alternatenames/city-alternate-names-model.ttl", "http://dig.isi.edu/geonames", "http://schema.org/City1", city_context, data_type="csv", additional_settings={ "karma.input.delimiter": "\t",
context_url = github_base + '/karma/karma-context.json' workflow = Workflow(sc) fileUtil = FileUtil(sc) rdd_list = list() #Read the input data escorts_rdd = inputRDD = fileUtil.load_file( inputFilename, inputType, "json").partitionBy(numPartitions) escorts_rdd.persist(StorageLevel.MEMORY_AND_DISK) # Apply the main model main_rdd = workflow.run_karma( escorts_rdd, github_base + '/datasets/ht/CDRv2/main/ht-main-model.ttl', "http://dig.isi.edu/ht/data/", "http://schema.org/Offer1", context_url, numFramerPartitions ) #Partition the output data by numFramerPartitions and the rest #of the workflow works with the same number of partitions main_rdd.persist(StorageLevel.MEMORY_AND_DISK) rdd_list.append(main_rdd) print "main model done" # Apply the AdultService Model adultservice_rdd = workflow.run_karma( escorts_rdd, github_base + '/datasets/ht/CDRv2/adultservice/ht-adultservice-model.ttl', "http://dig.isi.edu/ht/data/", "http://schema.dig.isi.edu/ontology/AdultService1", context_url, numFramerPartitions) adultservice_rdd.persist(StorageLevel.MEMORY_AND_DISK)
inputFilename = argv[1] inputType = argv[2] outputFilename = argv[3] numPartitions = 1 workflow = Workflow(sc) fileUtil = FileUtil(sc) # Read input inputRDD = fileUtil.load_file(inputFilename, inputType, "json") #1. Apply the first karma Model outputRDD1 = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions) #2. Apply the second Karma Model outputRDD2 = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model2.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions) #3. Combined the data and then apply the Karma JSON Reducer reducedRDD = workflow.reduce_rdds(numPartitions, outputRDD1, outputRDD2)
# auto.offset.reset: smallest -> Read all data on the queue # : largest -> Start reading from now, the largest offset. You can # also omit auto.offset.reset and that starts at teh largest offset # then kvs = KafkaUtils.createDirectStream(ssc, topics, { "metadata.broker.list": brokers, "auto.offset.reset": "smallest" }) kvs.pprint() # Apply the karma Model workflow = Workflow(sc) inputDStream = kvs.map(lambda x: ("karma", json.loads(x[1]))) outputDStream = inputDStream.transform(lambda rdd: workflow.run_karma( rdd, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json" )) # outputDStream = inputDStream final = outputDStream.transform(lambda rdd: workflow.save_rdd_to_es( rdd, "localhost", "9200", "karma/Webpage")) final.pprint() # Start streaming ssc.start() ssc.start() ssc.awaitTermination()
inputFilename = argv[1] inputFileType = argv[2] inputDataType = argv[3] numPartitions = int(argv[4]) outputFilename = argv[5] fileUtil = FileUtil(sc) workflow = Workflow(sc) #1. Read the input inputRDD = fileUtil.load_file(inputFilename, inputFileType, inputDataType).partitionBy(numPartitions) #2. Apply the karma Model contextUrl = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json" outputRDD_karma = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", contextUrl, num_partitions=numPartitions, data_type=inputDataType) #3. Apply the content outputRDD = workflow.apply_context(outputRDD_karma, contextUrl) #3. Save the output fileUtil.save_file(outputRDD, outputFilename, "text", "json") sys.exit(0)