--archives ../karma.zip \ --py-files ../lib/python-lib.zip \ --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \ karmaContextWorkflow.py \ ../sample-data/sample-unicode-jsonld.json text \ https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json \ ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karmaContextWorkflow") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] inputType = argv[2] contextUrl = argv[3] outputFilename = argv[4] fileUtil = FileUtil(sc) workflow = Workflow(sc) # Read input inputRDD = fileUtil.load_file(inputFilename, inputType, "json") # Apply the context outputRDD = workflow.apply_context(inputRDD, contextUrl) # Save the output fileUtil.save_file(outputRDD, outputFilename, "text", "json")
--py-files ../lib/python-lib.zip \ --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \ karmaWorkflowCSV.py ../sample-data/sample-unicode.txt ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karmaCSV") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] outputFilename = argv[2] numPartitions = 1 fileUtil = FileUtil(sc) workflow = Workflow(sc) #1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) #2. Apply the karma Model outputRDD = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl", "http://dig.isi.edu/data", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions, data_type="csv", additional_settings={ "karma.input.delimiter": "\t",
--conf "spark.driver.extraClassPath=/home/hadoop/effect-workflows/lib/karma-spark-0.0.1-SNAPSHOT-shaded.jar" \ --py-files /home/hadoop/effect-workflows/lib/python-lib.zip \ --archives /home/hadoop/effect-workflows/karma.zip \ /home/hadoop/effect-workflows/effectWorkflow.py \ cdr hdfs://ip-172-31-19-102/user/effect/data/cdr-framed sequence 10 ''' context_url = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json" base_uri = "http://effect.isi.edu/data/" if __name__ == "__main__": sc = SparkContext() conf = SparkConf() java_import(sc._jvm, "edu.isi.karma") workflow = Workflow(sc) fileUtil = FileUtil(sc) parser = ArgumentParser() parser.add_argument("-i", "--input", help="input folder", required=True) parser.add_argument("-o", "--output", help="input folder", required=True) parser.add_argument("-n", "--partitions", help="Number of partitions", required=False, default=20) parser.add_argument("-t", "--host", help="ES hostname", default="localhost", required=False)
help="field separator", default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options java_import(sc._jvm, "edu.isi.karma") inputFilename = args[0] input_country = args[1] outputFilename = args[2] city_alternate_name_input = args[3] city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json" state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json" country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json" fileUtil = FileUtil(sc) workflow = Workflow(sc) # 1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) input_country_rdd = workflow.batch_read_csv(input_country) input_alternate_city_rdd = workflow.batch_read_csv(city_alternate_name_input) print input_alternate_city_rdd.first() inputRDD_partitioned = inputRDD.partitionBy(10) #2. Apply the karma Model cityRDD1 = workflow.run_karma(inputRDD_partitioned, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl", "http://dig.isi.edu/geonames", "http://schema.org/City1", city_context,
default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options java_import(sc._jvm, "edu.isi.karma") inputFilename = args[0] input_country = args[1] outputFilename = args[2] city_alternate_name_input = args[3] city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json" state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json" country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json" fileUtil = FileUtil(sc) workflow = Workflow(sc) # 1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) input_country_rdd = workflow.batch_read_csv(input_country) input_alternate_city_rdd = workflow.batch_read_csv( city_alternate_name_input) print input_alternate_city_rdd.first() inputRDD_partitioned = inputRDD.partitionBy(10) #2. Apply the karma Model cityRDD1 = workflow.run_karma( inputRDD_partitioned, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl", "http://dig.isi.edu/geonames",
numPartitions = int(argv[3]) outputFilename = argv[4] loadelasticsearch = argv[5] == "True" es_server = argv[6] es_port = argv[7] es_index = argv[8] #After applying karma, we would like to reduce the number of partitions numFramerPartitions = max(10, numPartitions / 10) github_base = 'https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0' context_url = github_base + '/karma/karma-context.json' workflow = Workflow(sc) fileUtil = FileUtil(sc) rdd_list = list() #Read the input data escorts_rdd = inputRDD = fileUtil.load_file( inputFilename, inputType, "json").partitionBy(numPartitions) escorts_rdd.persist(StorageLevel.MEMORY_AND_DISK) # Apply the main model main_rdd = workflow.run_karma( escorts_rdd, github_base + '/datasets/ht/CDRv2/main/ht-main-model.ttl', "http://dig.isi.edu/ht/data/", "http://schema.org/Offer1", context_url, numFramerPartitions ) #Partition the output data by numFramerPartitions and the rest
--driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \ karmaReduceWorkflow.py \ ../sample-data/sample-unicode.json text ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karmaReduceWorkflow") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] inputType = argv[2] outputFilename = argv[3] numPartitions = 1 workflow = Workflow(sc) fileUtil = FileUtil(sc) # Read input inputRDD = fileUtil.load_file(inputFilename, inputType, "json") #1. Apply the first karma Model outputRDD1 = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions) #2. Apply the second Karma Model
# {"metadata.broker.list": brokers}) # This is read everything on all partitions on the queue. # auto.offset.reset: smallest -> Read all data on the queue # : largest -> Start reading from now, the largest offset. You can # also omit auto.offset.reset and that starts at teh largest offset # then kvs = KafkaUtils.createDirectStream(ssc, topics, { "metadata.broker.list": brokers, "auto.offset.reset": "smallest" }) kvs.pprint() # Apply the karma Model workflow = Workflow(sc) inputDStream = kvs.map(lambda x: ("karma", json.loads(x[1]))) outputDStream = inputDStream.transform(lambda rdd: workflow.run_karma( rdd, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json" )) # outputDStream = inputDStream final = outputDStream.transform(lambda rdd: workflow.save_rdd_to_es( rdd, "localhost", "9200", "karma/Webpage")) final.pprint() # Start streaming ssc.start()
karmaWorkflow.py ../sample-data/part-00002-seq sequence json 1 ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karma") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] inputFileType = argv[2] inputDataType = argv[3] numPartitions = int(argv[4]) outputFilename = argv[5] fileUtil = FileUtil(sc) workflow = Workflow(sc) #1. Read the input inputRDD = fileUtil.load_file(inputFilename, inputFileType, inputDataType).partitionBy(numPartitions) #2. Apply the karma Model contextUrl = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json" outputRDD_karma = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", contextUrl, num_partitions=numPartitions, data_type=inputDataType)