def testTokenizer(sc, input_dir, output_dir, config, limit=None, input_file_format="sequence", input_data_type="json", output_file_format="sequence", output_data_type="json", **kwargs): print(limit) futil = FileUtil(sc) # LOAD DATA rdd_ingest = futil.load_file(input_dir, file_format=kwargs.get("file_format"), data_type=input_data_type) rdd_ingest.setName('rdd_ingest_input') ## TOKENIZE #tokOptions = {"file_format": input_file_format, # "data_type": input_data_type} tokenizer = Tokenizer(config, **kwargs) rdd_tokenized = tokenizer.perform(rdd_ingest) # SAVE DATA outOptions = {} futil.save_file(rdd_tokenized, output_dir, file_format=output_file_format, data_type=output_data_type, **outOptions)
def preprocess(sc, inputDir, file_format, outputDir): """ this method just reads the offer file and creates vertexrdd and edgerdd required for graphx vertexrdd will be node uri and type edgesrdd will be node a,node b,edge type :param inputDir: :param file_format: :return: """ fileUtil = FileUtil(sc) inputrdd = fileUtil.load_file(inputDir, file_format=file_format, data_type='json') vertexrdd = inputrdd.flatMapValues(lambda x: nodes_mapper(x)) #rdd = vertexrdd.foreach(lambda (x,y):f(x,y)) edgerdd = inputrdd.flatMapValues(lambda x: edges_mapper(x)) fileUtil.save_file(vertexrdd, outputDir + 'vertex', file_format='text', data_type='json') fileUtil.save_file(edgerdd, outputDir + 'edges', file_format='text', data_type='json')
def testLSH(sc, inputFilename,outputFilename,configFilename, **kwargs): ''' kwargs is a dictionary of inputs a sample input would look like options = { "file_format":"sequence", "data_type":"json", "numHashes":100 , "numItemsInBand": 10, "computeSimilarity": True, "threshold":0.8, "base":"saam-city.json", "topk":3, "candidatesName":"candidates") } ''' futil = FileUtil(sc) #Tokenize ###################### rdd_input = futil.load_file(inputFilename, file_format=kwargs['file_format'], data_type="json") rdd_input.setName('rdd_input') tokenizer = Tokenizer(configFilename, **kwargs) rdd_tokenized = tokenizer.perform(rdd_input) futil = FileUtil(sc) outOptions = {} #you can save the tokens file here by using #futil.save_file(rdd_tokenized,outputFilename,file_format='sequence',data_type='json',**outOptions) #Hashing ####################### hasher = Hasher(**kwargs) rdd_minHashes = hasher.perform(rdd_tokenized) rdd_minHashes.setName('rdd_minhashes') #futil.save_file(rdd_minHashes,outputFilename,file_format='sequence',data_type='json',**outOptions) #clustering ######################### clusterer = Clusterer(**kwargs) rdd_clusters = clusterer.perform(rdd_minHashes) #futil.save_file(rdd_clusters,outputFilename,file_format='text',data_type='json',**outOptions) #unionfind ######################### unionFind = UnionFind(**kwargs) rdd_unionfind = unionFind.perform(rdd_clusters) # SAVE DATA futil.save_file(rdd_unionfind,outputFilename,file_format='text',data_type='json',**outOptions)
def testSignature(sc, inputFileName, outputFileName, file_format): config = {"runlimit": 5, "field": "title_signature"} fUtil = FileUtil(sc) rdd = fUtil.load_file(inputFileName, file_format=file_format, data_type="json") signature = ComputeSignature(**config) rdd = signature.perform(rdd) fUtil.save_file(rdd, outputFileName, file_format="text", data_type="json")
def testSignature(sc,inputFileName,outputFileName,file_format): """ :param sc: this is sparkcontext variable needed for spark :param inputFileName: inputdir to which cluster_id needs to be added :param outputFileName: outputdir :param file_format: text/seq """ fUtil = FileUtil(sc) rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json") addClusterId = AddClusterId() rdd = addClusterId.perform(rdd) fUtil.save_file(rdd,outputFileName,file_format='text',data_type='json')
def save_jsonlines(sc, rdd, output_dir, file_format='sequence', data_type='json', separator='\t'): fUtil = FileUtil(sc) fUtil.save_file(rdd, output_dir, file_format=file_format, data_type=data_type, separator=separator)
def testSignature(sc,inputFileName,outputFileName,file_format): config = { "runlimit" : 5, "field" : "title_signature" } fUtil = FileUtil(sc) rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json") signature = ComputeSignature(**config) rdd = signature.perform(rdd) fUtil.save_file(rdd,outputFileName,file_format="text",data_type="json")
def preprocess(sc,inputDir,file_format,outputDir): """ this method just reads the offer file and creates vertexrdd and edgerdd required for graphx vertexrdd will be node uri and type edgesrdd will be node a,node b,edge type :param inputDir: :param file_format: :return: """ fileUtil = FileUtil(sc) inputrdd=fileUtil.load_file(inputDir,file_format=file_format,data_type='json') vertexrdd = inputrdd.flatMapValues(lambda x:nodes_mapper(x)) #rdd = vertexrdd.foreach(lambda (x,y):f(x,y)) edgerdd = inputrdd.flatMapValues(lambda x : edges_mapper(x)) fileUtil.save_file(vertexrdd,outputDir+'vertex',file_format='text',data_type='json') fileUtil.save_file(edgerdd,outputDir+'edges',file_format='text',data_type='json')
return decoder.line_to_predictions( ner_fea, Decoder(params), data, attribute_name, content_type) return data cdr_extractions_rdd = cdr_extractions_isi_rdd\ .mapValues(lambda x : apply_bbn_extractor(x))\ .repartition(numPartitions)\ .persist(StorageLevel.MEMORY_AND_DISK) cdr_extractions_rdd.setName("cdr_extractions") if args.incremental is True: if len(since) > 0: fileUtil.save_file( cdr_extractions_rdd, outputFilename + '/cdr_extractions/' + since, outputFileType, "json") else: fileUtil.save_file( cdr_extractions_rdd, outputFilename + '/cdr_extractions/initial', outputFileType, "json") else: fileUtil.save_file(cdr_extractions_rdd, outputFilename + '/cdr_extractions', outputFileType, "json") else: if args.incremental is True: if len(since) > 0: cdr_extractions_rdd = sc.sequenceFile( outputFilename + '/cdr_extractions/' +
workflow = Workflow(sc) fileUtil = FileUtil(sc) # Read input inputRDD = fileUtil.load_file(inputFilename, inputType, "json") #1. Apply the first karma Model outputRDD1 = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions) #2. Apply the second Karma Model outputRDD2 = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model2.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions) #3. Combined the data and then apply the Karma JSON Reducer reducedRDD = workflow.reduce_rdds(numPartitions, outputRDD1, outputRDD2) #4. Save the output fileUtil.save_file(reducedRDD, outputFilename, "text", "json")
"name": "E82_Actor_Appellation", "uri": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation" }, { "name": "E67_Birth", "uri": "http://www.cidoc-crm.org/cidoc-crm/E67_Birth" }, { "name": "E69_Death", "uri": "http://www.cidoc-crm.org/cidoc-crm/E69_Death" }, { "name": "E52_Time-Span", "uri": "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span" }] frames = [{ "name": "npgConstituents", "url": "https://raw.githubusercontent.com/american-art/aac-alignment/master/frames/npgConsitituents.json-ld" }] framer_output = workflow.apply_framer(reducedRDD, types, frames, 5, 2) for frame_name in framer_output: outputRDD = workflow.apply_context(framer_output[frame_name], contextUrl) outputRDD_after = outputRDD.mapValues(mapFunc) if not outputRDD_after.isEmpty(): fileUtil.save_file(outputRDD_after, outputFilename + "/" + frame_name, 'text', 'json') print "Save to:", ("---" + frame_name) # workflow.save_rdd_to_es(outputRDD, es_server, es_port, es_index + "/" + frame_name)
def convert(sc,inputFileName,outputFileName): fileUtil = FileUtil(sc) rdd = fileUtil.load_file(inputFileName,file_format="text",data_type="json") fileUtil.save_file(rdd,outputFileName)
dest="threshold", default=0.0, help="similarity threshold") parser.add_option("-e", "--base", dest="base", type="string", help="base file", default="") parser.add_option("-o", "--outputformat", dest="outputformat", type="string", help="output file format: text/sequence", default="text") parser.add_option("-y", "--outputtype", dest="outputtype", type="string", help="output type: csv/json", default="json") parser.add_option("-k", "--topk", dest="topk", type="int", help="top n matches", default=3) parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=10) parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string", help="name for json element for matching candidates", default="candidates") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename = args[0] outputFilename = args[1] print "Save to:", outputFilename kwargs = as_dict(c_options) clusterer = Clusterer(**kwargs) fileUtil = FileUtil(sc) rdd = fileUtil.load_file(inputFilename,file_format='text') cluster_rdd = clusterer.compute_clusters(rdd) fileUtil.save_file(cluster_rdd,outputFilename,file_format='text')
#3. Save the output # fileUtil.save_file(outputRDD, outputFilename, "text", "json") #4. Reduce rdds reducedRDD = workflow.reduce_rdds(numFramerPartitions, outputRDD) reducedRDD.persist() types = [ {"name": "E39_Actor", "uri": "http://www.cidoc-crm.org/cidoc-crm/E39_Actor"}, {"name": "E82_Actor_Appellation", "uri": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation"}, {"name": "E67_Birth", "uri": "http://www.cidoc-crm.org/cidoc-crm/E67_Birth"}, {"name": "E69_Death", "uri": "http://www.cidoc-crm.org/cidoc-crm/E69_Death"}, {"name": "E52_Time-Span", "uri": "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span"} ] frames = [ {"name": "npgConstituents", "url": "https://raw.githubusercontent.com/american-art/aac-alignment/master/frames/npgConsitituents.json-ld"} ] type_to_rdd_json = workflow.apply_partition_on_types(reducedRDD, types) #5. Apply framer framer_output = workflow.apply_framer(reducedRDD, type_to_rdd_json, frames, numFramerPartitions, 10) for frame_name in framer_output: #6. Map function framer_output[frame_name] = framer_output[frame_name].mapValues(mapFunc) fileUtil.save_file(framer_output[frame_name], outputFilename + "/" + frame_name, 'text', 'json') print "Save to:", ("---" + frame_name)
default="sequence") parser.add_argument("-q", "--query", help="HIVE query to get data", default="", required=False) args = parser.parse_args() print("Got arguments:", args) inputTable = args.inputTable.strip() outputFilename = args.output.strip() outputFileType = args.outputtype.strip() hiveQuery = args.query.strip() numPartitions = int(args.partitions) numFramerPartitions = numPartitions / 2 if len(hiveQuery) > 0: cdr_data = workflow.load_cdr_from_hive_query(hiveQuery)\ .partitionBy(numPartitions) \ .persist(StorageLevel.MEMORY_AND_DISK) else: cdr_data = workflow.load_cdr_from_hive_table(inputTable) \ .partitionBy(numPartitions) \ .persist(StorageLevel.MEMORY_AND_DISK) cdr_data.setName("cdr_data") fileUtil.save_file(cdr_data, outputFilename, outputFileType, "json")
cityRDD = cityRDD.persist(StorageLevel.MEMORY_AND_DISK) cityRDD.setName("cityRDD") stateRDD = workflow.apply_context(stateRDD1, state_context) stateRDD = stateRDD.persist(StorageLevel.MEMORY_AND_DISK) stateRDD.setName("stateRDD") countryRDD = workflow.apply_context(countryRDD1, country_context) countryRDD = countryRDD.persist(StorageLevel.MEMORY_AND_DISK) countryRDD.setName("countryRDD") cityAlternateRDD = workflow.apply_context(city_alternate_names_rdd, city_context) cityAlternateRDD = cityAlternateRDD.persist(StorageLevel.MEMORY_AND_DISK) cityAlternateRDD.setName("cityAlternateRDD") fileUtil.save_file(cityAlternateRDD, outputFilename+"_cityalternate", "text", "json") city_reduced_rdd = workflow.reduce_rdds(10, cityRDD, cityAlternateRDD) city_reduced_rdd = city_reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK) city_reduced_rdd.setName("city_reduced_rdd") # fileUtil.save_file(countryRDD, outputFilename+"_Country", "text", "json") # fileUtil.save_file(city_reduced_rdd, outputFilename+"_City", "text", "json") # fileUtil.save_file(stateRDD, outputFilename+"_State", "text", "json") mergeRDD1 = EntityMerger.merge_rdds(city_reduced_rdd, "address.addressCountry", countryRDD,10) # fileUtil.save_file(mergeRDD1, outputFilename+"_State_Country", "text", "json") mergeRDD2 = EntityMerger.merge_rdds(mergeRDD1, "address.addressRegion", stateRDD,10)
inputRDD = workflow.batch_read_csv(input) outputFileType = "sequence" #2. Apply the karma Model reduced_rdd = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/usc-isi-i2/effect-alignment/master/models/ransonware/ransomware-model.ttl", "http://effect.isi.edu/data/", "http://schema.dig.isi.edu/ontology/Malware1", "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json", num_partitions=numPartitions, data_type="csv", additional_settings={"karma.input.delimiter": ","}) if reduced_rdd is not None: reduced_rdd = reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK) fileUtil.save_file(reduced_rdd, outputFilename + '/reduced_rdd', "sequence", "json") reduced_rdd.setName("karma_out_reduced") types = [{ "name": "AttackEvent", "uri": "http://schema.dig.isi.edu/ontology/AttackEvent" }, { "name": "EmailAddress", "uri": "http://schema.dig.isi.edu/ontology/EmailAddress" }, { "name": "GeoCoordinates", "uri": "http://schema.org/GeoCoordinates" }, { "name": "Organization", "uri": "http://schema.org/Organization" }, {
"url": github_base + "/frames/email.json" }] type_to_rdd_json = workflow.apply_partition_on_types(reduced_rdd, types) for type_name in type_to_rdd_json: type_to_rdd_json[type_name]["rdd"] = type_to_rdd_json[type_name][ "rdd"].persist(StorageLevel.MEMORY_AND_DISK) type_to_rdd_json[type_name]["rdd"].setName(type_name) framer_output = workflow.apply_framer(reduced_rdd, type_to_rdd_json, frames, numFramerPartitions, 10) # We have the framer output. Now we can save it to disk and load it into Elastic Search for frame_name in framer_output: framer_output[frame_name] = framer_output[frame_name].coalesce(numFramerPartitions)\ .persist(StorageLevel.MEMORY_AND_DISK) fileUtil.save_file(framer_output[frame_name], outputFilename + "/" + frame_name, "text", "json") if not framer_output[frame_name].isEmpty(): if loadelasticsearch: workflow.save_rdd_to_es(framer_output[frame_name], es_server, es_port, es_index + "/" + frame_name) reduced_rdd.unpersist() for type_name in type_to_rdd_json: type_to_rdd_json[type_name]["rdd"] = type_to_rdd_json[type_name][ "rdd"].unpersist() for frame_name in framer_output: framer_output[frame_name].unpersist()
--archives ../karma.zip \ --py-files ../lib/python-lib.zip \ --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \ karmaContextWorkflow.py \ ../sample-data/sample-unicode-jsonld.json text \ https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json \ ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karmaContextWorkflow") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] inputType = argv[2] contextUrl = argv[3] outputFilename = argv[4] fileUtil = FileUtil(sc) workflow = Workflow(sc) # Read input inputRDD = fileUtil.load_file(inputFilename, inputType, "json") # Apply the context outputRDD = workflow.apply_context(inputRDD, contextUrl) # Save the output fileUtil.save_file(outputRDD, outputFilename, "text", "json")
"http://www.cidoc-crm.org/cidoc-crm/E22_Man-Made_Object1", "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json", data_type="csv", additional_settings={"karma.input.delimiter":","}) #3. Save the output # fileUtil.save_file(outputRDD, outputFilename, "text", "json") reducedRDD = workflow.reduce_rdds(outputRDD) reducedRDD.persist() types = [ {"name": "E82_Actor_Appellation", "uri": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation"} ] frames = [ {"name": "AutryMakers", "url": "https://raw.githubusercontent.com/american-art/aac-alignment/master/frames/autryMakers.json-ld"} ] context = workflow.read_json_file(contextUrl) framer_output = workflow.apply_framer(reducedRDD, types, frames) for frame_name in framer_output: outputRDD = workflow.apply_context(framer_output[frame_name], context, contextUrl) #apply mapValues function outputRDD_after = outputRDD.mapValues(mapFunc) if not outputRDD_after.isEmpty(): fileUtil.save_file(outputRDD_after, outputFilename + "/" + frame_name, 'text', 'json') print "Save to:", ("---" + frame_name) # workflow.save_rdd_to_es(outputRDD, es_server, es_port, es_index + "/" + frame_name)
from pyspark import SparkContext, SparkConf, StorageLevel from optparse import OptionParser from digSparkUtil.fileUtil import FileUtil if __name__ == '__main__': parser = OptionParser() (c_options, args) = parser.parse_args() print "Got options:", c_options filename = args[0] file_format = args[1] out_filename = args[2] out_format = args[3] uris = args[4].split(",") print "Filename", filename, file_format print "Output:", out_filename, out_format print "Filter:", args[4] sc = SparkContext(appName="DIG-FILTER") conf = SparkConf() fileUtil = FileUtil(sc) input_rdd = fileUtil.load_file(filename, file_format, "json") output_rdd = input_rdd.filter(lambda x: x[0] in uris).coalesce(1) fileUtil.save_file(output_rdd, out_filename, out_format, "json")
def convert(sc, inputFileName, outputFileName): fileUtil = FileUtil(sc) rdd = fileUtil.load_file(inputFileName, file_format="text", data_type="json") fileUtil.save_file(rdd, outputFileName)
cityRDD = cityRDD.persist(StorageLevel.MEMORY_AND_DISK) cityRDD.setName("cityRDD") stateRDD = workflow.apply_context(stateRDD1, state_context) stateRDD = stateRDD.persist(StorageLevel.MEMORY_AND_DISK) stateRDD.setName("stateRDD") countryRDD = workflow.apply_context(countryRDD1, country_context) countryRDD = countryRDD.persist(StorageLevel.MEMORY_AND_DISK) countryRDD.setName("countryRDD") cityAlternateRDD = workflow.apply_context(city_alternate_names_rdd, city_context) cityAlternateRDD = cityAlternateRDD.persist(StorageLevel.MEMORY_AND_DISK) cityAlternateRDD.setName("cityAlternateRDD") fileUtil.save_file(cityAlternateRDD, outputFilename + "_cityalternate", "text", "json") city_reduced_rdd = workflow.reduce_rdds(10, cityRDD, cityAlternateRDD) city_reduced_rdd = city_reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK) city_reduced_rdd.setName("city_reduced_rdd") # fileUtil.save_file(countryRDD, outputFilename+"_Country", "text", "json") # fileUtil.save_file(city_reduced_rdd, outputFilename+"_City", "text", "json") # fileUtil.save_file(stateRDD, outputFilename+"_State", "text", "json") mergeRDD1 = EntityMerger.merge_rdds(city_reduced_rdd, "address.addressCountry", countryRDD, 10) # fileUtil.save_file(mergeRDD1, outputFilename+"_State_Country", "text", "json") mergeRDD2 = EntityMerger.merge_rdds(mergeRDD1, "address.addressRegion",