usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath" \ "baseDataset baseDatasetFormat" \ "outputFilename outoutFileFormat" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") parser.add_option("-p", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=10) (c_options, args) = parser.parse_args() inputFilename1 = args[0] inputFileFormat1 = args[1] inputPath = args[2] baseFilename = args[3] baseFormat = args[4] outputFilename = args[5] outputFileFormat = args[6] print "Got options:", c_options, ", " \ "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \ ", base:", baseFilename, ",", baseFormat print "Write output to:", outputFilename fileUtil = FileUtil(sc) input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options).partitionBy(c_options.numPartitions) base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options) result_rdd = EntityCleaner.clean_rdds(input_rdd1, inputPath, base_rdd, c_options.numPartitions) fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
# then pass in result to merge-rdds along with input-rdd or output-rdd if defined, #set output-rdd as result from merge # return output if __name__ == "__main__": sc = SparkContext(appName="DIG-FRAMER") parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") parser.add_option("-n", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=5) (c_options, args) = parser.parse_args() frameFilename = args[0] rddFilename = args[1] outputFilename = args[2] if len(args) > 3: outputFileFormat = args[3] else: outputFileFormat = "text" type_to_rdd_json_input = open(rddFilename) type_to_rdd_json = json.load(type_to_rdd_json_input) type_to_rdd_json_input.close() frame_input = open(frameFilename) frame = json.load(frame_input) frame_input.close() fileUtil = FileUtil(sc) for key, val in type_to_rdd_json.items(): val["rdd"] = fileUtil.load_json_file(val["path"], val["format"], c_options) output_rdd = frame_json(frame, type_to_rdd_json) print "Write output to:", outputFilename fileUtil.save_json_file(output_rdd, outputFilename, outputFileFormat, c_options)
inputFilename1 = args[0] inputFileFormat1 = args[1] inputPath = args[2] baseFilename = args[3] baseFormat = args[4] joinResultFilename = args[5] joinFormat = args[6] outputFilename = args[7] outputFileFormat = args[8] removeElementsStr = c_options.remove removeElements = [] if len(removeElementsStr) > 0: removeElements = removeElementsStr.split(",") print "Got options:", c_options, ", " \ "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \ ", base:", baseFilename, ",", baseFormat, ", join:", joinResultFilename print "Write output to:", outputFilename fileUtil = FileUtil(sc) input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options) base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options) join_rdd = fileUtil.load_json_file(joinResultFilename, joinFormat, c_options) result_rdd = EntityMerger.merge_rdds(input_rdd1, inputPath, base_rdd, join_rdd, removeElements, c_options.numPartitions) fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
#!/usr/bin/env python from pyspark import SparkContext from optparse import OptionParser from fileUtil import FileUtil if __name__ == "__main__": sc = SparkContext(appName="DIG-TEXT-TO-SEQ") usage = "usage: %prog [options] inputDataset outputFilename" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename1 = args[0] outputFilename = args[1] print "Write output to:", outputFilename fileUtil = FileUtil(sc) input_rdd = fileUtil.load_json_file(inputFilename1, "text", c_options) print "Write output to:", outputFilename fileUtil.save_json_file(input_rdd, outputFilename, "sequence", c_options)
else: seen_objs.add(json.dumps(part)) return input_json if __name__ == "__main__": sc = SparkContext(appName="DIG-ENTITY_DEDUPLICATOR") usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath " \ "outputFilename outoutFileFormat" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename = args[0] inputFileFormat = args[1] inputPath = args[2] print "Read ", inputFileFormat, " file from ", inputFilename, " with path:", inputPath outputFilename = args[3] outputFileFormat = args[4] print "Write output to:", outputFilename fileUtil = FileUtil(sc) input_rdd = fileUtil.load_json_file(inputFilename, inputFileFormat, c_options) result_rdd = input_rdd.mapValues(lambda x: EntityDeduplicator().deduplicate(x, inputPath)) fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
from optparse import OptionParser from fileUtil import FileUtil import json if __name__ == "__main__": sc = SparkContext(appName="DIG-ENTITY_MERGER") usage = "usage: %prog [options] inputDataset inputDatasetFormat" \ "outputFilename" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") (c_options, args) = parser.parse_args() inputFilename1 = args[0] inputFileFormat1 = args[1] outputFilename = args[2] print "Got options:", c_options, ",input:", inputFilename1 + ", output:", outputFilename fileUtil = FileUtil(sc) input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options) def write_result(x): key = x[0] #print "Got key:", key return json.dumps({"uri":key, "matches":[{"uri": key}]}) result = input_rdd1.map(write_result) result.saveAsTextFile(outputFilename)