示例#1
0
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaCSV")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    outputFilename = argv[2]
    numPartitions = 1

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)

    #2. Apply the karma Model
    outputRDD = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl",
        "http://dig.isi.edu/data",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions,
        data_type="csv",
        additional_settings={
            "karma.input.delimiter": "\t",
            "karma.output.format": "n3"
        })
                        default="9200",
                        required=False)
    parser.add_argument("-x", "--index", help="ES Index name", required=True)
    parser.add_argument("-d",
                        "--doctype",
                        help="ES Document types",
                        required=True)
    args = parser.parse_args()
    print("Got arguments:", args)

    input = args.input.strip()
    numPartitions = int(args.partitions)
    doc_type = args.doctype.strip()
    outputFilename = args.output.strip()
    numFramerPartitions = numPartitions / 2
    inputRDD = workflow.batch_read_csv(input)
    outputFileType = "sequence"

    #2. Apply the karma Model
    reduced_rdd = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/usc-isi-i2/effect-alignment/master/models/ransonware/ransomware-model.ttl",
        "http://effect.isi.edu/data/",
        "http://schema.dig.isi.edu/ontology/Malware1",
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json",
        num_partitions=numPartitions,
        data_type="csv",
        additional_settings={"karma.input.delimiter": ","})
    if reduced_rdd is not None:
        reduced_rdd = reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK)
        fileUtil.save_file(reduced_rdd, outputFilename + '/reduced_rdd',
    print "Got options:", c_options

    java_import(sc._jvm, "edu.isi.karma")
    inputFilename = args[0]
    input_country = args[1]
    outputFilename = args[2]
    city_alternate_name_input = args[3]
    city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json"
    state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json"
    country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json"

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # 1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)
    input_country_rdd =  workflow.batch_read_csv(input_country)
    input_alternate_city_rdd = workflow.batch_read_csv(city_alternate_name_input)
    print input_alternate_city_rdd.first()

    inputRDD_partitioned = inputRDD.partitionBy(10)

    #2. Apply the karma Model
    cityRDD1 = workflow.run_karma(inputRDD_partitioned,
                                   "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
                                   "http://dig.isi.edu/geonames",
                                   "http://schema.org/City1",
                                   city_context,
                                   data_type="csv",
                                   additional_settings={"karma.input.delimiter":"\t", "rdf.generation.disable.nesting":"false"})
    print "Got options:", c_options

    java_import(sc._jvm, "edu.isi.karma")
    inputFilename = args[0]
    input_country = args[1]
    outputFilename = args[2]
    city_alternate_name_input = args[3]
    city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json"
    state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json"
    country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json"

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # 1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)
    input_country_rdd = workflow.batch_read_csv(input_country)
    input_alternate_city_rdd = workflow.batch_read_csv(
        city_alternate_name_input)
    print input_alternate_city_rdd.first()

    inputRDD_partitioned = inputRDD.partitionBy(10)

    #2. Apply the karma Model
    cityRDD1 = workflow.run_karma(
        inputRDD_partitioned,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
        "http://dig.isi.edu/geonames",
        "http://schema.org/City1",
        city_context,
        data_type="csv",