Пример #1
0
def main():
    hr = HadoopRuntime("spec.json")
    settings = hr.settings
    print(settings)

    # allocate output_path, and clean it
    output_path = get_s3_working_dir(settings, "output_path")
    s3_delete(output_path, settings)

    # Prepare working directory
    hr.hdfs_clean_working_dir()
    temp_path = hr.get_hdfs_working_dir("temp")

    # build parameters for hadoop job
    jar_file = "mahout-core-1.0-SNAPSHOT-job.jar"
    hadoop_params = {}
    hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce"
    hadoop_params_str = " ".join(
        ["%s=%s" % (k, v) for k, v in hadoop_params.items()])

    jar_defs = {}
    jar_defs[
        "fs.s3n.awsAccessKeyId"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_ID
    jar_defs[
        "fs.s3n.awsSecretAccessKey"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_SECRET
    jar_defs[
        "fs.s3.awsAccessKeyId"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_ID
    jar_defs[
        "fs.s3.awsSecretAccessKey"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_SECRET
    jar_defs["mapreduce.framework.name"] = "yarn"
    jar_defs[
        "yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager
    jar_defs[
        "yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler
    jar_defs["fs.defaultFS"] = settings.Param.hdfs_root
    jar_defs["mapreduce.output.fileoutputformat.compress"] = "false"
    jar_defs_str = " ".join(["-D %s=%s" % (k, v) for k, v in jar_defs.items()])

    other_args = OrderedDict()
    other_args["similarityClassname"] = "SIMILARITY_EUCLIDEAN_DISTANCE"
    other_args["input"] = settings.Input.ratings.as_datasource['URL']
    other_args["usersFile"] = settings.Input.usersFile.as_datasource['URL']
    other_args["output"] = output_path
    other_args["tempDir"] = temp_path
    other_args_str = " ".join(
        ["--%s %s" % (k, v) for k, v in other_args.items()])

    cmd_str = '%s hadoop jar %s org.apache.mahout.cf.taste.hadoop.item.RecommenderJob %s %s' % \
            (hadoop_params_str, jar_file, jar_defs_str, other_args_str)
    print("Executing:")
    print(cmd_str)
    ret = cmd(cmd_str)
    if ret != 0:
        print("Job failed")
        sys.exit(ret)

    settings.Output.output_path.val = output_path
    print("Done")
Пример #2
0
def main():
    hr = HadoopRuntime("spec.json")
    settings = hr.settings
    print(settings)

    # allocate output_path, and clean it
    output_path = get_s3_working_dir(settings, "output_path")
    s3_delete(output_path, settings)

    # Prepare working directory
    hr.hdfs_clean_working_dir()
    temp_path = hr.get_hdfs_working_dir("temp")

    # build parameters for hadoop job
    jar_file = "mahout-core-1.0-SNAPSHOT-job.jar"
    hadoop_params = {}
    hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce"
    hadoop_params_str = " ".join(["%s=%s" % (k, v) for k, v in hadoop_params.items()])

    jar_defs = {}
    jar_defs["fs.s3n.awsAccessKeyId"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_ID
    jar_defs["fs.s3n.awsSecretAccessKey"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_SECRET
    jar_defs["fs.s3.awsAccessKeyId"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_ID
    jar_defs["fs.s3.awsSecretAccessKey"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_SECRET
    jar_defs["mapreduce.framework.name"] = "yarn"
    jar_defs["yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager
    jar_defs["yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler
    jar_defs["fs.defaultFS"] = settings.Param.hdfs_root
    jar_defs["mapreduce.output.fileoutputformat.compress"] = "false"
    jar_defs_str = " ".join(["-D %s=%s" % (k, v) for k, v in jar_defs.items()])

    other_args = OrderedDict()
    other_args["similarityClassname"] = "SIMILARITY_EUCLIDEAN_DISTANCE"
    other_args["input"] = settings.Input.ratings.as_datasource["URL"]
    other_args["usersFile"] = settings.Input.usersFile.as_datasource["URL"]
    other_args["output"] = output_path
    other_args["tempDir"] = temp_path
    other_args_str = " ".join(["--%s %s" % (k, v) for k, v in other_args.items()])

    cmd_str = "%s hadoop jar %s org.apache.mahout.cf.taste.hadoop.item.RecommenderJob %s %s" % (
        hadoop_params_str,
        jar_file,
        jar_defs_str,
        other_args_str,
    )
    print("Executing:")
    print(cmd_str)
    ret = cmd(cmd_str)
    if ret != 0:
        print("Job failed")
        sys.exit(ret)

    settings.Output.output_path.val = output_path
    print("Done")
Пример #3
0
def main():
    hr = HadoopRuntime("spec.json")
    settings = hr.settings
    print(settings)

    # Prepare working directory
    hr.hdfs_clean_working_dir()
    # allocate temp_path
    temp_path = hr.get_hdfs_working_dir("temp")
    # allocate output_path
    output_path = hr.get_hdfs_working_dir("output_path")

    # build parameters for hadoop job
    jar_file = "./mahout-core-1.0-SNAPSHOT-job.jar"
    hadoop_params = {}
    hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce"
    hadoop_params_str = " ".join(
        ["%s=%s" % (k, v) for k, v in hadoop_params.items()])

    jar_defs = {}
    jar_defs["mapreduce.framework.name"] = "yarn"
    jar_defs[
        "yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager
    jar_defs[
        "yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler
    jar_defs["fs.defaultFS"] = settings.Param.hdfs_root
    jar_defs["mapreduce.output.fileoutputformat.compress"] = "false"
    jar_defs_str = " ".join(["-D %s=%s" % (k, v) for k, v in jar_defs.items()])

    other_args = OrderedDict()
    other_args["similarityClassname"] = "SIMILARITY_EUCLIDEAN_DISTANCE"
    other_args["input"] = settings.Input.ratings.val
    other_args["usersFile"] = settings.Input.usersFile.val
    other_args["output"] = output_path
    other_args["tempDir"] = temp_path
    other_args_str = " ".join(
        ["--%s %s" % (k, v) for k, v in other_args.items()])

    line_num = get_the_line_of_transaction(settings.Input.ratings.val)

    if line_num > 0:
        cmd_str = '%s hadoop jar %s org.apache.mahout.cf.taste.hadoop.item.RecommenderJob %s %s' % \
                (hadoop_params_str, jar_file, jar_defs_str, other_args_str)
        print("Executing:")
        print(cmd_str)
        ret = cmd(cmd_str)
        if ret != 0:
            print("Job failed")
            sys.exit(ret)
    else:
        print "Collaborative Input Transaction Matrix is empty. Skip the calcuating."
    settings.Output.cl_result.val = output_path

    print("Done")
Пример #4
0
def main():
    hr = HadoopRuntime("spec.json")
    settings = hr.settings
    print(settings)

    # Prepare working directory
    hr.hdfs_clean_working_dir()
    # allocate temp_path
    temp_path = hr.get_hdfs_working_dir("temp")
    # allocate output_path
    output_path = hr.get_hdfs_working_dir("output_path")
    
    # build parameters for hadoop job
    jar_file = "./mahout-core-1.0-SNAPSHOT-job.jar"
    hadoop_params = {}
    hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce"
    hadoop_params_str = " ".join(["%s=%s" % (k,v) for k,v in hadoop_params.items()])

    jar_defs = {}
    jar_defs["mapreduce.framework.name"] = "yarn"
    jar_defs["yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager
    jar_defs["yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler
    jar_defs["fs.defaultFS"] = settings.Param.hdfs_root
    jar_defs["mapreduce.output.fileoutputformat.compress"] = "false"
    jar_defs_str = " ".join(["-D %s=%s" % (k,v) for k,v in jar_defs.items()])

    other_args = OrderedDict()
    other_args["similarityClassname"] = "SIMILARITY_EUCLIDEAN_DISTANCE"
    other_args["input"] = settings.Input.ratings.val
    other_args["usersFile"] = settings.Input.usersFile.val
    other_args["output"] = output_path
    other_args["tempDir"] = temp_path
    other_args_str = " ".join(["--%s %s" % (k,v) for k,v in other_args.items()])
    
    line_num =get_the_line_of_transaction(settings.Input.ratings.val)
    
    if line_num >0: 
        cmd_str = '%s hadoop jar %s org.apache.mahout.cf.taste.hadoop.item.RecommenderJob %s %s' % \
                (hadoop_params_str, jar_file, jar_defs_str, other_args_str)
        print("Executing:")
        print(cmd_str)
        ret = cmd(cmd_str)
        if ret != 0:
            print("Job failed")
            sys.exit(ret)
    else:
        print "Collaborative Input Transaction Matrix is empty. Skip the calcuating."   
    settings.Output.cl_result.val = output_path

    print("Done")
Пример #5
0
def main():
    hr = HadoopRuntime("spec.json")
    settings = hr.settings
    print(settings)

    ds = json.load(open(settings.Input.DS))

    if ds['Type'] != "AWS_S3":
        raise ValueError("Invalid data_source type: '%s'" % ds['Type'])

    # Prepare working directory
    hr.hdfs_clean_working_dir()
    output_dir = hr.get_hdfs_working_dir("sentiment_result")
    settings.Output.sentiment_result.val = output_dir

    AWS_ACCESS_KEY_ID = ds['Meta']['key']
    AWS_SECRET_ACCESS_KEY = ds['Meta']['token']

    # Execute "hadoop jar"
    jar_file = "HelloAvro-1.1-jar-with-dependencies.jar"
    hadoop_params = {}
    hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce"
    hadoop_params["AWS_ACCESS_KEY_ID"] = ds['Meta']['key']
    hadoop_params["AWS_SECRET_ACCESS_KEY"] = ds['Meta']['token']
    hadoop_params_str = " ".join(
        ["%s=%s" % (k, v) for k, v in hadoop_params.items()])

    jar_defs = {}
    jar_defs["fs.s3n.awsAccessKeyId"] = '"%s"' % AWS_ACCESS_KEY_ID
    jar_defs["fs.s3n.awsSecretAccessKey"] = '"%s"' % AWS_SECRET_ACCESS_KEY
    jar_defs["fs.s3.awsAccessKeyId"] = '"%s"' % AWS_ACCESS_KEY_ID
    jar_defs["fs.s3.awsSecretAccessKey"] = '"%s"' % AWS_SECRET_ACCESS_KEY
    jar_defs["mapreduce.framework.name"] = "yarn"
    jar_defs[
        "yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager
    jar_defs[
        "yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler
    jar_defs["fs.defaultFS"] = settings.Param.hdfs_root
    jar_defs_str = " ".join(["-D %s=%s" % (k, v) for k, v in jar_defs.items()])

    cmd_str = '%s hadoop jar %s %s %s %s' % (
        hadoop_params_str, jar_file, jar_defs_str, ds['URL'], output_dir)
    print("Executing:")
    print(cmd_str)
    ret = cmd(cmd_str)
    print("exit code = %d" % ret)
    sys.exit(ret)
Пример #6
0
def main():
    hr = HadoopRuntime("spec.json")
    settings = hr.settings
    print(settings)

    ds = json.load(open(settings.Input.DS))

    if ds['Type'] != "AWS_S3":
        raise ValueError("Invalid data_source type: '%s'" % ds['Type'])

    # Prepare working directory
    hr.hdfs_clean_working_dir()
    output_dir = hr.get_hdfs_working_dir("sentiment_result")
    settings.Output.sentiment_result.val = output_dir

    AWS_ACCESS_KEY_ID = ds['Meta']['key']
    AWS_SECRET_ACCESS_KEY = ds['Meta']['token']

    # Execute "hadoop jar"
    jar_file = "HelloAvro-1.1-jar-with-dependencies.jar"
    hadoop_params = {}
    hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce"
    hadoop_params["AWS_ACCESS_KEY_ID"] = ds['Meta']['key']
    hadoop_params["AWS_SECRET_ACCESS_KEY"] = ds['Meta']['token']
    hadoop_params_str = " ".join(["%s=%s" % (k,v) for k,v in hadoop_params.items()])

    jar_defs = {}
    jar_defs["fs.s3n.awsAccessKeyId"] = '"%s"' % AWS_ACCESS_KEY_ID
    jar_defs["fs.s3n.awsSecretAccessKey"] = '"%s"' % AWS_SECRET_ACCESS_KEY
    jar_defs["fs.s3.awsAccessKeyId"] = '"%s"' % AWS_ACCESS_KEY_ID
    jar_defs["fs.s3.awsSecretAccessKey"] = '"%s"' % AWS_SECRET_ACCESS_KEY
    jar_defs["mapreduce.framework.name"] = "yarn"
    jar_defs["yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager
    jar_defs["yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler
    jar_defs["fs.defaultFS"] = settings.Param.hdfs_root
    jar_defs_str = " ".join(["-D %s=%s" % (k,v) for k,v in jar_defs.items()])

    cmd_str = '%s hadoop jar %s %s %s %s' % (hadoop_params_str, jar_file, jar_defs_str, ds['URL'], output_dir)
    print("Executing:")
    print(cmd_str)
    ret = cmd(cmd_str)
    print("exit code = %d" % ret)
    sys.exit(ret)