def main(): hr = HadoopRuntime("spec.json") settings = hr.settings print(settings) # allocate output_path, and clean it output_path = get_s3_working_dir(settings, "output_path") s3_delete(output_path, settings) # Prepare working directory hr.hdfs_clean_working_dir() temp_path = hr.get_hdfs_working_dir("temp") # build parameters for hadoop job jar_file = "mahout-core-1.0-SNAPSHOT-job.jar" hadoop_params = {} hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce" hadoop_params_str = " ".join( ["%s=%s" % (k, v) for k, v in hadoop_params.items()]) jar_defs = {} jar_defs[ "fs.s3n.awsAccessKeyId"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_ID jar_defs[ "fs.s3n.awsSecretAccessKey"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_SECRET jar_defs[ "fs.s3.awsAccessKeyId"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_ID jar_defs[ "fs.s3.awsSecretAccessKey"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_SECRET jar_defs["mapreduce.framework.name"] = "yarn" jar_defs[ "yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager jar_defs[ "yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler jar_defs["fs.defaultFS"] = settings.Param.hdfs_root jar_defs["mapreduce.output.fileoutputformat.compress"] = "false" jar_defs_str = " ".join(["-D %s=%s" % (k, v) for k, v in jar_defs.items()]) other_args = OrderedDict() other_args["similarityClassname"] = "SIMILARITY_EUCLIDEAN_DISTANCE" other_args["input"] = settings.Input.ratings.as_datasource['URL'] other_args["usersFile"] = settings.Input.usersFile.as_datasource['URL'] other_args["output"] = output_path other_args["tempDir"] = temp_path other_args_str = " ".join( ["--%s %s" % (k, v) for k, v in other_args.items()]) cmd_str = '%s hadoop jar %s org.apache.mahout.cf.taste.hadoop.item.RecommenderJob %s %s' % \ (hadoop_params_str, jar_file, jar_defs_str, other_args_str) print("Executing:") print(cmd_str) ret = cmd(cmd_str) if ret != 0: print("Job failed") sys.exit(ret) settings.Output.output_path.val = output_path print("Done")
def main(): hr = HadoopRuntime("spec.json") settings = hr.settings print(settings) # allocate output_path, and clean it output_path = get_s3_working_dir(settings, "output_path") s3_delete(output_path, settings) # Prepare working directory hr.hdfs_clean_working_dir() temp_path = hr.get_hdfs_working_dir("temp") # build parameters for hadoop job jar_file = "mahout-core-1.0-SNAPSHOT-job.jar" hadoop_params = {} hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce" hadoop_params_str = " ".join(["%s=%s" % (k, v) for k, v in hadoop_params.items()]) jar_defs = {} jar_defs["fs.s3n.awsAccessKeyId"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_ID jar_defs["fs.s3n.awsSecretAccessKey"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_SECRET jar_defs["fs.s3.awsAccessKeyId"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_ID jar_defs["fs.s3.awsSecretAccessKey"] = '"%s"' % settings.Param.AWS_ACCESS_KEY_SECRET jar_defs["mapreduce.framework.name"] = "yarn" jar_defs["yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager jar_defs["yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler jar_defs["fs.defaultFS"] = settings.Param.hdfs_root jar_defs["mapreduce.output.fileoutputformat.compress"] = "false" jar_defs_str = " ".join(["-D %s=%s" % (k, v) for k, v in jar_defs.items()]) other_args = OrderedDict() other_args["similarityClassname"] = "SIMILARITY_EUCLIDEAN_DISTANCE" other_args["input"] = settings.Input.ratings.as_datasource["URL"] other_args["usersFile"] = settings.Input.usersFile.as_datasource["URL"] other_args["output"] = output_path other_args["tempDir"] = temp_path other_args_str = " ".join(["--%s %s" % (k, v) for k, v in other_args.items()]) cmd_str = "%s hadoop jar %s org.apache.mahout.cf.taste.hadoop.item.RecommenderJob %s %s" % ( hadoop_params_str, jar_file, jar_defs_str, other_args_str, ) print("Executing:") print(cmd_str) ret = cmd(cmd_str) if ret != 0: print("Job failed") sys.exit(ret) settings.Output.output_path.val = output_path print("Done")
def main(): hr = HadoopRuntime("spec.json") settings = hr.settings print(settings) # Prepare working directory hr.hdfs_clean_working_dir() # allocate temp_path temp_path = hr.get_hdfs_working_dir("temp") # allocate output_path output_path = hr.get_hdfs_working_dir("output_path") # build parameters for hadoop job jar_file = "./mahout-core-1.0-SNAPSHOT-job.jar" hadoop_params = {} hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce" hadoop_params_str = " ".join( ["%s=%s" % (k, v) for k, v in hadoop_params.items()]) jar_defs = {} jar_defs["mapreduce.framework.name"] = "yarn" jar_defs[ "yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager jar_defs[ "yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler jar_defs["fs.defaultFS"] = settings.Param.hdfs_root jar_defs["mapreduce.output.fileoutputformat.compress"] = "false" jar_defs_str = " ".join(["-D %s=%s" % (k, v) for k, v in jar_defs.items()]) other_args = OrderedDict() other_args["similarityClassname"] = "SIMILARITY_EUCLIDEAN_DISTANCE" other_args["input"] = settings.Input.ratings.val other_args["usersFile"] = settings.Input.usersFile.val other_args["output"] = output_path other_args["tempDir"] = temp_path other_args_str = " ".join( ["--%s %s" % (k, v) for k, v in other_args.items()]) line_num = get_the_line_of_transaction(settings.Input.ratings.val) if line_num > 0: cmd_str = '%s hadoop jar %s org.apache.mahout.cf.taste.hadoop.item.RecommenderJob %s %s' % \ (hadoop_params_str, jar_file, jar_defs_str, other_args_str) print("Executing:") print(cmd_str) ret = cmd(cmd_str) if ret != 0: print("Job failed") sys.exit(ret) else: print "Collaborative Input Transaction Matrix is empty. Skip the calcuating." settings.Output.cl_result.val = output_path print("Done")
def main(): hr = HadoopRuntime("spec.json") settings = hr.settings print(settings) # Prepare working directory hr.hdfs_clean_working_dir() # allocate temp_path temp_path = hr.get_hdfs_working_dir("temp") # allocate output_path output_path = hr.get_hdfs_working_dir("output_path") # build parameters for hadoop job jar_file = "./mahout-core-1.0-SNAPSHOT-job.jar" hadoop_params = {} hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce" hadoop_params_str = " ".join(["%s=%s" % (k,v) for k,v in hadoop_params.items()]) jar_defs = {} jar_defs["mapreduce.framework.name"] = "yarn" jar_defs["yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager jar_defs["yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler jar_defs["fs.defaultFS"] = settings.Param.hdfs_root jar_defs["mapreduce.output.fileoutputformat.compress"] = "false" jar_defs_str = " ".join(["-D %s=%s" % (k,v) for k,v in jar_defs.items()]) other_args = OrderedDict() other_args["similarityClassname"] = "SIMILARITY_EUCLIDEAN_DISTANCE" other_args["input"] = settings.Input.ratings.val other_args["usersFile"] = settings.Input.usersFile.val other_args["output"] = output_path other_args["tempDir"] = temp_path other_args_str = " ".join(["--%s %s" % (k,v) for k,v in other_args.items()]) line_num =get_the_line_of_transaction(settings.Input.ratings.val) if line_num >0: cmd_str = '%s hadoop jar %s org.apache.mahout.cf.taste.hadoop.item.RecommenderJob %s %s' % \ (hadoop_params_str, jar_file, jar_defs_str, other_args_str) print("Executing:") print(cmd_str) ret = cmd(cmd_str) if ret != 0: print("Job failed") sys.exit(ret) else: print "Collaborative Input Transaction Matrix is empty. Skip the calcuating." settings.Output.cl_result.val = output_path print("Done")
def main(): hr = HadoopRuntime("spec.json") settings = hr.settings print(settings) ds = json.load(open(settings.Input.DS)) if ds['Type'] != "AWS_S3": raise ValueError("Invalid data_source type: '%s'" % ds['Type']) # Prepare working directory hr.hdfs_clean_working_dir() output_dir = hr.get_hdfs_working_dir("sentiment_result") settings.Output.sentiment_result.val = output_dir AWS_ACCESS_KEY_ID = ds['Meta']['key'] AWS_SECRET_ACCESS_KEY = ds['Meta']['token'] # Execute "hadoop jar" jar_file = "HelloAvro-1.1-jar-with-dependencies.jar" hadoop_params = {} hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce" hadoop_params["AWS_ACCESS_KEY_ID"] = ds['Meta']['key'] hadoop_params["AWS_SECRET_ACCESS_KEY"] = ds['Meta']['token'] hadoop_params_str = " ".join( ["%s=%s" % (k, v) for k, v in hadoop_params.items()]) jar_defs = {} jar_defs["fs.s3n.awsAccessKeyId"] = '"%s"' % AWS_ACCESS_KEY_ID jar_defs["fs.s3n.awsSecretAccessKey"] = '"%s"' % AWS_SECRET_ACCESS_KEY jar_defs["fs.s3.awsAccessKeyId"] = '"%s"' % AWS_ACCESS_KEY_ID jar_defs["fs.s3.awsSecretAccessKey"] = '"%s"' % AWS_SECRET_ACCESS_KEY jar_defs["mapreduce.framework.name"] = "yarn" jar_defs[ "yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager jar_defs[ "yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler jar_defs["fs.defaultFS"] = settings.Param.hdfs_root jar_defs_str = " ".join(["-D %s=%s" % (k, v) for k, v in jar_defs.items()]) cmd_str = '%s hadoop jar %s %s %s %s' % ( hadoop_params_str, jar_file, jar_defs_str, ds['URL'], output_dir) print("Executing:") print(cmd_str) ret = cmd(cmd_str) print("exit code = %d" % ret) sys.exit(ret)
def main(): hr = HadoopRuntime("spec.json") settings = hr.settings print(settings) ds = json.load(open(settings.Input.DS)) if ds['Type'] != "AWS_S3": raise ValueError("Invalid data_source type: '%s'" % ds['Type']) # Prepare working directory hr.hdfs_clean_working_dir() output_dir = hr.get_hdfs_working_dir("sentiment_result") settings.Output.sentiment_result.val = output_dir AWS_ACCESS_KEY_ID = ds['Meta']['key'] AWS_SECRET_ACCESS_KEY = ds['Meta']['token'] # Execute "hadoop jar" jar_file = "HelloAvro-1.1-jar-with-dependencies.jar" hadoop_params = {} hadoop_params["HADOOP_MAPRED_HOME"] = "/usr/lib/hadoop-mapreduce" hadoop_params["AWS_ACCESS_KEY_ID"] = ds['Meta']['key'] hadoop_params["AWS_SECRET_ACCESS_KEY"] = ds['Meta']['token'] hadoop_params_str = " ".join(["%s=%s" % (k,v) for k,v in hadoop_params.items()]) jar_defs = {} jar_defs["fs.s3n.awsAccessKeyId"] = '"%s"' % AWS_ACCESS_KEY_ID jar_defs["fs.s3n.awsSecretAccessKey"] = '"%s"' % AWS_SECRET_ACCESS_KEY jar_defs["fs.s3.awsAccessKeyId"] = '"%s"' % AWS_ACCESS_KEY_ID jar_defs["fs.s3.awsSecretAccessKey"] = '"%s"' % AWS_SECRET_ACCESS_KEY jar_defs["mapreduce.framework.name"] = "yarn" jar_defs["yarn.resourcemanager.address"] = settings.Param.yarn_resourcemanager jar_defs["yarn.resourcemanager.scheduler.address"] = settings.Param.yarn_resourcemanager_scheduler jar_defs["fs.defaultFS"] = settings.Param.hdfs_root jar_defs_str = " ".join(["-D %s=%s" % (k,v) for k,v in jar_defs.items()]) cmd_str = '%s hadoop jar %s %s %s %s' % (hadoop_params_str, jar_file, jar_defs_str, ds['URL'], output_dir) print("Executing:") print(cmd_str) ret = cmd(cmd_str) print("exit code = %d" % ret) sys.exit(ret)