import CONSTANTS import shutil import os import bigdataUtilities as util if __name__ == "__main__": util.obnoxiousPrint( "Ensure the file 'final_output.csv' is in the user's HDFS directory.") util.obnoxiousPrint("See the README.md file for details.") util.obnoxiousPrint("Checking project dependencies.") util.subprocessCall(["lein", "deps"]) util.obnoxiousPrint("Creating uberjar.") util.subprocessCall(["lein", "uberjar"]) util.obnoxiousPrint("Running Cascalog correlation.") util.subprocessCall([ "hadoop", "jar", "target/cascalog-dwell-0.1.0-SNAPSHOT-standalone.jar", "clojure.main" ]) util.obnoxiousPrint("Copying results from HDFS to local output directory.") try: shutil.rmtree("output") except: pass os.mkdir("output") paths = util.lsrHdfsDir(CONSTANTS.CASCALOG_HDFS_PATH) fobj = open("output/sorted_out", "w") for path in paths:
import CONSTANTS import shutil import os import bigdataUtilities as util if __name__ == "__main__": util.obnoxiousPrint("Cleaning up HDFS data.") util.subprocessCall(["hadoop", "fs", "-rm", CONSTANTS.CASCALOG_HDFS_PATH + '/\*'], False) fobj = open("__temp_file", 'w') util.hiveScript("select * from vast_time_series_data", stdout=fobj) fobj.close() util.obnoxiousPrint("Moving data to Cascalog readable location.") util.subprocessCall(["hadoop", "fs", "-copyFromLocal", "__temp_file", "vast_time_series_data.tsv"]) util.obnoxiousPrint("Checking project dependencies.") util.subprocessCall(["lein", "deps"]) util.obnoxiousPrint("Creating uberjar.") util.subprocessCall(["lein", "uberjar"]) util.obnoxiousPrint("Running Cascalog correlation.") util.subprocessCall(["hadoop", "jar", "target/cascalog-corr-0.1.0-SNAPSHOT-standalone.jar", "clojure.main"]) util.obnoxiousPrint("Copying results from HDFS to local output directory.") try: shutil.rmtree("output") except: pass os.mkdir("output")
import CONSTANTS import shutil import os import bigdataUtilities as util if __name__ == "__main__": util.obnoxiousPrint("Ensure the file 'final_output.csv' is in the user's HDFS directory.") util.obnoxiousPrint("See the README.md file for details.") util.obnoxiousPrint("Checking project dependencies.") util.subprocessCall(["lein", "deps"]) util.obnoxiousPrint("Creating uberjar.") util.subprocessCall(["lein", "uberjar"]) util.obnoxiousPrint("Running Cascalog correlation.") util.subprocessCall(["hadoop", "jar", "target/cascalog-dwell-0.1.0-SNAPSHOT-standalone.jar", "clojure.main"]) util.obnoxiousPrint("Copying results from HDFS to local output directory.") try: shutil.rmtree("output") except: pass os.mkdir("output") paths = util.lsrHdfsDir(CONSTANTS.CASCALOG_HDFS_PATH) fobj = open("output/sorted_out","w") for path in paths: if "ais-dwell-out/part-" in path: util.subprocessCall(["hadoop", "fs", "-cat", path], stdout=fobj)
import sys if __name__ == "__main__": # base directory in HDFS. Make sure you're able to write/delete in this directory base_dir = "/analytics/leaf-compression" version = "-0.1-SNAPSHOT" # Comma seperated list of zookeeper nodes in ip:port format. 2181 is the default port. zkList = "localhost:2181" slaves = 1 if len(sys.argv) > 1: slaves = int(sys.argv[1]) util.obnoxiousPrint("Uploading data to hdfs") util.subprocessCall(["hadoop","fs","-rmr", base_dir + "/input"],False) util.subprocessCall(["hadoop","fs","-rmr", base_dir + "/output/leaf"],False) util.subprocessCall(["hadoop","fs","-mkdir","-p", base_dir + "/input"],False) util.subprocessCall(["hadoop","fs","-copyFromLocal","example_data.csv",base_dir + "/input/small_graph"]) util.obnoxiousPrint("Running leafcompress job") util.subprocessCall(["hadoop","jar","target/leaf-compression"+version+"-jar-with-dependencies.jar", "org.apache.giraph.GiraphRunner", "-Dgiraph.zkList=" + zkList, "com.soteradefense.bsp.KeyDataVertex", "-w", str(slaves), "-vif", "com.soteradefense.bsp.KeyDataVertexInputFormat", "-of", "com.soteradefense.bsp.KeyDataVertexOutputFormat", "-vip", base_dir + "/input/small_graph", "-op", base_dir + "/output/leaf"])