def run (self, params, script_name, script_file, elements = []): ''' Execute pig. ''' pig = Pig.compileFromFile (script_name, script_file) bound = pig.bind (params) futures = bound.run () if isinstance (params, list) else bound.runSingle () self.handle_future (futures, elements) self.complete ()
def runbidi(src, fdest): P = Pig.compileFromFile('src/main/pig/bidi.pig') cntsbase = 'counts' Pig.fs('rmr ' + cntsbase) for count in range(10): dest = fdest + 'gm%04d' % count Pig.fs('rmr ' + dest) cnts = cntsbase params = {'src':src, 'dest':dest, 'cnts':cnts} bound = P.bind(params) job = bound.runSingle() if not job.isSuccessful(): raise 'failed' src = dest iter = job.result('S').iterator() if iter.hasNext(): Pig.fs('rmr ' + cnts) else: Pig.fs('mv ' + dest + ' ' + fdest) print 'ALL DONE!' break
def run_script(): import os from org.apache.pig.scripting import Pig # compile the pig code P = Pig.compileFromFile("../pigscripts/#{script_name}.pig") bound = P.bind() bound.runSingle()
def main(argv=None): #Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily' #however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to #work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case #https://issues.apache.org/jira/browse/PIG-2548 # if argv is None: # argv = sys.argv # if len(argv) != 3: # print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>" # return 1 # # profile_file = argv[1] # timeframe = argv[2] profile_file = os.environ['config_file'] timeframe = os.environ['timeframe'] if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'): print 'The time frame must be either daily, weekly or monthly.' return 1 #Load the config profile = {} execfile(profile_file, {'timeframe':timeframe}, profile) #Clean up incomplete runs and create dir Pig.fs('rmr ' + profile['REPORTDIR']) Pig.fs('mkdir ' + profile['REPORTDIR']) #Start pig processing pig_init() if timeframe == 'daily': #Clean up incomplete runs and create dir Pig.fs('rmr %s' % profile['LOGDIR']) Pig.fs('mkdir %s' % profile['LOGDIR']) import_logs(profile['logs']) #The web_load.pig script is run by the processing scripts pstats = Pig.compileFromFile('web_%s.pig' % timeframe) bstats = pstats.bind(profile) stats = bstats.run() if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats): if not stats.isSuccessful(): print 'Error in web log stats, %s' % run.getErrorMessage() sys.exit(1) else: for run in stats: if not run.isSuccessful(): print 'Error in web log stats, %s' % run.getErrorMessage() sys.exit(1)
def import_logs(profile): """ Import all the log files for a given day and processed them putting each in a log dir. If the profile is a list there are multiple files otherwise only a single one. The files are combined when running web_load.pig """ #Clean up any left over files from the last run for logfile in profile: Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME'])) pload = Pig.compileFromFile('web_import.pig') bload = pload.bind(profile) load = bload.run() #Check for load errors if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats): if not load.isSuccessful(): print 'Error in web log load, %s' % load.getErrorMessage() sys.exit(1) else: for run in load: if not run.isSuccessful(): print 'Error in web log load, %s' % run.getErrorMessage() sys.exit(1)
def run_script(): import os from org.apache.pig.scripting import Pig # compile the pig code for i in range(10): print 'Run %s started!' % i P = Pig.compileFromFile("../pigscripts/avg_songs_per_split_counter.pig") bound = P.bind({"ITERATION_NUM":i}) ps = bound.runSingle() print 'Run %s done!' % i result = ps.result("avg_split_song_count") for r in result.iterator(): print r if int(r.get(1).toString()) >= 5: print 'Good enough! Quitting time!' break
if __name__ == '__main__': from org.apache.pig.scripting import Pig import sys P = Pig.compileFromFile('/home/course/lian9478/task3.pig') params = {} for i in range(int(sys.argv[1])): if i == 0: out = '/home/course/lian9478/HW4-old_twitter_account_rank.csv' else: out = "out/pagerank_data_" + str(i + 1) params['doc_in'] = out params['doc_out'] = "out/pagerank_data_" + str(i + 2) bound = P.bind(params) bound.runSingle() #this is to do it one by one instead of parallel #so you can call this driver like this #pig -x local -embedded jython driver.py 20
def __init__(self, script_name, description, script_path, script_params, checkpoint_path): self.script_name = script_name self.description = description self.bound_script = Pig.compileFromFile(script_path).bind(script_params) self.flag_file_path = "%s/%s.success" % (checkpoint_path, os.path.splitext(script_name)[0])
with open('tenant.json') as tenantdata_file: tenantData = json.loads(tenantdata_file.read()) globalVars.init(tenantCode, tenantData) callResult = call("mongo " + mongo1 + " " + mongo2 + " " + mongo3 + " var param1='" + tenantCode + "' " + ''' " ../list_mongo_dataset_fields.js > dataset.json''', shell=True) if callResult == 0: with open('dataset.json') as metadata_file: metadata = json.loads(metadata_file.read()) syncJob = Pig.compileFromFile("""sync_solr2phoenix.pig""") for m in metadata: subtype = m['_id']['subtype'] phoenixColumns = phoenixDynamicColumns = phoenixUpsertColumns = '' for field in m['_id']['fields']: name = field['fieldName'].strip() dataType = field['dataType'] if subtype == 'binaryDataset' and (name == 'urlDownloadBinary' or name == 'idBinary'): continue
def iterate_until_convergence(script_path, iteration_dir, param_generator_func, metric_name, metric_type, metric_alias, metric_threshold, max_num_iterations): """ Utility for running a pigscript which outputs data in the same schema as its input iteratively, with the output of the previous run being the input of the next run. Stops when some convergence metric has been reached or if a maximum number of iterations has been reached. Example usage: iteration_result = IterationUtils.iterate_until_convergence( "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate iteration_dir, # temporary iteration outputs will be stored here iteration_param_func, # takes iteration #, returns Pig parameter dictionary "Sum of ordering-rank changes", # name of the convergence metric int, # Python type of the convergence metric "aggregate_rank_change", # alias in the pigscript where the metric is stored to convergence_threshold, # stop when metric less than this max_num_iterations # or if this many iterations have been performed iteration_result is a PigStats object for the results of the last iteration. Example iteration_param_func: def iteration_param_func(it_num, it_dir): if it_num == 1: iteration_input = preprocess_dir + "/pageranks" else: iteration_input = it_dir + "/" + str(it_num-1) + "/pageranks" return { "INPUT_PATH" : iteration_input, "DAMPING_FACTOR" : damping_factor, "NUM_NODES" : num_nodes, "PAGERANKS_OUTPUT_PATH" : it_dir + "/" + str(it_num) + "/pageranks" "AGG_RANK_CHANGE_OUTPUT_PATH" : it_dir + "/" + str(it_num) + "/rank_changes" } ) """ script = Pig.compileFromFile(script_path) for i in range(1, max_num_iterations+1): print "Starting iteration step: %d" % i iteration = script.bind(param_generator_func(i, iteration_dir)).runSingle() metric_value = metric_type(str(iteration.result(metric_alias).iterator().next().get(0))) if metric_value < metric_threshold: print "%s %s under convergence threshold %s. Stopping." \ % (metric_name, str(metric_value), str(metric_threshold)) return { "num_iterations": i, "stop_reason": "CONVERGED" } elif i == max_num_iterations: print "%s %s above convergence threshold %s but hit max number of iterations. Stopping" \ % (metric_name, str(metric_value), str(metric_threshold)) return { "num_iterations": i, "stop_reason": "MAX_ITERATIONS" } else: print "%s %s above convergence threshold %s. Continuing." \ % (metric_name, str(metric_value), str(metric_threshold))
if __name__ == "__main__": params = Pig.getParameters() graph = params["GRAPH"] seed_vertices = params["SEED_VERTICES"] tmp_dir = params["TMP_DIR"] output_path = params["OUTPUT_PATH"] nhood_size = int(params["NEIGHBORHOOD_SIZE"]) preprocess_graph = "%s/preprocess/graph" % tmp_dir preprocess_num_vertices = "%s/preprocess/num_vertices" % tmp_dir iteration_verts_prefix = "%s/iteration/vertices_" % tmp_dir print "Graph Sampler: starting preprocessing step." preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({ "GRAPH_INPUT_PATH" : graph, "GRAPH_OUTPUT_PATH" : preprocess_graph, "NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices }).runSingle() iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig") num_iterations = nhood_size - 1 num_vertices = long(str(preprocessing.result("num_vertices").iterator().next().get(0))) print "Graph Sampler: scheduling %d iterations" % num_iterations for i in range(num_iterations): print "Graph Sampler: starting iteration step %d" % (i+1) iteration = iteration_script.bind({ "VERTICES_INPUT_PATH" : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)), "GRAPH_INPUT_PATH" : preprocess_graph, "VERTICES_OUTPUT_PATH" : iteration_verts_prefix + str(i) }).runSingle()
import sys from org.apache.pig.scripting import Pig load = Pig.compileFromFile(sys.argv[1]) iteration = Pig.compileFromFile('iteration.pig') store = Pig.compileFromFile('store.pig') print '*** Loading input ***' load_stats = load.bind({'EDGES_OUT': 'edges0.tmp'}).runSingle() if not load_stats.isSuccessful(): raise 'Load failed' i = 1 stable_inerations = 0 edges_in = 'edges' + str(i - 1) + '.tmp' edges_out = '' while True: print "*** Iteration " + str(i) + " ***" edges_out = 'edges' + str(i) + '.tmp' iteration_bound = iteration.bind({'EDGES_IN': edges_in, 'EDGES_OUT': edges_out, 'CONVERGENCE_OUT': 'convergence.tmp'}) iteration_stats = iteration_bound.runSingle() if not iteration_stats.isSuccessful(): raise 'Iteration failed' conv_result = iteration_stats.result('convergence').iterator().next() max_iter = int(str(conv_result.get(0))) conv_iter = int(str(conv_result.get(1))) change_count = int(str(conv_result.get(2))) Pig.fs('rm -r ' + 'convergence.tmp') Pig.fs('rm -r ' + edges_in)
def run_pagerank(): """ Calculates pageranks for Twitter users. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of users. b) Prepare initial pagerank values for all users. 2. Iterative: Calculate new pageranks for each user based on the previous pageranks of the users' followers. 3. Postprocesing: Find the top pagerank users and join to a separate dataset to find their names. """ # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile(PREPROCESS_SCRIPT) preprocess_bound = preprocess.bind({ "INPUT_PATH": FOLLOWER_GRAPH_INPUT, "PAGERANKS_OUTPUT_PATH": PREPROCESS_PAGERANKS, "NUM_USERS_OUTPUT_PATH": PREPROCESS_NUM_USERS }) preprocess_stats = preprocess_bound.runSingle() num_users = int(str(preprocess_stats.result("num_users").iterator().next().get(0))) convergence_threshold = CONVERGENCE_THRESHOLD / num_users # Iteration step: iteration = Pig.compileFromFile(PAGERANK_ITERATE_SCRIPT) for i in range(MAX_NUM_ITERATIONS): print "Starting iteration step: %s" % str(i + 1) # Append the iteration number to the input/output stems iteration_input = PREPROCESS_PAGERANKS if i == 0 else (ITERATION_PAGERANKS_PREFIX + str(i-1)) iteration_pageranks_output = ITERATION_PAGERANKS_PREFIX + str(i) iteration_max_diff_output = ITERATION_MAX_DIFF_PREFIX + str(i) iteration_bound = iteration.bind({ "INPUT_PATH": iteration_input, "DAMPING_FACTOR": DAMPING_FACTOR, "NUM_USERS": num_users, "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output, "MAX_DIFF_OUTPUT_PATH": iteration_max_diff_output }) iteration_stats = iteration_bound.runSingle() # If we're below the convergence_threshold break out of the loop. max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0))) if max_diff < CONVERGENCE_THRESHOLD: print "Max diff %s under convergence threshold. Stopping." % max_diff break elif i == MAX_NUM_ITERATIONS-1: print "Max diff %s above convergence threshold but hit max number of iterations. Stopping." \ % max_diff else: print "Max diff %s above convergence threshold. Continuing." % max_diff iteration_pagerank_result = ITERATION_PAGERANKS_PREFIX + str(i) # Postprocesing step: print "Starting postprocessing step." postprocess = Pig.compileFromFile(POSTPROCESS_SCRIPT) postprocess_bound = postprocess.bind({ "PAGERANKS_INPUT_PATH": iteration_pagerank_result, "USERNAMES_INPUT_PATH": USERNAMES_INPUT, "TOP_N": NUM_TOP_USERS, "OUTPUT_BUCKET": OUTPUT_BUCKET }) postprocess_stats = postprocess_bound.runSingle()
if __name__=='__main__': from org.apache.pig.scripting import Pig import sys P=Pig.compileFromFile('/home/course/lian9478/task3.pig'); params={}; for i in range(int(sys.argv[1])): if i==0: out='/home/course/lian9478/HW4-old_twitter_account_rank.csv' else: out="out/pagerank_data_"+str(i+1) params['doc_in']=out; params['doc_out']="out/pagerank_data_"+str(i+2); bound = P.bind(params); bound.runSingle(); #this is to do it one by one instead of parallel #so you can call this driver like this #pig -x local -embedded jython driver.py 20
#Passing PIG Script to PYTHON and RUN #! /usr/bin/python from org.apache.pig.scripting import Pig P = Pig.compileFromFile("""myscript.pig""") input = "original" output = "output" result = p.bind({'in':input, 'out':output}).runSingle() if result.isSuccessful(): print "Pig job succeeded" else: raise "Pig job failed"
def run_script(): import os from org.apache.pig.scripting import Pig # Specify where the data will come from, # and where output data will go after each step data_stem = "s3n://jpacker-dev/amazon_products/books_graph/" num_vertices_input = data_stem + "num_vertices" nodes_input = data_stem + "nodes" edges_input = data_stem + "edges" output_stem = data_stem + "clustering/" preprocess_num_vertices_output = output_stem + "preprocess/num_vertices" preprocess_trans_mat_output = output_stem + "preprocess/trans_mat" iteration_trans_mat_output_stem = output_stem + "iteration/trans_mat_" postprocess_clusters_output = output_stem + "postprocess/clusters" postprocess_stats_output = output_stem + "postprocess/stats" """ data_stem = "../fake-fixtures/" num_vertices_input = data_stem + "cathedral-num-vertices" nodes_input = data_stem + "cathedral-nodes" edges_input = data_stem + "cathedral-edges" output_stem = data_stem + "cathedral_clustering/" preprocess_num_vertices_output = output_stem + "preprocess/num_vertices" preprocess_trans_mat_output = output_stem + "preprocess/trans_mat" iteration_trans_mat_output_stem = output_stem + "iteration/trans_mat_" postprocess_clusters_output = output_stem + "postprocess/clusters" postprocess_stats_output = output_stem + "postprocess/stats" """ # Preprocessing step: # # (1) Generate a transition matrix from the internal edges # (2) Copy precomputed count of # vertices # No computation is being done here; this just lets us use Pig to access the data # instead of configuring S3 access manually with boto # preprocess = Pig.compileFromFile("../pigscripts/clustering_preprocess.pig") preprocess_bound = preprocess.bind({ "NUM_VERTICES_INPUT_PATH": num_vertices_input, "EDGES_INPUT_PATH": edges_input, "NUM_VERTICES_OUTPUT_PATH": preprocess_num_vertices_output, "TRANS_MAT_OUTPUT_PATH": preprocess_trans_mat_output }) preprocess_stats = preprocess_bound.runSingle() # Extract the number of vertices, which we will pass into each iteration as a parameter num_vertices = long(str(preprocess_stats.result("num_verts").iterator().next().get(0))) # Extract the number of edges (including inserted self-loops) # We will use this in our convergence metric initial_num_edges = long(str(preprocess_stats.getNumberRecords(preprocess_trans_mat_output))) # Iteration step applying the Markov Clustering operations: # # (1) Expansion: square the transition matrix ~= take a step in a random walk # (2) Inflation: take an elementwise power of the matrix ~= strengthen strong connections, weaken weak ones' # (3) Pruning: set small matrix values to zero (since the matrix impl is sparse, this greatly speeds things up) # (4) Normalization: renormalize the matrix columnwise to keep it a valid transition matrix # # I tested several mathematically sensible convergence metrics # (max of max residual for each col, avg of max residual for each col, col kurtosis) # but none worked very well. So I'm currently just breaking when the number of edges # in an iteration's transition matrix is less than the number of edges in # the initial transition matrix times a constant multiple, which seems to indicate # that things are settling down. # # The algorithm has two parameters: # (1) The inflation parameter is an exponential factor which determines the cluster size. higher inflation => smaller clusters # (2) Epsilon is a minimum threshold for values in the transition matrix; anything smaller will be pruned (set to zero) # I am not sure how high epsilon can safely be set without significantly degrading the quality of the algorithm # If you run in to performance problems though, raising epsilon will dramatically reduce execution time # iteration = Pig.compileFromFile("../pigscripts/clustering_iterate.pig") max_num_iterations = 7 # most graphs should converge after 4-10 iterations num_iterations = 0 for i in range(1, max_num_iterations + 1): iteration_input = preprocess_trans_mat_output if i == 1 else (iteration_trans_mat_output_stem + str(i-1)) iteration_output = iteration_trans_mat_output_stem + str(i) iteration_bound = iteration.bind({ "INPUT_PATH": iteration_input, "ITERATION_OUTPUT_PATH": iteration_output, "NUM_VERTICES": num_vertices, "INFLATION_PARAMETER": 1.5, "EPSILON": 0.01 }) iteration_stats = iteration_bound.runSingle() num_iterations += 1 num_edges = long(str(iteration_stats.getNumberRecords(iteration_output))) if num_iterations >= 3 and num_edges < (initial_num_edges * 1.05): break # Postprocessing step: # # Interpret the transition matrix outputted by the iterations to find clusters. # Each row represents a cluster: the column id's of its non-zero elements are its constituents. # # There will be many duplicate clusters (N rows for a cluster of N elements), # so we filter those out. We also filter out very small clusters. # mcl_result_path = iteration_trans_mat_output_stem + str(num_iterations) postprocess = Pig.compileFromFile("../pigscripts/clustering_postprocess.pig") postprocess_bound = postprocess.bind({ "NODES_INPUT_PATH": nodes_input, "MCL_RESULT_PATH": mcl_result_path, "CLUSTERS_OUTPUT_PATH": postprocess_clusters_output, "STATS_OUTPUT_PATH": postprocess_stats_output, "MIN_ACCEPTABLE_CLUSTER_SIZE": 3 }) postprocess_stats = postprocess_bound.runSingle()
def run(self): print project_name + ": " + self.action compiled = Pig.compileFromFile(self.script) bound = compiled.bind(self.params) return bound.runSingle()
w0_schema_file = open("%s/.pig_schema" % path, 'w') ObjectMapper().writeValue(w0_schema_file, w0_schema); w0_schema_file.close() # # Copy initial weights to fs # copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0") Pig.fs(copyFromLocal) # # Iterate until converged # features = "%s/%s" % (data_dir,features) script = Pig.compileFromFile(pig_script) weight_queue = Queue.Queue(25) # for moving average avg_weight = [0.0 for i in xrange(int(num_features))] converged = False prev = 0 weight_dir = tempfile.mkdtemp() while not converged: input_weights = "%s/weight-%s" % (data_dir,prev) output_weights = "%s/weight-%s" % (data_dir,prev+1) bound = script.bind({'input_weights':input_weights,'output_weights':output_weights,'data':features}) bound.runSingle() #
with open('tenant.json') as tenantdata_file: tenantData = json.loads(tenantdata_file.read()) globalVars.init(tenantCode, tenantData) callResult = call("mongo " + mongo1 + " " + mongo2 + " " + mongo3 + " var param1='" + tenantCode + "' " + ''' " ../list_mongo_dataset_fields.js > dataset.json''', shell=True) if callResult == 0: with open('dataset.json') as metadata_file: metadata = json.loads(metadata_file.read()) importJob = Pig.compileFromFile("""copy_mongo2phoenix_solr.pig""") for m in metadata: subtype = m['_id']['subtype'] dynamicMongoFields = '' dynamicPhoenixColumns = '' for field in m['_id']['fields']: name = field['fieldName'].strip() dataType = field['dataType'].strip() if subtype == 'binaryDataset' and (name == 'urlDownloadBinary' or name == 'idBinary'): continue
def run_pagerank(edges_input, output_path, tmp_output_dir, damping_factor=0.85, convergence_threshold=0.0001, max_num_iterations=10, id_name_map=None, preprocessing_script="../pigscripts/pagerank_preprocess.pig", iteration_script="../pigscripts/pagerank_iterate.pig" ): """ Calculates pageranks for directed graph of nodes and edges. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of nodes. b) Prepare initial pagerank values for all nodes. 2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the nodes with edges going into the given node. 3. Postprocessing: Order nodes by pagerank Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs to get human-readable names """ preprocess_dir = "%s/preprocess" % tmp_output_dir iteration_dir = "%s/iteration" % tmp_output_dir # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({ "INPUT_PATH" : edges_input, "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir, "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir }).runSingle() # Update convergence threshold based on the size of the graph (number of nodes) num_nodes = long(str(preprocess.result("num_nodes").iterator().next().get(0))) convergence_threshold = long(convergence_threshold * num_nodes * num_nodes) print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) # Iteration step: def iteration_param_func(it_num, it_dir): if it_num == 1: iteration_input = "%s/pageranks" % preprocess_dir else: iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1) return { "INPUT_PATH" : iteration_input, "DAMPING_FACTOR" : damping_factor, "NUM_NODES" : num_nodes, "PAGERANKS_OUTPUT_PATH" : "%s/%d/pageranks" % (it_dir, it_num), "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num) } iteration_result = IterationUtils.iterate_until_convergence( "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate iteration_dir, # temporary iteration outputs will be stored here iteration_param_func, # takes iteration #, returns Pig parameter dictionary "Sum of ordering-rank changes", # name of the convergence metric int, # Python type of the convergence metric "aggregate_rank_change", # alias in the pigscript where the metric is stored to convergence_threshold, # stop when metric less than this max_num_iterations # or if this many iterations have been performed ) # Postprocesing step: print "Starting postprocessing step." postprocess_script = """ pageranks = LOAD '$PAGERANKS_INPUT_PATH' USING PigStorage() AS (id: int, pagerank: double); pageranks = FILTER pageranks BY pagerank IS NOT NULL; """ if id_name_map: postprocess_script += """ id_name_map = LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray); with_names = FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank; ordered = ORDER with_names BY pagerank DESC; rmf $OUTPUT_PATH; STORE ordered INTO '$OUTPUT_PATH' USING PigStorage(); """ postprocess = Pig.compile(postprocess_script).bind({ "PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]), "ID_NAME_MAP_INPUT_PATH" : id_name_map, "OUTPUT_PATH" : output_path }).runSingle() else: postprocess_script += """ ordered = ORDER pageranks BY pagerank DESC; rmf $OUTPUT_PATH; STORE ordered INTO '$OUTPUT_PATH' USING PigStorage(); """ postprocess = Pig.compile(postprocess_script).bind({ "PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]), "OUTPUT_PATH" : output_path }).runSingle() Pig.fs("rmr %s" % preprocess_dir) Pig.fs("rmr %s" % iteration_dir)
notEmptyDatasets + ''''" list_dataset_conv.js > ''' + outDir + '''/lista_dataset.''' + str(pid) + ".json") callResult = call("mongo " + mongoConnectString + " " + ''' --eval "''' + " var param1='" + tenantCode + "';var param2='" + notEmptyDatasets + ''''" list_dataset_conv.js > ''' + outDir + '''/lista_dataset.''' + str(pid) + ".json", shell=True) if callResult == 0: with open(outDir + "/lista_dataset." + str(pid) + ".json") as metadata_file: metadata = json.loads(metadata_file.read()) exportJob = Pig.compileFromFile("""export_single_dataset.pig""") for m in metadata: subtype = m['configData']['subtype'] dynamicPhoenixColumns = phoenixColumns = metadataFields = csvHeader = metadataHeader = '' for field in m['info']['fields']: name = field['fieldName'].strip() dataType = field['dataType'].lower() dynamicPhoenixColumns += '\\\"' + name + globalVars.dataTypeSuffixes[ dataType] + '\\\"\ ' + globalVars.dataType2Phoenix[ dataType] + ',' csvHeader += name + ','
from org.apache.pig.scripting import Pig import os if __name__ == "__main__": params = Pig.getParameters() loader = params["LOADER"] input_source = params["INPUT_SRC"] output_path = params["OUTPUT_PATH"] infer_types = params["INFER_TYPES"] Pig.compileFromFile("../pigscripts/characterize.pig").bind({ "LOADER" : loader, "INPUT_SRC" : input_source, "OUTPUT_PATH" : output_path, "INFER_TYPES" : infer_types }).runSingle() for root, _, files in os.walk("../%s" % output_path): for f in files: if f[0] != '.': fullpath = os.path.join(root, f) copypath = os.path.join(root, f + '.csv') os.system ("cp %s %s" % (fullpath, copypath))
def run_pagerank(self): """ Calculates pageranks for directed graph of nodes and edges. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of nodes. b) Prepare initial pagerank values for all nodes. 2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the nodes with edges going into the given node. 3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names. """ # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile(self.preprocessing_script) preprocess_params = { "INPUT_PATH": self.edges_input, "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks, "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes } preprocess_bound = preprocess.bind(preprocess_params) preprocess_stats = preprocess_bound.runSingle() # Update convergence threshold based on the size of the graph (number of nodes) num_nodes = long( str(preprocess_stats.result("num_nodes").iterator().next().get(0))) convergence_threshold = long(self.convergence_threshold * num_nodes * num_nodes) print "Calculated convergence threshold for %d nodes: %d" % ( num_nodes, convergence_threshold) # Iteration step: iteration = Pig.compileFromFile(self.iteration_script) for i in range(self.max_num_iterations): print "Starting iteration step: %s" % str(i + 1) # Append the iteration number to the input/output stems iteration_input = self.preprocess_pageranks if i == 0 else ( self.iteration_pageranks_prefix + str(i - 1)) iteration_pageranks_output = self.iteration_pageranks_prefix + str( i) iteration_rank_changes_output = self.iteration_rank_changes_prefix + str( i) iteration_bound = iteration.bind({ "INPUT_PATH": iteration_input, "DAMPING_FACTOR": self.damping_factor, "NUM_NODES": num_nodes, "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output, "AGG_RANK_CHANGE_OUTPUT_PATH": iteration_rank_changes_output }) iteration_stats = iteration_bound.runSingle() # If we're below the convergence threshold break out of the loop. aggregate_rank_change = long( str( iteration_stats.result( "aggregate_rank_change").iterator().next().get(0))) if aggregate_rank_change < convergence_threshold: print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \ % (aggregate_rank_change, convergence_threshold) break elif i == self.max_num_iterations - 1: print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \ ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \ "Stopping." else: print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \ % (aggregate_rank_change, convergence_threshold) iteration_pagerank_result = self.iteration_pageranks_prefix + str(i) # Postprocesing step: print "Starting postprocessing step." postprocess = Pig.compileFromFile(self.postprocessing_script) postprocess_params = { "PAGERANKS_INPUT_PATH": iteration_pagerank_result } if self.output_path is not None: # otherwise, the script outputs to the default location, # which is a special directory in s3://mortar-example-output-data # permissioned for your Mortar account. postprocess_params["OUTPUT_PATH"] = self.output_path postprocess_bound = postprocess.bind(postprocess_params) postprocess_stats = postprocess_bound.runSingle()
import sys from org.apache.pig.scripting import Pig load = Pig.compileFromFile(sys.argv[1]) iteration = Pig.compileFromFile('iteration.pig') store = Pig.compileFromFile('store.pig') print '*** Loading input ***' load_stats = load.bind({'EDGES_OUT': 'edges0.tmp'}).runSingle() if not load_stats.isSuccessful(): raise 'Load failed' i = 1 stable_inerations = 0 edges_in = 'edges' + str(i - 1) + '.tmp' edges_out = '' while True: print "*** Iteration " + str(i) + " ***" edges_out = 'edges' + str(i) + '.tmp' iteration_bound = iteration.bind({ 'EDGES_IN': edges_in, 'EDGES_OUT': edges_out, 'CONVERGENCE_OUT': 'convergence.tmp' }) iteration_stats = iteration_bound.runSingle() if not iteration_stats.isSuccessful(): raise 'Iteration failed' conv_result = iteration_stats.result('convergence').iterator().next() max_iter = int(str(conv_result.get(0))) conv_iter = int(str(conv_result.get(1)))
tenantData = json.loads(tenantdata_file.read()) globalVars.init(tenantCode, tenantData) # newLastId = 0 # read from metadata source (mongoDB) all datasets with # availableSpeed = true # availableHive = true (getting also dbHiveSchema and dbHiveTable) callResult = call("mongo " + mongoConn + mongoParam + " ../list_mongo_hive_dataset.js > dataset." + str(pid) + ".json", shell=True) if callResult == 0: print "Dataset list read" copyHive2MongoJob = Pig.compileFromFile("""copy_hive2phoenix.pig""") with open('dataset.' + str(pid) + '.json') as metadata_file: metadata = json.loads(metadata_file.read()) for m in metadata: subtype = m['configData']['subtype'] fields = m['info']['fields'] idDataset = str(m['idDataset']) datasetVersion = str(m['datasetVersion']) dynamicPhoenixColumns = '' aliasString = ('bda_id\ as\ bda_id:chararray,\ ' + idDataset + '\ as\ idDataset:int,\ ' + datasetVersion + '\ as\ datasetVersion:int,\ ')
System.exit(0) if fs.exists(parsedDir): # parsed-captures if (not fs.exists(parsedCaptures) or fs.getFileStatus(parsedDir).getModificationTime() > fs.getFileStatus(parsedCaptures).getModificationTime()): print 'LOG: Graph parsed-captures create' fs.delete(parsedCaptures, True) params = { 'INPUT': str(parsedDir), 'OUTPUT': str(parsedCaptures), 'JOBNAME': str(collection) + ' parsed-captures' } job = Pig.compileFromFile('pig/parsed-captures.pig').bind(params) result = job.runSingle(props) if not result.isSuccessful(): print '\nERROR: Pig job parsed-captures for ' + collection System.exit(1) else: print 'LOG: Graph parsed-captures up-to-date' # link-graph if (not fs.exists(linkGraph) or fs.getFileStatus(parsedDir).getModificationTime() > fs.getFileStatus(linkGraph).getModificationTime()): print 'LOG: Graph link-graph create' fs.delete(linkGraph, True) params = { 'INPUT': str(parsedDir),
print 'LOG: Elapsed %f' % (endTime - startTime) # Remove the guardFile fs.delete( guardFile, True ) System.exit(0) if fs.exists( parsedDir ): # parsed-captures if ( not fs.exists( parsedCaptures) or fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( parsedCaptures ).getModificationTime() ): print 'LOG: Graph parsed-captures create' fs.delete( parsedCaptures, True ) params = { 'INPUT' : str(parsedDir), 'OUTPUT' : str(parsedCaptures), 'JOBNAME': str(collection) + ' parsed-captures' } job = Pig.compileFromFile( 'pig/parsed-captures.pig' ).bind( params ) result = job.runSingle(props) if not result.isSuccessful(): print '\nERROR: Pig job parsed-captures for ' + collection System.exit(1) else: print 'LOG: Graph parsed-captures up-to-date' # link-graph if ( not fs.exists( linkGraph ) or fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( linkGraph ).getModificationTime() ): print 'LOG: Graph link-graph create' fs.delete( linkGraph, True ) params = { 'INPUT' : str(parsedDir), 'OUTPUT' : str(linkGraph), 'JOBNAME': str(collection) + ' link-graph' }
def run_script(): import os from org.apache.pig.scripting import Pig nodes_input = "s3n://jpacker-dev/amazon_products/fixtures/cathedral-nodes" edges_input = "s3n://jpacker-dev/amazon_products/fixtures/cathedral-edges" preprocess_vector_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/vector" preprocess_matrix_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/matrix" preprocess_num_vertices_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/num_vertices" iteration_output_stem = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/iteration_" max_diff_output_stem = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/max_diff_" postprocess_pageranks_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/pageranks" damping_factor = 0.85 preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig") preprocess_bound = preprocess.bind({ "EDGES_INPUT_PATH": edges_input, "VECTOR_OUTPUT_PATH": preprocess_vector_output, "MATRIX_OUTPUT_PATH": preprocess_matrix_output, "NUM_VERTICES_OUTPUT_PATH": preprocess_num_vertices_output, "DAMPING_FACTOR": damping_factor }) preprocess_stats = preprocess_bound.runSingle() num_vertices = int(str(preprocess_stats.result("num_vertices_copy").iterator().next().get(0))) iteration = Pig.compileFromFile("../pigscripts/pagerank_iterate.pig") max_num_iterations = 7 num_iterations = 0 convergence_threshold = 0.15 / float(num_vertices) for i in range(1, max_num_iterations + 1): iteration_vector_input = preprocess_vector_output if i == 1 else (iteration_output_stem + str(i-1)) iteration_matrix_input = preprocess_matrix_output iteration_output = iteration_output_stem + str(i) max_diff_output = max_diff_output_stem + str(i) iteration_bound = iteration.bind({ "VECTOR_INPUT_PATH": iteration_vector_input, "MATRIX_INPUT_PATH": iteration_matrix_input, "ITERATION_OUTPUT_PATH": iteration_output, "MAX_DIFF_OUTPUT_PATH": max_diff_output, "NUM_VERTICES": num_vertices, "DAMPING_FACTOR": damping_factor }) iteration_stats = iteration_bound.runSingle() num_iterations += 1 max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0))) if max_diff < convergence_threshold: break result_vector = iteration_output_stem + str(num_iterations) postprocess = Pig.compileFromFile("../pigscripts/pagerank_postprocess.pig") postprocess_bound = postprocess.bind({ "NODES_INPUT_PATH": nodes_input, "RESULT_VECTOR": result_vector, "OUTPUT_PATH": postprocess_pageranks_output }) postprocess_bound.runSingle()
#!/usr/bin/python import sys from org.apache.pig.scripting import Pig from bidipig import runbidi # make minhash clusters minhash = Pig.compileFromFile('src/main/pig/minhash.pig') osrc = src = sys.argv[1] destminhash = sys.argv[2] + '-minhash' dest = sys.argv[2] + '-jaccard' minjaccard = 80 bound = minhash.bind() job = bound.runSingle() if not job.isSuccessful(): raise 'failed in minhash' # output is pairs and scores # make transitive closure of clusters src = dest dest = sys.argv[2] + '-bidi' runbidi(src, dest) # join with original data join = Pig.compileFromFile('src/main/pig/join.pig') src = osrc keys = dest
#!/usr/bin/python import time import sys from org.apache.pig.scripting import Pig if __name__ == '__main__': P = Pig.compileFromFile("""calvisit.pig""") defaulttime = time.time() deadlinesec = defaulttime - 1800 deadline = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(deadlinesec)) if len(sys.argv) > 1: deadline = sys.argv[1] Q = P.bind({'deadline':deadline, 'input':'input', 'result':'result', 'inputtmp':'inputtmp'}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" else: print result
def runPigScript(pigScript,params): P = Pig.compileFromFile(pigScript) bound = P.bind(params) stat=bound.runSingle()
def run_pagerank(self): """ Calculates pageranks for directed graph of nodes and edges. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of nodes. b) Prepare initial pagerank values for all nodes. 2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the nodes with edges going into the given node. 3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names. """ # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile(self.preprocessing_script) preprocess_params = { "INPUT_PATH": self.edges_input, "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks, "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes } preprocess_bound = preprocess.bind(preprocess_params) preprocess_stats = preprocess_bound.runSingle() # Update convergence threshold based on the size of the graph (number of nodes) num_nodes = long(str(preprocess_stats.result("num_nodes").iterator().next().get(0))) convergence_threshold = long(self.convergence_threshold * num_nodes * num_nodes) print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) # Iteration step: iteration = Pig.compileFromFile(self.iteration_script) for i in range(self.max_num_iterations): print "Starting iteration step: %s" % str(i + 1) # Append the iteration number to the input/output stems iteration_input = self.preprocess_pageranks if i == 0 else (self.iteration_pageranks_prefix + str(i-1)) iteration_pageranks_output = self.iteration_pageranks_prefix + str(i) iteration_rank_changes_output = self.iteration_rank_changes_prefix + str(i) iteration_bound = iteration.bind({ "INPUT_PATH": iteration_input, "DAMPING_FACTOR": self.damping_factor, "NUM_NODES": num_nodes, "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output, "AGG_RANK_CHANGE_OUTPUT_PATH": iteration_rank_changes_output }) iteration_stats = iteration_bound.runSingle() # If we're below the convergence threshold break out of the loop. aggregate_rank_change = long(str(iteration_stats.result("aggregate_rank_change").iterator().next().get(0))) if aggregate_rank_change < convergence_threshold: print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \ % (aggregate_rank_change, convergence_threshold) break elif i == self.max_num_iterations-1: print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \ ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \ "Stopping." else: print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \ % (aggregate_rank_change, convergence_threshold) iteration_pagerank_result = self.iteration_pageranks_prefix + str(i) # Postprocesing step: print "Starting postprocessing step." postprocess = Pig.compileFromFile(self.postprocessing_script) postprocess_params = { "PAGERANKS_INPUT_PATH": iteration_pagerank_result } if self.output_path is not None: # otherwise, the script outputs to the default location, # which is a special directory in s3://mortar-example-output-data # permissioned for your Mortar account. postprocess_params["OUTPUT_PATH"] = self.output_path postprocess_bound = postprocess.bind(postprocess_params) postprocess_stats = postprocess_bound.runSingle()
} w0_schema_file = open("%s/.pig_schema" % path, 'w') ObjectMapper().writeValue(w0_schema_file, w0_schema) w0_schema_file.close() # # Copy initial weights to fs # copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0") Pig.fs(copyFromLocal) # # Iterate until converged # features = "%s/%s" % (data_dir, features) script = Pig.compileFromFile(pig_script) weight_queue = Queue.Queue(25) # for moving average avg_weight = [0.0 for i in xrange(int(num_features))] converged = False prev = 0 weight_dir = tempfile.mkdtemp() while not converged: input_weights = "%s/weight-%s" % (data_dir, prev) output_weights = "%s/weight-%s" % (data_dir, prev + 1) bound = script.bind({ 'input_weights': input_weights, 'output_weights': output_weights, 'data': features })
'mongoPort') + "/DB_SUPPORT" mongo2 = " -u " + props.getProperty('mongoUsr') mongo3 = " -p " + props.getProperty( 'mongoPwd') + ''' --authenticationDatabase admin --quiet --eval "''' # var param1=438; var param2=1; var param3='datalake'" delete_dataset.js Pig.registerJar("../lib/mongo-java-driver-3.4.0.jar") Pig.registerJar("../lib/mongo-hadoop-core-1.5.2.jar") Pig.registerJar("../lib/mongo-hadoop-pig-1.5.2.jar") Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") #Pig.registerJar("../lib/yucca-phoenix-pig.jar") if mode in ["APPEND", "append"]: # read from metadata source (mongoDB) lastIdDatalake2Speed for tenant readLastIdJob = Pig.compileFromFile( """read_mongo_lastIdDatalake2Speed.pig""") results = readLastIdJob.bind({'tenantCode': tenantCode}).runSingle() if results.isSuccessful(): print "Pig job succeeded" iter = results.result("lastId").iterator() if iter.hasNext(): lastId = iter.next() print "lastId: " + str(lastId) else: raise "Pig job failed" # read from metadata source (mongoDB) all datasets with # availableSpeed = true # availableHive = true (getting also dbHiveSchema and dbHiveTable) readDatasetListJob = Pig.compileFromFile("""../read_mongo_dataset.pig""") readDatasetParams = {