def __init__(self, params): # BIND and RUN self.params = params self.set_param_defaults() Pig.fs("rmr " + self.params['output_name']) generator = PigScriptGenerator.PigScriptGenerator(self.params) full_script = generator.generate() P = Pig.compile( full_script ) results = P.bind({ 'output':self.params['output_name'], }).runSingle() if results.isSuccessful() : print 'Pig job succeeded' else : raise 'Pig job failed' result_iter = results.result("final_set").iterator() #This takes care of turning our iter into something we can use. self.make_dict_from_results(result_iter) send_to_grapht = raw_input('do you want to send this data to grapht?') if send_to_grapht not in ('y', 'yes', '1'): sys.exit() connector = GraphtConnector('grapht.shuttercorp.net') metric = self.params['output_name'] connector.record_data_points(metric, self.result)
def __init__ (self, jars = [], properties = {}): ''' Initialize Pig. ''' for jar in jars: logger.debug (" >>> register jar: %s", jar) Pig.registerJar (jar) for key in properties: logger.debug (" >>> set property: %s => %s", key, properties[key]) Pig.set (key, properties [key])
def run(self): print "%s: %s" % (self.script_name, self.description) stats = self.bound_script.runSingle() if stats.isSuccessful(): Pig.fs("touchz %s" % self.flag_file_path) else: raise Exception("\nScript %s failed! Error should be logged above.\n" % self.script_name + "Once you have fixed the problem, you can restart the workflow at this step " + "using the argument \"-p CHECKPOINT=%s\"" % self.script_name)
def runbidi(src, fdest): P = Pig.compileFromFile('src/main/pig/bidi.pig') cntsbase = 'counts' Pig.fs('rmr ' + cntsbase) for count in range(10): dest = fdest + 'gm%04d' % count Pig.fs('rmr ' + dest) cnts = cntsbase params = {'src':src, 'dest':dest, 'cnts':cnts} bound = P.bind(params) job = bound.runSingle() if not job.isSuccessful(): raise 'failed' src = dest iter = job.result('S').iterator() if iter.hasNext(): Pig.fs('rmr ' + cnts) else: Pig.fs('mv ' + dest + ' ' + fdest) print 'ALL DONE!' break
def run (self, params, script_name, script_file, elements = []): ''' Execute pig. ''' pig = Pig.compileFromFile (script_name, script_file) bound = pig.bind (params) futures = bound.run () if isinstance (params, list) else bound.runSingle () self.handle_future (futures, elements) self.complete ()
def run_script(): import os from org.apache.pig.scripting import Pig # compile the pig code P = Pig.compileFromFile("../pigscripts/#{script_name}.pig") bound = P.bind() bound.runSingle()
def main(argv=None): #Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily' #however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to #work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case #https://issues.apache.org/jira/browse/PIG-2548 # if argv is None: # argv = sys.argv # if len(argv) != 3: # print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>" # return 1 # # profile_file = argv[1] # timeframe = argv[2] profile_file = os.environ['config_file'] timeframe = os.environ['timeframe'] if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'): print 'The time frame must be either daily, weekly or monthly.' return 1 #Load the config profile = {} execfile(profile_file, {'timeframe':timeframe}, profile) #Clean up incomplete runs and create dir Pig.fs('rmr ' + profile['REPORTDIR']) Pig.fs('mkdir ' + profile['REPORTDIR']) #Start pig processing pig_init() if timeframe == 'daily': #Clean up incomplete runs and create dir Pig.fs('rmr %s' % profile['LOGDIR']) Pig.fs('mkdir %s' % profile['LOGDIR']) import_logs(profile['logs']) #The web_load.pig script is run by the processing scripts pstats = Pig.compileFromFile('web_%s.pig' % timeframe) bstats = pstats.bind(profile) stats = bstats.run() if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats): if not stats.isSuccessful(): print 'Error in web log stats, %s' % run.getErrorMessage() sys.exit(1) else: for run in stats: if not run.isSuccessful(): print 'Error in web log stats, %s' % run.getErrorMessage() sys.exit(1)
def import_logs(profile): """ Import all the log files for a given day and processed them putting each in a log dir. If the profile is a list there are multiple files otherwise only a single one. The files are combined when running web_load.pig """ #Clean up any left over files from the last run for logfile in profile: Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME'])) pload = Pig.compileFromFile('web_import.pig') bload = pload.bind(profile) load = bload.run() #Check for load errors if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats): if not load.isSuccessful(): print 'Error in web log load, %s' % load.getErrorMessage() sys.exit(1) else: for run in load: if not run.isSuccessful(): print 'Error in web log load, %s' % run.getErrorMessage() sys.exit(1)
def run_script(): import os from org.apache.pig.scripting import Pig # compile the pig code for i in range(10): print 'Run %s started!' % i P = Pig.compileFromFile("../pigscripts/avg_songs_per_split_counter.pig") bound = P.bind({"ITERATION_NUM":i}) ps = bound.runSingle() print 'Run %s done!' % i result = ps.result("avg_split_song_count") for r in result.iterator(): print r if int(r.get(1).toString()) >= 5: print 'Good enough! Quitting time!' break
print 'LOG: Elapsed %f' % (endTime - startTime) # Remove the guardFile fs.delete( guardFile, True ) System.exit(0) if fs.exists( parsedDir ): # parsed-captures if ( not fs.exists( parsedCaptures) or fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( parsedCaptures ).getModificationTime() ): print 'LOG: Graph parsed-captures create' fs.delete( parsedCaptures, True ) params = { 'INPUT' : str(parsedDir), 'OUTPUT' : str(parsedCaptures), 'JOBNAME': str(collection) + ' parsed-captures' } job = Pig.compileFromFile( 'pig/parsed-captures.pig' ).bind( params ) result = job.runSingle(props) if not result.isSuccessful(): print '\nERROR: Pig job parsed-captures for ' + collection System.exit(1) else: print 'LOG: Graph parsed-captures up-to-date' # link-graph if ( not fs.exists( linkGraph ) or fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( linkGraph ).getModificationTime() ): print 'LOG: Graph link-graph create' fs.delete( linkGraph, True ) params = { 'INPUT' : str(parsedDir), 'OUTPUT' : str(linkGraph), 'JOBNAME': str(collection) + ' link-graph' }
try: props = util.Properties() propertiesfis = javaio.FileInputStream(paramFile) props.load(propertiesfis) except: print "Errore leggendo " + paramFile + ": ", sys.exc_info()[0] sys.exit(1) mongoConn = (props.getProperty('mongoHost') + ":" + props.getProperty('mongoPort') + "/DB_SUPPORT" + " -u " + props.getProperty('mongoUsr') + " -p " + props.getProperty('mongoPwd') + " --authenticationDatabase admin --quiet ") mongoParam = ''' --eval "''' + "var param1='" + tenantCode + "' " + ''' " ''' Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") Pig.registerJar("../lib/yucca-phoenix-pig.jar") #lastId = '000000000000000000000000' #if mode in ["APPEND", "append"]: # read from metadata source (mongoDB) lastIdDatalake2Speed for tenant # callResult, output = getstatusoutput('mongo ' + mongoConn + mongoParam + ' read_mongo_lastIdDatalake2Speed.js') # if callResult == 0: # print "Last id read successfully" # lastId = output # else: # print "Error while reading last id" # sys.exit(1) print("mongo " + mongoConn + mongoParam + " ../list_tenant_defaults.js > tenant." + str(pid) + ".json") callResult = call("mongo " + mongoConn + mongoParam +
par = {} par['DATA_DATE'] = day par['REPORT_DATE'] = "2013/07/" + str(i) params.append(par) prefix = "2013080" for i in range(1, 10): day = prefix + str(i) par = {} par['DATA_DATE'] = day par['REPORT_DATE'] = "2013/08/0" + str(i) params.append(par) Pig.registerUDF("attribute_click.py", "myfuncs") # ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false'); script = """ %declare OUTPUT '/user/haliu' member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON(); events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false'); job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0; job_views = FOREACH job_view_events GENERATE (int)header.memberId AS memberId, (long)header.time AS time, trackingCode, (int)trackingInfo#'0' AS jobId;
vertica.accert_table_exists(table_name) table_size = vertica.get_table_size(table_name) logger.info(table_name + " table size is " + str(table_size) + " bytes") output_dir = "/user/mykhail.martsynyuk/vertica/export/" + table_name #prepare hdfs structure logger.info("Move folder " + output_dir + " to backup") hdfs.move_folder_to_backup(output_dir) logger.info("Remove " + output_dir) hdfs.remove_folder(output_dir) params.append({'out': output_dir, 'table': table_name}) P = Pig.compile(""" register /usr/lib/pig/lib/pig-vertica.jar register /usr/lib/pig/lib/vertica-jdbc-7.0.1-0.jar A = LOAD 'sql://{SELECT * FROM $table WHERE 1 = ?};{1}' USING com.vertica.pig.VerticaLoader('10.104.5.29','verticadst','5433','alfxplsit','xpl123'); STORE A INTO '$out'; """) bound = P.bind(params) stats_list = bound.run() i = 0 for stats in stats_list: if stats.isSuccessful(): logger.info("SUCCESS: Table: " + params[i]["table"] + "; Number jobs: " + str(stats.getNumberJobs()) + "; Time to run: " + str(stats.getDuration()) + "; Files written: " + str(stats.getOutputLocations())) else: logger.info("FAIL: Table: " + params[i]["table"] + "; ERRORS: " +
""" if aggregateMethod == "avg": pigScript += """ rankedTriples = FOREACH objGroup GENERATE $0,$1,$2, AVG({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;""" elif aggregateMethod == "max": pigScript += """ rankedTriples = FOREACH objGroup GENERATE $0,$1,$2, MAX({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;""" elif aggregateMethod == "min": pigScript += """ rankedTriples = FOREACH objGroup GENERATE $0,$1,$2, MIN({($4 is null? 1F: $4),($6 is null? 1F: $6)}) AS ranking;""" else: pigScript += """ WRONGGGG. how to aggregate?!""" pigScript += """ rmf $outputFile STORE rankedTriples INTO '$outputFile' USING PigStorage(); """ P = Pig.compile(pigScript) stats = P.bind().runSingle()
vertica.accert_table_exists(table_name) table_size = vertica.get_table_size(table_name) logger.info(table_name + " table size is " + str(table_size) + " bytes") output_dir = "/user/mykhail.martsynyuk/vertica/export/"+table_name #prepare hdfs structure logger.info("Move folder "+output_dir+" to backup") hdfs.move_folder_to_backup(output_dir) logger.info("Remove "+output_dir) hdfs.remove_folder(output_dir) params.append({'out':output_dir, 'table':table_name}) P = Pig.compile(""" register /usr/lib/pig/lib/pig-vertica.jar register /usr/lib/pig/lib/vertica-jdbc-7.0.1-0.jar A = LOAD 'sql://{SELECT * FROM $table WHERE 1 = ?};{1}' USING com.vertica.pig.VerticaLoader('10.104.5.29','verticadst','5433','alfxplsit','xpl123'); STORE A INTO '$out'; """) bound = P.bind(params) stats_list = bound.run() i = 0 for stats in stats_list: if stats.isSuccessful(): logger.info("SUCCESS: Table: "+params[i]["table"]+"; Number jobs: "+str(stats.getNumberJobs())+ "; Time to run: "+str(stats.getDuration())+"; Files written: "+str(stats.getOutputLocations())) else: logger.info("FAIL: Table: "+params[i]["table"]+"; ERRORS: "+stats.getAllErrorMessages()) i+=1 # Next is example of how to get script output:
def runPigScript(pigScript,params): P = Pig.compileFromFile(pigScript) bound = P.bind(params) stat=bound.runSingle()
postString += "Result = FOREACH Result GENERATE * AS (" + fsDic[ 'genSchema'] + ");\n" # postString += "Result = FOREACH Result GENERATE " + currentAction+ "Result::UserId AS UserId, *;\n" # postString += "DESCRIBE Result;\n" # A1BResult = JOIN BResult BY UserId, CResult By UserId; pigString += postString pigString += """ DUMP Result; DESCRIBE Result; """ print(pigString) # with open('cyygeneratedPig.pig','w') as outFile: # outFile.write(pigString) if USE_PIG: P = Pig.compile(pigString) # P = Pig.compileFromFile('pig_bcd_bc.pig') # run the pig script if True: result = P.bind().runSingle() if result.isSuccessful(): print 'run success' else: raise 'run failed'
MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i != k - 1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register 'centroid.py' using jython as centroid; raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, centroid.get_closest_centroid(gpa, '$centroids') as centroid; grouped = group centroided by centroid parallel 2; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'kmoutput'; """) converged = False iter_num = 0 while iter_num < MAX_ITERATION: Q = P.bind({'centroids': initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration
#!/usr/bin/python # -*- coding: utf-8 -*- # explicitly import Pig class from org.apache.pig.scripting import Pig # COMPILE: compile method returns a Pig object that represents the pipeline P = Pig.compile('''Arcs = LOAD '$docs_in' USING PigStorage('\t') AS (url: chararray, pagerank: float, links:{ link: ( url: chararray ) } ); outlinkPageRank = FOREACH Arcs GENERATE pagerank / COUNT ( links ) AS pagerank, FLATTEN ( links ) AS to_url; newPageRank = FOREACH ( COGROUP outlinkPageRank BY to_url, Arcs BY url INNER ) GENERATE FLATTEN (Arcs.url), ( 1.0 - 0.85 ) + 0.85 * SUM ( outlinkPageRank.pagerank ) AS pagerank, FLATTEN (Arcs.links) AS links; dump newPageRank; STORE newPageRank INTO '$docs_out';''') params = {'docs_in': 'urls2.txt' } for i in range(1): out = "out/pagerank_data_" + str(i + 1) params["docs_out"] = out Pig.fs("rmr " + out) stats = P.bind(params).runSingle() if not stats.isSuccessful(): raise 'failed' params["docs_in"] = out
if i != k - 1: initial_centroids = initial_centroids + ":" # initial_centroids = "37.475097, -122.155599:37.486098,-122.195388:37.4985769, -122.2195727:37.4608874, -122.143838:37.453407, -122.182255" # initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0" # last_centroids = [(-120.0,-120.0),(-60.0, -60.0),(0.0, 0.0),(60.0, 60.0),(120.0,120.0)] print last_centroids print initial_centroids P = Pig.compile( """register Find.jar DEFINE find_centroid FindCentroid('$centroids'); raw_data = load 'MP_match.txt' as (latitude:double, longitude:double, status:chararray); centroided = foreach raw_data generate status, latitude, longitude, find_centroid(latitude, longitude) as centroid; grouped = group centroided by centroid; store grouped into 'grouped'; result = foreach grouped generate group, AVG(centroided.latitude), AVG(centroided.longitude); store result into 'output'; """ ) converged = False iter_num = 0 while iter_num < MAX_ITERATION: Q = P.bind({"centroids": initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = []
def run_pagerank(): """ Calculates pageranks for Twitter users. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of users. b) Prepare initial pagerank values for all users. 2. Iterative: Calculate new pageranks for each user based on the previous pageranks of the users' followers. 3. Postprocesing: Find the top pagerank users and join to a separate dataset to find their names. """ # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile(PREPROCESS_SCRIPT) preprocess_bound = preprocess.bind({ "INPUT_PATH": FOLLOWER_GRAPH_INPUT, "PAGERANKS_OUTPUT_PATH": PREPROCESS_PAGERANKS, "NUM_USERS_OUTPUT_PATH": PREPROCESS_NUM_USERS }) preprocess_stats = preprocess_bound.runSingle() num_users = int(str(preprocess_stats.result("num_users").iterator().next().get(0))) convergence_threshold = CONVERGENCE_THRESHOLD / num_users # Iteration step: iteration = Pig.compileFromFile(PAGERANK_ITERATE_SCRIPT) for i in range(MAX_NUM_ITERATIONS): print "Starting iteration step: %s" % str(i + 1) # Append the iteration number to the input/output stems iteration_input = PREPROCESS_PAGERANKS if i == 0 else (ITERATION_PAGERANKS_PREFIX + str(i-1)) iteration_pageranks_output = ITERATION_PAGERANKS_PREFIX + str(i) iteration_max_diff_output = ITERATION_MAX_DIFF_PREFIX + str(i) iteration_bound = iteration.bind({ "INPUT_PATH": iteration_input, "DAMPING_FACTOR": DAMPING_FACTOR, "NUM_USERS": num_users, "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output, "MAX_DIFF_OUTPUT_PATH": iteration_max_diff_output }) iteration_stats = iteration_bound.runSingle() # If we're below the convergence_threshold break out of the loop. max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0))) if max_diff < CONVERGENCE_THRESHOLD: print "Max diff %s under convergence threshold. Stopping." % max_diff break elif i == MAX_NUM_ITERATIONS-1: print "Max diff %s above convergence threshold but hit max number of iterations. Stopping." \ % max_diff else: print "Max diff %s above convergence threshold. Continuing." % max_diff iteration_pagerank_result = ITERATION_PAGERANKS_PREFIX + str(i) # Postprocesing step: print "Starting postprocessing step." postprocess = Pig.compileFromFile(POSTPROCESS_SCRIPT) postprocess_bound = postprocess.bind({ "PAGERANKS_INPUT_PATH": iteration_pagerank_result, "USERNAMES_INPUT_PATH": USERNAMES_INPUT, "TOP_N": NUM_TOP_USERS, "OUTPUT_BUCKET": OUTPUT_BUCKET }) postprocess_stats = postprocess_bound.runSingle()
#!/usr/bin/python import sys from org.apache.pig.scripting import Pig from bidipig import runbidi # make minhash clusters minhash = Pig.compileFromFile('src/main/pig/minhash.pig') osrc = src = sys.argv[1] destminhash = sys.argv[2] + '-minhash' dest = sys.argv[2] + '-jaccard' minjaccard = 80 bound = minhash.bind() job = bound.runSingle() if not job.isSuccessful(): raise 'failed in minhash' # output is pairs and scores # make transitive closure of clusters src = dest dest = sys.argv[2] + '-bidi' runbidi(src, dest) # join with original data join = Pig.compileFromFile('src/main/pig/join.pig') src = osrc keys = dest
from org.apache.pig.scripting import Pig from org.codehaus.jackson.map import ObjectMapper EPS = 10e-6 # maximum distance between consective weights for convergence pig_script = sys.argv[1] # pig script to run iteratively data_dir = sys.argv[2] # directory where intermediate weights will be written features = sys.argv[3] # location, inside data_dir, where the data to fit exists num_features = sys.argv[4] # number of features # # Cleanup data dir # cmd = "rmr %s/weight-*" % data_dir Pig.fs(cmd) # # Initialize weights # w0_fields = [] weights = [] for i in xrange(int(num_features)): weights.append(str(random.random())) w0_fields.append({"name":"w%s" % i,"type":25,"schema":None}) # See Pig's DataType.java path = tempfile.mkdtemp() w0 = open("%s/part-r-00000" % path, 'w') w0.write("\t".join(weights)+"\n") w0.close()
#!/usr/bin/python from org.apache.pig.scripting import Pig import time P = Pig.compile(""" InOut = LOAD '$in_links' using PigStorage('\t') as (home_url: chararray, links:{ link: ( url: chararray ) } ); InPagerank = LOAD '$in_pagerank' using PigStorage('\t') as (home_url: chararray, rank : float); InData = JOIN InPagerank by home_url, InOut by home_url; Data = FOREACH InData GENERATE InOut::home_url as url, InPagerank::rank as rank, InOut::links as links; outbound_pagerank = FOREACH Data GENERATE rank/COUNT(links) AS pagerank_transfer, FLATTEN (links) AS outbound_links; new_pagerank = FOREACH (GROUP outbound_pagerank BY outbound_links) GENERATE group AS url, 0.15 + 0.85 * SUM(outbound_pagerank.pagerank_transfer) AS pagerank; STORE new_pagerank INTO '$out' USING PigStorage('\t'); """) params = { 'in_links': './data/output_links.txt', 'in_pagerank': './data/output_rank.txt' } times = [] for i in range(30): print("Iteration " + str(i))
def run_pagerank(self): """ Calculates pageranks for directed graph of nodes and edges. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of nodes. b) Prepare initial pagerank values for all nodes. 2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the nodes with edges going into the given node. 3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names. """ # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile(self.preprocessing_script) preprocess_params = { "INPUT_PATH": self.edges_input, "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks, "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes } preprocess_bound = preprocess.bind(preprocess_params) preprocess_stats = preprocess_bound.runSingle() # Update convergence threshold based on the size of the graph (number of nodes) num_nodes = long( str(preprocess_stats.result("num_nodes").iterator().next().get(0))) convergence_threshold = long(self.convergence_threshold * num_nodes * num_nodes) print "Calculated convergence threshold for %d nodes: %d" % ( num_nodes, convergence_threshold) # Iteration step: iteration = Pig.compileFromFile(self.iteration_script) for i in range(self.max_num_iterations): print "Starting iteration step: %s" % str(i + 1) # Append the iteration number to the input/output stems iteration_input = self.preprocess_pageranks if i == 0 else ( self.iteration_pageranks_prefix + str(i - 1)) iteration_pageranks_output = self.iteration_pageranks_prefix + str( i) iteration_rank_changes_output = self.iteration_rank_changes_prefix + str( i) iteration_bound = iteration.bind({ "INPUT_PATH": iteration_input, "DAMPING_FACTOR": self.damping_factor, "NUM_NODES": num_nodes, "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output, "AGG_RANK_CHANGE_OUTPUT_PATH": iteration_rank_changes_output }) iteration_stats = iteration_bound.runSingle() # If we're below the convergence threshold break out of the loop. aggregate_rank_change = long( str( iteration_stats.result( "aggregate_rank_change").iterator().next().get(0))) if aggregate_rank_change < convergence_threshold: print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \ % (aggregate_rank_change, convergence_threshold) break elif i == self.max_num_iterations - 1: print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \ ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \ "Stopping." else: print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \ % (aggregate_rank_change, convergence_threshold) iteration_pagerank_result = self.iteration_pageranks_prefix + str(i) # Postprocesing step: print "Starting postprocessing step." postprocess = Pig.compileFromFile(self.postprocessing_script) postprocess_params = { "PAGERANKS_INPUT_PATH": iteration_pagerank_result } if self.output_path is not None: # otherwise, the script outputs to the default location, # which is a special directory in s3://mortar-example-output-data # permissioned for your Mortar account. postprocess_params["OUTPUT_PATH"] = self.output_path postprocess_bound = postprocess.bind(postprocess_params) postprocess_stats = postprocess_bound.runSingle()
from org.apache.pig.scripting import Pig from pagerank_lib import Pagerank if __name__ == "__main__": params = Pig.getParameters() try: input_path = params["INPUT_PATH"] output_path = params["OUTPUT_PATH"] tmp_output_dir = params["TMP_OUTPUT_DIR"] except: print "Usage: mortar baconbits:[local_]run pagerank " + "-p INPUT_PATH=<...> -p OUTPUT_PATH=<...> -p TMP_OUTPUT_DIR=<...> " damping_factor = 0.85 if "DAMPING_FACTOR" in params: damping_factor = float(params["DAMPING_FACTOR"]) convergence_threshold = 0.001 if "CONVERGENCE_THRESHOLD" in params: convergence_threshold = float(params["CONVERGENCE_THRESHOLD"]) max_num_iterations = 10 if "MAX_NUM_ITERATIONS" in params: max_num_iterations = int(params["MAX_NUM_ITERATIONS"]) id_name_map = None if "ID_NAME_MAP" in params: id_name_map = params["ID_NAME_MAP"] Pagerank.run_pagerank( input_path,
for i in range(10, 32): day = prefix + str(i) par = {} par['DATA_DATE'] = day par['REPORT_DATE'] = "2013/07/" + str(i) params.append(par) prefix = "2013080" for i in range(1, 10): day = prefix + str(i) par = {} par['DATA_DATE'] = day par['REPORT_DATE'] = "2013/08/0" + str(i) params.append(par) Pig.registerUDF("attribute_click.py", "myfuncs") script = """ %declare OUTPUT '/user/haliu' applypair = LOAD '/data/tracking/JobApplyClickEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false'); applypair = foreach applypair generate header.memberId as memberId, jobId, header.time as time; member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON(); applypair = join applypair by memberId, member by memberId parallel 2000; applypair = foreach applypair generate applypair::memberId as memberId, applypair::jobId as jobId, applypair::time as time; applypair = distinct applypair parallel 1; store applypair into '$OUTPUT/JYMBII-batch/history/positive/$REPORT_DATE' USING BinaryJSON('memberId'); """
if __name__ == '__main__': from org.apache.pig.scripting import Pig import sys P = Pig.compileFromFile('/home/course/lian9478/task3.pig') params = {} for i in range(int(sys.argv[1])): if i == 0: out = '/home/course/lian9478/HW4-old_twitter_account_rank.csv' else: out = "out/pagerank_data_" + str(i + 1) params['doc_in'] = out params['doc_out'] = "out/pagerank_data_" + str(i + 2) bound = P.bind(params) bound.runSingle() #this is to do it one by one instead of parallel #so you can call this driver like this #pig -x local -embedded jython driver.py 20
_out = _in + '_counts_m' + _min_count _out_nc = _out + '/count' _out_v = _out + '/vocab' _out_nf = _out + '/nfollow' _out_np = _out + '/nprecede' _out_nfp = _out + '/nfollowerprecede' _out_njc = _out + '/countsjoined' ## # start actual pig jobs # from org.apache.pig.scripting import Pig # if output path does not exist, create it if Pig.fs('-test -d ' + _out): Pig.fs('mkdir ' + _out) ## # CountJob # # if output path of countjob already exists, skip it, run job ## if not Pig.fs('-test -d ' + _out_nc): print '\nPath ("%s") already exists, skipping job.\n' % _out_nc else: result = Pig.compile(_header + """ count_ngrams( '${in}', '${out}', '${min_count}' ); """).bind({'in':_in, 'out':_out_nc, 'min_count': _min_count, 'n':'count-ngrams'}).runSingle() # check the result if not result.isSuccessful():
#!/usr/bin/python from __future__ import with_statement import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__')))) import globalVars from org.apache.pig.scripting import Pig sys.path.append('../lib/jyson-1.0.2.jar') from com.xhaus.jyson import JysonCodec as json from subprocess import call import java.util as util import java.io as javaio import csv Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") Pig.registerJar("../lib/piggybankExtended.jar") #Jar to use AVRO #Pig.registerJar("/usr/hdp/current/pig-client/lib/avro-1.7.5.jar") #Pig.registerJar("/usr/hdp/current/pig-client/lib/json-simple-1.1.jar") #Pig.registerJar("/usr/hdp/current/pig-client/lib/jackson-core-asl-1.9.13.jar") #Pig.registerJar("/usr/hdp/current/pig-client/lib/jackson-mapper-asl-1.9.13.jar") if len(sys.argv) != 2: print "Usage: " + sys.argv[0] + " parameters-file" sys.exit(1) paramFile = sys.argv[1] try: props = util.Properties()
import sys from org.apache.pig.scripting import Pig load = Pig.compileFromFile(sys.argv[1]) iteration = Pig.compileFromFile('iteration.pig') store = Pig.compileFromFile('store.pig') print '*** Loading input ***' load_stats = load.bind({'EDGES_OUT': 'edges0.tmp'}).runSingle() if not load_stats.isSuccessful(): raise 'Load failed' i = 1 stable_inerations = 0 edges_in = 'edges' + str(i - 1) + '.tmp' edges_out = '' while True: print "*** Iteration " + str(i) + " ***" edges_out = 'edges' + str(i) + '.tmp' iteration_bound = iteration.bind({ 'EDGES_IN': edges_in, 'EDGES_OUT': edges_out, 'CONVERGENCE_OUT': 'convergence.tmp' }) iteration_stats = iteration_bound.runSingle() if not iteration_stats.isSuccessful(): raise 'Iteration failed' conv_result = iteration_stats.result('convergence').iterator().next() max_iter = int(str(conv_result.get(0))) conv_iter = int(str(conv_result.get(1)))
postString += "Result = FOREACH Result GENERATE " + fsDic['genFields'] + ";\n" postString += "Result = FOREACH Result GENERATE * AS (" + fsDic['genSchema'] + ");\n" # postString += "Result = FOREACH Result GENERATE " + currentAction+ "Result::UserId AS UserId, *;\n" # postString += "DESCRIBE Result;\n" # A1BResult = JOIN BResult BY UserId, CResult By UserId; pigString += postString pigString += """ DUMP Result; DESCRIBE Result; """ print(pigString) # with open('cyygeneratedPig.pig','w') as outFile: # outFile.write(pigString) if USE_PIG: P = Pig.compile(pigString) # P = Pig.compileFromFile('pig_bcd_bc.pig') # run the pig script if True: result = P.bind().runSingle() if result.isSuccessful(): print 'run success' else: raise 'run failed'
System.exit(0) if fs.exists(parsedDir): # parsed-captures if (not fs.exists(parsedCaptures) or fs.getFileStatus(parsedDir).getModificationTime() > fs.getFileStatus(parsedCaptures).getModificationTime()): print 'LOG: Graph parsed-captures create' fs.delete(parsedCaptures, True) params = { 'INPUT': str(parsedDir), 'OUTPUT': str(parsedCaptures), 'JOBNAME': str(collection) + ' parsed-captures' } job = Pig.compileFromFile('pig/parsed-captures.pig').bind(params) result = job.runSingle(props) if not result.isSuccessful(): print '\nERROR: Pig job parsed-captures for ' + collection System.exit(1) else: print 'LOG: Graph parsed-captures up-to-date' # link-graph if (not fs.exists(linkGraph) or fs.getFileStatus(parsedDir).getModificationTime() > fs.getFileStatus(linkGraph).getModificationTime()): print 'LOG: Graph link-graph create' fs.delete(linkGraph, True) params = { 'INPUT': str(parsedDir),
def run_pagerank(edges_input, output_path, tmp_output_dir, damping_factor=0.85, convergence_threshold=0.0001, max_num_iterations=10, id_name_map=None, preprocessing_script="../pigscripts/pagerank_preprocess.pig", iteration_script="../pigscripts/pagerank_iterate.pig" ): """ Calculates pageranks for directed graph of nodes and edges. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of nodes. b) Prepare initial pagerank values for all nodes. 2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the nodes with edges going into the given node. 3. Postprocessing: Order nodes by pagerank Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs to get human-readable names """ preprocess_dir = "%s/preprocess" % tmp_output_dir iteration_dir = "%s/iteration" % tmp_output_dir # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({ "INPUT_PATH" : edges_input, "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir, "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir }).runSingle() # Update convergence threshold based on the size of the graph (number of nodes) num_nodes = long(str(preprocess.result("num_nodes").iterator().next().get(0))) convergence_threshold = long(convergence_threshold * num_nodes * num_nodes) print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) # Iteration step: def iteration_param_func(it_num, it_dir): if it_num == 1: iteration_input = "%s/pageranks" % preprocess_dir else: iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1) return { "INPUT_PATH" : iteration_input, "DAMPING_FACTOR" : damping_factor, "NUM_NODES" : num_nodes, "PAGERANKS_OUTPUT_PATH" : "%s/%d/pageranks" % (it_dir, it_num), "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num) } iteration_result = IterationUtils.iterate_until_convergence( "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate iteration_dir, # temporary iteration outputs will be stored here iteration_param_func, # takes iteration #, returns Pig parameter dictionary "Sum of ordering-rank changes", # name of the convergence metric int, # Python type of the convergence metric "aggregate_rank_change", # alias in the pigscript where the metric is stored to convergence_threshold, # stop when metric less than this max_num_iterations # or if this many iterations have been performed ) # Postprocesing step: print "Starting postprocessing step." postprocess_script = """ pageranks = LOAD '$PAGERANKS_INPUT_PATH' USING PigStorage() AS (id: int, pagerank: double); pageranks = FILTER pageranks BY pagerank IS NOT NULL; """ if id_name_map: postprocess_script += """ id_name_map = LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray); with_names = FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank; ordered = ORDER with_names BY pagerank DESC; rmf $OUTPUT_PATH; STORE ordered INTO '$OUTPUT_PATH' USING PigStorage(); """ postprocess = Pig.compile(postprocess_script).bind({ "PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]), "ID_NAME_MAP_INPUT_PATH" : id_name_map, "OUTPUT_PATH" : output_path }).runSingle() else: postprocess_script += """ ordered = ORDER pageranks BY pagerank DESC; rmf $OUTPUT_PATH; STORE ordered INTO '$OUTPUT_PATH' USING PigStorage(); """ postprocess = Pig.compile(postprocess_script).bind({ "PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]), "OUTPUT_PATH" : output_path }).runSingle() Pig.fs("rmr %s" % preprocess_dir) Pig.fs("rmr %s" % iteration_dir)
params = [] for i in range(bucket): par = {} par['INDEX'] = str(i) params.append(par) script = """ data = load '/user/hgui/JYMBII-batch/TMP/history' USING BinaryJSON(); mem = foreach data generate memberId; mem = distinct mem parallel 100; mem = sample mem 0.5; data = join data by memberId, mem by memberId; data = distinct data parallel 1; data = foreach data generate data::label as label, data::class as class, data::memberId as memberId, data::jobId as jobId, data::score as score; store data into '/user/hgui/JYMBII-batch/TMP/bucket-$INDEX'; """ prog = Pig.compile(script) for para in params: bound = prog.bind(para) stats = bound.runSingle()
def main(): filename = "studenttab10k" k = 4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i != k - 1: initial_centroids = initial_centroids + ":" # Compile Pig script. Register the same script since it contains the Jython UDF. # $centroids is the only binding parameter. It will be bound to different parameter with the # estimation for centroid from the last round. Then we calculate the average of the new clusters # to get the new centroid estimation, and store into "output" P = Pig.compile("""register 'kmeans.py' using jython as util; raw = load 'studenttab10k' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, util.findCentroid('$centroids', gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num < MAX_ITERATION: # Binding parameter centroids to current centroids Q = P.bind({'centroids': initial_centroids}) # Run Pig script results = Q.runSingle() # Check the result of the Pig script if results.isSuccessful() == "FAILED": raise "Pig job failed" # Get the new centroids from the output iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # Calculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i] - centroids[i]) distance_move = distance_move / k Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) # Converge if distance_move < tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]\n") converged = True break # Not converge, use the new centroids as the initial centroids for next iteration last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i != k - 1: initial_centroids = initial_centroids + ":" iter_num += 1 # Not converge after MAX_ITERATION if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]\n")
def run_script(): import os from org.apache.pig.scripting import Pig nodes_input = "s3n://jpacker-dev/amazon_products/fixtures/cathedral-nodes" edges_input = "s3n://jpacker-dev/amazon_products/fixtures/cathedral-edges" preprocess_vector_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/vector" preprocess_matrix_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/matrix" preprocess_num_vertices_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/num_vertices" iteration_output_stem = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/iteration_" max_diff_output_stem = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/max_diff_" postprocess_pageranks_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/pageranks" damping_factor = 0.85 preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig") preprocess_bound = preprocess.bind({ "EDGES_INPUT_PATH": edges_input, "VECTOR_OUTPUT_PATH": preprocess_vector_output, "MATRIX_OUTPUT_PATH": preprocess_matrix_output, "NUM_VERTICES_OUTPUT_PATH": preprocess_num_vertices_output, "DAMPING_FACTOR": damping_factor }) preprocess_stats = preprocess_bound.runSingle() num_vertices = int(str(preprocess_stats.result("num_vertices_copy").iterator().next().get(0))) iteration = Pig.compileFromFile("../pigscripts/pagerank_iterate.pig") max_num_iterations = 7 num_iterations = 0 convergence_threshold = 0.15 / float(num_vertices) for i in range(1, max_num_iterations + 1): iteration_vector_input = preprocess_vector_output if i == 1 else (iteration_output_stem + str(i-1)) iteration_matrix_input = preprocess_matrix_output iteration_output = iteration_output_stem + str(i) max_diff_output = max_diff_output_stem + str(i) iteration_bound = iteration.bind({ "VECTOR_INPUT_PATH": iteration_vector_input, "MATRIX_INPUT_PATH": iteration_matrix_input, "ITERATION_OUTPUT_PATH": iteration_output, "MAX_DIFF_OUTPUT_PATH": max_diff_output, "NUM_VERTICES": num_vertices, "DAMPING_FACTOR": damping_factor }) iteration_stats = iteration_bound.runSingle() num_iterations += 1 max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0))) if max_diff < convergence_threshold: break result_vector = iteration_output_stem + str(num_iterations) postprocess = Pig.compileFromFile("../pigscripts/pagerank_postprocess.pig") postprocess_bound = postprocess.bind({ "NODES_INPUT_PATH": nodes_input, "RESULT_VECTOR": result_vector, "OUTPUT_PATH": postprocess_pageranks_output }) postprocess_bound.runSingle()
MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar; DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid parallel 2; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'kmoutput'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration
#!/usr/bin/python import time import sys from org.apache.pig.scripting import Pig if __name__ == '__main__': P = Pig.compileFromFile("""calvisit.pig""") defaulttime = time.time() deadlinesec = defaulttime - 1800 deadline = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(deadlinesec)) if len(sys.argv) > 1: deadline = sys.argv[1] Q = P.bind({'deadline':deadline, 'input':'input', 'result':'result', 'inputtmp':'inputtmp'}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" else: print result
from math import ceil, log from org.apache.pig.scripting import Pig if __name__ == "__main__": params = Pig.getParameters() graph = params["GRAPH"] seed_vertices = params["SEED_VERTICES"] tmp_dir = params["TMP_DIR"] output_path = params["OUTPUT_PATH"] nhood_size = int(params["NEIGHBORHOOD_SIZE"]) preprocess_graph = "%s/preprocess/graph" % tmp_dir preprocess_num_vertices = "%s/preprocess/num_vertices" % tmp_dir iteration_verts_prefix = "%s/iteration/vertices_" % tmp_dir print "Graph Sampler: starting preprocessing step." preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({ "GRAPH_INPUT_PATH" : graph, "GRAPH_OUTPUT_PATH" : preprocess_graph, "NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices }).runSingle() iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig") num_iterations = nhood_size - 1 num_vertices = long(str(preprocessing.result("num_vertices").iterator().next().get(0))) print "Graph Sampler: scheduling %d iterations" % num_iterations for i in range(num_iterations): print "Graph Sampler: starting iteration step %d" % (i+1) iteration = iteration_script.bind({ "VERTICES_INPUT_PATH" : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)),
def run_pagerank(self): """ Calculates pageranks for directed graph of nodes and edges. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of nodes. b) Prepare initial pagerank values for all nodes. 2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the nodes with edges going into the given node. 3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names. """ # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile(self.preprocessing_script) preprocess_params = { "INPUT_PATH": self.edges_input, "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks, "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes } preprocess_bound = preprocess.bind(preprocess_params) preprocess_stats = preprocess_bound.runSingle() # Update convergence threshold based on the size of the graph (number of nodes) num_nodes = long(str(preprocess_stats.result("num_nodes").iterator().next().get(0))) convergence_threshold = long(self.convergence_threshold * num_nodes * num_nodes) print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) # Iteration step: iteration = Pig.compileFromFile(self.iteration_script) for i in range(self.max_num_iterations): print "Starting iteration step: %s" % str(i + 1) # Append the iteration number to the input/output stems iteration_input = self.preprocess_pageranks if i == 0 else (self.iteration_pageranks_prefix + str(i-1)) iteration_pageranks_output = self.iteration_pageranks_prefix + str(i) iteration_rank_changes_output = self.iteration_rank_changes_prefix + str(i) iteration_bound = iteration.bind({ "INPUT_PATH": iteration_input, "DAMPING_FACTOR": self.damping_factor, "NUM_NODES": num_nodes, "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output, "AGG_RANK_CHANGE_OUTPUT_PATH": iteration_rank_changes_output }) iteration_stats = iteration_bound.runSingle() # If we're below the convergence threshold break out of the loop. aggregate_rank_change = long(str(iteration_stats.result("aggregate_rank_change").iterator().next().get(0))) if aggregate_rank_change < convergence_threshold: print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \ % (aggregate_rank_change, convergence_threshold) break elif i == self.max_num_iterations-1: print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \ ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \ "Stopping." else: print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \ % (aggregate_rank_change, convergence_threshold) iteration_pagerank_result = self.iteration_pageranks_prefix + str(i) # Postprocesing step: print "Starting postprocessing step." postprocess = Pig.compileFromFile(self.postprocessing_script) postprocess_params = { "PAGERANKS_INPUT_PATH": iteration_pagerank_result } if self.output_path is not None: # otherwise, the script outputs to the default location, # which is a special directory in s3://mortar-example-output-data # permissioned for your Mortar account. postprocess_params["OUTPUT_PATH"] = self.output_path postprocess_bound = postprocess.bind(postprocess_params) postprocess_stats = postprocess_bound.runSingle()
MIN_SCORE = 0 MAX_ITERATION = 5 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration
MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i != k - 1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar; DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid parallel 2; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'kmoutput'; """) converged = False iter_num = 0 while iter_num < MAX_ITERATION: Q = P.bind({'centroids': initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration
def iterate_until_convergence(script_path, iteration_dir, param_generator_func, metric_name, metric_type, metric_alias, metric_threshold, max_num_iterations): """ Utility for running a pigscript which outputs data in the same schema as its input iteratively, with the output of the previous run being the input of the next run. Stops when some convergence metric has been reached or if a maximum number of iterations has been reached. Example usage: iteration_result = IterationUtils.iterate_until_convergence( "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate iteration_dir, # temporary iteration outputs will be stored here iteration_param_func, # takes iteration #, returns Pig parameter dictionary "Sum of ordering-rank changes", # name of the convergence metric int, # Python type of the convergence metric "aggregate_rank_change", # alias in the pigscript where the metric is stored to convergence_threshold, # stop when metric less than this max_num_iterations # or if this many iterations have been performed iteration_result is a PigStats object for the results of the last iteration. Example iteration_param_func: def iteration_param_func(it_num, it_dir): if it_num == 1: iteration_input = preprocess_dir + "/pageranks" else: iteration_input = it_dir + "/" + str(it_num-1) + "/pageranks" return { "INPUT_PATH" : iteration_input, "DAMPING_FACTOR" : damping_factor, "NUM_NODES" : num_nodes, "PAGERANKS_OUTPUT_PATH" : it_dir + "/" + str(it_num) + "/pageranks" "AGG_RANK_CHANGE_OUTPUT_PATH" : it_dir + "/" + str(it_num) + "/rank_changes" } ) """ script = Pig.compileFromFile(script_path) for i in range(1, max_num_iterations+1): print "Starting iteration step: %d" % i iteration = script.bind(param_generator_func(i, iteration_dir)).runSingle() metric_value = metric_type(str(iteration.result(metric_alias).iterator().next().get(0))) if metric_value < metric_threshold: print "%s %s under convergence threshold %s. Stopping." \ % (metric_name, str(metric_value), str(metric_threshold)) return { "num_iterations": i, "stop_reason": "CONVERGED" } elif i == max_num_iterations: print "%s %s above convergence threshold %s but hit max number of iterations. Stopping" \ % (metric_name, str(metric_value), str(metric_threshold)) return { "num_iterations": i, "stop_reason": "MAX_ITERATIONS" } else: print "%s %s above convergence threshold %s. Continuing." \ % (metric_name, str(metric_value), str(metric_threshold))
%declare DIR 'impression-inter-neg' data = load '/user/hgui/JYMBII-batch/history/$DIR/2013/$MM/$DD' USING BinaryJSON(); data = join score by (memberId, jobId), data by (memberId, jobId) parallel 500; data = foreach data generate score::memberId as memberId, score::jobId as jobId, score::score as score; data = distinct data parallel 1; store data into '/user/hgui/JYMBII-batch/history/$DIR/tmp-2013/$MM/$DD' USING BinaryJSON('memberId'); --------------------------------------------------------------------------------------------------------- %declare DIR 'positive' data = load '/user/hgui/JYMBII-batch/history/$DIR/2013/$MM/$DD' USING BinaryJSON(); data = join score by (memberId, jobId), data by (memberId, jobId) parallel 500; data = foreach data generate score::memberId as memberId, score::jobId as jobId, score::score as score; data = distinct data parallel 1; store data into '/user/hgui/JYMBII-batch/history/$DIR/tmp-2013/$MM/$DD' USING BinaryJSON('memberId'); --------------------------------------------------------------------------------------------------------- %declare DIR 'view' data = load '/user/hgui/JYMBII-batch/history/$DIR/2013/$MM/$DD' USING BinaryJSON(); data = join score by (memberId, jobId), data by (memberId, jobId) parallel 500; data = foreach data generate score::memberId as memberId, data::time as time, score::jobId as jobId, score::score as score; data = distinct data parallel 1; store data into '/user/hgui/JYMBII-batch/history/$DIR/tmp-2013/$MM/$DD' USING BinaryJSON('memberId'); """ prog = Pig.compile(script) for para in params: bound = prog.bind(para) stats = bound.runSingle()
if len(sys.argv) != 5: print "Usage: " + sys.argv[ 0] + " tenantCode start-date end-date parameters-file" print "Data format: yyyy/MM/dd" sys.exit() tenantCode = sys.argv[1] startDate = sys.argv[2] endDate = sys.argv[3] paramFile = sys.argv[4] minObjectId = globalVars.dateToObjectId(startDate) maxObjectId = globalVars.dateToObjectId(endDate) Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") Pig.registerJar("../lib/yucca-phoenix-pig.jar") try: props = util.Properties() propertiesfis = javaio.FileInputStream("mongo_parameters_prod.txt") props.load(propertiesfis) except: print "Errore leggendo mongo_parameters_prod.txt: ", sys.exc_info()[0] sys.exit(1) mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty( 'mongoPort') + "/DB_SUPPORT" mongo2 = " -u " + props.getProperty('mongoUsr') mongo3 = " -p " + props.getProperty( 'mongoPwd') + ''' --authenticationDatabase admin --quiet --eval "'''
MIN_SCORE = 0 MAX_ITERATION = 5 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i != k - 1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num < MAX_ITERATION: Q = P.bind({'centroids': initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration
def __init__(self, script_name, description, script_path, script_params, checkpoint_path): self.script_name = script_name self.description = description self.bound_script = Pig.compileFromFile(script_path).bind(script_params) self.flag_file_path = "%s/%s.success" % (checkpoint_path, os.path.splitext(script_name)[0])
sorted = ORDER non_null BY value DESC; limited = LIMIT sorted 1000; GENERATE group AS rowkey, FLATTEN(limited.(colkey, value)); }; jsonified = FOREACH limited GENERATE rowkey, colkey, com.reddit.pig.TO_JSON(value); STORE jsonified INTO '$OUTPUT' USING PigStorage(); """ ###### run the jobs # register the reddit udfs Pig.registerJar(SCRIPT_ROOT + "reddit-pig-udfs.jar") # process rels for rel, (cf, thing2_type) in relations.iteritems(): # build source for a script script = "SET default_parallel 10;" script += load_rels if "inbox" in rel: script += load_and_map_data script += add_unread else: script += add_relname script += load_things script += generate_rel_items script += store_top_1000_per_rowkey
from org.apache.pig.scripting import Pig from org.codehaus.jackson.map import ObjectMapper EPS = 10e-6 # maximum distance between consective weights for convergence pig_script = sys.argv[1] # pig script to run iteratively data_dir = sys.argv[2] # directory where intermediate weights will be written features = sys.argv[ 3] # location, inside data_dir, where the data to fit exists num_features = sys.argv[4] # number of features # # Cleanup data dir # cmd = "rmr %s/weight-*" % data_dir Pig.fs(cmd) # # Initialize weights # w0_fields = [] weights = [] for i in xrange(int(num_features)): weights.append(str(random.random())) w0_fields.append({ "name": "w%s" % i, "type": 25, "schema": None }) # See Pig's DataType.java path = tempfile.mkdtemp()
def run(self): print project_name + ": " + self.action compiled = Pig.compileFromFile(self.script) bound = compiled.bind(self.params) return bound.runSingle()
script = get_script(aggregateby=aggregateby) if mode == 'master': logging.warning('unsubstituted script:\n') logging.warning(script) logging.warning('substituted script:\n') logging.warning(PrintScript(args=args, sub=True)) args['mode'] = 'pigmode' run_bash('pig %s %s' % (self_name(), ' '.join('='.join(map(str, [k, v])) for k, v in args.items()))) #run_bash('pig %s mode=pigmode output=%s dryrun=%s ldadata=%s titledata=%s aggregateby=%s'%(self_name(), output, str(dryrun), ldadata, titledata, aggvalue)) elif mode == 'pigmode': if not dryrun: from org.apache.pig.scripting import Pig P = Pig.compile(script) result = P.bind({ 'output': output, 'selfname': self_name(), 'ldadata': ldadata, 'titledata': titledata, 'mylibloc': mylibloc }).runSingle() if result.isSuccessful(): print 'Pig job succeeded!' else: raise 'Pig job failed!' else: print 'unsubstituted:' print PrintScript(args=args, sub=False)
# last_centroids[i] = (MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE),MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)) # initial_centroids = initial_centroids + str(last_centroids[i]) # if i!=k-1: # initial_centroids = initial_centroids + ":" initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0" last_centroids = [(-120.0, -120.0), (-60.0, -60.0), (0.0, 0.0), (60.0, 60.0), (120.0, 120.0)] print initial_centroids P = Pig.compile("""register /Users/yun_shen/Desktop/spams/pigudf.jar DEFINE find_centroid FindCentroid('$centroids'); raw_data = load '1.log' as (spam_id:chararray, longitude:double, latitude:double); raw = filter raw_data by longitude is not null and latitude is not null; centroided = foreach raw generate spam_id, longitude, latitude, find_centroid(longitude, latitude) as centroid; grouped = group centroided by centroid parallel 4; store grouped into 'grouped'; result = foreach grouped generate group, AVG(centroided.longitude), AVG(centroided.latitude); store result into 'output'; """) converged = False iter_num = 0 while iter_num < MAX_ITERATION: Q = P.bind({'centroids': initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [] x = 0.0
sorted = ORDER non_null BY value DESC; limited = LIMIT sorted 1000; GENERATE group AS rowkey, FLATTEN(limited.(colkey, value)); }; jsonified = FOREACH limited GENERATE rowkey, colkey, com.verbify.pig.TO_JSON(value); STORE jsonified INTO '$OUTPUT' USING PigStorage(); """ ###### run the jobs # register the verbify udfs Pig.registerJar(SCRIPT_ROOT + "verbify-pig-udfs.jar") # process rels for rel, (cf, thing2_type) in relations.iteritems(): # build source for a script script = "SET default_parallel 10;" script += load_rels if "inbox" in rel: script += load_and_map_data script += add_unread else: script += add_relname script += load_things script += generate_rel_items script += store_top_1000_per_rowkey
if len(sys.argv) != 5: print "Usage: " + sys.argv[ 0] + " tenantCode start-date end-date parameters-file" print "Data format: yyyy/MM/dd" sys.exit() tenantCode = sys.argv[1] startDate = sys.argv[2] endDate = sys.argv[3] paramFile = sys.argv[4] minObjectId = globalVars.dateToObjectId(startDate) maxObjectId = globalVars.dateToObjectId(endDate) Pig.registerJar("../lib/mongo-java-driver-3.4.0.jar") Pig.registerJar("../lib/mongo-hadoop-core-1.5.2.jar") Pig.registerJar("../lib/mongo-hadoop-pig-1.5.2.jar") Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") Pig.registerJar("../lib/yucca-phoenix-pig.jar") Pig.registerJar("/usr/hdp/current/pig-client/piggybank.jar") try: props = util.Properties() propertiesfis = javaio.FileInputStream(paramFile) props.load(propertiesfis) except: print "Errore leggendo mongo_parameters_prod.txt: ", sys.exc_info()[0] sys.exit(1) mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty(
#!/usr/bin/python # explicitly import Pig class from org.apache.pig.scripting import Pig # COMPILE: compile method returns a Pig object that represents the pipeline P = Pig.compile( """a = load '$input' using PigStorage() as (name:chararray, age:int, gpa:double); a1 = filter a by age > 18; a2 = foreach a1 generate name, ROUND(gpa) as gpa; b = load 'votertab10k' using PigStorage() as (name:chararray, age:int, registration:chararray, contributions:double); c = join a2 by name, b by name; d = group c by registration; e = foreach d generate group, AVG(c.gpa) as gpa; f = order e by gpa desc; store f into '$output'; """) results = P.bind({'input': 'studenttab10k', 'output': 'output'}).runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("f").iterator() while iter.hasNext(): tuple = iter.next() print tuple
import java.io as javaio props = util.Properties() #add try catch for this propertiesfis = javaio.FileInputStream("mongo_parameters_prod.txt") props.load(propertiesfis) mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty( 'mongoPort') + "/DB_SUPPORT" mongo2 = " -u " + props.getProperty('mongoUsr') mongo3 = " -p " + props.getProperty( 'mongoPwd') + ''' --authenticationDatabase admin --quiet --eval "''' # var param1=438; var param2=1; var param3='datalake'" delete_dataset.js Pig.registerJar("../lib/mongo-java-driver-3.4.0.jar") Pig.registerJar("../lib/mongo-hadoop-core-1.5.2.jar") Pig.registerJar("../lib/mongo-hadoop-pig-1.5.2.jar") Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") #Pig.registerJar("../lib/yucca-phoenix-pig.jar") if mode in ["APPEND", "append"]: # read from metadata source (mongoDB) lastIdDatalake2Speed for tenant readLastIdJob = Pig.compileFromFile( """read_mongo_lastIdDatalake2Speed.pig""") results = readLastIdJob.bind({'tenantCode': tenantCode}).runSingle() if results.isSuccessful(): print "Pig job succeeded" iter = results.result("lastId").iterator() if iter.hasNext(): lastId = iter.next()