示例#1
0
 def run (self, params, script_name, script_file, elements = []):
     ''' Execute pig. '''
     pig = Pig.compileFromFile (script_name, script_file)
     bound = pig.bind (params) 
     futures = bound.run () if isinstance (params, list) else bound.runSingle ()
     self.handle_future (futures, elements)
     self.complete ()
示例#2
0
def runbidi(src, fdest):
	P = Pig.compileFromFile('src/main/pig/bidi.pig')

	cntsbase = 'counts'
	Pig.fs('rmr ' + cntsbase)

	for count in range(10):
		dest = fdest + 'gm%04d' % count
		Pig.fs('rmr ' + dest)
		cnts = cntsbase
		params = {'src':src, 'dest':dest, 'cnts':cnts}
		bound = P.bind(params)
		job = bound.runSingle()

		if not job.isSuccessful():
			raise 'failed'

		src = dest

		iter = job.result('S').iterator()
		if iter.hasNext():
			Pig.fs('rmr ' + cnts)
		else:
			Pig.fs('mv ' + dest + ' ' + fdest)
			print 'ALL DONE!'
			break
示例#3
0
def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # compile the pig code
    P = Pig.compileFromFile("../pigscripts/#{script_name}.pig")
    bound = P.bind()
    bound.runSingle()
示例#4
0
def main(argv=None):
#Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily'
#however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to
#work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case
#https://issues.apache.org/jira/browse/PIG-2548
#    if argv is None:
#        argv = sys.argv
#    if len(argv) != 3:
#        print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>"
#        return 1
#
#    profile_file = argv[1]
#    timeframe = argv[2]
    
    profile_file = os.environ['config_file']
    timeframe = os.environ['timeframe']

    if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'):
        print 'The time frame must be either daily, weekly or monthly.'
        return 1

    #Load the config
    profile = {}
    execfile(profile_file, {'timeframe':timeframe}, profile)

    #Clean up incomplete runs and create dir
    Pig.fs('rmr ' + profile['REPORTDIR'])
    Pig.fs('mkdir ' + profile['REPORTDIR'])

    #Start pig processing
    pig_init()
    if timeframe == 'daily':
        #Clean up incomplete runs and create dir
        Pig.fs('rmr %s' % profile['LOGDIR'])
        Pig.fs('mkdir %s' % profile['LOGDIR'])
        import_logs(profile['logs'])
    #The web_load.pig script is run by the processing scripts
    pstats = Pig.compileFromFile('web_%s.pig' % timeframe)
    bstats = pstats.bind(profile)
    stats = bstats.run()
    if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats):
        if not stats.isSuccessful():
            print 'Error in web log stats, %s' % run.getErrorMessage()
            sys.exit(1)
    else:
        for run in stats:
            if not run.isSuccessful():
                print 'Error in web log stats, %s' % run.getErrorMessage()
                sys.exit(1)
示例#5
0
def import_logs(profile):
    """ Import all the log files for a given day and processed them putting each in a log dir.
        If the profile is a list there are multiple files otherwise only a single one.
        The files are combined when running web_load.pig
    """
    #Clean up any left over files from the last run
    for logfile in profile:
        Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME']))
    pload = Pig.compileFromFile('web_import.pig')
    bload = pload.bind(profile)
    load = bload.run()
    #Check for load errors
    if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats):
        if not load.isSuccessful():
            print 'Error in web log load, %s' % load.getErrorMessage()
            sys.exit(1)
    else:
        for run in load:
            if not run.isSuccessful():
                print 'Error in web log load, %s' % run.getErrorMessage()
                sys.exit(1)
def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # compile the pig code
    for i in range(10):
        print 'Run %s started!' % i
        P = Pig.compileFromFile("../pigscripts/avg_songs_per_split_counter.pig")

        bound = P.bind({"ITERATION_NUM":i})

        ps = bound.runSingle()
        print 'Run %s done!' % i

        result = ps.result("avg_split_song_count")
        for r in result.iterator():
            print r

        if int(r.get(1).toString()) >= 5:
            print 'Good enough! Quitting time!'
            break
示例#7
0
if __name__ == '__main__':
    from org.apache.pig.scripting import Pig
    import sys
    P = Pig.compileFromFile('/home/course/lian9478/task3.pig')
    params = {}
    for i in range(int(sys.argv[1])):
        if i == 0:
            out = '/home/course/lian9478/HW4-old_twitter_account_rank.csv'
        else:
            out = "out/pagerank_data_" + str(i + 1)
        params['doc_in'] = out
        params['doc_out'] = "out/pagerank_data_" + str(i + 2)
        bound = P.bind(params)
        bound.runSingle()
        #this is to do it one by one instead of parallel
        #so you can call this driver like this
        #pig -x local -embedded jython driver.py 20
示例#8
0
 def __init__(self, script_name, description, script_path, script_params, checkpoint_path):
     self.script_name = script_name
     self.description = description
     self.bound_script = Pig.compileFromFile(script_path).bind(script_params)
     self.flag_file_path = "%s/%s.success" % (checkpoint_path, os.path.splitext(script_name)[0])
示例#9
0
    with open('tenant.json') as tenantdata_file:
        tenantData = json.loads(tenantdata_file.read())

    globalVars.init(tenantCode, tenantData)

    callResult = call("mongo " + mongo1 + " " + mongo2 + " " + mongo3 +
                      " var param1='" + tenantCode + "' " +
                      ''' "  ../list_mongo_dataset_fields.js > dataset.json''',
                      shell=True)
    if callResult == 0:

        with open('dataset.json') as metadata_file:
            metadata = json.loads(metadata_file.read())

        syncJob = Pig.compileFromFile("""sync_solr2phoenix.pig""")

        for m in metadata:

            subtype = m['_id']['subtype']
            phoenixColumns = phoenixDynamicColumns = phoenixUpsertColumns = ''

            for field in m['_id']['fields']:

                name = field['fieldName'].strip()
                dataType = field['dataType']

                if subtype == 'binaryDataset' and (name == 'urlDownloadBinary'
                                                   or name == 'idBinary'):
                    continue
示例#10
0
    def iterate_until_convergence(script_path,
                                  iteration_dir,
                                  param_generator_func,
                                  metric_name,
                                  metric_type,
                                  metric_alias,
                                  metric_threshold,
                                  max_num_iterations):

    """
    Utility for running a pigscript which outputs data in the same schema as its input iteratively,
    with the output of the previous run being the input of the next run. Stops when some convergence
    metric has been reached or if a maximum number of iterations has been reached.

    Example usage:

    iteration_result = IterationUtils.iterate_until_convergence(
            "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
            iteration_dir,                        # temporary iteration outputs will be stored here
            iteration_param_func,                 # takes iteration #, returns Pig parameter dictionary
            "Sum of ordering-rank changes",       # name of the convergence metric
            int,                                  # Python type of the convergence metric
            "aggregate_rank_change",              # alias in the pigscript where the metric is stored to
            convergence_threshold,                # stop when metric less than this
            max_num_iterations                    # or if this many iterations have been performed

    iteration_result is a PigStats object for the results of the last iteration.

    Example iteration_param_func:

    def iteration_param_func(it_num, it_dir):
        if it_num == 1:
            iteration_input = preprocess_dir + "/pageranks"
        else:
            iteration_input = it_dir + "/" + str(it_num-1) + "/pageranks"

        return {
            "INPUT_PATH"                  : iteration_input,
            "DAMPING_FACTOR"              : damping_factor,
            "NUM_NODES"                   : num_nodes,
            "PAGERANKS_OUTPUT_PATH"       : it_dir + "/" + str(it_num) + "/pageranks"
            "AGG_RANK_CHANGE_OUTPUT_PATH" : it_dir + "/" + str(it_num) + "/rank_changes"
        }
    )
    """

        script = Pig.compileFromFile(script_path)
        for i in range(1, max_num_iterations+1):
            print "Starting iteration step: %d" % i

            iteration    = script.bind(param_generator_func(i, iteration_dir)).runSingle()
            metric_value = metric_type(str(iteration.result(metric_alias).iterator().next().get(0)))

            if metric_value < metric_threshold:
                print "%s %s under convergence threshold %s. Stopping." \
                       % (metric_name, str(metric_value), str(metric_threshold))
                return { "num_iterations": i, "stop_reason": "CONVERGED" }
            elif i == max_num_iterations:
                print "%s %s above convergence threshold %s but hit max number of iterations. Stopping" \
                       % (metric_name, str(metric_value), str(metric_threshold))
                return { "num_iterations": i, "stop_reason": "MAX_ITERATIONS" }
            else:
                print "%s %s above convergence threshold %s. Continuing." \
                       % (metric_name, str(metric_value), str(metric_threshold))
示例#11
0
if __name__ == "__main__":
    params        = Pig.getParameters()
    graph         = params["GRAPH"]
    seed_vertices = params["SEED_VERTICES"]
    tmp_dir       = params["TMP_DIR"]
    output_path   = params["OUTPUT_PATH"]
    nhood_size    = int(params["NEIGHBORHOOD_SIZE"])

    preprocess_graph        = "%s/preprocess/graph"        % tmp_dir
    preprocess_num_vertices = "%s/preprocess/num_vertices" % tmp_dir
    iteration_verts_prefix  = "%s/iteration/vertices_"     % tmp_dir

    print "Graph Sampler: starting preprocessing step."
    preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({
        "GRAPH_INPUT_PATH"         : graph,
        "GRAPH_OUTPUT_PATH"        : preprocess_graph,
        "NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices
    }).runSingle()

    iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig")
    num_iterations   = nhood_size - 1
    num_vertices     = long(str(preprocessing.result("num_vertices").iterator().next().get(0)))

    print "Graph Sampler: scheduling %d iterations" % num_iterations
    for i in range(num_iterations):
        print "Graph Sampler: starting iteration step %d" % (i+1)
        iteration = iteration_script.bind({
            "VERTICES_INPUT_PATH"  : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)),
            "GRAPH_INPUT_PATH"     : preprocess_graph,
            "VERTICES_OUTPUT_PATH" : iteration_verts_prefix + str(i)
        }).runSingle()
示例#12
0
import sys
from org.apache.pig.scripting import Pig

load = Pig.compileFromFile(sys.argv[1])
iteration = Pig.compileFromFile('iteration.pig')
store = Pig.compileFromFile('store.pig')
 
print '*** Loading input ***' 
load_stats = load.bind({'EDGES_OUT': 'edges0.tmp'}).runSingle()
if not load_stats.isSuccessful():
    raise 'Load failed'

i = 1
stable_inerations = 0
edges_in = 'edges' + str(i - 1) + '.tmp'
edges_out = ''

while True:
    print "*** Iteration " + str(i) + " ***"
    edges_out = 'edges' + str(i) + '.tmp'
    iteration_bound = iteration.bind({'EDGES_IN': edges_in, 'EDGES_OUT': edges_out, 
        'CONVERGENCE_OUT': 'convergence.tmp'})
    iteration_stats = iteration_bound.runSingle()
    if not iteration_stats.isSuccessful():
        raise 'Iteration failed'
    conv_result = iteration_stats.result('convergence').iterator().next()
    max_iter = int(str(conv_result.get(0)))
    conv_iter = int(str(conv_result.get(1)))
    change_count = int(str(conv_result.get(2)))
    Pig.fs('rm -r ' + 'convergence.tmp')
    Pig.fs('rm -r ' + edges_in)
def run_pagerank():
    """
    Calculates pageranks for Twitter users.

    Three main steps:
        1. Preprocessing: Process input data to:
             a) Count the total number of users.
             b) Prepare initial pagerank values for all users.
        2. Iterative: Calculate new pageranks for each user based on the previous pageranks of the
                      users' followers.
        3. Postprocesing: Find the top pagerank users and join to a separate dataset to find their names.
    """
    # Preprocessing step:
    print "Starting preprocessing step."
    preprocess = Pig.compileFromFile(PREPROCESS_SCRIPT)
    preprocess_bound = preprocess.bind({
        "INPUT_PATH": FOLLOWER_GRAPH_INPUT,
        "PAGERANKS_OUTPUT_PATH": PREPROCESS_PAGERANKS,
        "NUM_USERS_OUTPUT_PATH": PREPROCESS_NUM_USERS
    })
    preprocess_stats = preprocess_bound.runSingle()
    num_users = int(str(preprocess_stats.result("num_users").iterator().next().get(0)))
    convergence_threshold = CONVERGENCE_THRESHOLD / num_users


    # Iteration step:
    iteration = Pig.compileFromFile(PAGERANK_ITERATE_SCRIPT)
    for i in range(MAX_NUM_ITERATIONS):
        print "Starting iteration step: %s" % str(i + 1)

        # Append the iteration number to the input/output stems
        iteration_input = PREPROCESS_PAGERANKS if i == 0 else (ITERATION_PAGERANKS_PREFIX + str(i-1))
        iteration_pageranks_output = ITERATION_PAGERANKS_PREFIX + str(i)
        iteration_max_diff_output = ITERATION_MAX_DIFF_PREFIX + str(i)

        iteration_bound = iteration.bind({
            "INPUT_PATH": iteration_input,
            "DAMPING_FACTOR": DAMPING_FACTOR,
            "NUM_USERS": num_users,
            "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output,
            "MAX_DIFF_OUTPUT_PATH": iteration_max_diff_output
        })
        iteration_stats = iteration_bound.runSingle()

        # If we're below the convergence_threshold break out of the loop.
        max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0)))
        if max_diff < CONVERGENCE_THRESHOLD:
            print "Max diff %s under convergence threshold. Stopping." % max_diff
            break
        elif i == MAX_NUM_ITERATIONS-1:
            print "Max diff %s above convergence threshold but hit max number of iterations.  Stopping." \
                    % max_diff
        else:
            print "Max diff %s above convergence threshold. Continuing." % max_diff

    iteration_pagerank_result = ITERATION_PAGERANKS_PREFIX + str(i)

    # Postprocesing step:
    print "Starting postprocessing step."
    postprocess = Pig.compileFromFile(POSTPROCESS_SCRIPT)
    postprocess_bound = postprocess.bind({
        "PAGERANKS_INPUT_PATH": iteration_pagerank_result,
        "USERNAMES_INPUT_PATH": USERNAMES_INPUT,
        "TOP_N": NUM_TOP_USERS,
        "OUTPUT_BUCKET": OUTPUT_BUCKET
    })
    postprocess_stats = postprocess_bound.runSingle()
示例#14
0
if __name__=='__main__':
   from org.apache.pig.scripting import Pig
   import sys
   P=Pig.compileFromFile('/home/course/lian9478/task3.pig');
   params={};
   for i in range(int(sys.argv[1])):
       if i==0:
          out='/home/course/lian9478/HW4-old_twitter_account_rank.csv'
       else: 
          out="out/pagerank_data_"+str(i+1)
       params['doc_in']=out;
       params['doc_out']="out/pagerank_data_"+str(i+2);
       bound = P.bind(params);
       bound.runSingle(); #this is to do it one by one instead of parallel
       #so you can call this driver like this
       #pig -x local -embedded jython driver.py 20
示例#15
0
#Passing PIG Script to PYTHON and RUN
#! /usr/bin/python

from org.apache.pig.scripting import Pig

P = Pig.compileFromFile("""myscript.pig""")

input = "original"
output = "output"

result = p.bind({'in':input, 'out':output}).runSingle()
if result.isSuccessful():
	print "Pig job succeeded"
else:
	raise "Pig job failed"
def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # Specify where the data will come from,
    # and where output data will go after each step

    data_stem = "s3n://jpacker-dev/amazon_products/books_graph/"
    num_vertices_input = data_stem + "num_vertices"
    nodes_input = data_stem + "nodes"
    edges_input = data_stem + "edges"

    output_stem = data_stem + "clustering/"
    preprocess_num_vertices_output = output_stem + "preprocess/num_vertices"
    preprocess_trans_mat_output = output_stem + "preprocess/trans_mat"
    iteration_trans_mat_output_stem = output_stem + "iteration/trans_mat_"
    postprocess_clusters_output = output_stem + "postprocess/clusters"
    postprocess_stats_output = output_stem + "postprocess/stats"

    """
    data_stem = "../fake-fixtures/"
    num_vertices_input = data_stem + "cathedral-num-vertices"
    nodes_input = data_stem + "cathedral-nodes"
    edges_input = data_stem + "cathedral-edges"

    output_stem = data_stem + "cathedral_clustering/"
    preprocess_num_vertices_output = output_stem + "preprocess/num_vertices"
    preprocess_trans_mat_output = output_stem + "preprocess/trans_mat"
    iteration_trans_mat_output_stem = output_stem + "iteration/trans_mat_"
    postprocess_clusters_output = output_stem + "postprocess/clusters"
    postprocess_stats_output = output_stem + "postprocess/stats"
    """

    # Preprocessing step:
    #
    # (1) Generate a transition matrix from the internal edges
    # (2) Copy precomputed count of # vertices
    #     No computation is being done here; this just lets us use Pig to access the data
    #     instead of configuring S3 access manually with boto
    #
    preprocess = Pig.compileFromFile("../pigscripts/clustering_preprocess.pig")
    preprocess_bound = preprocess.bind({ 
        "NUM_VERTICES_INPUT_PATH": num_vertices_input, 
        "EDGES_INPUT_PATH": edges_input, 
        "NUM_VERTICES_OUTPUT_PATH": preprocess_num_vertices_output,
        "TRANS_MAT_OUTPUT_PATH": preprocess_trans_mat_output
    })
    preprocess_stats = preprocess_bound.runSingle()

    # Extract the number of vertices, which we will pass into each iteration as a parameter
    num_vertices = long(str(preprocess_stats.result("num_verts").iterator().next().get(0)))
    
    # Extract the number of edges (including inserted self-loops)
    # We will use this in our convergence metric
    initial_num_edges = long(str(preprocess_stats.getNumberRecords(preprocess_trans_mat_output)))
    
    # Iteration step applying the Markov Clustering operations:
    #
    # (1) Expansion: square the transition matrix ~= take a step in a random walk
    # (2) Inflation: take an elementwise power of the matrix ~= strengthen strong connections, weaken weak ones'
    # (3) Pruning: set small matrix values to zero (since the matrix impl is sparse, this greatly speeds things up)
    # (4) Normalization: renormalize the matrix columnwise to keep it a valid transition matrix
    #
    # I tested several mathematically sensible convergence metrics 
    # (max of max residual for each col, avg of max residual for each col, col kurtosis)
    # but none worked very well. So I'm currently just breaking when the number of edges
    # in an iteration's transition matrix is less than the number of edges in 
    # the initial transition matrix times a constant multiple, which seems to indicate
    # that things are settling down.
    #
    # The algorithm has two parameters:
    # (1) The inflation parameter is an exponential factor which determines the cluster size. higher inflation => smaller clusters
    # (2) Epsilon is a minimum threshold for values in the transition matrix; anything smaller will be pruned (set to zero)
    #     I am not sure how high epsilon can safely be set without significantly degrading the quality of the algorithm
    #     If you run in to performance problems though, raising epsilon will dramatically reduce execution time
    #
    iteration = Pig.compileFromFile("../pigscripts/clustering_iterate.pig")
    max_num_iterations = 7  # most graphs should converge after 4-10 iterations
    num_iterations = 0

    for i in range(1, max_num_iterations + 1):
        iteration_input = preprocess_trans_mat_output if i == 1 else (iteration_trans_mat_output_stem + str(i-1))
        iteration_output = iteration_trans_mat_output_stem + str(i)

        iteration_bound = iteration.bind({
            "INPUT_PATH": iteration_input,
            "ITERATION_OUTPUT_PATH": iteration_output,
            "NUM_VERTICES": num_vertices, 
            "INFLATION_PARAMETER": 1.5,
            "EPSILON": 0.01
        })
        iteration_stats = iteration_bound.runSingle()

        num_iterations += 1
        num_edges = long(str(iteration_stats.getNumberRecords(iteration_output)))
        if num_iterations >= 3 and num_edges < (initial_num_edges * 1.05):
            break

    # Postprocessing step:
    #
    # Interpret the transition matrix outputted by the iterations to find clusters.
    # Each row represents a cluster: the column id's of its non-zero elements are its constituents.
    #
    # There will be many duplicate clusters (N rows for a cluster of N elements),
    # so we filter those out. We also filter out very small clusters.
    #
    mcl_result_path = iteration_trans_mat_output_stem + str(num_iterations)
    postprocess = Pig.compileFromFile("../pigscripts/clustering_postprocess.pig")
    postprocess_bound = postprocess.bind({
        "NODES_INPUT_PATH": nodes_input,
        "MCL_RESULT_PATH": mcl_result_path, 
        "CLUSTERS_OUTPUT_PATH": postprocess_clusters_output,
        "STATS_OUTPUT_PATH": postprocess_stats_output,
        "MIN_ACCEPTABLE_CLUSTER_SIZE": 3
    })
    postprocess_stats = postprocess_bound.runSingle()
示例#17
0
 def run(self):
     print project_name + ": " + self.action
     compiled = Pig.compileFromFile(self.script)
     bound = compiled.bind(self.params)
     return bound.runSingle()
示例#18
0
w0_schema_file = open("%s/.pig_schema" % path, 'w')
ObjectMapper().writeValue(w0_schema_file, w0_schema);
w0_schema_file.close()

#
# Copy initial weights to fs
#
copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0")
Pig.fs(copyFromLocal)


#
# Iterate until converged
#
features     = "%s/%s" % (data_dir,features)
script       = Pig.compileFromFile(pig_script)
weight_queue = Queue.Queue(25) # for moving average
avg_weight   = [0.0 for i in xrange(int(num_features))]
converged    = False
prev         = 0
weight_dir   = tempfile.mkdtemp()

while not converged:
    input_weights  = "%s/weight-%s" % (data_dir,prev)
    output_weights = "%s/weight-%s" % (data_dir,prev+1)

    bound = script.bind({'input_weights':input_weights,'output_weights':output_weights,'data':features})
    bound.runSingle()
    
    
    #
示例#19
0
    with open('tenant.json') as tenantdata_file:
        tenantData = json.loads(tenantdata_file.read())

    globalVars.init(tenantCode, tenantData)

    callResult = call("mongo " + mongo1 + " " + mongo2 + " " + mongo3 +
                      " var param1='" + tenantCode + "' " +
                      ''' "  ../list_mongo_dataset_fields.js > dataset.json''',
                      shell=True)
    if callResult == 0:

        with open('dataset.json') as metadata_file:
            metadata = json.loads(metadata_file.read())

        importJob = Pig.compileFromFile("""copy_mongo2phoenix_solr.pig""")

        for m in metadata:

            subtype = m['_id']['subtype']
            dynamicMongoFields = ''
            dynamicPhoenixColumns = ''

            for field in m['_id']['fields']:

                name = field['fieldName'].strip()
                dataType = field['dataType'].strip()

                if subtype == 'binaryDataset' and (name == 'urlDownloadBinary'
                                                   or name == 'idBinary'):
                    continue
示例#20
0
    def run_pagerank(edges_input,
                     output_path,
                     tmp_output_dir,
                     damping_factor=0.85,
                     convergence_threshold=0.0001,
                     max_num_iterations=10,
                     id_name_map=None,
                     preprocessing_script="../pigscripts/pagerank_preprocess.pig",
                     iteration_script="../pigscripts/pagerank_iterate.pig"
                    ):

        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Order nodes by pagerank
                               Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
                               to get human-readable names
        """

        preprocess_dir = "%s/preprocess" % tmp_output_dir
        iteration_dir  = "%s/iteration"  % tmp_output_dir

        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
            "INPUT_PATH"            : edges_input,
            "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
            "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
        }).runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes             = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        def iteration_param_func(it_num, it_dir):
            if it_num == 1:
                iteration_input = "%s/pageranks" % preprocess_dir
            else:
                iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)

            return {
                "INPUT_PATH"                  : iteration_input,
                "DAMPING_FACTOR"              : damping_factor,
                "NUM_NODES"                   : num_nodes,
                "PAGERANKS_OUTPUT_PATH"       : "%s/%d/pageranks"    % (it_dir, it_num),
                "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
            }

        iteration_result = IterationUtils.iterate_until_convergence(
            "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
            iteration_dir,                        # temporary iteration outputs will be stored here
            iteration_param_func,                 # takes iteration #, returns Pig parameter dictionary
            "Sum of ordering-rank changes",       # name of the convergence metric
            int,                                  # Python type of the convergence metric
            "aggregate_rank_change",              # alias in the pigscript where the metric is stored to
            convergence_threshold,                # stop when metric less than this
            max_num_iterations                    # or if this many iterations have been performed
        )

        # Postprocesing step:
        print "Starting postprocessing step."

        postprocess_script = """
            pageranks   =   LOAD '$PAGERANKS_INPUT_PATH'   USING PigStorage() AS (id: int, pagerank: double);
            pageranks   =   FILTER pageranks BY pagerank IS NOT NULL;
        """

        if id_name_map:
            postprocess_script += """
                id_name_map =   LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
                with_names  =   FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
                ordered     =   ORDER with_names BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "ID_NAME_MAP_INPUT_PATH" : id_name_map,
                "OUTPUT_PATH"            : output_path
            }).runSingle()
        else:
            postprocess_script += """
                ordered     =   ORDER pageranks BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "OUTPUT_PATH"            : output_path
            }).runSingle()

        Pig.fs("rmr %s" % preprocess_dir)
        Pig.fs("rmr %s" % iteration_dir)
示例#21
0
              notEmptyDatasets + ''''"  list_dataset_conv.js > ''' + outDir +
              '''/lista_dataset.''' + str(pid) + ".json")

        callResult = call("mongo " + mongoConnectString + " " +
                          ''' --eval "''' + " var param1='" + tenantCode +
                          "';var param2='" + notEmptyDatasets +
                          ''''"  list_dataset_conv.js > ''' + outDir +
                          '''/lista_dataset.''' + str(pid) + ".json",
                          shell=True)
        if callResult == 0:

            with open(outDir + "/lista_dataset." + str(pid) +
                      ".json") as metadata_file:
                metadata = json.loads(metadata_file.read())

            exportJob = Pig.compileFromFile("""export_single_dataset.pig""")

            for m in metadata:

                subtype = m['configData']['subtype']
                dynamicPhoenixColumns = phoenixColumns = metadataFields = csvHeader = metadataHeader = ''

                for field in m['info']['fields']:

                    name = field['fieldName'].strip()
                    dataType = field['dataType'].lower()

                    dynamicPhoenixColumns += '\\\"' + name + globalVars.dataTypeSuffixes[
                        dataType] + '\\\"\ ' + globalVars.dataType2Phoenix[
                            dataType] + ','
                    csvHeader += name + ','
示例#22
0
from org.apache.pig.scripting import Pig
import os

if __name__ == "__main__":
    params       = Pig.getParameters()
    loader       = params["LOADER"]
    input_source = params["INPUT_SRC"]
    output_path  = params["OUTPUT_PATH"]
    infer_types  = params["INFER_TYPES"]

    Pig.compileFromFile("../pigscripts/characterize.pig").bind({
        "LOADER"      : loader,
        "INPUT_SRC"   : input_source,
        "OUTPUT_PATH" : output_path,
        "INFER_TYPES" : infer_types
    }).runSingle()

    for root, _, files in os.walk("../%s" % output_path):
        for f in files:
            if f[0] != '.':
                fullpath = os.path.join(root, f)
                copypath = os.path.join(root, f + '.csv')
                os.system ("cp %s %s" % (fullpath, copypath))
示例#23
0
    def run_pagerank(self):
        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names.
        """
        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile(self.preprocessing_script)
        preprocess_params = {
            "INPUT_PATH": self.edges_input,
            "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks,
            "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes
        }
        preprocess_bound = preprocess.bind(preprocess_params)
        preprocess_stats = preprocess_bound.runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes = long(
            str(preprocess_stats.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(self.convergence_threshold * num_nodes *
                                     num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (
            num_nodes, convergence_threshold)

        # Iteration step:
        iteration = Pig.compileFromFile(self.iteration_script)
        for i in range(self.max_num_iterations):
            print "Starting iteration step: %s" % str(i + 1)

            # Append the iteration number to the input/output stems
            iteration_input = self.preprocess_pageranks if i == 0 else (
                self.iteration_pageranks_prefix + str(i - 1))
            iteration_pageranks_output = self.iteration_pageranks_prefix + str(
                i)
            iteration_rank_changes_output = self.iteration_rank_changes_prefix + str(
                i)

            iteration_bound = iteration.bind({
                "INPUT_PATH":
                iteration_input,
                "DAMPING_FACTOR":
                self.damping_factor,
                "NUM_NODES":
                num_nodes,
                "PAGERANKS_OUTPUT_PATH":
                iteration_pageranks_output,
                "AGG_RANK_CHANGE_OUTPUT_PATH":
                iteration_rank_changes_output
            })
            iteration_stats = iteration_bound.runSingle()

            # If we're below the convergence threshold break out of the loop.
            aggregate_rank_change = long(
                str(
                    iteration_stats.result(
                        "aggregate_rank_change").iterator().next().get(0)))
            if aggregate_rank_change < convergence_threshold:
                print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \
                       % (aggregate_rank_change, convergence_threshold)
                break
            elif i == self.max_num_iterations - 1:
                print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \
                      ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \
                       "Stopping."
            else:
                print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \
                       % (aggregate_rank_change, convergence_threshold)

        iteration_pagerank_result = self.iteration_pageranks_prefix + str(i)

        # Postprocesing step:
        print "Starting postprocessing step."
        postprocess = Pig.compileFromFile(self.postprocessing_script)
        postprocess_params = {
            "PAGERANKS_INPUT_PATH": iteration_pagerank_result
        }
        if self.output_path is not None:  # otherwise, the script outputs to the default location,
            # which is a special directory in s3://mortar-example-output-data
            # permissioned for your Mortar account.
            postprocess_params["OUTPUT_PATH"] = self.output_path
        postprocess_bound = postprocess.bind(postprocess_params)
        postprocess_stats = postprocess_bound.runSingle()
示例#24
0
import sys
from org.apache.pig.scripting import Pig

load = Pig.compileFromFile(sys.argv[1])
iteration = Pig.compileFromFile('iteration.pig')
store = Pig.compileFromFile('store.pig')

print '*** Loading input ***'
load_stats = load.bind({'EDGES_OUT': 'edges0.tmp'}).runSingle()
if not load_stats.isSuccessful():
    raise 'Load failed'

i = 1
stable_inerations = 0
edges_in = 'edges' + str(i - 1) + '.tmp'
edges_out = ''

while True:
    print "*** Iteration " + str(i) + " ***"
    edges_out = 'edges' + str(i) + '.tmp'
    iteration_bound = iteration.bind({
        'EDGES_IN': edges_in,
        'EDGES_OUT': edges_out,
        'CONVERGENCE_OUT': 'convergence.tmp'
    })
    iteration_stats = iteration_bound.runSingle()
    if not iteration_stats.isSuccessful():
        raise 'Iteration failed'
    conv_result = iteration_stats.result('convergence').iterator().next()
    max_iter = int(str(conv_result.get(0)))
    conv_iter = int(str(conv_result.get(1)))
示例#25
0
        tenantData = json.loads(tenantdata_file.read())

    globalVars.init(tenantCode, tenantData)
    #    newLastId = 0

    # read from metadata source (mongoDB) all datasets with
    # availableSpeed = true
    # availableHive = true (getting also dbHiveSchema and dbHiveTable)
    callResult = call("mongo " + mongoConn + mongoParam +
                      " ../list_mongo_hive_dataset.js > dataset." + str(pid) +
                      ".json",
                      shell=True)
    if callResult == 0:

        print "Dataset list read"
        copyHive2MongoJob = Pig.compileFromFile("""copy_hive2phoenix.pig""")

        with open('dataset.' + str(pid) + '.json') as metadata_file:
            metadata = json.loads(metadata_file.read())

        for m in metadata:

            subtype = m['configData']['subtype']
            fields = m['info']['fields']
            idDataset = str(m['idDataset'])
            datasetVersion = str(m['datasetVersion'])
            dynamicPhoenixColumns = ''
            aliasString = ('bda_id\ as\ bda_id:chararray,\ ' + idDataset +
                           '\ as\ idDataset:int,\ ' + datasetVersion +
                           '\ as\ datasetVersion:int,\ ')
示例#26
0
    System.exit(0)

if fs.exists(parsedDir):

    # parsed-captures
    if (not fs.exists(parsedCaptures)
            or fs.getFileStatus(parsedDir).getModificationTime() >
            fs.getFileStatus(parsedCaptures).getModificationTime()):
        print 'LOG: Graph parsed-captures create'
        fs.delete(parsedCaptures, True)
        params = {
            'INPUT': str(parsedDir),
            'OUTPUT': str(parsedCaptures),
            'JOBNAME': str(collection) + ' parsed-captures'
        }
        job = Pig.compileFromFile('pig/parsed-captures.pig').bind(params)
        result = job.runSingle(props)
        if not result.isSuccessful():
            print '\nERROR: Pig job parsed-captures for ' + collection
            System.exit(1)
    else:
        print 'LOG: Graph parsed-captures up-to-date'

    # link-graph
    if (not fs.exists(linkGraph)
            or fs.getFileStatus(parsedDir).getModificationTime() >
            fs.getFileStatus(linkGraph).getModificationTime()):
        print 'LOG: Graph link-graph create'
        fs.delete(linkGraph, True)
        params = {
            'INPUT': str(parsedDir),
示例#27
0
    print 'LOG: Elapsed %f' % (endTime - startTime)
    # Remove the guardFile
    fs.delete( guardFile, True )
    System.exit(0)

if fs.exists( parsedDir ):

    # parsed-captures
    if ( not fs.exists( parsedCaptures) or
         fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( parsedCaptures ).getModificationTime() ):
        print 'LOG: Graph parsed-captures create'
        fs.delete( parsedCaptures, True )
        params = { 'INPUT'  : str(parsedDir),
                   'OUTPUT' : str(parsedCaptures),
                   'JOBNAME': str(collection) + ' parsed-captures' }
        job = Pig.compileFromFile( 'pig/parsed-captures.pig' ).bind( params )
        result = job.runSingle(props)
        if not result.isSuccessful():
            print '\nERROR: Pig job parsed-captures for ' + collection
            System.exit(1)
    else:
        print 'LOG: Graph parsed-captures up-to-date'

    # link-graph
    if ( not fs.exists( linkGraph ) or
         fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( linkGraph ).getModificationTime() ):
        print 'LOG: Graph link-graph create'
        fs.delete( linkGraph, True )
        params = { 'INPUT'  : str(parsedDir),
                   'OUTPUT' : str(linkGraph),
                   'JOBNAME': str(collection) + ' link-graph' }
def run_script():
    import os
    from org.apache.pig.scripting import Pig

    nodes_input = "s3n://jpacker-dev/amazon_products/fixtures/cathedral-nodes"
    edges_input = "s3n://jpacker-dev/amazon_products/fixtures/cathedral-edges"

    preprocess_vector_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/vector"
    preprocess_matrix_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/matrix"
    preprocess_num_vertices_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/num_vertices"
    iteration_output_stem = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/iteration_"
    max_diff_output_stem = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/max_diff_"
    postprocess_pageranks_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/pageranks"

    damping_factor = 0.85

    preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig")
    preprocess_bound = preprocess.bind({ 
        "EDGES_INPUT_PATH": edges_input,
        "VECTOR_OUTPUT_PATH": preprocess_vector_output,
        "MATRIX_OUTPUT_PATH": preprocess_matrix_output,
        "NUM_VERTICES_OUTPUT_PATH": preprocess_num_vertices_output,
        "DAMPING_FACTOR": damping_factor
    })
    preprocess_stats = preprocess_bound.runSingle()

    num_vertices = int(str(preprocess_stats.result("num_vertices_copy").iterator().next().get(0)))

    iteration = Pig.compileFromFile("../pigscripts/pagerank_iterate.pig")
    max_num_iterations = 7
    num_iterations = 0
    convergence_threshold = 0.15 / float(num_vertices)

    for i in range(1, max_num_iterations + 1):
        iteration_vector_input = preprocess_vector_output if i == 1 else (iteration_output_stem + str(i-1))
        iteration_matrix_input = preprocess_matrix_output

        iteration_output = iteration_output_stem + str(i)
        max_diff_output = max_diff_output_stem + str(i)

        iteration_bound = iteration.bind({
            "VECTOR_INPUT_PATH": iteration_vector_input,
            "MATRIX_INPUT_PATH": iteration_matrix_input,
            "ITERATION_OUTPUT_PATH": iteration_output,
            "MAX_DIFF_OUTPUT_PATH": max_diff_output,
            "NUM_VERTICES": num_vertices,
            "DAMPING_FACTOR": damping_factor
        })
        iteration_stats = iteration_bound.runSingle()

        num_iterations += 1
        max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0)))
        if max_diff < convergence_threshold:
            break

    result_vector = iteration_output_stem + str(num_iterations)

    postprocess = Pig.compileFromFile("../pigscripts/pagerank_postprocess.pig")
    postprocess_bound = postprocess.bind({
        "NODES_INPUT_PATH": nodes_input,
        "RESULT_VECTOR": result_vector,
        "OUTPUT_PATH": postprocess_pageranks_output
    })
    postprocess_bound.runSingle()
示例#29
0
#!/usr/bin/python
import sys
from org.apache.pig.scripting import Pig
from bidipig import runbidi

# make minhash clusters
minhash = Pig.compileFromFile('src/main/pig/minhash.pig')

osrc = src = sys.argv[1]
destminhash = sys.argv[2] + '-minhash'
dest = sys.argv[2] + '-jaccard'
minjaccard = 80

bound = minhash.bind()

job = bound.runSingle()

if not job.isSuccessful():
	raise 'failed in minhash'
# output is pairs and scores

# make transitive closure of clusters
src = dest
dest = sys.argv[2] + '-bidi'
runbidi(src, dest)

# join with original data
join = Pig.compileFromFile('src/main/pig/join.pig')

src = osrc
keys = dest
示例#30
0
#!/usr/bin/python
import time
import sys 
from org.apache.pig.scripting import Pig

if __name__ == '__main__':
    P = Pig.compileFromFile("""calvisit.pig""")

    defaulttime = time.time()
    deadlinesec = defaulttime - 1800
    deadline = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(deadlinesec))
    
    if len(sys.argv) > 1:
        deadline = sys.argv[1]

    Q = P.bind({'deadline':deadline, 'input':'input', 'result':'result', 'inputtmp':'inputtmp'})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
       raise "Pig job failed"
    else:
       print result
示例#31
0
def runPigScript(pigScript,params):
    P = Pig.compileFromFile(pigScript)
    bound = P.bind(params)
    stat=bound.runSingle()
示例#32
0
    def run_pagerank(self):
        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names.
        """
        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile(self.preprocessing_script)
        preprocess_params = {
            "INPUT_PATH": self.edges_input,
            "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks,
            "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes
        }
        preprocess_bound = preprocess.bind(preprocess_params)
        preprocess_stats = preprocess_bound.runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes = long(str(preprocess_stats.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(self.convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        iteration = Pig.compileFromFile(self.iteration_script)
        for i in range(self.max_num_iterations):
            print "Starting iteration step: %s" % str(i + 1)

            # Append the iteration number to the input/output stems
            iteration_input = self.preprocess_pageranks if i == 0 else (self.iteration_pageranks_prefix + str(i-1))
            iteration_pageranks_output = self.iteration_pageranks_prefix + str(i)
            iteration_rank_changes_output = self.iteration_rank_changes_prefix + str(i)

            iteration_bound = iteration.bind({
                "INPUT_PATH": iteration_input,
                "DAMPING_FACTOR": self.damping_factor,
                "NUM_NODES": num_nodes,
                "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output,
                "AGG_RANK_CHANGE_OUTPUT_PATH": iteration_rank_changes_output
            })
            iteration_stats = iteration_bound.runSingle()

            # If we're below the convergence threshold break out of the loop.
            aggregate_rank_change = long(str(iteration_stats.result("aggregate_rank_change").iterator().next().get(0)))
            if aggregate_rank_change < convergence_threshold:
                print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \
                       % (aggregate_rank_change, convergence_threshold)
                break
            elif i == self.max_num_iterations-1:
                print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \
                      ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \
                       "Stopping."
            else:
                print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \
                       % (aggregate_rank_change, convergence_threshold)

        iteration_pagerank_result = self.iteration_pageranks_prefix + str(i)

        # Postprocesing step:
        print "Starting postprocessing step."
        postprocess = Pig.compileFromFile(self.postprocessing_script)
        postprocess_params = { "PAGERANKS_INPUT_PATH": iteration_pagerank_result }
        if self.output_path is not None: # otherwise, the script outputs to the default location,
                                         # which is a special directory in s3://mortar-example-output-data
                                         # permissioned for your Mortar account.
            postprocess_params["OUTPUT_PATH"] = self.output_path
        postprocess_bound = postprocess.bind(postprocess_params)
        postprocess_stats = postprocess_bound.runSingle()
示例#33
0
}
w0_schema_file = open("%s/.pig_schema" % path, 'w')
ObjectMapper().writeValue(w0_schema_file, w0_schema)
w0_schema_file.close()

#
# Copy initial weights to fs
#
copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0")
Pig.fs(copyFromLocal)

#
# Iterate until converged
#
features = "%s/%s" % (data_dir, features)
script = Pig.compileFromFile(pig_script)
weight_queue = Queue.Queue(25)  # for moving average
avg_weight = [0.0 for i in xrange(int(num_features))]
converged = False
prev = 0
weight_dir = tempfile.mkdtemp()

while not converged:
    input_weights = "%s/weight-%s" % (data_dir, prev)
    output_weights = "%s/weight-%s" % (data_dir, prev + 1)

    bound = script.bind({
        'input_weights': input_weights,
        'output_weights': output_weights,
        'data': features
    })
示例#34
0
    'mongoPort') + "/DB_SUPPORT"
mongo2 = " -u " + props.getProperty('mongoUsr')
mongo3 = " -p " + props.getProperty(
    'mongoPwd') + ''' --authenticationDatabase admin  --quiet --eval "'''

# var param1=438; var param2=1; var param3='datalake'" delete_dataset.js

Pig.registerJar("../lib/mongo-java-driver-3.4.0.jar")
Pig.registerJar("../lib/mongo-hadoop-core-1.5.2.jar")
Pig.registerJar("../lib/mongo-hadoop-pig-1.5.2.jar")
Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
#Pig.registerJar("../lib/yucca-phoenix-pig.jar")

if mode in ["APPEND", "append"]:
    # read from metadata source (mongoDB) lastIdDatalake2Speed for tenant
    readLastIdJob = Pig.compileFromFile(
        """read_mongo_lastIdDatalake2Speed.pig""")
    results = readLastIdJob.bind({'tenantCode': tenantCode}).runSingle()
    if results.isSuccessful():
        print "Pig job succeeded"
        iter = results.result("lastId").iterator()
        if iter.hasNext():
            lastId = iter.next()
            print "lastId: " + str(lastId)
    else:
        raise "Pig job failed"

# read from metadata source (mongoDB) all datasets with
# availableSpeed = true
# availableHive = true (getting also dbHiveSchema and dbHiveTable)
readDatasetListJob = Pig.compileFromFile("""../read_mongo_dataset.pig""")
readDatasetParams = {