def count_items(input, output): """Sum all the items in the input data set""" # Intermediate file name inter = output + '_inter' # Run the task with specified mapper and reducer methods prince.run(count_mapper, count_reducer, input, inter, inputformat='text', outputformat='text', files=__file__) prince.run(sum_mapper, count_reducer, inter + '/part*', output, inputformat='text', outputformat='text', files=__file__) # Read the output file and print it file = prince.dfs.read(output + '/part*', first=1) return int(file.split()[1])
part = '/part-00000' options = {'damping': damping, 'precision': precision, 'nb_nodes': len(graph)} # Create the initial values pagerank_current = pagerank % iteration_start if iteration_start == 1: pagerank_values = [(n, make_value(pr_init, pr_init, n_adjacent)) for n, n_adjacent in graph.items()] prince.dfs.write(pagerank_current + part, pagerank_values) iteration_start += 1 stop = False iteration = iteration_start while not stop and iteration < iteration_max: # Update file names pagerank_previous = pagerank_current pagerank_current = pagerank % iteration term_current = term % iteration # Compute the new PageRank values prince.run(pagerank_mapper, pagerank_reducer, pagerank_previous + suffix, pagerank_current, [], options, 'text', 'text') # Termination: check if all PageRank values are stable prince.run(term_mapper, term_reducer, pagerank_current + suffix, term_current, [], options, 'text', 'text') term_value = prince.dfs.read(term_current + suffix) stop = int(term_value.split()[1]) # Get ready for the next iteration iteration += 1
import prince # Methods from wordcount are now in the local name space from wordcount import * def display_usage(): print 'usage: %s input output' % sys.argv[0] print ' input: input file on the DFS' print ' output: output file on the DFS' if __name__ == "__main__": # Always call prince.init() at the beginning of the program prince.init() if len(sys.argv) != 3: display_usage() sys.exit(0) input = sys.argv[1] output = sys.argv[2] # Run the task with the mapper and reducer methods from the wordcount.py file # Note that the file wordcount.py is added to the 'files' argument prince.run(wc_mapper, wc_reducer, input, output, inputformat='text', outputformat='text', files='wordcount.py') # Read the output file and print it file = prince.dfs.read(output + '/part*') print file
if __name__ == "__main__": # Always call prince.init() at the beginning of the program prince.init() if len(sys.argv) != 3: display_usage() sys.exit(0) input = sys.argv[1] output = sys.argv[2] + '%04d' sorted = sys.argv[2] + '_sorted' suffix = '/part*' # Create the initial buckets from the data prince.run(init_mapper, init_reducer, input, output % 0, inputformat='text', outputformat='text') stop = False iteration = 1 while not stop: # Merge current buckets previous = output % (iteration - 1) current = output % iteration prince.run(merge_mapper, merge_reducer, previous + suffix, current, inputformat='text', outputformat='text') # Check if sort is done state = prince.dfs.read(current + suffix, last=1) if int(state.split()[0]) == 0: stop = True iteration += 1
def display_usage(): print 'usage: %s input output' % sys.argv[0] print ' input: input file on the DFS' print ' output: output file on the DFS' if __name__ == "__main__": # Always call prince.init() at the beginning of the program prince.init() if len(sys.argv) != 3: display_usage() sys.exit(0) input = sys.argv[1] output = sys.argv[2] # Run the task with the mapper and reducer methods from the wordcount.py file # Note that the file wordcount.py is added to the 'files' argument prince.run(wc_mapper, wc_reducer, input, output, inputformat='text', outputformat='text', files='wordcount.py') # Read the output file and print it file = prince.dfs.read(output + '/part*') print file
# Create the initial values pagerank_current = pagerank % iteration_start if iteration_start == 1: pagerank_values = [(n, make_value(pr_init, pr_init, n_adjacent)) for n, n_adjacent in graph.items()] prince.dfs.write(pagerank_current + part, pagerank_values) iteration_start += 1 stop = False iteration = iteration_start while not stop and iteration < iteration_max: # Update file names pagerank_previous = pagerank_current pagerank_current = pagerank % iteration term_current = term % iteration # Compute the new PageRank values prince.run(pagerank_mapper, pagerank_reducer, pagerank_previous + suffix, pagerank_current, [], options, 'text', 'text') # Termination: check if all PageRank values are stable prince.run(term_mapper, term_reducer, pagerank_current + suffix, term_current, [], options, 'text', 'text') term_value = prince.dfs.read(term_current + suffix) stop = int(term_value.split()[1]) # Get ready for the next iteration iteration += 1
def wc_reducer(key, values): """Reducer method with 'key' a string and 'values' a generator of strings""" try: yield key, sum([int(v) for v in values]) except ValueError: pass # discard non-numerical values def display_usage(): print 'usage: ./%s input output' % sys.argv[0] print ' input: input file on the DFS' print ' output: output file on the DFS' if __name__ == "__main__": # Always call prince.init() at the beginning of the program prince.init() if len(sys.argv) != 3: display_usage() sys.exit(0) input = sys.argv[1] output = sys.argv[2] # Run the task with specified mapper and reducer methods prince.run(wc_mapper, wc_reducer, input, output, inputformat='text', outputformat='text') # Read the output file and print it file = prince.dfs.read(output + '/part*') print file
options = {'graph': filename_graph, 'source': source_node} # Create the initial frontier with the tuple (source, 0) frontier_current = frontier % iteration_start if iteration_start == 1: prince.dfs.write(frontier_current + part, (source_node, '%d %d' % (sys.maxint, 0))) iteration_start += 1 stop = False iteration = iteration_start while not stop and iteration < iteration_max: # Update file names frontier_previous = frontier_current frontier_current = frontier % iteration term_current = term % iteration # Compute the new frontier prince.run(frontier_mapper, frontier_reducer, frontier_previous + suffix, frontier_current, filename_graph, options, 'text', 'text') print prince.dfs.read(frontier_current + suffix) # Termination: check if all distances are stable prince.run(term_mapper, term_reducer, frontier_current + suffix, term_current, filename_graph, options, 'text', 'text') print prince.dfs.read(term_current + suffix) term_value = prince.dfs.read(term_current + suffix) stop = int(term_value.split()[1]) # Get ready for the next iteration iteration += 1
# Create the initial frontier with the tuple (source, 0) frontier_current = frontier % iteration_start if iteration_start == 1: prince.dfs.write(frontier_current + part, (source_node, '%d %d' % (sys.maxint, 0))) iteration_start += 1 stop = False iteration = iteration_start while not stop and iteration < iteration_max: # Update file names frontier_previous = frontier_current frontier_current = frontier % iteration term_current = term % iteration # Compute the new frontier prince.run(frontier_mapper, frontier_reducer, frontier_previous + suffix, frontier_current, filename_graph, options, 'text', 'text') print prince.dfs.read(frontier_current + suffix) # Termination: check if all distances are stable prince.run(term_mapper, term_reducer, frontier_current + suffix, term_current, filename_graph, options, 'text', 'text') print prince.dfs.read(term_current + suffix) term_value = prince.dfs.read(term_current + suffix) stop = int(term_value.split()[1]) # Get ready for the next iteration iteration += 1