def main(): if hadoopy.exists(hdfs_output): hadoopy.rmr("-skipTrash %s" % hdfs_output) hadoopy.launch(hdfs_path, hdfs_output, 'WordCount.py', files=['../stop_words.txt'])
def launch(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw): _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw) hadoopy.launch(in_name, out_name, script_path, **kw)
def random_cluster(input_path, output_path, num_clusters, cluster_path, num_reducers): def inc_path(): global iter_cnt iter_cnt +=1 return '%s/%d' % (output_path, iter_cnt) hadoopy.freeze(script_path='random_cluster.py', shared_libs=SHARED_LIBS, modules=['vitrieve_algorithms'], remove_dir=True) hadoopy.launch(in_name=input_path, out_name=inc_path(), cmdenvs=['NUM_CLUSTERS=%d' % (num_clusters)], script_path='random_cluster.py', #combiner=True, frozen_path='frozen')
def canopy(input_path, output_path, num_clusters, cluster_path, num_reducers): def inc_path(): global iter_cnt iter_cnt +=1 return '%s/%d' % (output_path, iter_cnt) def prev_path(): return '%s/%d' % (output_path, iter_cnt) soft = str(4000.) hard = str(250.) hadoopy.freeze(script_path='canopy_cluster.py', shared_libs=SHARED_LIBS, modules=['vitrieve_algorithms', 'nn_l2sqr_c'], remove_dir=True) hadoopy.launch(in_name=input_path, out_name=inc_path(), script_path='canopy_cluster.py', files='nn_l2sqr.py', cmdenvs=['NN_MODULE=nn_l2sqr_c', 'CANOPY_SOFT_DIST=%s' % (soft), 'CANOPY_HARD_DIST=%s' % (hard)], frozen_path='frozen') consolidate_clusters(prev_path(), 'canopies.pkl') hadoopy.freeze(script_path='canopy_cluster_assign.py', remove_dir=True) hadoopy.launch(in_name=input_path, out_name=inc_path(), script_path='canopy_cluster_assign.py', cmdenvs=['CANOPY_SOFT_DIST=%s' % (soft), 'CANOPIES_PKL=' + 'canopies.pkl'], files='canopies.pkl', reducer=None, frozen_path='frozen') input_path = prev_path() hadoopy.launch(in_name=cluster_path, out_name=inc_path(), script_path='canopy_cluster_assign.py', cmdenvs=['CANOPY_SOFT_DIST=%s' % (soft), 'CANOPIES_PKL=' + 'canopies.pkl'], files='canopies.pkl', reducer=None, frozen_path='frozen') consolidate_canopy_clusters(prev_path(), 'clusters.pkl') hadoopy.freeze(script_path='kmeans_canopy_cluster.py', shared_libs=SHARED_LIBS, modules=['vitrieve_algorithms', 'nn_l2sqr_c',], remove_dir=True) hadoopy.launch(in_name=input_path, out_name=inc_path(), script_path='kmeans_canopy_cluster.py', cmdenvs=['CLUSTERS_PKL=%s' % ('clusters.pkl'), 'CANOPY_SOFT_DIST=%s' % (soft), 'NN_MODULE=nn_l2sqr_c'], files=['nn_l2sqr_c.py', 'clusters.pkl'], frozen_path='frozen')
def launch_map_update(nodes, job_id, redis_host, jobconfs=None): jobconfs_base = {'mapred.map.tasks.speculative.execution': 'false', 'mapred.reduce.tasks.speculative.execution': 'false', 'mapred.task.timeout': '0'} if jobconfs: jobconfs_base.update(jobconfs) with hadoopy_helper.hdfs_temp() as input_path: for node in nodes: print(node) v = {'script_name': os.path.basename(node['script_path']), 'script_data': open(node['script_path']).read()} if 'cmdenvs' in node and node['cmdenvs'] is not None: v['cmdenvs'] = node['cmdenvs'] if 'files' in node and node['files'] is not None: v['files'] = dict((os.path.basename(f), open(f).read()) for f in node['files']) cmdenvs = {'job_id': job_id, 'hadoopy_rt_redis': redis_host} if 'outputs' in node and node['outputs']: v['outputs'] = node['outputs'] hadoopy.writetb('%s/input/%d' % (input_path, node['name']), [(node['name'], v)]) hadoopy.launch(input_path + '/input', input_path + '/output_path_empty', _lf('hadoopy_rt_job.py'), cmdenvs=cmdenvs, jobconfs=jobconfs_base)
def main(): if hadoopy.exists(hdfs_output): hadoopy.rmr("-skipTrash %s" % hdfs_output) hadoopy.launch(hdfs_path, hdfs_output, "WordCount.py", files=["../stop_words.txt"])
#!/usr/bin/env python import hadoopy input_path = "/alice.txt" output_path = "/result" if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s" % output_path) hadoopy.launch(input_path, output_path, 'WordCount.py') word_counts = dict(hadoopy.readtb(output_path)) for word in word_counts: print "%s: %d" % (word, word_counts[word])
if hadoopy.exists(input_path): hadoopy.rmr("-skipTrash %s"%input_path) os.system('hdfs dfs -cp '+edge_path+' '+input_path) if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s"%output_path) hadoopy.writetb(output_path,read_vector(r0)) if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s"%temp_path) iteration = 0 while diff>0.01: if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s"%temp_path) hadoopy.launch(input_path,temp_path,'PageRank.py',files=[]) generator_vector = hadoopy.readtb(output_path) rk = {} for score in generator_vector: url = score[0] r = score[1] rk[url] = r generator_vector = hadoopy.readtb(temp_path+"/part-00000") rkpp = {} for i,score in enumerate(generator_vector): url = score[0][0] r = score[0][1] rkpp[url] = r
import hadoopy hadoopy.launch('/data/MSCDirect', '/data/MSCDirectBrandMPN', 'brandMPN.py', remove_output = True)
from hadoopy import launch input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams' output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy' launch(input_path, output_path, 'ngrams.py', use_seqoutput=False, num_reducers=10, hstreaming='/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar')
# -*- coding: utf-8 -*- """ Created on Mon Nov 9 16:35:12 2015 @author: user """ import hadoopy input_path = 'wiki_index.tb' output_path = "/result" if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s"%output_path) hadoopy.launch(input_path, output_path, 'map_red_01.py') word_urls = dict(hadoopy.readtb(output_path)) for word in word_urls: print "%s: %s, %s" % (word, word_urls[word][0], word_urls[word][1])
# -*- coding: utf-8 -*- """ Created on Tue Dec 1 16:09:56 2015 @author: user """ import hadoopy input_path = "wiki.tb" output_path = "index_wiki" if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s" % output_path) hadoopy.launch(input_path, output_path, "q_04_mapred.py") # Testing ... word_urls = dict(hadoopy.readtb(output_path)) for word, urls in word_urls.iteritems(): print "%s: %s" % (word, urls) break
elif os.environ['stage'] == 'stage4': for key, value in stage4Map(key, value): yield key, value def reducer(key, values): if os.environ['stage'] == 'stage1': for key, value in stage1Reduce(key, values): yield key, value elif os.environ['stage'] == 'stage2': for key, value in stage2Reduce(key, values): yield key, value elif os.environ['stage'] == 'stage3': for key, value in stage3Reduce(key, values): yield key, value elif os.environ['stage'] == 'stage4': for key, value in stage4Reduce(key, values): yield key, value if __name__ == '__main__': if sys.argv[1] == "launch": # hadoopy.launch('/data/MSCDirect', '/data/MSCDirectBrandMPN', '/home/hadoop/MSCDirect/brandMPN/brandMPN.py', remove_output = True, jobconfs={'stage':'stage1'}) # hadoopy.launch('/data/MSCDirectBrandMPN', '/data/StoreCounts', '/home/hadoop/MSCDirect/brandMPN/brandMPN.py', remove_output = True, jobconfs={'stage':'stage2'}) #hadoopy.launch('/data/MSCDirect', '/data/MSCUrlUpc', '/home/hadoop/MSCDirect/brandMPN/brandMPN.py', remove_output = True, jobconfs={'stage':'stage3'}) hadoopy.launch('/data/MSCDirect', '/data/MSCUpcMatches', '/home/hadoop/MSCDirect/brandMPN/brandMPN.py', remove_output = True, jobconfs={'stage':'stage4'}) data = hadoopy.readtb('/data/MSCUpcMatches') for i, j in data: print i, j else: hadoopy.run(mapper, reducer)
if hadoopy.exists(input_path): hadoopy.rmr("-skipTrash %s" % input_path) os.system('hdfs dfs -cp ' + edge_path + ' ' + input_path) if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s" % output_path) hadoopy.writetb(output_path, read_vector(r0)) if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s" % temp_path) iteration = 0 while diff > 0.01: if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s" % temp_path) hadoopy.launch(input_path, temp_path, 'PageRank.py', files=[]) generator_vector = hadoopy.readtb(output_path) rk = {} for score in generator_vector: url = score[0] r = score[1] rk[url] = r generator_vector = hadoopy.readtb(temp_path + "/part-00000") rkpp = {} for i, score in enumerate(generator_vector): url = score[0][0] r = score[0][1] rkpp[url] = r
#executionPlan.append(('/data/MSCDirectBrands', extractBrandIDMap, extractBrandIDReduce, '/data/MSCDirectExtractedBrands')) executionPlan.append(('/data/finalmatches/finalmatches', mpidPairsMap, mpidPairsReduce, '/data/finalmatchesMSCfiltered')) inputs,mapperStages,reducerStages,outputs = zip(*executionPlan) thisFilename = "/home/hadoop/MSCDirect/brandMPN/mscDirectProcessingRefactored.py" def mapper(key, value): stageCounter = 0 for mapperStage in mapperStages: stageCounter += 1 if os.environ["stage"] == "stage" + str(stageCounter): for key, value in mapperStage(key, value): yield key, value def reducer(key, value): stageCounter = 0 for reducerStage in reducerStages: stageCounter += 1 if os.environ["stage"] == "stage" + str(stageCounter): for key, value in reducerStage(key, value): yield key, value if __name__ == '__main__': if sys.argv[1] == "launch": for i in xrange(0, len(mapperStages)): jobconfs = {} jobconfs["stage"] = "stage" + str(i + 1) hadoopy.launch(inputs[i], outputs[i], thisFilename, remove_output = True, jobconfs=jobconfs) else: hadoopy.run(mapper, reducer)
from hadoopy import launch input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams' output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy' launch( input_path, output_path, 'ngrams.py', use_seqoutput=False, num_reducers=10, hstreaming= '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar' )
#!/usr/bin/env python import hadoopy input_path = "/alice.txt" output_path = "/result" if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s"%output_path) hadoopy.launch(input_path, output_path, 'WordCount.py') word_counts = dict(hadoopy.readtb(output_path)) for word in word_counts: print "%s: %d" % (word,word_counts[word])