def main():
    if hadoopy.exists(hdfs_output):
        hadoopy.rmr("-skipTrash %s" % hdfs_output)
    hadoopy.launch(hdfs_path,
                   hdfs_output,
                   'WordCount.py',
                   files=['../stop_words.txt'])
예제 #2
0
def launch(in_name,
           out_name,
           script_path,
           hbase_in=True,
           hbase_out=False,
           columns=(),
           start_row=None,
           stop_row=None,
           single_value=None,
           **kw):
    _launch_args(hbase_in, hbase_out, columns, start_row, stop_row,
                 single_value, kw)
    hadoopy.launch(in_name, out_name, script_path, **kw)
예제 #3
0
def random_cluster(input_path, output_path, num_clusters, cluster_path, num_reducers):
    def inc_path():
        global iter_cnt
        iter_cnt +=1
        return '%s/%d' % (output_path, iter_cnt)
    hadoopy.freeze(script_path='random_cluster.py',
                   shared_libs=SHARED_LIBS,
                   modules=['vitrieve_algorithms'],
                   remove_dir=True)
    hadoopy.launch(in_name=input_path,
                       out_name=inc_path(),
                       cmdenvs=['NUM_CLUSTERS=%d' % (num_clusters)],
                       script_path='random_cluster.py',
                       #combiner=True,
                       frozen_path='frozen')
예제 #4
0
def canopy(input_path, output_path, num_clusters, cluster_path, num_reducers):
    def inc_path():
        global iter_cnt
        iter_cnt +=1
        return '%s/%d' % (output_path, iter_cnt)
    def prev_path():
        return '%s/%d' % (output_path, iter_cnt)
    soft = str(4000.)
    hard = str(250.)

    hadoopy.freeze(script_path='canopy_cluster.py',
                   shared_libs=SHARED_LIBS,
                   modules=['vitrieve_algorithms', 'nn_l2sqr_c'],
                   remove_dir=True)
    hadoopy.launch(in_name=input_path,
                       out_name=inc_path(),
                       script_path='canopy_cluster.py',
                       files='nn_l2sqr.py',
                       cmdenvs=['NN_MODULE=nn_l2sqr_c',
                                'CANOPY_SOFT_DIST=%s' % (soft),
                                'CANOPY_HARD_DIST=%s' % (hard)],
                       frozen_path='frozen')
    consolidate_clusters(prev_path(), 'canopies.pkl')

    hadoopy.freeze(script_path='canopy_cluster_assign.py',
                   remove_dir=True)
    hadoopy.launch(in_name=input_path,
                       out_name=inc_path(),
                       script_path='canopy_cluster_assign.py',
                       cmdenvs=['CANOPY_SOFT_DIST=%s' % (soft),
                                'CANOPIES_PKL=' + 'canopies.pkl'],
                       files='canopies.pkl',
                       reducer=None,
                       frozen_path='frozen')
    input_path = prev_path()

    hadoopy.launch(in_name=cluster_path,
                       out_name=inc_path(),
                       script_path='canopy_cluster_assign.py',
                       cmdenvs=['CANOPY_SOFT_DIST=%s' % (soft),
                                'CANOPIES_PKL=' + 'canopies.pkl'],
                       files='canopies.pkl',
                       reducer=None,
                       frozen_path='frozen')
    consolidate_canopy_clusters(prev_path(), 'clusters.pkl')

    hadoopy.freeze(script_path='kmeans_canopy_cluster.py',
                   shared_libs=SHARED_LIBS,
                   modules=['vitrieve_algorithms', 'nn_l2sqr_c',],
                   remove_dir=True)
    hadoopy.launch(in_name=input_path,
                       out_name=inc_path(),
                       script_path='kmeans_canopy_cluster.py',
                       cmdenvs=['CLUSTERS_PKL=%s' % ('clusters.pkl'),
                                'CANOPY_SOFT_DIST=%s' % (soft),
                                 'NN_MODULE=nn_l2sqr_c'],
                       files=['nn_l2sqr_c.py', 'clusters.pkl'],
                       frozen_path='frozen')
예제 #5
0
def launch_map_update(nodes, job_id, redis_host, jobconfs=None):
    jobconfs_base = {'mapred.map.tasks.speculative.execution': 'false',
                'mapred.reduce.tasks.speculative.execution': 'false',
                'mapred.task.timeout': '0'}
    if jobconfs:
        jobconfs_base.update(jobconfs)
    with hadoopy_helper.hdfs_temp() as input_path:
        for node in nodes:
            print(node)
            v = {'script_name': os.path.basename(node['script_path']),
                 'script_data': open(node['script_path']).read()}
            if 'cmdenvs' in node and node['cmdenvs'] is not None:
                v['cmdenvs'] = node['cmdenvs']
            if 'files' in node and node['files'] is not None:
                v['files'] = dict((os.path.basename(f), open(f).read()) for f in node['files'])
            cmdenvs = {'job_id': job_id,
                       'hadoopy_rt_redis': redis_host}
            if 'outputs' in node and node['outputs']:
                v['outputs'] = node['outputs']
            hadoopy.writetb('%s/input/%d' % (input_path, node['name']), [(node['name'], v)])
        hadoopy.launch(input_path + '/input', input_path + '/output_path_empty', _lf('hadoopy_rt_job.py'), cmdenvs=cmdenvs,
                       jobconfs=jobconfs_base)
예제 #6
0
def launch(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw):
    _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw)
    hadoopy.launch(in_name, out_name, script_path, **kw)
def main():
    if hadoopy.exists(hdfs_output):
        hadoopy.rmr("-skipTrash %s" % hdfs_output)
    hadoopy.launch(hdfs_path, hdfs_output, "WordCount.py", files=["../stop_words.txt"])
예제 #8
0
#!/usr/bin/env python

import hadoopy

input_path = "/alice.txt"
output_path = "/result"

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s" % output_path)

hadoopy.launch(input_path, output_path, 'WordCount.py')

word_counts = dict(hadoopy.readtb(output_path))

for word in word_counts:
    print "%s: %d" % (word, word_counts[word])
if hadoopy.exists(input_path):
    hadoopy.rmr("-skipTrash %s"%input_path)
os.system('hdfs dfs -cp '+edge_path+' '+input_path)
    
if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s"%output_path)
hadoopy.writetb(output_path,read_vector(r0))

if hadoopy.exists(temp_path):
    hadoopy.rmr("-skipTrash %s"%temp_path)

iteration = 0
while diff>0.01:
    if hadoopy.exists(temp_path):
        hadoopy.rmr("-skipTrash %s"%temp_path)
    hadoopy.launch(input_path,temp_path,'PageRank.py',files=[])
    
    generator_vector = hadoopy.readtb(output_path)
    rk = {}
    for score in generator_vector:
        url = score[0]
        r = score[1]
        rk[url] = r

    generator_vector = hadoopy.readtb(temp_path+"/part-00000")
    rkpp = {}
    for i,score in enumerate(generator_vector):
        url = score[0][0]
        r = score[0][1]
        rkpp[url] = r
예제 #10
0
파일: driver.py 프로젝트: karthiknrao/gunda
import hadoopy

hadoopy.launch('/data/MSCDirect', '/data/MSCDirectBrandMPN', 'brandMPN.py', remove_output = True)
예제 #11
0
from hadoopy import launch

input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams'
output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy'

launch(input_path,
       output_path,
       'ngrams.py',
       use_seqoutput=False,
       num_reducers=10,
       hstreaming='/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar')
예제 #12
0
# -*- coding: utf-8 -*-
"""
Created on Mon Nov  9 16:35:12 2015

@author: user
"""

import hadoopy

input_path = 'wiki_index.tb'
output_path = "/result"

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s"%output_path)

hadoopy.launch(input_path, output_path, 'map_red_01.py')
word_urls = dict(hadoopy.readtb(output_path))

for word in word_urls:
    print "%s: %s, %s" % (word, word_urls[word][0], word_urls[word][1])
예제 #13
0
# -*- coding: utf-8 -*-
"""
Created on Tue Dec  1 16:09:56 2015

@author: user
"""

import hadoopy

input_path = "wiki.tb"
output_path = "index_wiki"

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s" % output_path)

hadoopy.launch(input_path, output_path, "q_04_mapred.py")

# Testing ...
word_urls = dict(hadoopy.readtb(output_path))
for word, urls in word_urls.iteritems():
    print "%s: %s" % (word, urls)
    break
예제 #14
0
    elif os.environ['stage'] == 'stage4':
        for key, value in stage4Map(key, value):
            yield key, value

def reducer(key, values):
    if os.environ['stage'] == 'stage1':
        for key, value in stage1Reduce(key, values):
            yield key, value
    elif os.environ['stage'] == 'stage2':
        for key, value in stage2Reduce(key, values):
            yield key, value
    elif os.environ['stage'] == 'stage3':
        for key, value in stage3Reduce(key, values):
            yield key, value
    elif os.environ['stage'] == 'stage4':
        for key, value in stage4Reduce(key, values):
            yield key, value

if __name__ == '__main__':
    if sys.argv[1] == "launch":
#        hadoopy.launch('/data/MSCDirect', '/data/MSCDirectBrandMPN', '/home/hadoop/MSCDirect/brandMPN/brandMPN.py', remove_output = True, jobconfs={'stage':'stage1'})
#        hadoopy.launch('/data/MSCDirectBrandMPN', '/data/StoreCounts', '/home/hadoop/MSCDirect/brandMPN/brandMPN.py', remove_output = True, jobconfs={'stage':'stage2'})
        #hadoopy.launch('/data/MSCDirect', '/data/MSCUrlUpc', '/home/hadoop/MSCDirect/brandMPN/brandMPN.py', remove_output = True, jobconfs={'stage':'stage3'})
        hadoopy.launch('/data/MSCDirect', '/data/MSCUpcMatches', '/home/hadoop/MSCDirect/brandMPN/brandMPN.py', remove_output = True, jobconfs={'stage':'stage4'})
        data = hadoopy.readtb('/data/MSCUpcMatches')
        for i, j in data:
            print i, j
    else:
        hadoopy.run(mapper, reducer)

예제 #15
0
if hadoopy.exists(input_path):
    hadoopy.rmr("-skipTrash %s" % input_path)
os.system('hdfs dfs -cp ' + edge_path + ' ' + input_path)

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s" % output_path)
hadoopy.writetb(output_path, read_vector(r0))

if hadoopy.exists(temp_path):
    hadoopy.rmr("-skipTrash %s" % temp_path)

iteration = 0
while diff > 0.01:
    if hadoopy.exists(temp_path):
        hadoopy.rmr("-skipTrash %s" % temp_path)
    hadoopy.launch(input_path, temp_path, 'PageRank.py', files=[])

    generator_vector = hadoopy.readtb(output_path)
    rk = {}
    for score in generator_vector:
        url = score[0]
        r = score[1]
        rk[url] = r

    generator_vector = hadoopy.readtb(temp_path + "/part-00000")
    rkpp = {}
    for i, score in enumerate(generator_vector):
        url = score[0][0]
        r = score[0][1]
        rkpp[url] = r
#executionPlan.append(('/data/MSCDirectBrands', extractBrandIDMap, extractBrandIDReduce, '/data/MSCDirectExtractedBrands'))
executionPlan.append(('/data/finalmatches/finalmatches', mpidPairsMap, mpidPairsReduce, '/data/finalmatchesMSCfiltered'))

inputs,mapperStages,reducerStages,outputs = zip(*executionPlan)
thisFilename = "/home/hadoop/MSCDirect/brandMPN/mscDirectProcessingRefactored.py"

def mapper(key, value):
    stageCounter = 0
    for mapperStage in mapperStages:
        stageCounter += 1
        if os.environ["stage"] == "stage" + str(stageCounter):
            for key, value in mapperStage(key, value):
                yield key, value

def reducer(key, value):
    stageCounter = 0
    for reducerStage in reducerStages:
        stageCounter += 1
        if os.environ["stage"] == "stage" + str(stageCounter):
            for key, value in reducerStage(key, value):
                yield key, value

if __name__ == '__main__':
    if sys.argv[1] == "launch":
        for i in xrange(0, len(mapperStages)):
            jobconfs = {}
            jobconfs["stage"] = "stage" + str(i + 1)
            hadoopy.launch(inputs[i], outputs[i], thisFilename, remove_output = True, jobconfs=jobconfs)
    else:
        hadoopy.run(mapper, reducer)
예제 #17
0
from hadoopy import launch

input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams'
output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy'

launch(
    input_path,
    output_path,
    'ngrams.py',
    use_seqoutput=False,
    num_reducers=10,
    hstreaming=
    '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar'
)
#!/usr/bin/env python

import hadoopy

input_path = "/alice.txt"
output_path = "/result"

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s"%output_path)
    
hadoopy.launch(input_path, output_path, 'WordCount.py')

word_counts = dict(hadoopy.readtb(output_path))

for word in word_counts:
    print "%s: %d" % (word,word_counts[word])