def runner(job): import dumbo dumbo.run(MarginalMap, sum_reduce)
#!/usr/bin/python """ select name,occupation from incomeTable; """ def mapper(key, value): name, age, sex, occupation, incomelevel = value.split(",") key = "-".join([name, occupation]) yield key, 1 if __name__ == '__main__': import dumbo dumbo.run(mapper)
def runner(job): import dumbo dumbo.run(Mapper, reduce)
import json EXCLUDE_COMMITS = True EXCLUDE_TESTS = True def mapper(key, value): msg = json.loads(value) user_id = msg['sender_id'] content = msg['content'] stream = msg['display_recipient'] to_yield = content.count('!') if EXCLUDE_COMMITS: if stream == 'commits': to_yield = 0 if EXCLUDE_TESTS: if stream == 'test-stream': to_yield = 0 yield user_id, to_yield def reducer(key, values): yield key, sum(values) if __name__ == '__main__': import dumbo dumbo.run(mapper, reducer)
def blur_image(im, n, ny=None) : """ blurs the image by convolving with a gaussian kernel of typical size n. The optional keyword argument ny allows for a different size in the y direction. """ g = gauss_kern(n, sizey=ny) improc = signal.convolve(im,g, mode='valid') return(improc) def mapper(key, value): I = asarray(Image.open(value)) r = I[:,:,0] g = I[:,:,1] b = I[:,:,2] gray = r*.222 + g*.7067 + b*.0713 out = blur_image(gray,20) im = Image.fromarray(uint8(out)) outBuff = StringIO.StringIO() im.save(outBuff,format="JPEG") yield value, outBuff.getvalue() @opt("getpath","yes") def reducer(key, values): for i in values: yield (key,key), i if __name__ == "__main__": import dumbo dumbo.run(mapper,reducer,combiner=reducer)
yield TRUE_T_STR, true_cnt_t yield TRUE_F_STR, true_cnt_f yield FALSE_T_STR, false_cnt_t yield FALSE_F_STR, false_cnt_f class Reducer(): def __call__(self, key, values): """ Reducer Program: statistical for elsvm Inputs: key: true_label or false_label values: cnt for label Outputs: the statistical result """ if str(key) == TRUE_T_STR: yield TRUE_T_STR, sum(values) elif str(key) == TRUE_F_STR: yield TRUE_F_STR, sum(values) elif str(key) == FALSE_T_STR: yield FALSE_T_STR, sum(values) elif str(key) == FALSE_F_STR: yield FALSE_F_STR, sum(values) if __name__ == "__main__": dumbo.run(Mapper, Reducer)
def main(): dumbo.run(mapper,reducer,combiner=reducer)
class Solve_Reducer: """ Solve the subproblem """ def __init__(self): self.tau_vec = [0.5, 0.75, 0.95] self.ntau = len(self.tau_vec) def __call__(self, key, values): #SAb = np.array([v for v in values]) data = [] for v in values: data += v SAb = np.array(data) m, n = SAb.shape x = np.zeros((n - 1, self.ntau)) for i in range(self.ntau): x[:, i] = quantreg_ipm(SAb[:, :n - 1], SAb[:, n - 1], self.tau_vec[i]) key = [key, m] yield key, x.T.tolist() if __name__ == '__main__': import dumbo dumbo.run(Unif_Samp_Mapper, Solve_Reducer)
#!/usr/bin/env dumbo def mapper(key,value): """ Each record is a line of text. key=<byte that the line starts in the file> value=<line of text> """ valarray = [float(v) for v in value.split()] yield key, sum(valarray) if __name__=='__main__': import dumbo import dumbo.lib dumbo.run(mapper,dumbo.lib.identityreducer)
import sys """ Map lines of a matrix to a sequence file: Key=<lineno>, Value=[row_i] """ def mapper(key,value): valarray = [float(v) for v in value.split()] if len(valarray) == 0: return yield key, valarray class Converter: def __init__(self,opts): pass def __call__(self,data): item = 0 for key,value in data: for entry in value: print "%18.16e"%(entry), print item += 1 if __name__ == '__main__': import dumbo import dumbo.lib dumbo.run(mapper,dumbo.lib.identityreducer)
def run_task(self): self.kwargs['iter'] = self.index dumbo.run(*self.args, **self.kwargs)
def run(self): import dumbo dumbo.run(self.mapper, self.reducer)
def main(): dumbo.run(Mapper,reducer,combiner=reducer)
#!/usr/bin/python """ select name,occupation from incomeTable; """ def mapper(key,value): name,age,sex,occupation,incomelevel = value.split(",") key = "-".join([name,occupation]) yield key,1 if __name__ == '__main__': import dumbo dumbo.run(mapper)
Creation: 2014-4-17 Revision: 2014-4-17 """ import json import sys from dumbo import run reload(sys) sys.setdefaultencoding('utf8') class Mapper(object): def __init__(self): pass def __call__(self, key, value): record = json.loads(value.strip()) if "city" in record: yield record['city'], 1 class Reducer(object): def __init__(self): pass def __call__(self, key, values): yield key, sum(values) if __name__ == '__main__': run(Mapper, Reducer)
import os import math class Mapper: def __init__(self): pass def __call__(self, key, value): tempArr = value.replace(' ','').split('->') tempArr1 = tempArr[1].replace(' ','').split(',') for x in tempArr1: combinationsTempArr = [tempArr[0],x] combinationsTempArr.sort() yield (','.join(combinationsTempArr)), tempArr[1].replace((x+','),'').replace((','+x),'').replace(' ','') def reducer(key, values): tempValArr = [] commonFnds = [] for temp in values: tempArr = temp.split(",") for temp1 in tempArr: if(temp1 in tempValArr): commonFnds.append(temp1) else: tempValArr.append(temp1) yield key, commonFnds if __name__ == "__main__": import dumbo dumbo.run(Mapper, reducer, combiner=None)
#print numpy.sum(var_beta[:, edges_indices_list].T) #print numpy.dot((self._edge_prior[:, edges_indices_list] - 1), var_beta[:, edges_indices_list].T) # term 6 #corpus_level_log_likelihood += numpy.sum( - scipy.special.gammaln(numpy.sum(var_beta[:, edges_indices_list], axis=1))) #corpus_level_log_likelihood += numpy.sum(scipy.special.gammaln(var_beta[:, edges_indices_list])); corpus_level_log_likelihood += numpy.sum( - scipy.special.gammaln(numpy.sum(var_beta[:, edges_indices_list], axis=1)) + numpy.sum(scipy.special.gammaln(var_beta[:, edges_indices_list]), axis=1)); corpus_level_log_likelihood += numpy.sum( - (var_beta[:, edges_indices_list]-1) * self.compute_dirichlet_expectation(var_beta[:, edges_indices_list])); assert numpy.min(var_beta)>=0; # TODO: add in alpha updating # compute the sufficient statistics for alpha and update #alpha_sufficient_statistics = scipy.special.psi(self._gamma) - scipy.special.psi(numpy.sum(self._gamma, axis=1)[:, numpy.newaxis]); #alpha_sufficient_statistics = numpy.sum(alpha_sufficient_statistics, axis=0)[numpy.newaxis, :]; #self.update_alpha(alpha_sufficient_statistics) #print numpy.sum(numpy.exp(self.E_log_beta), axis=1); return corpus_level_log_likelihood ''' yield current_topic_index, " ".join( ["%f" % (item) for item in E_log_beta[0, :]]) if __name__ == '__main__': import dumbo dumbo.run(Mapper, Reducer, combiner=Combiner)
yield TRUE_F_STR, true_cnt_f yield FALSE_T_STR, false_cnt_t yield FALSE_F_STR, false_cnt_f class Reducer(): def __call__(self, key, values): """ Reducer Program: statistical for elsvm Inputs: key: true_label or false_label values: cnt for label Outputs: the statistical result """ if str(key) == TRUE_T_STR: yield TRUE_T_STR, sum(values) elif str(key) == TRUE_F_STR: yield TRUE_F_STR, sum(values) elif str(key) == FALSE_T_STR: yield FALSE_T_STR, sum(values) elif str(key) == FALSE_F_STR: yield FALSE_F_STR, sum(values) if __name__ == "__main__": dumbo.run(Mapper, Reducer)
if len(data) < 3: return ngram = data[0].split() year = data[1] count = int(data[2]) if len(ngram) != self.expected_tokens: return pair = sorted([ngram[0], ngram[self.expected_tokens - 1]]) k = pair + [year] yield (k, count) def combiner(key, values): yield (key, sum(values)) def reducer(key, values): yield "%s\t%s\t%s" % tuple(key), str(sum(values)) if __name__ == '__main__': import dumbo # import pdb # pdb.set_trace() # dumbo.run(NgramMapper, reducer, combiner=combiner) dumbo.run(NgramMapper, reducer)
data = value.split('\t') if len(data) < 3: return ngram = data[0].split() year = data[1] count = int(data[2]) if len(ngram) != self.expected_tokens: return pair = sorted([ngram[0], ngram[self.expected_tokens - 1]]) k = pair + [year] yield (k, count) def combiner(key, values): yield (key, sum(values)) def reducer(key, values): yield "%s\t%s\t%s" % tuple(key), str(sum(values)) if __name__ == '__main__': import dumbo # import pdb # pdb.set_trace() # dumbo.run(NgramMapper, reducer, combiner=combiner) dumbo.run(NgramMapper, reducer)
#!/usr/bin/env python import re logline = re.compile(r'^(local|remote) - - \[(.*)\] "(.*)" (\d+) (\d+)$', re.I) def mapper(key, value): line = logline.match(value) if line: dt = line.groups()[1] yield dt.split('/')[1], 1 if __name__ == '__main__': import dumbo dumbo.run(mapper, dumbo.sumreducer, dumbo.sumreducer)
def extend_point(self, point): """ Extent a new value into point array """ point = np.resize(point, len(point) + 1) point[-1] = 1 return point def __call__(self, data): """ Mapper Program It will output the modified single line """ for docID, doc in data: for term in doc.split("\n"): self.SEP = self.SEP if self.SEP is not None else get_sep(term) point = np.fromstring(term, dtype=np.float64, sep=self.SEP) label = int(point[-1]) last_value = self.getDValue(point) point = self.extend_point(point) point[-1] = last_value point[-2] = float(label) output = ",".join([str(i) for i in point]) yield output, "\t" if __name__ == "__main__": dumbo.run(Mapper)
# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #!/usr/bin/env python # the script we tell dumbo to run. from zohmg.mapper import Mapper from zohmg.reducer import Reducer from zohmg.combiner import Combiner from usermapper import map # !@# import dumbo dumbo.run(Mapper(map), Reducer(), Combiner())
""" Counts how many times each word occurs. """ def mapper(key,value): for word in value.split(): yield word,1 def reducer(key,values): yield key,sum(values) if __name__ == "__main__": import dumbo dumbo.run(mapper,reducer,reducer)
corpus_level_log_likelihood += (scipy.special.gammaln(numpy.sum(self._edge_prior[:, edges_indices_list])) - numpy.sum(scipy.special.gammaln(self._edge_prior[:, edges_indices_list]))) * self._number_of_topics; corpus_level_log_likelihood += numpy.sum(numpy.dot((self._edge_prior[:, edges_indices_list] - 1), var_beta[:, edges_indices_list].T)); #print numpy.sum(var_beta[:, edges_indices_list].T) #print numpy.dot((self._edge_prior[:, edges_indices_list] - 1), var_beta[:, edges_indices_list].T) # term 6 #corpus_level_log_likelihood += numpy.sum( - scipy.special.gammaln(numpy.sum(var_beta[:, edges_indices_list], axis=1))) #corpus_level_log_likelihood += numpy.sum(scipy.special.gammaln(var_beta[:, edges_indices_list])); corpus_level_log_likelihood += numpy.sum( - scipy.special.gammaln(numpy.sum(var_beta[:, edges_indices_list], axis=1)) + numpy.sum(scipy.special.gammaln(var_beta[:, edges_indices_list]), axis=1)); corpus_level_log_likelihood += numpy.sum( - (var_beta[:, edges_indices_list]-1) * self.compute_dirichlet_expectation(var_beta[:, edges_indices_list])); assert numpy.min(var_beta)>=0; # TODO: add in alpha updating # compute the sufficient statistics for alpha and update #alpha_sufficient_statistics = scipy.special.psi(self._gamma) - scipy.special.psi(numpy.sum(self._gamma, axis=1)[:, numpy.newaxis]); #alpha_sufficient_statistics = numpy.sum(alpha_sufficient_statistics, axis=0)[numpy.newaxis, :]; #self.update_alpha(alpha_sufficient_statistics) #print numpy.sum(numpy.exp(self.E_log_beta), axis=1); return corpus_level_log_likelihood ''' yield current_topic_index, " ".join(["%f" % (item) for item in E_log_beta[0, :]]); if __name__ == '__main__': import dumbo; dumbo.run(Mapper, Reducer, combiner=Combiner);
import json def mapper(key, value): msg = json.loads(value) user_id = msg['sender_id'] name = msg['sender_full_name'] wordcount = len(msg['content'].split()) yield (user_id, name), wordcount def reducer(key, values): yield key, sum(values) if __name__ == '__main__': import dumbo dumbo.run(mapper, reducer, combiner=reducer)
import sys class Mapper: opts = [('addpath','yes')] def __call__(self,key,value): # This assumes the column index is a one-digit integer # that occurs immediately before the . in the file name. # It also assumes that the first element in the column # file is a row index. path,_key = key ind = path.rfind('.') col = path[ind-1] # value[0] is the row index, value[1] is the element. yield value[0], (col,float(value[1])) def reducer(key,values): values = sorted(values, key=lambda value: value[0]) row = [float(v[1]) for v in values] yield key,row if __name__ == '__main__': import dumbo import dumbo.lib dumbo.run(Mapper,reducer)