def tf_mapper(self, _, line): for x in string.punctuation: line = line.replace(x, ' ') line = line.split() for word in line: yield (word.lower(), jobconf_from_env('map.input.file')), 1 yield ('__doc_counter__', None), jobconf_from_env('map.input.file')
def mapper(self, _, line): for x in string.punctuation: line = line.replace(x, ' ') for word in line.split(): yield (word.lower(), jobconf_from_env('map.input.file')), 1 yield ('.total.', jobconf_from_env('map.input.file')), len(line) yield ('.doc.', jobconf_from_env('map.input.file')), 1
def mapper(self, _, line): for x in string.punctuation: line = line.replace(x,' ') line = line.split() for word in line: yield (word.lower(), jobconf_from_env('map.input.file')), 1 yield '__file_name__', jobconf_from_env('map.input.file')
def tf_mapper(self, _, line): for x in string.punctuation: line = line.replace(x,' ') line = line.split() for word in line: yield (word.lower(), jobconf_from_env('map.input.file')), 1 yield ('__doc_counter__',None), jobconf_from_env('map.input.file')
def mapper_init(self): self.numPubColleges = jobconf_from_env( "my.job.settings.numPubColleges") self.numPrivColleges = jobconf_from_env( "my.job.settings.numPrivColleges") self.numPubColleges = float(self.numPubColleges) self.numPrivColleges = float(self.numPrivColleges)
def reducer_init(self): self.priceMean = jobconf_from_env("my.job.settings.elecMean") self.popMean = jobconf_from_env("my.job.settings.popMean") self.areaMean = jobconf_from_env("my.job.settings.areaMean") self.priceMean = float(self.priceMean) self.popMean = float(self.popMean) self.areaMean = float(self.areaMean)
def mapper(self, key, value): lower_height = int(jobconf_from_env('my.job.lower_height')) upper_height = int(jobconf_from_env('my.job.upper_height')) zipcode = int(jobconf_from_env('my.job.zipcode')) (Zipcode, Height) = value.split(',') if lower_height <= int(Height) <= upper_height: if int(Zipcode) == int(zipcode): yield "Number of people", 1
def reducer_init(self): self.priceMean = jobconf_from_env("my.job.settings.elecMean") self.areaIntercept = jobconf_from_env("my.job.settings.areaIntercept") self.popIntercept = jobconf_from_env("my.job.settings.popIntercept") self.areaSlope = jobconf_from_env("my.job.settings.areaSlope") self.popSlope = jobconf_from_env("my.job.settings.popSlope") self.priceMean = float(self.priceMean) self.areaIntercept = float(self.areaIntercept) self.popIntercept = float(self.popIntercept) self.areaSlope = float(self.areaSlope) self.popSlope = float(self.popSlope)
def mapper(self, _, line): lower = int(jobconf_from_env('my.job.lower_height')) upper = int(jobconf_from_env('my.job.upper_height')) zipc = int(jobconf_from_env('my.job.zip_code')) if lower > upper: lower, upper = upper, lower (GivenName,ZipCode,Centimeters) = line.split(',') if lower <= int(Centimeters) <= upper: if int(ZipCode) == zipc: yield "row", (GivenName, ZipCode, Centimeters)
def mapper(self, _, line): for w in line.decode('utf-8', 'ignore').split(): filename = jobconf_from_env('mapreduce.map.input.file') w = limpiar(w) if w != ',' and w: yield (w, filename), 1
def decide_matrix(self): """returns 1 if matrix is A else returns 2 for matrix B""" filename = jobconf_from_env("map.input.file") if self.options.Matrix_A in filename: return 1 else: return 2
def mapper(self, _, line): line_stripped = line.translate(string.maketrans("", ""), string.punctuation) sentence = line_stripped.split() num_words = len(sentence) for word in sentence: yield (word.lower(), jobconf_from_env('map.input.file')), 1
def reducer_init(self): self.dt_initial = datetime(2008, 1, 1, 0, 0, 0) self.x_initial=0.0 self.y_initial=0.0 self.user = '' self.dist = {'all':0.0} self.max_interval_sec = int(jobconf_from_env('max_interval_sec'))
def mapper(self, key, line): # Eliminamos los caracteres especiales salvo que sean espacios en blanco o comillas simples formatLine = ''.join(e for e in line if e.isalpha() or e == ' ' or e == "'") for word in formatLine.lower().split(): yield word, (jobconf_from_env('mapreduce.map.input.file'), 1)
def mapper2(self, key, value): #print key,value x_train = np.array(key[:-1]) y_train = np.zeros(out_size) y_train[key[-1]] = 1 y_train = np.matrix(y_train) y_train = np.matrix(y_train) x_train = np.matrix(x_train) self.x.set_value(x_train.astype('float32')) self.y.set_value(y_train.astype('float32')) #predict = theano.function([],pred) b = jobconf_from_env('mapreduce.task.partition') grads = self.compute_cost() if self.count % 50 == 0: #b = jobconf_from_env('mapreduce.task.partition') print 'cost is ',float(grads[0]),' mapper',b,' iteration :: ',self.count if len(self.gradients) == 0: self.gradients = grads else: for i in range(0,5): self.gradients[i] += grads[i] self.count+=1 yield b,float(grads[0])
def mapper(self, _, line): try: #Make tables names global global R1_NAME global R2_NAME #Get the names of the tables. input_file_name = jobconf_from_env('map.input.file') #Augment file name to line line = input_file_name + "*" + line #store the names of the tables. if input_file_name is not None: if R1_NAME is None: R1_NAME = input_file_name else: if R1_NAME != input_file_name: R2_NAME = input_file_name #get join column key = line.split(self.table_delim_map)[self.column - 1] yield key, line except: print("Something is not right.") sys.exit(-1)
def mapper1(self, key, value): value = map(float,value.split(',')) #print 'mapper ',' ',key,' ',value x_train = np.array(value[:-1]) y_train = np.zeros(out_size) y_train[value[-1]] = 1 #print x_train #print y_train y_train = np.matrix(y_train) x_train = np.matrix(x_train) self.x.set_value(x_train.astype('float32')) self.y.set_value(y_train.astype('float32')) grads = self.compute_cost() #print 'here' b = jobconf_from_env('mapreduce.task.partition') #a = np.asarray(grads[3]) #print a if self.count % 50 == 0: #b = jobconf_from_env('mapreduce.task.partition') print 'cost is ',float(grads[0]),' mapper',b,' iteration :: ', self.count #dic[1] = grads #cost_all.append((b,cost)) if len(self.gradients) == 0: self.gradients = grads else: for i in range(0,5): self.gradients[i] += grads[i] #c = np.matrix(np.zeros((32,25))) #c = range(1,500) self.count+=1 yield b, float(grads[0])
def mapper_init(self): self.dataset_dir = 'dataset_dir' self.output_dir = os.path.join(jobconf_from_env('mapreduce.task.output.dir'), 'faces') cascade_cpu = jobconf_from_env('job.settings.cascade_cpu') cascade_gpu = jobconf_from_env('job.settings.cascade_gpu') colorferet = jobconf_from_env('job.settings.colorferet') gpu_or_cpu = jobconf_from_env('job.settings.gpu_or_cpu') self.detector = create_detector(gpu_or_cpu, cascade_cpu, cascade_gpu) self.recognizer = create_recognizer(colorferet) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.write_results = False
def decide_input_file(self): # return 1 if test.feature , return 2 if mapper is reading .model file. filename = jobconf_from_env( "map.input.file" ) # this is to get the name of the file a mapper is reading input from. if self.options.test in filename: return 1 else: return 2
def mapper(self, _, line): v = line.split(',') n = (len(v) - 2) / 2 #number of Non-zero columns for this each i = int(jobconf_from_env( "row.num.A")) # we need to know how many rows of A j = int(jobconf_from_env( "col.num.B")) # we need to know how many columns of B if v[0] == '0': for p in range(n): for q in range(j): yield (int(v[1]), q), (int(v[p * 2 + 2]), float(v[p * 2 + 3])) elif v[0] == '1': for p in range(n): for q in range(i): yield (q, int(v[p * 2 + 2])), (int(v[1]), float(v[p * 2 + 3]))
def mapper(self, _, line): data = u = unicode(line, "utf-8") normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore') new_line = re.sub('\W+', ' ', normal.lower()) filename = jobconf_from_env('mapreduce.map.input.file') for w in new_line.decode('utf-8', 'ignore').split(): yield (w, filename), 1
def mapper(self, _, line): for x in string.punctuation: line = line.replace(x, ' ') words = line.split() lista = [] for word in words: lista.append(word.lower()) for x in lista: yield x, jobconf_from_env( 'map.input.file' ) #el jobconf_from_env... te dice el fichero en el que estas trabajando
def final_mapper(self): print '........in final mapper...............',(self.gradients[0] / self.count) final_cost.append(self.gradients[0] / self.count) #print 'cost after iteration ',self.count,' is ',(self.gradients[0] / self.count) temp_w1 = w1.get_value() temp_w2 = w2.get_value() temp_b1 = b1.get_value() temp_b2 = b2.get_value() #dic.append(1) #print 'dic is ',dic[0] b = jobconf_from_env('mapreduce.task.partition') print 'now ',b self.gradients[1] = temp_w1 - (alpha*self.gradients[1]) self.gradients[2] = temp_w2 - (alpha*self.gradients[2]) self.gradients[3] = temp_b1 - (alpha*np.asarray(self.gradients[3])) self.gradients[4] = temp_b2 - (alpha*np.asarray(self.gradients[4])) parent[b] = self.gradients '''w1.set_value(self.gradients[1].astype('float32')) w2.set_value(self.gradients[2].astype('float32')) #print 'here',dic['b1'].tolist()[0],b1.get_value() b1.set_value((np.asarray(self.gradients[3])).astype('float32')) b2.set_value((np.asarray(self.gradients[4])).astype('float32'))''' self.gradients = []
def mapper(self, _, line): yield jobconf_from_env('mapreduce.map.input.file'), 1
def mapper(self, _, line): line_stripped = line.translate(string.maketrans("",""), string.punctuation) sentence = line_stripped.split() num_words = len(sentence) for word in sentence: yield (word.lower(), jobconf_from_env('map.input.file')), 1
def test_get_new_hadoop_jobconf(self): os.environ["mapreduce_job_user_name"] = "Edsger W. Dijkstra" self.assertEqual(jobconf_from_env("user.name"), "Edsger W. Dijkstra") self.assertEqual(jobconf_from_env("mapreduce.job.user.name"), "Edsger W. Dijkstra")
def test_get_missing_jobconf_not_in_table(self): # there was a bug where defaults didn't work for jobconf # variables that we don't know about self.assertEqual(jobconf_from_env('user.defined'), None) self.assertEqual(jobconf_from_env('user.defined', 'beauty'), 'beauty')
def mapper(self, _, line): n = len(line) self.increment_counter('group', 'total_chars', n) yield jobconf_from_env('map.input.file'), n
def mapper_init(self): for jobconf in JOBCONF_LIST: yield (jobconf, jobconf_from_env(jobconf))
def mapper_init(self): def load_from_small_dataset(colorferet_small_dir): face_labels_str = { 'Black-or-African-American': 0, 'Asian': 1, 'Asian-Middle-Eastern': 2, 'Hispanic': 3, 'Native-American': 4, 'Other': 5, 'Pacific-Islander': 6, 'White': 7 } images = [] labels = [] for face_label_str in face_labels_str: face_label_num = face_labels_str[face_label_str] image_dir = os.path.join(colorferet_small_dir, face_label_str) image_files = [ os.path.join(image_dir, image_file) for image_file in os.listdir(image_dir) ] images_tmp = [ cv2.resize(cv2.imread(image_file, 0), (256, 256)) for image_file in image_files if image_file.split('.')[-1] == 'png' ] labels_tmp = [face_label_num] * len(images_tmp) images.extend(images_tmp) labels.extend(labels_tmp) return images, labels self.video_dir = jobconf_from_env('job.settings.video_dir') self.output_dir = os.path.join( jobconf_from_env('mapreduce.task.output.dir'), 'faces') self.opencv_version = int(cv2.__version__.split('.')[0]) if self.opencv_version == 2: self.recognizer = cv2.createLBPHFaceRecognizer() # self.recognizer = cv2.createFisherFaceRecognizer() # self.recognizer = cv2.createEigenFaceRecognizer() elif self.opencv_version == 3: self.recognizer = cv2.face.createLBPHFaceRecognizer() # self.recognizer = cv2.face.createFisherFaceRecognizer() # self.recognizer = cv2.face.createEigenFaceRecognizer() if cv2gpu.is_cuda_compatible(): sys.stderr.write('Using GPU CascadeClassifier') cv2gpu.init_gpu_detector( jobconf_from_env('job.settings.cascade_gpu')) else: sys.stderr.write('Using CPU CascadeClassifier') cv2gpu.init_cpu_detector( jobconf_from_env('job.settings.cascade_cpu')) images, labels = load_from_small_dataset( jobconf_from_env('job.settings.colorferet')) self.recognizer.train(images, numpy.array(labels)) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.face_labels_num = { 0: 'Black-or-African-American', 1: 'Asian', 2: 'Asian-Middle-Eastern', 3: 'Hispanic', 4: 'Native-American', 5: 'Other', 6: 'Pacific-Islander', 7: 'White' } self.write_results = False
def mapper1_init(self): self.interval = int(jobconf_from_env('interval'))
def test_get_new_hadoop_jobconf(self): os.environ['mapreduce_job_user_name'] = 'Edsger W. Dijkstra' self.assertEqual(jobconf_from_env('user.name'), 'Edsger W. Dijkstra') self.assertEqual(jobconf_from_env('mapreduce.job.user.name'), 'Edsger W. Dijkstra')
def test_default(self): self.assertEqual(jobconf_from_env('user.name'), None) self.assertEqual(jobconf_from_env('user.name', 'dave'), 'dave')
def mapper(self, _, line): yield jobconf_from_env('map.input.file'), len(line)
def get_terms(self, _, line): docname = jobconf_from_env('map.input.file') for term in WORD_RE.findall(line): yield ((term.lower(), docname), 1)
def mapper(self, _, line): yield 'total', jobconf_from_env('map.input.file')
def mapper_init(self): self.increment_counter("count", "mapper_init", 1) for jobconf in JOBCONF_LIST: yield ((self.options.step_num, jobconf), jobconf_from_env(jobconf, None))
def mapper(self, _, line): for x in string.punctuation: line = line.replace(x,' ') for word in line.split(): yield word, jobconf_from_env('map.input.file')
def test_default(self): self.assertEqual(jobconf_from_env("user.name"), None) self.assertEqual(jobconf_from_env("user.name", "dave"), "dave")
def mapper(self, _, line): line_stripped = line.translate(string.maketrans("",""), string.punctuation) sentence = line_stripped.split() num_words = len(sentence) yield jobconf_from_env('map.input.file'), num_words yield '.total_counter.', num_words
def mapper(self, _, line): for word in WORD_RE.findall(line): yield (jobconf_from_env("mapreduce.map.input.file"), 1)
def mapper_init(self): def load_from_small_dataset(colorferet_small_dir): face_labels_str = { 'Black-or-African-American': 0, 'Asian': 1, 'Asian-Middle-Eastern': 2, 'Hispanic': 3, 'Native-American': 4, 'Other': 5, 'Pacific-Islander': 6, 'White': 7 } images = [] labels = [] for face_label_str in face_labels_str: face_label_num = face_labels_str[face_label_str] image_dir = os.path.join(colorferet_small_dir, face_label_str) image_files = [os.path.join(image_dir, image_file) for image_file in os.listdir(image_dir)] images_tmp = [cv2.resize(cv2.imread(image_file, 0), (256, 256)) for image_file in image_files if image_file.split('.')[-1] == 'png'] labels_tmp = [face_label_num] * len(images_tmp) images.extend(images_tmp) labels.extend(labels_tmp) return images, labels self.video_dir = jobconf_from_env('job.settings.video_dir') self.output_dir = os.path.join(jobconf_from_env('mapreduce.task.output.dir'), 'faces') self.opencv_version = int(cv2.__version__.split('.')[0]) if self.opencv_version == 2: self.recognizer = cv2.createLBPHFaceRecognizer() # self.recognizer = cv2.createFisherFaceRecognizer() # self.recognizer = cv2.createEigenFaceRecognizer() elif self.opencv_version == 3: self.recognizer = cv2.face.createLBPHFaceRecognizer() # self.recognizer = cv2.face.createFisherFaceRecognizer() # self.recognizer = cv2.face.createEigenFaceRecognizer() if cv2gpu.is_cuda_compatible(): sys.stderr.write('Using GPU CascadeClassifier') cv2gpu.init_gpu_detector(jobconf_from_env('job.settings.cascade_gpu')) else: sys.stderr.write('Using CPU CascadeClassifier') cv2gpu.init_cpu_detector(jobconf_from_env('job.settings.cascade_cpu')) images, labels = load_from_small_dataset(jobconf_from_env('job.settings.colorferet')) self.recognizer.train(images, numpy.array(labels)) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.face_labels_num = { 0: 'Black-or-African-American', 1: 'Asian', 2: 'Asian-Middle-Eastern', 3: 'Hispanic', 4: 'Native-American', 5: 'Other', 6: 'Pacific-Islander', 7: 'White' } self.write_results = False