示例#1
0
 def tf_mapper(self, _, line):
     for x in string.punctuation:
         line = line.replace(x, ' ')
     line = line.split()
     for word in line:
         yield (word.lower(), jobconf_from_env('map.input.file')), 1
     yield ('__doc_counter__', None), jobconf_from_env('map.input.file')
示例#2
0
 def mapper(self, _, line):
     for x in string.punctuation:
         line = line.replace(x, ' ')
     for word in line.split():
         yield (word.lower(), jobconf_from_env('map.input.file')), 1
     yield ('.total.', jobconf_from_env('map.input.file')), len(line)
     yield ('.doc.', jobconf_from_env('map.input.file')), 1
 def mapper(self, _, line):
     for x in string.punctuation:
         line = line.replace(x,' ')
     line = line.split()
     for word in line:
         yield (word.lower(), jobconf_from_env('map.input.file')), 1
     yield '__file_name__', jobconf_from_env('map.input.file')
示例#4
0
 def tf_mapper(self, _, line):
     for x in string.punctuation:
         line = line.replace(x,' ')
     line = line.split()
     for word in line:
         yield (word.lower(), jobconf_from_env('map.input.file')), 1
     yield ('__doc_counter__',None), jobconf_from_env('map.input.file')
示例#5
0
    def mapper_init(self):
        self.numPubColleges = jobconf_from_env(
            "my.job.settings.numPubColleges")
        self.numPrivColleges = jobconf_from_env(
            "my.job.settings.numPrivColleges")

        self.numPubColleges = float(self.numPubColleges)
        self.numPrivColleges = float(self.numPrivColleges)
 def reducer_init(self):
     self.priceMean = jobconf_from_env("my.job.settings.elecMean")
     self.popMean = jobconf_from_env("my.job.settings.popMean")
     self.areaMean = jobconf_from_env("my.job.settings.areaMean")
     
     self.priceMean = float(self.priceMean)
     self.popMean = float(self.popMean)
     self.areaMean = float(self.areaMean)
示例#7
0
 def mapper(self, key, value):
     lower_height = int(jobconf_from_env('my.job.lower_height'))
     upper_height = int(jobconf_from_env('my.job.upper_height'))
     zipcode = int(jobconf_from_env('my.job.zipcode'))
     (Zipcode, Height) = value.split(',')
     if lower_height <= int(Height) <= upper_height:
         if int(Zipcode) == int(zipcode):
             yield "Number of people", 1
    def reducer_init(self):
        self.priceMean = jobconf_from_env("my.job.settings.elecMean")
        self.areaIntercept = jobconf_from_env("my.job.settings.areaIntercept")
        self.popIntercept = jobconf_from_env("my.job.settings.popIntercept")
        self.areaSlope = jobconf_from_env("my.job.settings.areaSlope")
        self.popSlope = jobconf_from_env("my.job.settings.popSlope")

        self.priceMean = float(self.priceMean)
        self.areaIntercept = float(self.areaIntercept)
        self.popIntercept = float(self.popIntercept)
        self.areaSlope = float(self.areaSlope)
        self.popSlope = float(self.popSlope)
示例#9
0
    def mapper(self, _, line):
        lower = int(jobconf_from_env('my.job.lower_height'))
        upper = int(jobconf_from_env('my.job.upper_height'))
        zipc = int(jobconf_from_env('my.job.zip_code'))

        if lower > upper:
            lower, upper = upper, lower

        (GivenName,ZipCode,Centimeters) = line.split(',')

        if lower <= int(Centimeters) <= upper:
            if int(ZipCode) == zipc:
                yield "row", (GivenName, ZipCode, Centimeters)
示例#10
0
    def mapper(self, _, line):

        for w in line.decode('utf-8', 'ignore').split():
            filename = jobconf_from_env('mapreduce.map.input.file')
            w = limpiar(w)
            if w != ',' and w:
                yield (w, filename), 1
 def decide_matrix(self):
     """returns 1 if matrix is A else returns 2 for matrix B"""
     filename = jobconf_from_env("map.input.file")
     if self.options.Matrix_A in filename:
         return 1
     else:
         return 2
示例#12
0
 def mapper(self, _, line):
     line_stripped = line.translate(string.maketrans("", ""),
                                    string.punctuation)
     sentence = line_stripped.split()
     num_words = len(sentence)
     for word in sentence:
         yield (word.lower(), jobconf_from_env('map.input.file')), 1
示例#13
0
 def reducer_init(self):
     self.dt_initial = datetime(2008, 1, 1, 0, 0, 0)
     self.x_initial=0.0
     self.y_initial=0.0
     self.user = ''
     self.dist = {'all':0.0}
     self.max_interval_sec = int(jobconf_from_env('max_interval_sec'))
示例#14
0
文件: 3.py 项目: wennyHou/big-data
    def mapper(self, key, line):

        # Eliminamos los caracteres especiales salvo que sean espacios en blanco o comillas simples
        formatLine = ''.join(e for e in line
                             if e.isalpha() or e == ' ' or e == "'")
        for word in formatLine.lower().split():
            yield word, (jobconf_from_env('mapreduce.map.input.file'), 1)
示例#15
0
 def mapper2(self, key, value):
     #print key,value
     x_train = np.array(key[:-1])
     y_train = np.zeros(out_size)
     y_train[key[-1]]  = 1 
     
     y_train = np.matrix(y_train)
     y_train = np.matrix(y_train)
     x_train = np.matrix(x_train)
     
     self.x.set_value(x_train.astype('float32'))
     self.y.set_value(y_train.astype('float32'))
     #predict = theano.function([],pred)
     b = jobconf_from_env('mapreduce.task.partition')
     grads = self.compute_cost()
     if self.count % 50 == 0:
         #b = jobconf_from_env('mapreduce.task.partition')
         print 'cost is ',float(grads[0]),'  mapper',b,' iteration :: ',self.count
     if len(self.gradients) == 0:
         self.gradients = grads
     else:
         for i in range(0,5):
             self.gradients[i] += grads[i]
     self.count+=1
     yield b,float(grads[0])
    def mapper(self, _, line):
        try:
            #Make tables names global
            global R1_NAME
            global R2_NAME

            #Get the names of the tables.
            input_file_name = jobconf_from_env('map.input.file')

            #Augment file name to line
            line = input_file_name + "*" + line

            #store the names of the tables.
            if input_file_name is not None:
                if R1_NAME is None:
                    R1_NAME = input_file_name
                else:
                    if R1_NAME != input_file_name:
                        R2_NAME = input_file_name

            #get join column
            key = line.split(self.table_delim_map)[self.column - 1]
            yield key, line

        except:
            print("Something is not right.")
            sys.exit(-1)
示例#17
0
    def mapper1(self, key, value):

        value = map(float,value.split(','))
        #print 'mapper  ','  ',key,'    ',value
        x_train = np.array(value[:-1])
        y_train = np.zeros(out_size)
        y_train[value[-1]]  = 1      
        #print x_train
        #print y_train
        y_train = np.matrix(y_train)
        
        x_train = np.matrix(x_train)
        self.x.set_value(x_train.astype('float32'))
        self.y.set_value(y_train.astype('float32'))
        
        grads = self.compute_cost()
        #print 'here'
        b = jobconf_from_env('mapreduce.task.partition')
        #a = np.asarray(grads[3])
        #print a
        if self.count % 50 == 0:
            #b = jobconf_from_env('mapreduce.task.partition')
            print 'cost is ',float(grads[0]),'  mapper',b,' iteration :: ', self.count
            #dic[1] = grads
            #cost_all.append((b,cost))
        if len(self.gradients) == 0:
            self.gradients = grads
        else:
            for i in range(0,5):
                self.gradients[i] += grads[i]
        #c = np.matrix(np.zeros((32,25)))
        #c = range(1,500)

        self.count+=1
        yield b, float(grads[0])
示例#18
0
    def mapper_init(self):

        self.dataset_dir = 'dataset_dir'
        self.output_dir = os.path.join(jobconf_from_env('mapreduce.task.output.dir'), 'faces')

        cascade_cpu = jobconf_from_env('job.settings.cascade_cpu')
        cascade_gpu = jobconf_from_env('job.settings.cascade_gpu')
        colorferet = jobconf_from_env('job.settings.colorferet')
        gpu_or_cpu = jobconf_from_env('job.settings.gpu_or_cpu')

        self.detector = create_detector(gpu_or_cpu, cascade_cpu, cascade_gpu)
        self.recognizer = create_recognizer(colorferet)

        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        self.write_results = False
示例#19
0
 def decide_input_file(self):
     # return 1 if test.feature , return 2 if mapper is reading .model file.
     filename = jobconf_from_env(
         "map.input.file"
     )  # this is to get the name of the file a mapper is reading input from.
     if self.options.test in filename:
         return 1
     else:
         return 2
示例#20
0
    def mapper(self, _, line):
        v = line.split(',')
        n = (len(v) - 2) / 2  #number of Non-zero columns for this each
        i = int(jobconf_from_env(
            "row.num.A"))  # we need to know how many rows of A
        j = int(jobconf_from_env(
            "col.num.B"))  # we need to know how many columns of B

        if v[0] == '0':
            for p in range(n):
                for q in range(j):
                    yield (int(v[1]), q), (int(v[p * 2 + 2]),
                                           float(v[p * 2 + 3]))

        elif v[0] == '1':
            for p in range(n):
                for q in range(i):
                    yield (q, int(v[p * 2 + 2])), (int(v[1]),
                                                   float(v[p * 2 + 3]))
示例#21
0
    def mapper(self, _, line):

        data = u = unicode(line, "utf-8")
        normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore')
        new_line = re.sub('\W+', ' ', normal.lower())

        filename = jobconf_from_env('mapreduce.map.input.file')

        for w in new_line.decode('utf-8', 'ignore').split():
            yield (w, filename), 1
示例#22
0
 def mapper(self, _, line):
     for x in string.punctuation:
         line = line.replace(x, ' ')
     words = line.split()
     lista = []
     for word in words:
         lista.append(word.lower())
     for x in lista:
         yield x, jobconf_from_env(
             'map.input.file'
         )  #el jobconf_from_env... te dice el fichero en el que estas trabajando
示例#23
0
 def final_mapper(self):
     print '........in final mapper...............',(self.gradients[0] / self.count)
     final_cost.append(self.gradients[0] / self.count)
     #print 'cost after iteration ',self.count,' is ',(self.gradients[0] / self.count)
     temp_w1 = w1.get_value()
     temp_w2 = w2.get_value()
     temp_b1 = b1.get_value()
     temp_b2 = b2.get_value()
     #dic.append(1)
     #print 'dic is ',dic[0]
     b = jobconf_from_env('mapreduce.task.partition')
     print 'now ',b
     self.gradients[1] = temp_w1 - (alpha*self.gradients[1])
     self.gradients[2] = temp_w2 - (alpha*self.gradients[2])
     self.gradients[3] = temp_b1 - (alpha*np.asarray(self.gradients[3]))
     self.gradients[4] = temp_b2 - (alpha*np.asarray(self.gradients[4]))
     parent[b] = self.gradients
     '''w1.set_value(self.gradients[1].astype('float32'))
     w2.set_value(self.gradients[2].astype('float32'))
     #print 'here',dic['b1'].tolist()[0],b1.get_value()
     b1.set_value((np.asarray(self.gradients[3])).astype('float32'))
     b2.set_value((np.asarray(self.gradients[4])).astype('float32'))'''
     self.gradients = []
示例#24
0
 def mapper(self, _, line):
     yield jobconf_from_env('mapreduce.map.input.file'), 1
示例#25
0
 def mapper(self, _, line):
     line_stripped = line.translate(string.maketrans("",""), string.punctuation)
     sentence = line_stripped.split()
     num_words = len(sentence)
     for word in sentence:
         yield (word.lower(), jobconf_from_env('map.input.file')), 1
示例#26
0
 def test_get_new_hadoop_jobconf(self):
     os.environ["mapreduce_job_user_name"] = "Edsger W. Dijkstra"
     self.assertEqual(jobconf_from_env("user.name"), "Edsger W. Dijkstra")
     self.assertEqual(jobconf_from_env("mapreduce.job.user.name"), "Edsger W. Dijkstra")
示例#27
0
 def test_get_missing_jobconf_not_in_table(self):
     # there was a bug where defaults didn't work for jobconf
     # variables that we don't know about
     self.assertEqual(jobconf_from_env('user.defined'), None)
     self.assertEqual(jobconf_from_env('user.defined', 'beauty'), 'beauty')
示例#28
0
 def mapper(self, _, line):
     n = len(line)
     self.increment_counter('group', 'total_chars', n)
     yield jobconf_from_env('map.input.file'), n
示例#29
0
 def mapper(self, _, line):
     yield jobconf_from_env('mapreduce.map.input.file'), 1
示例#30
0
 def mapper_init(self):
     for jobconf in JOBCONF_LIST:
         yield (jobconf, jobconf_from_env(jobconf))
示例#31
0
    def mapper_init(self):
        def load_from_small_dataset(colorferet_small_dir):

            face_labels_str = {
                'Black-or-African-American': 0,
                'Asian': 1,
                'Asian-Middle-Eastern': 2,
                'Hispanic': 3,
                'Native-American': 4,
                'Other': 5,
                'Pacific-Islander': 6,
                'White': 7
            }

            images = []
            labels = []

            for face_label_str in face_labels_str:
                face_label_num = face_labels_str[face_label_str]
                image_dir = os.path.join(colorferet_small_dir, face_label_str)
                image_files = [
                    os.path.join(image_dir, image_file)
                    for image_file in os.listdir(image_dir)
                ]
                images_tmp = [
                    cv2.resize(cv2.imread(image_file, 0), (256, 256))
                    for image_file in image_files
                    if image_file.split('.')[-1] == 'png'
                ]
                labels_tmp = [face_label_num] * len(images_tmp)
                images.extend(images_tmp)
                labels.extend(labels_tmp)

            return images, labels

        self.video_dir = jobconf_from_env('job.settings.video_dir')
        self.output_dir = os.path.join(
            jobconf_from_env('mapreduce.task.output.dir'), 'faces')
        self.opencv_version = int(cv2.__version__.split('.')[0])

        if self.opencv_version == 2:
            self.recognizer = cv2.createLBPHFaceRecognizer()
            # self.recognizer = cv2.createFisherFaceRecognizer()
            # self.recognizer = cv2.createEigenFaceRecognizer()
        elif self.opencv_version == 3:
            self.recognizer = cv2.face.createLBPHFaceRecognizer()
            # self.recognizer = cv2.face.createFisherFaceRecognizer()
            # self.recognizer = cv2.face.createEigenFaceRecognizer()

        if cv2gpu.is_cuda_compatible():
            sys.stderr.write('Using GPU CascadeClassifier')
            cv2gpu.init_gpu_detector(
                jobconf_from_env('job.settings.cascade_gpu'))
        else:
            sys.stderr.write('Using CPU CascadeClassifier')
            cv2gpu.init_cpu_detector(
                jobconf_from_env('job.settings.cascade_cpu'))

        images, labels = load_from_small_dataset(
            jobconf_from_env('job.settings.colorferet'))

        self.recognizer.train(images, numpy.array(labels))
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        self.face_labels_num = {
            0: 'Black-or-African-American',
            1: 'Asian',
            2: 'Asian-Middle-Eastern',
            3: 'Hispanic',
            4: 'Native-American',
            5: 'Other',
            6: 'Pacific-Islander',
            7: 'White'
        }

        self.write_results = False
示例#32
0
 def mapper1_init(self):
     self.interval = int(jobconf_from_env('interval'))
示例#33
0
 def test_get_new_hadoop_jobconf(self):
     os.environ['mapreduce_job_user_name'] = 'Edsger W. Dijkstra'
     self.assertEqual(jobconf_from_env('user.name'),
                      'Edsger W. Dijkstra')
     self.assertEqual(jobconf_from_env('mapreduce.job.user.name'),
                      'Edsger W. Dijkstra')
示例#34
0
 def test_default(self):
     self.assertEqual(jobconf_from_env('user.name'), None)
     self.assertEqual(jobconf_from_env('user.name', 'dave'), 'dave')
 def mapper(self, _, line):
     yield jobconf_from_env('map.input.file'), len(line)
示例#36
0
 def get_terms(self, _, line):
     docname = jobconf_from_env('map.input.file')
     for term in WORD_RE.findall(line):
         yield ((term.lower(), docname), 1)
示例#37
0
 def mapper(self, _, line):
     yield 'total', jobconf_from_env('map.input.file')
示例#38
0
 def mapper_init(self):
     self.increment_counter("count", "mapper_init", 1)
     for jobconf in JOBCONF_LIST:
         yield ((self.options.step_num, jobconf), jobconf_from_env(jobconf, None))
示例#39
0
 def test_default(self):
     self.assertEqual(jobconf_from_env('user.name'), None)
     self.assertEqual(jobconf_from_env('user.name', 'dave'), 'dave')
示例#40
0
 def mapper(self, _, line):
     for x in string.punctuation:
         line = line.replace(x,' ')
     for word in line.split():
         yield word, jobconf_from_env('map.input.file')
示例#41
0
 def test_default(self):
     self.assertEqual(jobconf_from_env("user.name"), None)
     self.assertEqual(jobconf_from_env("user.name", "dave"), "dave")
示例#42
0
 def test_get_new_hadoop_jobconf(self):
     os.environ['mapreduce_job_user_name'] = 'Edsger W. Dijkstra'
     self.assertEqual(jobconf_from_env('user.name'), 'Edsger W. Dijkstra')
     self.assertEqual(jobconf_from_env('mapreduce.job.user.name'),
                      'Edsger W. Dijkstra')
示例#43
0
 def mapper(self, _, line):
     line_stripped = line.translate(string.maketrans("",""), string.punctuation)
     sentence = line_stripped.split()
     num_words = len(sentence)
     yield jobconf_from_env('map.input.file'), num_words
     yield '.total_counter.', num_words 
示例#44
0
 def test_get_missing_jobconf_not_in_table(self):
     # there was a bug where defaults didn't work for jobconf
     # variables that we don't know about
     self.assertEqual(jobconf_from_env('user.defined'), None)
     self.assertEqual(jobconf_from_env('user.defined', 'beauty'), 'beauty')
 def  mapper(self, _, line):
     n = len(line)
     self.increment_counter('group', 'total_chars', n)
     yield jobconf_from_env('map.input.file'), n
示例#46
0
 def mapper(self, _, line):
     for word in WORD_RE.findall(line):
         yield (jobconf_from_env("mapreduce.map.input.file"), 1)
示例#47
0
    def mapper_init(self):

        def load_from_small_dataset(colorferet_small_dir):

            face_labels_str = {
                'Black-or-African-American': 0,
                'Asian': 1,
                'Asian-Middle-Eastern': 2,
                'Hispanic': 3,
                'Native-American': 4,
                'Other': 5,
                'Pacific-Islander': 6,
                'White': 7
            }

            images = []
            labels = []

            for face_label_str in face_labels_str:
                face_label_num = face_labels_str[face_label_str]
                image_dir = os.path.join(colorferet_small_dir, face_label_str)
                image_files = [os.path.join(image_dir, image_file) for image_file in os.listdir(image_dir)]
                images_tmp = [cv2.resize(cv2.imread(image_file, 0), (256, 256)) for image_file in image_files if image_file.split('.')[-1] == 'png']
                labels_tmp = [face_label_num] * len(images_tmp)
                images.extend(images_tmp)
                labels.extend(labels_tmp)

            return images, labels

        self.video_dir = jobconf_from_env('job.settings.video_dir')
        self.output_dir = os.path.join(jobconf_from_env('mapreduce.task.output.dir'), 'faces')
        self.opencv_version = int(cv2.__version__.split('.')[0])

        if self.opencv_version == 2:
            self.recognizer = cv2.createLBPHFaceRecognizer()
            # self.recognizer = cv2.createFisherFaceRecognizer()
            # self.recognizer = cv2.createEigenFaceRecognizer()
        elif self.opencv_version == 3:
            self.recognizer = cv2.face.createLBPHFaceRecognizer()
            # self.recognizer = cv2.face.createFisherFaceRecognizer()
            # self.recognizer = cv2.face.createEigenFaceRecognizer()

        if cv2gpu.is_cuda_compatible():
            sys.stderr.write('Using GPU CascadeClassifier')
            cv2gpu.init_gpu_detector(jobconf_from_env('job.settings.cascade_gpu'))
        else:
            sys.stderr.write('Using CPU CascadeClassifier')
            cv2gpu.init_cpu_detector(jobconf_from_env('job.settings.cascade_cpu'))

        images, labels = load_from_small_dataset(jobconf_from_env('job.settings.colorferet'))

        self.recognizer.train(images, numpy.array(labels))
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        self.face_labels_num = {
            0: 'Black-or-African-American',
            1: 'Asian',
            2: 'Asian-Middle-Eastern',
            3: 'Hispanic',
            4: 'Native-American',
            5: 'Other',
            6: 'Pacific-Islander',
            7: 'White'
        }

        self.write_results = False