示例#1
0
 def mapper_dangling(self, key, value):
     # Topic of Current Node
     topic = get_jobconf_value('topic')
     # Number of Nodes in same Topic as current Node
     n_nodes_topic = self.topicCounts.get(topic, 0)
     
     #sys.stderr.write('[M_D] {0}, {1}, {2} \n'.format(key, topic, n_nodes_topic)) 
     
     i = int(get_jobconf_value('iteration'))
     key = key.replace("\"","")
     key = key.replace("\\","")
     adj_list = ast.literal_eval(str(value))
     
     nodes = int(get_jobconf_value('nodes'))
     teleportation = float(get_jobconf_value('teleportation'))
     topic_bias = float(get_jobconf_value('topic_bias'))
     
     score = adj_list['score']
     
     '''
         Adjust for Topic Bias
         Random Surfer selects Nodes in same Topic as current node using a Topic Bias (> 0.5: Topic Sensitive)
     '''
     if topic != '0':
         random_topic_jump = teleportation * ((topic_bias/n_nodes_topic) + ((1 - topic_bias)/ (nodes - n_nodes_topic)))
         modified_score = random_topic_jump + (1 - teleportation) * ((self.dangling_mass / nodes) + score)
     else:
         modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score)
     
     #modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score)
     #modified_score = sum_log((teleportation / nodes), (1 - teleportation)*(self.dangling_mass / nodes))
     #modified_score = sum_log(modified_score, (1 - teleportation)*score)
     adj_list['score'] = modified_score
         
     yield key, adj_list
示例#2
0
 def mapper(self, key, value):
     nodes = int(get_jobconf_value('nodes'))
     i = int(get_jobconf_value('iteration'))
     #sys.stderr.write('[M] {0}, {1} \n'.format(key, value))
     key = key.replace("\"","")
     key = key.replace("\\","")
     adj_list = ast.literal_eval(value)
   
     score = 0
     l = 0
     
     if 'score' in adj_list.keys():
         # Previous Mass/Page Rank
         score = adj_list['score']
         l = len(adj_list) - 1
     else: # First iteration ('score' not yet part of the adjacency list!)
         # Start with uniform probability distribution
         score = 1.0 / nodes
         l = len(adj_list)
         adj_list['score'] = score
         
     if l == 0: # Only 'score' & no out links [Dangling!]
         sys.stderr.write('[{0}][M] "DANGLING MASS" | {1} | {2}\n'.format(i, key, score))
         # Emit using a special key; Accumlate in Reducer;Distribute in the next MRJob
         yield 'DANGLING', ('SCORE', score)
    
     # Emit the Graph Structure
     yield key, ('GRAPH', adj_list)
                 
     # Emit the new Mass/Page Rank
     for n in adj_list:
         if n != 'score':
             yield n, ('SCORE', score/l)
    def mapper_MulMat(self, key, value):
        if 'GModMat' == key:

            tmpRow = row = value[0]
            tmpCol = col = value[1]
            val = value[2]

            GroupID_row, tmp = tmpRow.split("_")
            GroupID_col, tmp = tmpCol.split("_")

            assert GroupID_row == GroupID_col, "GroupID_row and GroupID_col must be same"

            if row == col and 1 == self.options.iteration:
                maxColumnSum = float(get_jobconf_value("MAXSUM_" + GroupID_row))
                val = val + maxColumnSum

            #yield col, ('A', row, val)   # For A matrix
            #yield row, ('B', col, val)   # For B matrix

            #yield col + "|" + 'A' + "|" + row, val   # For A matrix
            #yield row + "|" + 'B' + "|" + col, val   # For B matrix

            matSize = int(get_jobconf_value("matSize_" + GroupID_row))

            # For A
            for i in range(matSize):
                midKey = get_jobconf_value("matIdx_" + GroupID_row + "_" + str(i))      # matIdx_0_0=0_20   ...

                yield row + "|" + midKey + "|" + col, val   # row, i, col
                yield midKey + "|" + col + "|" + row, val   # i, col, row
示例#4
0
    def reducer(self, key, values):     
        i = int(get_jobconf_value('iteration'))
        teleportation = float(get_jobconf_value('teleportation'))
        nodes = int(get_jobconf_value('nodes'))
        
        adj_list = None
        total_score = 0

        for value_type, value in values:
            if value_type == 'GRAPH':
                adj_list = value
            else:
                assert value_type == 'SCORE'
                total_score += value
                #total_score = sum_log(total_score, value)
                
        # Special Key
        if key == 'DANGLING':
            # Write accumulated Dangling Score in a file
            with open('/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt', 'w') as f:
                f.write('DANGLING\t{0}\n'.format(total_score))
        else:
            #total_score = (teleportation / nodes) + ((1 - teleportation) * total_score)
            #total_score = sum_log((teleportation / nodes), ((1 - teleportation) * total_score))
            if adj_list:
                adj_list['score'] = total_score
            else:
                adj_list = {'score': total_score}
    
            #sys.stderr.write('[R2] {0} | {1} | {2}\n\n'.format(key, total_score, adj_list))
            yield key, adj_list
    def mapper_CalcPij(self, key, value):
        """
        Bij = Aij - ( Ki * Kj ) / M
            = ( ( Aij * M ) - ( Ki * Kj ) ) 
              -----------------------------
                            M
        """
        
        #self.startDate = get_jobconf_value("maxNodeID")
        if key.isdigit():
            """
            Input:
                "1"     "2"
                "2"     "1"
                "3"     "4"
                "3"     "5"
                "4"     "3"
                "4"     "5"
                "5"     "3"
                "5"     "4"
            """
            row = key
            col = value

            Aij = 1.0
            yield row + "_" + col, ('A', Aij)

        elif 'k' == key:
            """
                "k"     ["7", 4]
            """

            row = value[0]
            minNodeID = int(get_jobconf_value("minNodeID"))
            maxNodeID = int(get_jobconf_value("maxNodeID"))
            k_row = float(get_jobconf_value("k" + row))
            M     = float(get_jobconf_value("M"))

            for col in range(minNodeID, maxNodeID + 1):
                if 'local' == RUN_TYPE:
                    #sys.stderr.write("col %d \t |" % col)
                    pass

                k_col = float(get_jobconf_value("k" + str(col)))
                Pij = (k_row * k_col) / M
                yield row + "_" + str(col), ('P', Pij)

        elif 'm' == key:
            return
        elif 'max' == key:
            return
        elif 'min' == key:
            return
        elif 'x' == key:
            return
        else:
            assert False, "In genModularityMat_mapper(), wrong key type. key : %s" % key
 def mapper(self, _, line):
     v = line.split(',')
     n = (len(v)-2)/2 #number of Non-zero columns for this each
     i = int(get_jobconf_value("row.num.A")) # we need to know how many rows of A
     j = int(get_jobconf_value("col.num.B")) # we need to know how many columns of B
     
     if v[0]=='0':
         for p in range(n):
             for q in range(j):
                 yield (int(v[1]),q), (int(v[p*2+2]),float(v[p*2+3]))
         
     elif v[0]=='1':
         for p in range(n):
             for q in range(i):
                 yield (q,int(v[p*2+2])), (int(v[1]),float(v[p*2+3]))
 def mapper(self, _, line):
     # step 0: strip off unexpected characters
     line = line.split('\t')[1]
     
     # step 1: fetch the exodus file from Hadoop cluster
     file = os.path.basename(line)
     if os.path.isfile(os.path.join('./', file)):
         call(['rm', os.path.join('./', file)])
     check_call(['hadoop', 'fs', '-copyToLocal', line, os.path.join('./', file)])
     outdir = os.path.basename(line)
     ind = outdir.rfind('.')
     outdir = outdir[0:ind]
     if os.path.isdir(os.path.join('./', outdir)):
         call(['rm', '-r', os.path.join('./', outdir)])
     call(['mkdir', os.path.join('./', outdir)])
     
     # step 2: do our local processing
     result = convert(os.path.join('./', file), self.timesteps, os.path.join('./', outdir), self.variables)
     
     # step3: write back to Hadoop cluster
     user = get_jobconf_value('mapreduce.job.user.name')
    
     for fname in os.listdir(os.path.join('./', outdir)):
         if call(['hadoop', 'fs', '-test', '-e', os.path.join(self.outdir,outdir,fname)]) == 0:
             call(['hadoop', 'fs', '-rm', os.path.join(self.outdir,outdir,fname)])
         call(['hadoop', 'fs', '-copyFromLocal', os.path.join('./',outdir,fname),os.path.join(self.outdir,outdir,fname)])
         call(['hadoop', 'fs', '-chown', '-R', user, os.path.join(self.outdir)])
     call(['rm', os.path.join('./', file)])
     call(['rm', '-r', os.path.join('./', outdir)])
     
     #step 4: yield output key/value
     if result == True:
         yield (line, 0)
     else:
         yield (line, 1)
示例#8
0
 def mapper_init(self):
     self.frontier_node = get_jobconf_value('frontier_node')
     if not self.frontier_node:
         # Save a list of visited nodes
         self.visited = [s.strip() for s in 
                           open('visited.txt').readlines()]
         open('visited.txt', 'w').close()
 def parsemat(self):
     """ Return 1 if this is the A matrix, otherwise return 2"""
     fn = get_jobconf_value('map.input.file')
     if self.options.Amatname in fn: 
         return 1
     else:
         return 2
    def mapper_CalcPij(self, key, value):
        """
        Bij = Aij - ( Ki * Kj ) / M
            = ( ( Aij * M ) - ( Ki * Kj ) ) 
              -----------------------------
                            M
        """
        
        if key.isdigit():
            """
            Input:
                "1"     "2"
                "2"     "1"
                "3"     "4"
                "3"     "5"
                "4"     "3"
                "4"     "5"
                "5"     "3"
                "5"     "4"
            """
            row = key
            col = value

            Aij = 1.0
            yield row + "_" + col + "_" + "A",  (Aij)

        elif 'k' == key:
            """
                "k"     ["7", 4]
            """

            row = value[0]
            k_row = float(get_jobconf_value("k" + row))

            for col in range(self.options.minNodeID, self.options.maxNodeID + 1):
                if 'local' == RUN_TYPE:
                    #sys.stderr.write("col %d \t |" % col)
                    pass

                k_col = float(get_jobconf_value("k" + str(col)))
                Pij = (k_row * k_col) / self.options.M
                yield row + "_" + str(col) + "_" + "P", (Pij)

        elif key in ['m', 'max', 'min', 'x']:
            return
        else:
            assert False, "In genModularityMat_mapper(), wrong key type. key : %s" % key
示例#11
0
 def mapper_dangling(self, key, value):
     #sys.stderr.write('[M_D] {0}, {1} \n'.format(key, value))
     i = int(get_jobconf_value('iteration'))
     key = key.replace("\"","")
     key = key.replace("\\","")
     adj_list = ast.literal_eval(str(value))
     
     if self.dangling_mass > 0:
         nodes = int(get_jobconf_value('nodes'))
         teleportation = float(get_jobconf_value('teleportation'))
         score = adj_list['score']
         modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score)
         #modified_score = sum_log((teleportation / nodes), (1 - teleportation)*(self.dangling_mass / nodes))
         #modified_score = sum_log(modified_score, (1 - teleportation)*score)
         adj_list['score'] = modified_score
         
     yield key, adj_list
示例#12
0
 def mapper(self, key, value):
     nodes = int(get_jobconf_value('nodes'))
     dangling_mass = float(get_jobconf_value('dangling_mass'))
     teleportation = float(get_jobconf_value('teleportation'))
     #sys.stderr.write('[M] {0}, {1} \n'.format(key, value))
     key = key.replace("\"","")
     key = key.replace("\\","")
     neighbors = ast.literal_eval(value)
     
     score = float(neighbors['score'])
     
     modified_score = teleportation / nodes + (1 - teleportation) * ( (dangling_mass / nodes) + score)
     
     print '{0}, {1}, {2}'.format(score, modified_score, dangling_mass)
     
     neighbors['score'] = modified_score
     
     yield key, neighbors
示例#13
0
	def mapper(self, _, l):
		t = l.strip('\n').split('\t')
		text = t[1]
		i = int(t[0])
		n = int(get_jobconf_value("total"))
		for j in range(1, i):
			yield(("%d,%d" % (i, j)), text)
		for j in range(i + 1, n + 1):
			yield(("%d,%d" % (j, i)), text)
 def mapper_dangling_init(self):
     i = int(get_jobconf_value('iteration'))
     aws_access_key_id = get_jobconf_value('aws_access_key_id')
     aws_secret_access_key = get_jobconf_value('aws_secret_access_key')
     
     self.dangling_mass = 0
     
     # Read Dangling Mass from S3 Bucket
     try:
         conn = boto.connect_s3()
         bucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
         k = Key(bucket)
         k.key = 'hw93/dangling_mass/{0}'.format(i) # Same as iteration
         self.dangling_mass = float(k.get_contents_as_string())
     except boto.exception.S3ResponseError as err:
         sys.stderr.write(err)
         sys.exit(1)
     
     sys.stderr.write('[{0}][M_D] DANGLING MASS: {1}\n'.format(i, self.dangling_mass))
    def reducer(self, key, values):     
        i = int(get_jobconf_value('iteration'))
        teleportation = float(get_jobconf_value('teleportation'))
        nodes = int(get_jobconf_value('nodes'))
        aws_access_key_id = get_jobconf_value('aws_access_key_id')
        aws_secret_access_key = get_jobconf_value('aws_secret_access_key')
        
        adj_list = None
        total_score = 0

        for value_type, value in values:
            if value_type == 'GRAPH':
                adj_list = value
            else:
                assert value_type == 'SCORE'
                total_score += value
                #total_score = sum_log(total_score, value)
                
        # Write Special Key to S3
        if key == 'DANGLING':
            # Write accumulated Dangling Score in a S3 Key
            try:
                conn = boto.connect_s3()
                bucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
                k = Key(bucket)
                k.key = 'hw93/dangling_mass/{0}'.format(i) # Same as iteration
                k.set_contents_from_string(str(total_score))
            except boto.exception.S3ResponseError as err:
                sys.stderr.write(err)
                sys.exit(1)
        else:
            #total_score = (teleportation / nodes) + ((1 - teleportation) * total_score)
            #total_score = sum_log((teleportation / nodes), ((1 - teleportation) * total_score))
            if adj_list:
                adj_list['score'] = total_score
            else:
                adj_list = {'score': total_score}
    
            #sys.stderr.write('[R2] {0} | {1} | {2}\n\n'.format(key, total_score, adj_list))
            yield key, adj_list
示例#16
0
 def mapper_dangling_init(self):
     i = int(get_jobconf_value('iteration'))
     self.dangling_mass = 0
     f_dangling = '/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt'
     try:
         with open(f_dangling, 'r') as f:
             l = f.readlines()
             if l:
                 self.dangling_mass = float(l[0].split('\t')[1])
         open(f_dangling, 'w').close()
     except Exception as e:
         pass
     sys.stderr.write('[{0}][M_D] DANGLING MASS: {1}\n'.format(i, self.dangling_mass))
示例#17
0
 def reducer(self, idx, inputdata): 
     centroids = []
     k = int(get_jobconf_value('k'))
     num = [0] * k
     for i in range(k):
         centroids.append([0 for i in xrange(1000)])
     for d, n in inputdata:
         num[idx] = num[idx] + n
         for i in xrange(1000):
             centroids[idx][i] = centroids[idx][i] + d[i]
     for i in xrange(1000):
         centroids[idx][i] = centroids[idx][i]/num[idx]
    
     with open('Centroids.txt', 'a') as f:
         f.writelines(",".join(str(i) for i in centroids[idx]) + '\n')
     yield idx,(centroids[idx], num)
示例#18
0
    def mapper(self, _, line):
        dist_type = get_jobconf_value('dist_type')
        tokens = line.strip().split('\t')
        
        key = tokens[0].replace("\"","")
        dict_pairs = ast.literal_eval(tokens[1])
        
        for n_key, n_dict_pairs in self.stripes.iteritems():
            # TODO distance calc for only (a,b) but not (b,a) --> Redundant
            if key > n_key:
                continue
            
            self.counter += 1   
            if self.counter % 1000 == 0:
                self.set_status('# of Distances Calculated: {0}'.format(self.counter))
                
            distance = None
            
            if dist_type == 'euclid':

                # Calculate Euclidean Distance
                squared_distance = 0
                for k in n_dict_pairs.keys():
                    squared_distance += (dict_pairs.get(k, 0) - n_dict_pairs.get(k, 0)) ** 2
                    
                distance = math.sqrt(squared_distance)
                
            if dist_type == 'cosine':
                
                # Calculate Cosine Distance
                # Get the intersection of keys from both stripes
                norm_x = 0
                norm_y = 0
                dot_x_y = 0
                for k in self.stripes.keys(): # Iterate through entire key range once
                    norm_x += dict_pairs.get(k,0) * dict_pairs.get(k,0)
                    norm_y += n_dict_pairs.get(k,0) * n_dict_pairs.get(k,0)
                    dot_x_y += dict_pairs.get(k,0) * n_dict_pairs.get(k,0)
                    
                distance = float(dot_x_y) / (math.sqrt(norm_x) * math.sqrt(norm_y))
          
            self.increment_counter('distance', 'num_{0}_distances'.format(dist_type), amount=1)
            yield (distance), (key, n_key)
示例#19
0
 def mapper_dangling_init(self):
     i = int(get_jobconf_value('iteration'))
     
     # Page/Topic Mapping & Topic Counts for each Topic.
     self.topics = {}
     self.topicCounts = {}
     with open('randNet_topics.txt') as f:
         for l in f:
             t = l.split('\t')
             self.topics[t[0].strip()] = t[1].strip()
             
     for k,v in self.topics.iteritems():
         self.topicCounts[v] = self.topicCounts.get(v, 0) + 1
     
     self.dangling_mass = 0
     f_dangling = '/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt'
     try:
         with open(f_dangling, 'r') as f:
             l = f.readlines()
             if l:
                 self.dangling_mass = float(l[0].split('\t')[1])
         open(f_dangling, 'w').close()
     except Exception as e:
         pass
示例#20
0
 def mapper(self, _, line):
     for word in WORD_RE.findall(line):
         yield (get_jobconf_value("mapreduce.map.input.file"), 1)
示例#21
0
文件: j02-grep.py 项目: jjo/mapreduce
 def mapper(self, _, line):
     for word in line.split():
         yield word, get_jobconf_value('map.input.file')
示例#22
0
 def mapper_init(self):
   self.cui_idx = int(get_jobconf_value("cui_idx"))
示例#23
0
 def mapper_init(self):
     self.start_node = get_jobconf_value('start_node')
     self.stop_node = get_jobconf_value('stop_node')
     sys.stderr.write('### Start/Frontier Node: {0}\n'.format(self.start_node))
     sys.stderr.write('### Stop: {0}\n'.format(self.stop_node))
示例#24
0
 def test_get_jobconf_value_2(self):
     os.environ['mapreduce_job_user_name'] = 'Edsger W. Dijkstra'
     self.assertEqual(get_jobconf_value('user.name'),
                      'Edsger W. Dijkstra')
     self.assertEqual(get_jobconf_value('mapreduce.job.user.name'),
                      'Edsger W. Dijkstra')
示例#25
0
 def test_get_jobconf_value_1(self):
     os.environ['user_name'] = 'Edsger W. Dijkstra'
     assert_equal(get_jobconf_value('user.name'),
                  'Edsger W. Dijkstra')
     assert_equal(get_jobconf_value('mapreduce.job.user.name'),
                  'Edsger W. Dijkstra')
示例#26
0
 def mapper(self, _, line):
     for jobconf in JOBCONF_LIST:
         yield (jobconf, get_jobconf_value(jobconf))
示例#27
0
    def reducer(self, key, values):
        """
        input: -1, (index, valarray)
        output: global variance exodus file
        """
        
        val_order = {}
        
        for i, value in enumerate(values):
            val_order[value[0]]=value[1]
            
            
        val = [ ]  
        for k,value in sorted(val_order.iteritems()):
            val.extend(value)
        val2 = np.array(val)
        
        # grab template exodus file from HDFS
        
        tmpstr = self.indir[7:]
        index = tmpstr.find('/')
        prefix = 'hdfs://'+tmpstr[0:index]
        
        cmd = 'hadoop fs -ls '+ self.indir
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        content = p.stdout.read()
        files = content.split('\n')
        
        flag = True

        for file in files:
            file = file.split(' ')
            fname = file[len(file)-1]
            if fname.endswith('.e'):
                fname = prefix + fname
                if flag:
                    check_call(['hadoop', 'fs', '-copyToLocal', fname, 'template.e'])
                    flag = False
                    break
        
        template = 'template.e'
        
        # create new interpolation exodus file
        
        if call(['test', '-e', template]) != 0:
            print >>sys.stderr,  "The template file doesnot exist!"
            yield key,1
        else: 
        
            print >>sys.stderr,  "Reading templatefile %s"%(template)
            templatefile = ep.ExoFile(template,'r')
            
            outfile = self.outputname+'.e'
            print >>sys.stderr, "Writing outputfile %s"%(os.path.join(outfile))
            newfile = ep.ExoFile(os.path.join(outfile),'w')  
            
            time_steps = np.array([0.0])
            templatefile.change_nodal_vars2(newfile, time_steps, [self.variable], [val2], ['d'])

            newfile.src.sync()
            newfile.close()

            print >>sys.stderr, "Finished writing data, copying to Hadoop"
            
            user = get_jobconf_value('mapreduce.job.user.name')
            call(['hadoop', 'fs', '-copyFromLocal', outfile, os.path.join(self.outdir,outfile)])
            call(['hadoop', 'fs', '-chown', '-R', user, os.path.join(self.outdir)])
            
            print >>sys.stderr, "Copied to Hadoop, removing ..."
            
            call(['rm', template])
            call(['rm', outfile])
            yield key,0
            
            print >>sys.stderr, "Done"