def main(self): if not os.path.exists(self.args.file): sys.exit('Input file %s not found' % self.args.file) if not os.path.exists(self.args.output): sys.exit('Output directory %s not found' % self.args.output) mrs.main(self.MAPREDUCE_CLASS, args=self._parsed_etl_args.job_args)
yield (ds,self.callback) itr = itr + 1 while True: # iteratively map reduce (count counts) ds = job.map_data(ds,mapper=self.map_counts) # key=int, val=int ds = job.reduce_data(ds,self.reduce,outdir="%s/counts_of_counts%d"%(self.args[-1],itr),format=mrs.fileformats.TextWriter) itr = itr + 1 yield (ds, self.callback) # count the counts def map_counts(self, key, count): yield (count, 1) # count the words def map_words(self, line_num, line_text): for word in line_text.split(): word = word.strip(string.punctuation).lower() if word: yield (word, 1) # aggregate the counts def reduce(self, key, counts): yield sum(counts) if __name__ == '__main__': mrs.main(IterativeWordCount) # vim: et sw=4 sts=4
class invertedIndexing(mrs.MapReduce): # Mapper takes in a key, which is the indexNumber and a value, which is a line def map(self, indexNumber, line): # Reads a line and creates list of the words in the line line = line.split() for word in line: # Converts all letters to lowercase word = word.lower() # Converts every instance of punctuation to a space word = word.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) #Only maps word once it has been validated that it only contains the alphabet, it's larger than 3 letters and is not a stopword if word.isalpha() and len(word) > 3 and word not in stopWords: yield(word, indexNumber + 1) # Returns lowercase word, stripped of all punstuation and the index number # Reducer takes in the key, which is a word, and a value, which is the index number and returns the list of indexNumbers def reduce(self, word, indexNumber): indexNumbers = [] for currentIndex in indexNumber: # If statements to ensure only the first 50 lines are recorded and that if a word appears twice on one line, its only recorded once if len(indexNumbers) >= 50: break if currentIndex not in indexNumbers: indexNumbers.append(currentIndex) yield(indexNumbers) if __name__ == '__main__': mrs.main(invertedIndexing)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import mrs import string class WordCount(mrs.MapReduce): """Count the number of occurrences of each word in a set of documents. Word Count is the classic "hello world" MapReduce program. This is a working example and is provided to demonstrate a simple Mrs program. It is further explained in the tutorials provided in the docs directory. """ def map(self, line_num, line_text): for word in line_text.split(): word = word.strip(string.punctuation).lower() if word: yield (word, 1) def reduce(self, word, counts): yield sum(counts) if __name__ == '__main__': mrs.main(WordCount) # vim: et sw=4 sts=4
self.items.append(node) def add_edge(self, edge): self.items.append(edge) def get_path_string(self, remove_cycles=True, lexicalize=False, max_length=5): to_output = [] i = len(self.items) - 1 while i >= 0: item = self.items[i] if lexicalize or not isinstance(item, int): to_output.append(item) if isinstance(item, int): j = i - 1 while j >= 0: if self.items[j] == self.items[i]: i = j j -= 1 i -= 1 if len(to_output) > max_length or not to_output: return None to_output.reverse() return '-'.join(to_output) if __name__ == '__main__': mrs.main(RandomWalkAnalyzer) # vim: et sw=4 sts=4
def reduce(self, key, jobs): """ run the job """ for job in jobs: # execute the job string print "\nRunning cmd %d\n\t%s" % (key,job) os.system(job) print "============= Done! ===================" yield ["Done"] ''' Define a set of command line parameters whose values will be passed to your program in the opts parameter of the __init__ method. ''' @classmethod def update_parser(cls, parser): # TODO: add your option(s) here parser.add_option('--myopt', type='int', dest='myopt', default=1, help='Myopt determines blah blah blah...', ) return parser if __name__ == '__main__': mrs.main(JobLauncher) # vim: et sw=4 sts=4
for k, v in zip(tempK, i[1]): # print(k) matrixC[k[0]][k[1]] = v print(len(matrixA)) # for i in matrixC: # print(i) sys.stdout.flush() return 0 @classmethod def update_parser(cls, parser): parser.add_option('-P', '--num_processes', dest='num_processes', type='int', help='Number of points for each map task', default=2) parser.add_option('-N', '--row_size', dest='row_size', type='int', help='Number of map tasks to use', default=16) return parser if __name__ == '__main__': mrs.main(MatrixMultiplication)
print(self.args[0]) direc = self.args[0] fileList = glob.glob(direc + '/*.txt') print(fileList) print("----------------------------------") return job.file_data(fileList) def map(self, key, value): # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") # print(value) splited = re.split('[\W0-9]', value, flags=re.UNICODE) for i in splited: if (i != ''): i = i.lower() # print(i) yield i, 1 def reduce(self, key, values): length = len(tuple(values)) # print length yield length if __name__ == '__main__': mrs.main(WordCount) # j = mrs.job.Job # wc = WordCount # wc.input_data(j) # wc.map("AAA 2.1 2 , bc2 3z bbb",1)
""" @mrs.output_serializers(key=mrs.MapReduce.str_serializer, value=mrs.MapReduce.int_serializer) def map(self, key, value): for word in value.split(): word = word.strip(string.punctuation).lower() if word: yield (word, 1) def reduce(self, key, values): yield sum(values) combine = reduce def input_data(self, job): if len(self.args) < 2: print("Requires input(s) and an output.", file=sys.stderr) return None inputs = [] for filename in self.args[:-1]: with open(filename) as f: for line in f: inputs.append(line.strip()) return job.file_data(inputs) if __name__ == '__main__': mrs.main(WordCount2) # vim: et sw=4 sts=4
job.wait(output) output.fetchall() for key, value in output.data(): if key == True: inside = value else: outside = value pi = 4 * inside / (inside + outside) print(pi) sys.stdout.flush() return 0 @classmethod def update_parser(cls, parser): parser.add_option('-p', '--points', dest='num_points', help='Number of points for each map task', default=1000) parser.add_option('-t', '--tasks', dest='num_tasks', type='int', help='Number of map tasks to use', default=40) return parser if __name__ == '__main__': mrs.main(SamplePi)
dest='pruner', action='extend', search=['specmethod'], help='Pruning method for generating speculative children', default='OneCompleteIteration', ) parser.add_option('','--total-tokens', dest='tokens', type='int', help='Number of tokens to use (only for the TokenPruner). This is' ' the difference between the number of desired particles and the' ' number of available processors.', default=0, ) parser.add_option('','--min-tokens', dest='min_tokens', type='int', help='The minimum number of tokens that each particle can have. ' 'This cannot be greater than the total number of tokens available.', default=0, ) # There are some sticky issues involved with doing this speculatively # that I haven't worried about. If we ever feel like we should do this, # we need make some changes to the code. Until then, disabling it is # better than leaving it in and having it not work. parser.remove_option('--transitive-best') return parser if __name__ == '__main__': mrs.main(SpecExPSO, update_parser=update_parser) # vim: et sw=4 sts=4
Q, anchors = self.get_qank() V, K = Q.shape[0], len(anchors) P_w = np.diag(Q.sum(axis=1)) for word in range(V): if np.isnan(P_w[word, word]): P_w[word, word] = 1e-16 C = np.zeros((V, K)) for part in values: C += np.loads(part) A = np.dot(P_w, C) for k in range(K): A[:, k] = A[:, k] / A[:, k].sum() yield C.dumps() @classmethod def update_parser(cls, parser): parser.add_option('-t', '--tasks', dest='num_tasks', type=int, help='Number of map tasks to use', default=20) return parser if __name__ == '__main__': mrs.main(TopicRecover)
def reduce(self, key, values): # All we do is aggregate all of the information we've seen topic_info = TopicInfo() for value in values: topic_info.aggregate(value) # Then output it in as a pickle, for easy analysis later. yield topic_info @classmethod def update_parser(cls, parser): parser.add_option('-d', '--dataset', dest='dataset', help='Database name of the dataset to use', ) parser.add_option('-a', '--analysis', dest='analysis', help='Database name of the analysis to use', ) parser.add_option('-o', '--outdir', dest='outdir', help='Directory to store the output', ) return parser if __name__ == '__main__': mrs.main(DependencyParse) # vim: et sw=4 sts=4
continue Q[w_i.token, w_j.token] += norm yield '', pickle.dumps(scipy.sparse.coo_matrix(Q)) @mrs.output_serializers(key=mrs.str_serializer, value=mrs.raw_serializer) def reduce(self, key, values): corpus = self.get_corpus() V = len(corpus.vocabulary) Q = np.zeros((V, V)) for Q_part in values: Q += pickle.loads(Q_part) Q /= Q.sum() yield Q.dumps() @classmethod def update_parser(cls, parser): parser.add_option('-t', '--tasks', dest='num_tasks', type=int, help='Number of map tasks to use', default=20) return parser if __name__ == '__main__': mrs.main(ConstructQ)
word = word.split() for w in word: w = w.strip(string.punctuation).lower() if w and w not in stopWords and not w[0].isdigit(): yield (w, line_num + 1) #Start counting at 1 else: word = word.strip(string.punctuation).lower() if word and word not in stopWords and not word[0].isdigit(): yield (word, line_num + 1) #Start counting at 1 def reduce(self, word, line_num): lineNumbers = [] for i in line_num: #At most 50 lines if len(lineNumbers) >= 50: break #If a word repeats on a line, write the line only once if i not in lineNumbers: lineNumbers.append(i) yield lineNumbers if __name__ == '__main__': mrs.main(WordIndex)
if int(j) < 10: j = '0' + j if int(i) < 10: i = '0' + i if line: if MatrixMultiply.CurrentFile == '1': for k in range(MatrixMultiply.Col): if k < 10: yield ('0'+str(k)+' '+j, i+' '+val) else: yield (str(k)+' '+j, i+' '+val) if MatrixMultiply.CurrentFile == '2': for k in range(MatrixMultiply.Row): if k < 10: yield (i+' '+'0'+str(k), j+' '+val) else: yield (i+' '+str(k), j+' '+val) def reduce(self, key, values): val = [] for k in range(MatrixMultiply.Col): indexTemp, valTemp = values.next().split(' ') val.append(int(valTemp)) for k in range(MatrixMultiply.Col): indexTemp, valTemp = values.next().split(' ') val[k] *= int(valTemp) yield sum(val) if __name__ == '__main__': mrs.main(MatrixMultiply)
default=0.4, ) parser.add_option('-f', '--func', metavar='FUNCTION', dest='func', action='extend', search=['amlpso.functions'], help='Function to optimize', default='sphere.Sphere', ) parser.add_option('-t', '--top', metavar='TOPOLOGY', dest='top', action='extend', search=['amlpso.topology'], help='Initialization parameters', default='Isolated', ) parser.add_option('-o', '--out', metavar='OUTPUTTER', dest='out', action='extend', search=['amlpso.output'], help='Style of output', default='Basic', ) parser.add_option('--hey-im-testing', dest='hey_im_testing', action='store_true', help='Ignore errors from uncommitted changes (for testing only!)', default=False, ) return parser if __name__ == '__main__': mrs.main(PI, update_parser) # vim: et sw=4 sts=4
# Then output it in as a pickle, for easy analysis later. yield topic_info @classmethod def update_parser(cls, parser): parser.add_option( '-d', '--dataset', dest='dataset', help='Database name of the dataset to use', ) parser.add_option( '-a', '--analysis', dest='analysis', help='Database name of the analysis to use', ) parser.add_option( '-o', '--outdir', dest='outdir', help='Directory to store the output', ) return parser if __name__ == '__main__': mrs.main(DependencyParse) # vim: et sw=4 sts=4
inside = value else: outside = value pi = 4 * inside / (inside + outside) print(pi) sys.stdout.flush() return 0 @classmethod def update_parser(cls, parser): parser.add_option('-p', '--points', dest='num_points', help='Number of points for each map task', default=1000) parser.add_option('-t', '--tasks', dest='num_tasks', type='int', help='Number of map tasks to use', default=40) return parser if __name__ == '__main__': mrs.main(SamplePi)
for job in jobs: # execute the job string print "\nRunning cmd %d\n\t%s" % (key, job) os.system(job) print "============= Done! ===================" yield ["Done"] ''' Define a set of command line parameters whose values will be passed to your program in the opts parameter of the __init__ method. ''' @classmethod def update_parser(cls, parser): # TODO: add your option(s) here parser.add_option( '--myopt', type='int', dest='myopt', default=1, help='Myopt determines blah blah blah...', ) return parser if __name__ == '__main__': mrs.main(JobLauncher) # vim: et sw=4 sts=4
def node_pair_map(self, key, value): """Emit an entry for each pair of nodes in the walks.""" for i, start_node in enumerate(value): for end_node in value[i + 1:]: yield (start_node, end_node) def normalize_reduce(self, key, values): """Make a conditional probability distribution given the node `key`.""" counts = defaultdict(int) for v in values: counts[v] += 1 distribution = {} total = 0 for node, count in counts.iteritems(): if count >= MIN_COUNT: distribution[node] = count total += count for node in distribution: distribution[node] /= total if distribution: yield distribution if __name__ == '__main__': mrs.main(RandomWalkAnalyzer) # vim: et sw=4 sts=4
"they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ] WORD_RE = re.compile( r"[\w']+") #looks for words such as "word", word's "word's", word STOP_WORDS = getStopWords() class MRSInvertedIndex(mrs.MapReduce): """Count the number of occurrences of each word in a set of documents. """ def map(self, line_num, line_text): for word in WORD_RE.findall(line_text): word = word.strip(string.punctuation).lower() if word.lower() not in STOP_WORDS: if not word.isdigit(): yield (word, line_num) def reduce(self, word, line_num_list): yield word, list(line_num_list) if __name__ == '__main__': mrs.main(MRSInvertedIndex)