Пример #1
0
from MapReduce import MapReduce

mr = MapReduce()


def mapper(record):
    # key: word
    # value: filename
    value = record[0]
    text = record[1]
    for key in text.split():
        mr.emit_intermediate(key, value)


def reducer(key, values):
    # key: word
    # value: list of filenames
    mr.emit((key, list(set(values))))


if __name__ == '__main__':
    import sys, json
    inputdata = open(sys.argv[1])
    # inputdata = open('./data/books.json')
    mr.execute(inputdata, mapper, reducer)

    with open('inverted_index.json', 'w') as outfile:
        json.dump(mr.result, outfile)
Пример #2
0
    order_id = record[1]

    #Mapper will be called on all the orders first(As the data file contains the records of orders before line_items)
    #As the output of mapper is fed to reducer,all the output to be displayed need to be present in the output of the Mapper.
    #So emit_intermdiate all the records with order_id as the key
    #So for every record the id,record is emmited
    if record_type == "order":
        mr.emit_intermediate(order_id,record)
    elif record_type == "line_item":
        mr.emit_intermediate(order_id,record)

#Reducer funtion is called for every record in the output of the map phase (here it is the global dictionary mr.intermediate)
def reducer(key,list_of_values):
    #for all list_of_values first field will be order record and all the others will be line_item records
    #ie list_of_values[0] will be the orders and list_of_values[1:n] will be the list_item records

    #Every order is emitted with all the list_items having the same order ID
    current = 1;
    while current < len(list_of_values):
        mr.emit((list_of_values[0],list_of_values[current]))
        current = current + 1;

#####################################################
if __name__ == '__main__':
  inputdata = open(sys.argv[1])
  #execute funtion steps
  #1.Mapper function for every record (Every mapper appends the output to a global dictionary by calling the functin mr.emit_intermediate ..Line number 18 )
  #2.Reducer is called for every record in the output of the map phase (Every reducer appends the data to the global array mr.result)
  #3.The final result array is printed
  mr.execute(inputdata, mapper, reducer)
Пример #3
0
from MapReduce import MapReduce
map_reduce_obj = MapReduce()


def mapper(record):
    number_of_baskets=len(record)
    candidate_list=open(sys.argv[2])
    for candidate in candidate_list:
        count=0
        candidate=json.loads(candidate.strip())
        for candidate_chunk in record:
            if not set(candidate)-set(candidate_chunk):
               count+=1
        map_reduce_obj.emit_intermediate(tuple(candidate),(count,number_of_baskets))


def reduce(key,list_of_value):
    total_count=0
    total_baskets=0
    for item in list_of_value:
        total_count+=item[0]
        total_baskets+=item[1]
    threshold=ceil(total_baskets*0.3)
    if total_count>=threshold:
        map_reduce_obj.emit([list(key),total_count])


if __name__=='__main__':
    input_data = open(sys.argv[1])
    map_reduce_obj.execute(input_data, mapper, reduce)
Пример #4
0
import sys
from MapReduce import MapReduce

mr = MapReduce()

def mapper(record):
    # key: document identifier
    # value: document contents
    key, seq = record
    mr.emit_intermediate(seq[:-10], 1)

# Part 3
def reducer(key, list_of_values):
    # key: word
    # value: list of occurrence counts
    mr.emit(key)

# Part 4
with open(sys.argv[1]) as f:
    mr.execute(f, mapper, reducer)
from MapperMatrixVector import MapperMatrixVector
from ReducerMatrixVector import ReducerMatrixVector
from MapReduce import MapReduce
from FileHelper import FileHelper

# Create instances for mapper and reducer
# Note that the vector is stored in the instance
theReducerMatrixVector = ReducerMatrixVector();
theMapperMatrixVector = MapperMatrixVector('dataFiles/b');

# the file where the matrix is stored
matrixFile = ['dataFiles/A'];

# MapReduce
theMapReducerMatrixVector = MapReduce(theMapperMatrixVector,theReducerMatrixVector,matrixFile,0,1)
resultDict = theMapReducerMatrixVector.execute();

# Write output
outFileFirectory = 'outputs/'
outfileName = 'matrixVectorResults.txt';
FileHelper.writeDictionnary(outFileFirectory+outfileName,resultDict)
Пример #6
0
from MapperCountingWords import MapperCountingWords
from ReducerCountingWords import ReducerCountingWords
from MapReduce import MapReduce
from FileHelper import FileHelper

# Create instances for mapper and reducer
theMapper = MapperCountingWords();
theReducer = ReducerCountingWords();

# parse the file : one word/line
inFiles = ['dataFiles/text'];

# we can have more than one text file
inFileParsed = 'dataFiles/textParsed';
FileHelper.transformTextIntoListOfWords(inFiles,inFileParsed)

# MapReduce
theMapReducer = MapReduce(theMapper,theReducer,[inFileParsed],silent=-1,nThreads=5)
resultDict = theMapReducer.execute()

# Write output
outFileFirectory = 'outputs/'
outfileName = 'coutingWordsResults.txt';
FileHelper.writeDictionnary(outFileFirectory+outfileName,resultDict)
Пример #7
0
from MapReduce import MapReduce
import itertools
import sys

map_reduce_obj = MapReduce()


def mapper(record):
    map_reduce_obj.emit_intermediate(record[0], record[1])
    map_reduce_obj.emit_intermediate(record[1], record[0])


def reducer(key, list_of_values):
    value_group = list(itertools.combinations(list_of_values, 2))
    for value in value_group:
        value = list(value)
        value.sort()
        value.append(key)
        map_reduce_obj.emit(value)


if __name__ == '__main__':
    input_data = open(sys.argv[1])
    map_reduce_obj.execute(input_data, mapper, reducer)