Exemplo n.º 1
0
def reduce_word_join(input=mru.reducer_stream(), output=sys.stdout):
    """
    (word) (file_name tfidf) --> (word) (file1 file2 tfidf1*tfidf2)

    for each word, if two distinct documents both contain that word,
    a line is emitted containing the product of the tfidf scores of that
    word in both documents.

    This is the first step in computing the pairwise dot product of the tf-idf
    vectors between all documents, where the corresponding elements for every
    pair of documents are multiplied together.
    """

    for in_key, key_stream in input:
        values = []
        for in_value in key_stream:
            values.append(in_value)
        for val1 in values:
            for val2 in values:
                if not val1['filename'] == val2['filename']:
                    out_key = {'word': in_key['word']}
                    out_value = {
                        'file1': val1['filename'],
                        'file2': val2['filename'],
                        'product': val1['tfidf'] * val2['tfidf']
                    }
                    mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 2
0
def reduce_corpus_size(input=mru.reducer_stream(), output=sys.stdout):
    corpus_size = 0
    for in_key, key_stream in input:
        for in_value in key_stream:
            corpus_size += 1
    out_key = 'corpus size'
    out_value = corpus_size
    mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 3
0
 def run_reducer(self, fixture=None, args={}):
     """
     runs the reducer that is being tested on fixture with arguments args
     and returns the resulting output strings in a list.
     """
     if fixture is None:
         fixture = self.default_fixture
     with open(fixture, 'r') as f:
         args['input'] = mru.reducer_stream(self.keys, self.values, f.readline)
         return pipe_through(self.to_test, args)
Exemplo n.º 4
0
 def run_reducer(self, fixture=None, args={}):
     """
     runs the reducer that is being tested on fixture with arguments args
     and returns the resulting output strings in a list.
     """
     if fixture is None:
         fixture = self.default_fixture
     with open(fixture, 'r') as f:
         args['input'] = mru.reducer_stream(self.keys, self.values,
                                            f.readline)
         return pipe_through(self.to_test, args)
Exemplo n.º 5
0
def reduce_word_frequency(input=reducer_stream(KEYS, VALUES), output=sys.stdout):
    """
    (word file_name) (1) --> (word file_name) (n)

    sums up the number of occurences of each word in each file and emits
    the result for each word/filename combination
    """

    for key, key_stream in input:
        count = 0
        for value in key_stream:
            count += int(value['count'])
        print_result(key, count, output)
Exemplo n.º 6
0
def reduce_cosine_similarity(input=mru.reducer_stream(), output=stdout):
    """
    (file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2))

    sums up the products of the tfidf values of words common between every
    pair of documents to produce the cosine similarity of the two documents
    """
    for in_key, key_stream in input:
        sum_for_docs = 0
        for in_value in key_stream:
            sum_for_docs += in_value['product']
        out_key = {'file1': in_key['file1'], 'file2': in_key['file2']}
        out_value = {'cos_similarity': sum_for_docs}
        mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 7
0
def reduce_word_frequency(input=mru.reducer_stream(), output=sys.stdout):
    """
    (word filename) (1) --> (word filename) (n)

    sums up the number of occurences of each word in each file and emits
    the result for each word/filename combination
    """

    for in_key, key_stream in input:
        word_frequency = 0
        for in_value in key_stream:
            word_frequency += in_value['count']
        out_key = {'word': in_key['word'], 'filename': in_key['filename']}
        out_value = {'word_freq': word_frequency}
        mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 8
0
def reduce_cosine_similarity(precision,
                             input=reducer_stream(KEYS, VALUES),
                             output=sys.stdout):
    """
    (file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2))

    sums up the products of the tfidf values of words common between every
    pair of documents to produce the cosine similarity of the two documents
    """
    for key, key_stream in input:
        sum_for_docs = 0
        for value in key_stream:
            term = value['term']
            sum_for_docs += float(term)
        print_result(key['file1'], key['file2'], sum_for_docs, precision,
                     output)
Exemplo n.º 9
0
def reduce_cosine_similarity(precision,
                             input=reducer_stream(KEYS, VALUES),
                             output=sys.stdout):
    """
    (file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2))

    sums up the products of the tfidf values of words common between every
    pair of documents to produce the cosine similarity of the two documents
    """
    for key, key_stream in input:
        sum_for_docs = 0
        for value in key_stream:
            term = value['term']
            sum_for_docs += float(term)
        print_result(key['file1'], key['file2'],
                     sum_for_docs, precision, output)
Exemplo n.º 10
0
def reduce_corpus_frequency(input=reducer_stream(KEYS, VALUES),
                            output=sys.stdout):
    """
    (word) (file_name n N 1) --> (word file_name) (n N m)

    sums up the number of occurences of each unique word throughout
    the corpus and emits this sum for each document that the word
    occurs in.
    """
    for key, key_stream in input:
        count = 0
        values = []
        for value in key_stream:
            count += int(value['count'])
            values.append(value)
        print_results(values, key['word'], count, output)
Exemplo n.º 11
0
def reduce_word_count(input=reducer_stream(KEYS, VALUES), output=sys.stdout):
    """
    (file_name) (word n) --> (word file_name) (n, N)

    sums up the total number of words in each document and emits
    that sum for each word along with the number of occurences of that
    word in the given document
    """

    for key, key_stream in input:
        count = 0
        values = []
        for value in key_stream:
            values.append(value)
            count += int(value['frequency'])
        print_results(values, key['filename'], count, output)
Exemplo n.º 12
0
def reduce_corpus_frequency(
        input=reducer_stream(KEYS, VALUES), output=sys.stdout):
    """
    (word) (file_name n N 1) --> (word file_name) (n N m)

    sums up the number of occurences of each unique word throughout
    the corpus and emits this sum for each document that the word
    occurs in.
    """
    for key, key_stream in input:
        count = 0
        values = []
        for value in key_stream:
            count += int(value['count'])
            values.append(value)
        print_results(values, key['word'], count, output)
Exemplo n.º 13
0
def normalize_reducer(input=mru.reducer_stream(), output=sys.stdout,
                      keys_to_normalize=KEYS_TO_NORMALIZE):
    for in_key, key_stream in input:
        normalize_factors = {to_factor: 0.0 for to_factor in keys_to_normalize}
        terms_to_normalize = []
        for in_value in key_stream:
            terms_to_normalize.append(in_value)
            normalize_factors = {k: normalize_factors[k] + in_value[k] ** 2
                                 for k, v in normalize_factors.iteritems()}
        for term in terms_to_normalize:
            out_key = {'uid': in_key['uid'], 'ngram': term['ngram']}
            out_value = term
            del out_value['ngram']
            for key in keys_to_normalize:
                out_value[key] /= math.sqrt(normalize_factors[key])
            mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 14
0
def reduce_word_count(input=mru.reducer_stream(), output=sys.stdout):
    """
    (file_name) (word word_freq) --> (word file_name) (n N)

    sums up the total number of words in each document and emits
    that sum for each word along with the number of occurences of that
    word in the given document
    """

    for in_key, key_stream in input:
        doc_size = 0
        values = []
        for in_value in key_stream:
            values.append(in_value)
            doc_size += in_value['word_freq']
        for value in values:
            out_key = {'word': value['word'], 'filename': in_key['filename']}
            out_value = {'word_freq': value['word_freq'], 'doc_size': doc_size}
            mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 15
0
def normalize_reducer(input=mru.reducer_stream(),
                      output=sys.stdout,
                      keys_to_normalize=KEYS_TO_NORMALIZE):
    for in_key, key_stream in input:
        normalize_factors = {to_factor: 0.0 for to_factor in keys_to_normalize}
        terms_to_normalize = []
        for in_value in key_stream:
            terms_to_normalize.append(in_value)
            normalize_factors = {
                k: normalize_factors[k] + in_value[k]**2
                for k, v in normalize_factors.iteritems()
            }
        for term in terms_to_normalize:
            out_key = {'uid': in_key['uid'], 'ngram': term['ngram']}
            out_value = term
            del out_value['ngram']
            for key in keys_to_normalize:
                out_value[key] /= math.sqrt(normalize_factors[key])
            mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 16
0
def reduce_word_count(input=mru.reducer_stream(), output=sys.stdout):
    """
    (file_name) (word word_freq) --> (word file_name) (n N)

    sums up the total number of words in each document and emits
    that sum for each word along with the number of occurences of that
    word in the given document
    """

    for in_key, key_stream in input:
        doc_size = 0
        values = []
        for in_value in key_stream:
            values.append(in_value)
            doc_size += in_value['word_freq']
        for value in values:
            out_key = {'word': value['word'], 'filename': in_key['filename']}
            out_value = {'word_freq': value['word_freq'], 'doc_size': doc_size}
            mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 17
0
def reduce_word_join(precision,
                     input=reducer_stream(KEYS, VALUES),
                     output=sys.stdout):
    """
    (word) (file_name tfidf) --> (word) (file1 file2 tfidf1*tfidf2)

    for each word, if two distinct documents both contain that word,
    a line is emitted containing the product of the tfidf scores of that
    word in both documents.

    This is the first step in computing the pairwise dot product of the tf-idf
    vectors between all documents, where the corresponding elements for every
    pair of documents are multiplied together.
    """

    for key, key_stream in input:
        values = []
        for value in key_stream:
            values.append(value)
        print_results(values, key['word'], precision, output)
Exemplo n.º 18
0
def reduce_corpus_frequency(input=mru.reducer_stream(), output=stdout):
    """
    (word) (filename n N 1) --> (word filename) (n N m)

    sums up the number of occurences of each unique word throughout
    the corpus and emits this sum for each document that the word
    occurs in.
    """
    for in_key, key_stream in input:
        corpus_frequency = 0
        values = []
        for in_value in key_stream:
            corpus_frequency += in_value['count']
            values.append(in_value)
        for value in values:
            out_key = {'word': in_key['word'], 'filename': value['filename']}
            out_value = {'word_freq': value['word_freq'],
                         'doc_size': value['doc_size'],
                         'corp_freq': corpus_frequency}
            mru.reducer_emit(out_key, out_value, output)
Exemplo n.º 19
0
def reduce_word_join(precision,
                     input=reducer_stream(KEYS, VALUES),
                     output=sys.stdout):
    """
    (word) (file_name tfidf) --> (word) (file1 file2 tfidf1*tfidf2)

    for each word, if two distinct documents both contain that word,
    a line is emitted containing the product of the tfidf scores of that
    word in both documents.

    This is the first step in computing the pairwise dot product of the tf-idf
    vectors between all documents, where the corresponding elements for every
    pair of documents are multiplied together.
    """

    for key, key_stream in input:
        values = []
        for value in key_stream:
            values.append(value)
        print_results(values, key['word'], precision, output)
Exemplo n.º 20
0
def reduce_corpus_frequency(input=mru.reducer_stream(), output=stdout):
    """
    (word) (filename n N 1) --> (word filename) (n N m)

    sums up the number of occurences of each unique word throughout
    the corpus and emits this sum for each document that the word
    occurs in.
    """
    for in_key, key_stream in input:
        corpus_frequency = 0
        values = []
        for in_value in key_stream:
            corpus_frequency += in_value['count']
            values.append(in_value)
        for value in values:
            out_key = {'word': in_key['word'], 'filename': value['filename']}
            out_value = {
                'word_freq': value['word_freq'],
                'doc_size': value['doc_size'],
                'corp_freq': corpus_frequency
            }
            mru.reducer_emit(out_key, out_value, output)