예제 #1
0
    def test_combine(self):
        even = self.items.filter(lambda x: x % 2 == 0)
        odd = self.items.filter(lambda x: x % 2 == 1)

        even_ve, odd_ve = Dampr.run(even, odd)
        self.assertEquals([10, 12, 14, 16, 18], list(even_ve))
        self.assertEquals([11, 13, 15, 17, 19], list(odd_ve))
예제 #2
0
def main(fname):
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # Share a root
    words = Dampr.text(fname, 1024**2) \
            .flat_map(lambda line: line.split())

    # Most frequent words
    top_words = words.count(lambda x: x) \
              .sort_by(lambda word_count: -word_count[1])

    # Total number of words seen
    total_count = top_words.fold_by(key=lambda word: 1,
                                    value=lambda x: x[1],
                                    binop=lambda x, y: x + y)

    # Character lengths
    word_lengths = top_words \
            .fold_by(lambda tc: len(tc[0]),
                    value=lambda tc: tc[1],
                    binop=lambda x,y: x+y) \
            .sort_by(lambda cl: cl[0])

    # Average character length
    avg_word_lengths = word_lengths \
            .map(lambda wl: wl[0] * wl[1]) \
            .a_group_by(lambda x: 1) \
                .sum() \
            .join(total_count) \
                .reduce(lambda awl, tc: next(awl)[1] / float(next(tc)[1]))

    tc, tw, wl, awl = Dampr.run(total_count,
                                top_words,
                                word_lengths,
                                avg_word_lengths,
                                name="word-stats")

    print()
    print("*" * 10)
    print("Word Stats")
    print("*" * 10)
    print("Total Words Found: ", tc.read(1)[0][1])

    print("\nTop 10 words")
    print("\n************")
    for word, count in tw.read(10):
        print(word, count)

    print("\nCharacter histogram")
    print("\n*******************")
    for cl, length in wl.read(20):
        print(cl, length)

    print("\nAverage Word Length: ", awl.read(1)[0][1])