Exemplo n.º 1
0
def test_simple_mapper():
    input_node = Input(input=get_numbers())
    mapper_node = Map(simple_mapper)(input_node)

    assert mapper_node.input == input_node
    graph = Graph(input_node=input_node, output_node=mapper_node)
    res = graph.run()
    assert res == get_numbers()
Exemplo n.º 2
0
def test_square_mapper():
    input_node = Input(input=get_numbers())
    mapper_node = Map(square_mapper)(input_node)
    graph = Graph(input_node=input_node, output_node=mapper_node)
    res = graph.run()

    answer = [{'a': i**2} for i in range(1, 6)]
    assert res == answer
Exemplo n.º 3
0
def test_person_mapper():
    input_node = Input(input=get_persons())
    mapper_node = Map(person_mapper)(input_node)
    graph = Graph(input_node=input_node, output_node=mapper_node)
    res = graph.run()
    assert res == [
        {
            "name": "Andrey",
            "id": 1
        },
        {
            "name": "Leonid",
            "id": 2
        },
        {
            "name": "Sergey",
            "id": 1
        },
    ]
Exemplo n.º 4
0
def test_empty_mapper():
    input_node = Input(input=[])
    mapper_node = Map(simple_mapper)(input_node)
    graph = Graph(input_node=input_node, output_node=mapper_node)
    res = graph.run()
    assert res == []
Exemplo n.º 5
0
def split_text(record):
    """
    Split rows with 'text' field into set of rows with 'token' field
    (one for every occurence of every word in text)
    """
    new_text = re.sub('[^A-Za-z]+', ' ', record['text'])
    tokens = new_text.split()
    for token in tokens:
        yield {
            'doc_id': record['doc_id'],
            'word': token.lower(),
        }


def word_counter(rows):
    """ Count words. """
    yield {'word': rows[0]['word'], 'number': len(rows)}


if __name__ == "__main__":

    input_node = Input()
    mapper = Map(split_text)(input_node)
    sort = Sort("word")(mapper)
    reduce = Reduce(word_counter, "word")(sort)

    graph = Graph(input_node=input_node, output_node=reduce)
    graph.run(input_file="data/text_corpus.txt",
              output_file=open("word_count.txt", "w"))
Exemplo n.º 6
0
    words = []
    for row in rows:
        pmi = math.log(row['number_in_doc'] * row['words_in_total'] /
                       row['sum_of_docs'] / row['words_in_doc'])

        words.append((row['word'], pmi))

    words.sort(key=lambda x: x[1], reverse=True)
    yield {'doc_id': rows[0]['doc_id'], 'top_words': words[:10]}


if __name__ == "__main__":

    input_node = Input()
    docs_count_reducer = Reduce(docs_count)(input_node)
    split_mapper = Map(split_text, "tokenizer")(docs_count_reducer)
    sort_by_word_doc_id = Sort(by=['word', 'doc_id'])(split_mapper)

    words_in_doc_reducer = Reduce(count_words_in_doc,
                                  key=('word', 'doc_id'))(sort_by_word_doc_id)

    sum_of_docs = Reduce(count_sum_of_docs, key=('word'))(words_in_doc_reducer)

    sort_by_doc_id = Sort(by="doc_id")(sum_of_docs)

    word_in_one_doc_reducer = Reduce(count_words_in_one_doc,
                                     key="doc_id")(sort_by_doc_id)

    words_reducer = Reduce(count_words)(word_in_one_doc_reducer)
    pmi_reducer = Reduce(count_pmi, "doc_id")(words_reducer)