Exemplo n.º 1
0
def test_simple_sort():
    input_node = Input(input=get_numbers()[::-1])
    sort_node = Sort(by='a')(input_node)

    graph = Graph(input_node=input_node, output_node=sort_node)
    res = graph.run()
    assert res == get_numbers()
Exemplo n.º 2
0
def test_persons_sort_name():
    input_node = Input(input=get_advanced_persons())
    sort_node = Sort(by='name')(input_node)
    graph = Graph(input_node=input_node, output_node=sort_node)
    res = graph.run()
    print("***** RESULT *****")
    for value in res:
        print(value)

    print()
    print("******************")

    assert res == [
        {'name': 'Andrey', 'id': 1, 'age': 38},
        {'name': 'Grigoroy', 'id': 4, 'age': 64},
        {'name': 'Leonid', 'id': 2, 'age': 20},
        {'name': 'Maxim', 'id': 5, 'age': 28},
        {'name': 'Misha', 'id': 1, 'age': 5},
        {'name': 'Rishat', 'id': 2, 'age': 17},
        {'name': 'Roma', 'id': 1, 'age': 10},
        {'name': 'Sergey', 'id': 1, 'age': 25},
        {'name': 'Stepan', 'id': 10, 'age': 14},
    ]
Exemplo n.º 3
0
def split_text(record):
    """
    Split rows with 'text' field into set of rows with 'token' field
    (one for every occurence of every word in text)
    """
    new_text = re.sub('[^A-Za-z]+', ' ', record['text'])
    tokens = new_text.split()
    for token in tokens:
        yield {
            'doc_id': record['doc_id'],
            'word': token.lower(),
        }


def word_counter(rows):
    """ Count words. """
    yield {'word': rows[0]['word'], 'number': len(rows)}


if __name__ == "__main__":

    input_node = Input()
    mapper = Map(split_text)(input_node)
    sort = Sort("word")(mapper)
    reduce = Reduce(word_counter, "word")(sort)

    graph = Graph(input_node=input_node, output_node=reduce)
    graph.run(input_file="data/text_corpus.txt",
              output_file=open("word_count.txt", "w"))
Exemplo n.º 4
0
    for row in rows:
        pmi = math.log(row['number_in_doc'] * row['words_in_total'] /
                       row['sum_of_docs'] / row['words_in_doc'])

        words.append((row['word'], pmi))

    words.sort(key=lambda x: x[1], reverse=True)
    yield {'doc_id': rows[0]['doc_id'], 'top_words': words[:10]}


if __name__ == "__main__":

    input_node = Input()
    docs_count_reducer = Reduce(docs_count)(input_node)
    split_mapper = Map(split_text, "tokenizer")(docs_count_reducer)
    sort_by_word_doc_id = Sort(by=['word', 'doc_id'])(split_mapper)

    words_in_doc_reducer = Reduce(count_words_in_doc,
                                  key=('word', 'doc_id'))(sort_by_word_doc_id)

    sum_of_docs = Reduce(count_sum_of_docs, key=('word'))(words_in_doc_reducer)

    sort_by_doc_id = Sort(by="doc_id")(sum_of_docs)

    word_in_one_doc_reducer = Reduce(count_words_in_one_doc,
                                     key="doc_id")(sort_by_doc_id)

    words_reducer = Reduce(count_words)(word_in_one_doc_reducer)
    pmi_reducer = Reduce(count_pmi, "doc_id")(words_reducer)

    count_pmi = Graph(input_node=input_node, output_node=pmi_reducer)
Exemplo n.º 5
0

if __name__ == "__main__":

    split_input_node = Input()
    split_mapper = Map(split_text)(split_input_node)
    split_words = Graph(input_node=split_input_node,
                        output_node=split_mapper,
                        name="split_words")

    fold_input = Input()
    folder = Fold(docs_count, {"docs_count": 0}, "doc_number")(fold_input)
    count_docs = Graph(input_node=fold_input, output_node=folder)

    count_idf_input = Input(split_words)
    sort_node = Sort(["doc_id", "word"])(count_idf_input)
    reducer = Reduce(unique, ["doc_id", "word"])(sort_node)
    join = Join(count_docs, [], "outer")(reducer)
    sort_by_word = Sort("word")(join)
    count_idf_reducer = Reduce(calc_idf, ["word"])(sort_by_word)
    count_idf = Graph(input_node=count_idf_input,
                      output_node=count_idf_reducer)

    calc_index_input = Input(split_words)
    sort_doc = Sort("doc_id")(calc_index_input)
    tf_reducer = Reduce(term_frequency_reducer, "doc_id")(sort_doc)
    join_left = Join(count_idf, "word", "left")(tf_reducer)
    invert_reduce = Reduce(invert_index, "word")(join_left)
    calc_index = Graph(input_node=calc_index_input, output_node=invert_reduce)

    dependencies = {