def test_simple_sort(): input_node = Input(input=get_numbers()[::-1]) sort_node = Sort(by='a')(input_node) graph = Graph(input_node=input_node, output_node=sort_node) res = graph.run() assert res == get_numbers()
def test_persons_sort_name(): input_node = Input(input=get_advanced_persons()) sort_node = Sort(by='name')(input_node) graph = Graph(input_node=input_node, output_node=sort_node) res = graph.run() print("***** RESULT *****") for value in res: print(value) print() print("******************") assert res == [ {'name': 'Andrey', 'id': 1, 'age': 38}, {'name': 'Grigoroy', 'id': 4, 'age': 64}, {'name': 'Leonid', 'id': 2, 'age': 20}, {'name': 'Maxim', 'id': 5, 'age': 28}, {'name': 'Misha', 'id': 1, 'age': 5}, {'name': 'Rishat', 'id': 2, 'age': 17}, {'name': 'Roma', 'id': 1, 'age': 10}, {'name': 'Sergey', 'id': 1, 'age': 25}, {'name': 'Stepan', 'id': 10, 'age': 14}, ]
def split_text(record): """ Split rows with 'text' field into set of rows with 'token' field (one for every occurence of every word in text) """ new_text = re.sub('[^A-Za-z]+', ' ', record['text']) tokens = new_text.split() for token in tokens: yield { 'doc_id': record['doc_id'], 'word': token.lower(), } def word_counter(rows): """ Count words. """ yield {'word': rows[0]['word'], 'number': len(rows)} if __name__ == "__main__": input_node = Input() mapper = Map(split_text)(input_node) sort = Sort("word")(mapper) reduce = Reduce(word_counter, "word")(sort) graph = Graph(input_node=input_node, output_node=reduce) graph.run(input_file="data/text_corpus.txt", output_file=open("word_count.txt", "w"))
for row in rows: pmi = math.log(row['number_in_doc'] * row['words_in_total'] / row['sum_of_docs'] / row['words_in_doc']) words.append((row['word'], pmi)) words.sort(key=lambda x: x[1], reverse=True) yield {'doc_id': rows[0]['doc_id'], 'top_words': words[:10]} if __name__ == "__main__": input_node = Input() docs_count_reducer = Reduce(docs_count)(input_node) split_mapper = Map(split_text, "tokenizer")(docs_count_reducer) sort_by_word_doc_id = Sort(by=['word', 'doc_id'])(split_mapper) words_in_doc_reducer = Reduce(count_words_in_doc, key=('word', 'doc_id'))(sort_by_word_doc_id) sum_of_docs = Reduce(count_sum_of_docs, key=('word'))(words_in_doc_reducer) sort_by_doc_id = Sort(by="doc_id")(sum_of_docs) word_in_one_doc_reducer = Reduce(count_words_in_one_doc, key="doc_id")(sort_by_doc_id) words_reducer = Reduce(count_words)(word_in_one_doc_reducer) pmi_reducer = Reduce(count_pmi, "doc_id")(words_reducer) count_pmi = Graph(input_node=input_node, output_node=pmi_reducer)
if __name__ == "__main__": split_input_node = Input() split_mapper = Map(split_text)(split_input_node) split_words = Graph(input_node=split_input_node, output_node=split_mapper, name="split_words") fold_input = Input() folder = Fold(docs_count, {"docs_count": 0}, "doc_number")(fold_input) count_docs = Graph(input_node=fold_input, output_node=folder) count_idf_input = Input(split_words) sort_node = Sort(["doc_id", "word"])(count_idf_input) reducer = Reduce(unique, ["doc_id", "word"])(sort_node) join = Join(count_docs, [], "outer")(reducer) sort_by_word = Sort("word")(join) count_idf_reducer = Reduce(calc_idf, ["word"])(sort_by_word) count_idf = Graph(input_node=count_idf_input, output_node=count_idf_reducer) calc_index_input = Input(split_words) sort_doc = Sort("doc_id")(calc_index_input) tf_reducer = Reduce(term_frequency_reducer, "doc_id")(sort_doc) join_left = Join(count_idf, "word", "left")(tf_reducer) invert_reduce = Reduce(invert_index, "word")(join_left) calc_index = Graph(input_node=calc_index_input, output_node=invert_reduce) dependencies = {