def test_order_computation(): first = Input(input=[]) gr1 = Graph(input_node=first, output_node=first, name="first") second = Input(input=gr1) gr2 = Graph(input_node=second, output_node=second, name="second") third = Input(input=gr2) gr3 = Graph(input_node=third, output_node=third, name="third") assert gr3.order == [gr1, gr2]
def test_input_node_to_graph_in(): first_input = Input(input=get_persons(), name="main_input") first_graph = Graph(input_node=first_input, output_node=first_input, name="main_graph") second_input = Input(input=first_graph) assert second_input.input_graph == first_graph assert second_input.input is None second_graph = Graph(input_node=second_input, output_node=second_input) assert second_graph.order == [first_graph]
def test_simple_sort(): input_node = Input(input=get_numbers()[::-1]) sort_node = Sort(by='a')(input_node) graph = Graph(input_node=input_node, output_node=sort_node) res = graph.run() assert res == get_numbers()
def test_outer_without_key(): left_input = Input(input=get_advanced_persons()) left_graph = Graph(input_node=left_input, output_node=left_input) right_input = Input(input=get_advanced_cities()) outer_join = Join(left_graph, [], "outer")(right_input) graph = Graph(input_node=right_input, output_node=outer_join) res = graph.run() print("***** RESULT *****") for value in res: print(value) print() print("******************")
def test_input_node_init(): input_node = Input( input=get_persons(), name="main_input", ) assert input_node.name == "main_input" assert input_node.input == get_persons() assert input_node.output is None
def test_square_mapper(): input_node = Input(input=get_numbers()) mapper_node = Map(square_mapper)(input_node) graph = Graph(input_node=input_node, output_node=mapper_node) res = graph.run() answer = [{'a': i**2} for i in range(1, 6)] assert res == answer
def test_simple_mapper(): input_node = Input(input=get_numbers()) mapper_node = Map(simple_mapper)(input_node) assert mapper_node.input == input_node graph = Graph(input_node=input_node, output_node=mapper_node) res = graph.run() assert res == get_numbers()
def test_simple_fold(): input_node = Input(input=get_numbers()) folder_node = Fold(simple_folder, {"a": []})(input_node) graph = Graph(input_node=input_node, output_node=folder_node) res = graph.run() assert res == [{'a': [1, -2, 3, -4, 5]}]
def test_input_node_to_graph(): input_node = Input( input=get_persons(), name="main_input", ) graph = Graph(input_node=input_node, output_node=input_node, name="main_graph") assert graph.name == "main_graph" assert graph.nodes == [input_node] assert graph._dependencies == [] assert graph.order == [] result = graph.run() assert result == get_persons()
def test_city_fold(): input_node = Input(input=get_cities()) state = {"id": -1, "name": ""} folder_node = Fold(city_folder, state)(input_node) graph = Graph(input_node=input_node, output_node=folder_node) res = graph.run() print("***** RESULT *****") for value in res: print(value) print() print("******************") answer_name = "" names = [city["name"] for city in get_cities()] for name in names: answer_name += name assert res == [{"id": -1, "name": answer_name}]
def test_person_mapper(): input_node = Input(input=get_persons()) mapper_node = Map(person_mapper)(input_node) graph = Graph(input_node=input_node, output_node=mapper_node) res = graph.run() assert res == [ { "name": "Andrey", "id": 1 }, { "name": "Leonid", "id": 2 }, { "name": "Sergey", "id": 1 }, ]
def test_persons_sort_name(): input_node = Input(input=get_advanced_persons()) sort_node = Sort(by='name')(input_node) graph = Graph(input_node=input_node, output_node=sort_node) res = graph.run() print("***** RESULT *****") for value in res: print(value) print() print("******************") assert res == [ {'name': 'Andrey', 'id': 1, 'age': 38}, {'name': 'Grigoroy', 'id': 4, 'age': 64}, {'name': 'Leonid', 'id': 2, 'age': 20}, {'name': 'Maxim', 'id': 5, 'age': 28}, {'name': 'Misha', 'id': 1, 'age': 5}, {'name': 'Rishat', 'id': 2, 'age': 17}, {'name': 'Roma', 'id': 1, 'age': 10}, {'name': 'Sergey', 'id': 1, 'age': 25}, {'name': 'Stepan', 'id': 10, 'age': 14}, ]
def test_inner_simple_join(): left_input = Input(input=get_advanced_persons()) left_graph = Graph(input_node=left_input, output_node=left_input) right_input = Input(input=get_advanced_cities()) inner_join = Join(left_graph, 'id', "inner")(right_input) graph = Graph(input_node=right_input, output_node=inner_join) res = graph.run() print("***** RESULT *****") for value in res: print(value) print() print("******************") assert res == [ { 'age': 38, 'city': 'Mocsow', 'id': 1, 'name': 'Andrey' }, { 'age': 25, 'city': 'Mocsow', 'id': 1, 'name': 'Sergey' }, { 'age': 5, 'city': 'Mocsow', 'id': 1, 'name': 'Misha' }, { 'age': 10, 'city': 'Mocsow', 'id': 1, 'name': 'Roma' }, { 'age': 20, 'city': 'SPb', 'id': 2, 'name': 'Leonid' }, { 'age': 17, 'city': 'SPb', 'id': 2, 'name': 'Rishat' }, { 'age': 14, 'city': 'Kaluga', 'id': 10, 'name': 'Stepan' }, ]
def test_outer_join(): left_input = Input(input=get_advanced_persons()) left_graph = Graph(input_node=left_input, output_node=left_input) right_input = Input(input=get_advanced_cities()) outer_join = Join(left_graph, 'id', "outer")(right_input) graph = Graph(input_node=right_input, output_node=outer_join) res = graph.run() print("***** RESULT *****") for value in res: print(value) print() print("******************") assert res == [ { 'age': 38, 'city': 'Mocsow', 'id': 1, 'name': 'Andrey' }, { 'age': 25, 'city': 'Mocsow', 'id': 1, 'name': 'Sergey' }, { 'age': 5, 'city': 'Mocsow', 'id': 1, 'name': 'Misha' }, { 'age': 10, 'city': 'Mocsow', 'id': 1, 'name': 'Roma' }, { 'age': 20, 'city': 'SPb', 'id': 2, 'name': 'Leonid' }, { 'age': 17, 'city': 'SPb', 'id': 2, 'name': 'Rishat' }, { 'age': None, 'city': 'Kazan', 'id': 3, 'name': None }, { 'age': 64, 'city': None, 'id': 4, 'name': 'Grigoroy' }, { 'age': 28, 'city': None, 'id': 5, 'name': 'Maxim' }, { 'age': None, 'city': 'Novgorod', 'id': 7, 'name': None }, { 'age': 14, 'city': 'Kaluga', 'id': 10, 'name': 'Stepan' }, { 'age': None, 'city': 'Tula', 'id': 12, 'name': None }, ]
def test_empty_mapper(): input_node = Input(input=[]) mapper_node = Map(simple_mapper)(input_node) graph = Graph(input_node=input_node, output_node=mapper_node) res = graph.run() assert res == []
row["tf_idf"] = row["tf"] * \ math.log(row['docs_count'] / row['count_idf']) records = sorted(records, key=itemgetter("tf_idf"), reverse=True) yield { "word": row["word"], "index": [(records[i]["doc_id"], records[i]["tf_idf"]) for i in range(0, min(3, len(records)))] } if __name__ == "__main__": split_input_node = Input() split_mapper = Map(split_text)(split_input_node) split_words = Graph(input_node=split_input_node, output_node=split_mapper, name="split_words") fold_input = Input() folder = Fold(docs_count, {"docs_count": 0}, "doc_number")(fold_input) count_docs = Graph(input_node=fold_input, output_node=folder) count_idf_input = Input(split_words) sort_node = Sort(["doc_id", "word"])(count_idf_input) reducer = Reduce(unique, ["doc_id", "word"])(sort_node) join = Join(count_docs, [], "outer")(reducer) sort_by_word = Sort("word")(join) count_idf_reducer = Reduce(calc_idf, ["word"])(sort_by_word)
def split_text(record): """ Split rows with 'text' field into set of rows with 'token' field (one for every occurence of every word in text) """ new_text = re.sub('[^A-Za-z]+', ' ', record['text']) tokens = new_text.split() for token in tokens: yield { 'doc_id': record['doc_id'], 'word': token.lower(), } def word_counter(rows): """ Count words. """ yield {'word': rows[0]['word'], 'number': len(rows)} if __name__ == "__main__": input_node = Input() mapper = Map(split_text)(input_node) sort = Sort("word")(mapper) reduce = Reduce(word_counter, "word")(sort) graph = Graph(input_node=input_node, output_node=reduce) graph.run(input_file="data/text_corpus.txt", output_file=open("word_count.txt", "w"))
def test_input_node_empty_run(): input_node = Input(input=[]) res = list(input_node.run()) assert res == []
def test_input_node_run(): input_node = Input(input=get_persons()) res = list(input_node.run()) assert res == get_persons()
def test_left_common_cols_join(): left_input = Input(input=get_advanced_persons()) left_graph = Graph(input_node=left_input, output_node=left_input) right_input = Input(input=get_cities()) left_join = Join(left_graph, 'id', "left")(right_input) graph = Graph(input_node=right_input, output_node=left_join) res = graph.run() print("***** RESULT *****") for value in res: print(value) print() print("******************") assert res == [ { 'age': 38, 'id': 1, 'left_name': 'Andrey', 'right_name': 'Mocsow' }, { 'age': 25, 'id': 1, 'left_name': 'Sergey', 'right_name': 'Mocsow' }, { 'age': 5, 'id': 1, 'left_name': 'Misha', 'right_name': 'Mocsow' }, { 'age': 10, 'id': 1, 'left_name': 'Roma', 'right_name': 'Mocsow' }, { 'age': 20, 'id': 2, 'left_name': 'Leonid', 'right_name': 'SPb' }, { 'age': 17, 'id': 2, 'left_name': 'Rishat', 'right_name': 'SPb' }, { 'age': 64, 'id': 4, 'left_name': 'Grigoroy', 'right_name': None }, { 'age': 28, 'id': 5, 'left_name': 'Maxim', 'right_name': None }, { 'age': 14, 'id': 10, 'left_name': 'Stepan', 'right_name': 'Kaluga' }, ]