def __init__(self, input_node: WillumpGraphNode, input_name: str, output_name: str, input_vocab_dict: Mapping[str, int], input_idf_vector, aux_data: List[Tuple[int, WeldType]], ngram_range: Tuple[int, int], analyzer: str = "char", cost: float = 0) -> None: """ Initialize the node, appending a new entry to aux_data in the process. """ self._input_array_string_name = input_name self._output_name = output_name vocabulary_list = sorted(input_vocab_dict.keys(), key=lambda x: input_vocab_dict[x]) self._vocab_size = len(vocabulary_list) self.output_width = self._vocab_size self._vocab_dict_name = "AUX_DATA_{0}".format(len(aux_data)) self._idf_vector_name = "AUX_DATA_{0}".format(len(aux_data) + 1) self._input_nodes = [] self._input_nodes.append(input_node) self._input_nodes.append(WillumpInputNode(self._vocab_dict_name)) self._input_nodes.append(WillumpInputNode(self._idf_vector_name)) self._input_names = [ input_name, self._vocab_dict_name, self._idf_vector_name ] self._min_gram, self._max_gram = ngram_range self._analyzer = analyzer self._cost = cost for entry in self._process_aux_data(vocabulary_list, input_idf_vector): aux_data.append(entry)
def test_basic_cv(self): print("\ntest_basic_cv") input_str = ["catcatcat", "dogdogdog", "elephantcat"] input_node: WillumpInputNode = WillumpInputNode("input_str") aux_data = [] with open("tests/test_resources/simple_vocabulary.txt") as simple_vocab: simple_vocab_dict = {word: index for index, word in enumerate(simple_vocab.read().splitlines())} array_cv_node: ArrayCountVectorizerNode = \ ArrayCountVectorizerNode(input_node, "input_str", output_name='lowered_output_words', input_vocab_dict=simple_vocab_dict, aux_data=aux_data, ngram_range=(2, 5)) output_node: WillumpOutputNode = WillumpOutputNode(array_cv_node, ["lowered_output_words"]) graph: WillumpGraph = WillumpGraph(output_node) type_map = {"input_str": WeldVec(WeldStr()), "lowered_output_words": WeldCSR((WeldLong()))} weld_output = wexec.execute_from_basics(graph, type_map, (input_str,), ["input_str"], ["lowered_output_words"], aux_data) numpy.testing.assert_equal( weld_output[0], numpy.array([0, 1, 2], dtype=numpy.int64)) numpy.testing.assert_equal( weld_output[1], numpy.array([3, 4, 3], dtype=numpy.int64)) numpy.testing.assert_equal( weld_output[2], numpy.array([3, 3, 1], dtype=numpy.int64))
def test_basic_hash_join(self): print("\ntest_basic_hash_join") left_table = pd.read_csv("tests/test_resources/toy_data_csv.csv") right_table = pd.read_csv("tests/test_resources/toy_metadata_csv.csv") input_node: WillumpInputNode = WillumpInputNode("input_table") aux_data = [] hash_join_node: WillumpHashJoinNode = \ WillumpHashJoinNode(input_node=input_node, input_name="input_table", output_name="output", join_col_names=["join_column"], right_dataframe=right_table, aux_data=aux_data, left_input_type=WeldPandas( [WeldVec(WeldLong()), WeldVec(WeldLong()), WeldVec(WeldLong())], ["join_column", "data1", "data2"])) output_node: WillumpOutputNode = WillumpOutputNode(hash_join_node, ["output"]) graph: WillumpGraph = WillumpGraph(output_node) type_map = {"input_table": WeldPandas([WeldVec(WeldLong()), WeldVec(WeldLong()), WeldVec(WeldLong())], ["join_column", "data1", "data2"]), "output": WeldPandas([WeldVec(WeldLong()), WeldVec(WeldLong()), WeldVec(WeldLong()), WeldVec(WeldDouble()), WeldVec(WeldDouble())], ["join_column", "data1", "data2", "metadata1", "metadata2"])} weld_output = wexec.execute_from_basics(graph=graph, type_map=type_map, inputs=((left_table["join_column"].values, left_table["data1"].values, left_table["data2"].values),), input_names=["input_table"], output_names=["output"], aux_data=aux_data) numpy.testing.assert_equal( weld_output[1], numpy.array([4, 5, 2, 5, 3], dtype=numpy.int64)) numpy.testing.assert_equal( weld_output[3], numpy.array([1.2, 2.2, 2.2, 3.2, 1.2], dtype=numpy.float64)) numpy.testing.assert_equal( weld_output[4], numpy.array([1.3, 2.3, 2.3, 3.3, 1.3], dtype=numpy.float64))
def __init__(self, input_node: WillumpGraphNode, input_name: str, left_input_type: WeldType, output_name: str, join_col_names: List[str], right_dataframe, aux_data: List[Tuple[int, WeldType]]) -> None: """ Initialize the node, appending a new entry to aux_data in the process. """ self.left_input_name = input_name self._output_name = output_name self._right_dataframe = right_dataframe self._right_dataframe_name = "AUX_DATA_{0}".format(len(aux_data)) self.join_col_names = join_col_names self._input_nodes = [ input_node, WillumpInputNode(self._right_dataframe_name) ] self._input_names = [input_name, self._right_dataframe_name] assert (isinstance(left_input_type, WeldPandas)) self.left_df_type = left_input_type for entry in self._process_aux_data(right_dataframe): aux_data.append(entry) self._output_type = WeldPandas( field_types=self.left_df_type.field_types + self.right_df_type.field_types, column_names=self.left_df_type.column_names + self.right_df_type.column_names)
def test_mixed_string_lower(self): print("\ntest_mixed_string_lower") input_df = pd.DataFrame({"target_col": ["aA,.,.a", "B,,b", "c34234C"]}) input_str = list(input_df["target_col"].values) input_node: WillumpInputNode = WillumpInputNode("input_str") string_lower_node: StringLowerNode = \ StringLowerNode(input_node=input_node, input_name="input_str", input_type=WeldPandas([WeldStr()], ["target_col"]), input_col="target_col", output_name="lowered_output_words", output_col="new_col", output_type=WeldPandas([WeldVec(WeldStr()), WeldVec(WeldStr())], ["new_col", "target_col"])) output_node: WillumpOutputNode = WillumpOutputNode(string_lower_node, ["lowered_output_words"]) graph: WillumpGraph = WillumpGraph(output_node) type_map = {"input_str": WeldPandas([WeldVec(WeldStr())], ["target_col"]), "lowered_output_words": WeldPandas([WeldVec(WeldStr()), WeldVec(WeldStr())], ["new_col", "target_col"])} weld_output = wexec.execute_from_basics(graph, type_map, ((input_str,),), ["input_str"], ["lowered_output_words"], []) self.assertEqual(weld_output, (["aa,.,.a", "b,,b", "c34234c"], ["aA,.,.a", "B,,b", "c34234C"]))
def visit_FunctionDef(self, node: ast.FunctionDef) -> None: """ Begin processing of a function. Create input nodes for function arguments. """ for arg in node.args.args: arg_name: str = self.get_store_name(arg.arg, node.lineno) arg_type: WeldType = self._type_map[arg_name] input_node: WillumpInputNode = WillumpInputNode(arg_name, arg_type=arg_type) self._node_dict[arg_name] = input_node self.arg_list.append(arg_name) for entry in node.body: if isinstance(entry, ast.Assign): output_var_names, assignment_node = self.analyze_Assign(entry) for output_var_name in output_var_names: self._node_dict[output_var_name] = assignment_node elif isinstance(entry, ast.Return): self.analyze_Return(entry) else: output_names, py_node = self._create_py_node(entry) for output_name in output_names: self._node_dict[output_name] = py_node
def process_weld_block(weld_block_input_set, weld_block_aux_input_set, weld_block_output_set, weld_block_node_list, future_nodes, typing_map, num_workers, eval_cascades, batch) \ -> List[typing.Union[ast.AST, Tuple[List[str], List[str], List[List[str]]]]]: """ Helper function for graph_to_weld. Creates Willump statements for a block of Weld code given information about the code, its inputs, and its outputs. Returns these Willump statements. """ # Do not emit any output that are not needed in later blocks. for entry in weld_block_output_set.copy(): appears_later = False for future_node in future_nodes: if any(entry == input_name for input_name in future_node.get_in_names()): appears_later = True break if any(entry == output_name for output_name in future_node.get_output_names()): break if not appears_later: weld_block_output_set.remove(entry) # Do optimization passes over the block. csr_preprocess_nodes, csr_postprocess_nodes = \ wg_passes.weld_csr_marshalling_pass(weld_block_input_set, weld_block_output_set, typing_map) pandas_preprocess_nodes, pandas_postprocess_nodes = wg_passes.weld_pandas_marshalling_pass( weld_block_input_set, weld_block_output_set, typing_map, batch) series_preprocess_nodes, series_postprocess_nodes = wg_passes.weld_pandas_series_marshalling_pass( weld_block_input_set, weld_block_output_set, typing_map) preprocess_nodes = csr_preprocess_nodes + pandas_preprocess_nodes + series_preprocess_nodes postprocess_nodes = csr_postprocess_nodes + pandas_postprocess_nodes + series_postprocess_nodes # Split Weld blocks into multiple threads. num_threads = num_workers + 1 # The main thread also does work. if num_threads > 1: threaded_statements_list = \ wg_passes.multithreading_weld_blocks_pass(weld_block_node_list, weld_block_input_set, weld_block_output_set, num_threads) else: threaded_statements_list = [ (weld_block_node_list, weld_block_input_set, weld_block_output_set) ] # Append appropriate input and output nodes to each node list. for multithreaded_entry in threaded_statements_list: if len(multithreaded_entry) == 2: input_set, thread_list = multithreaded_entry for thread_entry in thread_list: weld_block_nodes, output_set = thread_entry for entry in input_set: weld_block_nodes.insert(0, WillumpInputNode(entry)) for entry in weld_block_aux_input_set: weld_block_nodes.insert(0, WillumpInputNode(entry)) weld_block_nodes.append( WillumpMultiOutputNode(list(output_set))) else: weld_block_nodes, input_set, output_set = multithreaded_entry for entry in input_set: weld_block_nodes.insert(0, WillumpInputNode(entry)) for entry in weld_block_aux_input_set: weld_block_nodes.insert(0, WillumpInputNode(entry)) weld_block_nodes.append(WillumpMultiOutputNode(list(output_set))) # Construct the statement list from the nodes. weld_string_nodes: List[Tuple[List[str], List[str], List[List[str]]]] = [] for multithreaded_entry in threaded_statements_list: if len(multithreaded_entry) == 2: input_set, thread_list = multithreaded_entry weld_strings = [] output_sets = [] for thread_entry in thread_list: weld_block_nodes, output_set = thread_entry weld_str: str = "" for weld_node in weld_block_nodes: weld_str += weld_node.get_node_weld() weld_strings.append(weld_str) output_sets.append(output_set) weld_string_nodes.append( (weld_strings, list(input_set), output_sets)) else: weld_block_nodes, input_set, output_set = multithreaded_entry weld_str: str = "" for weld_node in weld_block_nodes: weld_str += weld_node.get_node_weld() weld_string_nodes.append( ([weld_str], list(input_set), [list(output_set)])) preprocess_python = list(map(lambda x: x.get_python(), preprocess_nodes)) postprocess_python = list(map(lambda x: x.get_python(), postprocess_nodes)) return preprocess_python + weld_string_nodes + postprocess_python