def list_tables(): """ Lists all tables and views currently held in the database. """ # ------------------------------------------- # Prepare command. cmd = dict() cmd["name_"] = "" cmd["type_"] = "Database.list_tables" # ------------------------------------------- # Send JSON command to engine. s = comm.send_and_receive_socket(cmd) # ------------------------------------------- # Make sure that everything went well. msg = comm.recv_string(s) if msg != "Success!": s.close() raise Exception(msg) # ------------------------------------------- # Parse result as list. arr = json.loads(comm.recv_string(s)) s.close() return arr
def list_data_frames(): """ List all data frames currently stored in the project folder and held in memory. Returns: dict: Lists the names of the data frames. """ cmd = dict() cmd["type_"] = "list_data_frames" cmd["name_"] = "" s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) if msg != "Success!": raise Exception(msg) json_str = comm.recv_string(s) s.close() return json.loads(json_str)
def to_sql(self): """ Extracts the SQL statements underlying the trained model. """ # ------------------------------------------------------ # Build and send JSON command cmd = dict() cmd["type_"] = "RelboostModel.to_sql" cmd["name_"] = self.name s = comm.send_and_receive_socket(cmd) # ------------------------------------------------------ # Make sure model exists on getML engine msg = comm.recv_string(s) if msg != "Found!": raise Exception(msg) # ------------------------------------------------------ # Receive SQL code from getML engine sql = comm.recv_string(s) # ------------------------------------------------------ s.close() return sql
def n_bytes(self): """ Returns the size of the data stored in the DataFrame in bytes. """ # ------------------------------------------------------ # Build and send JSON command cmd = dict() cmd["type_"] = "DataFrame.nbytes" cmd["name_"] = self.name s = comm.send_and_receive_socket(cmd) # ------------------------------------------------------ # Make sure model exists on getml engine msg = comm.recv_string(s) if msg != "Found!": s.close() raise Exception(msg) # ------------------------------------------------------ # Receive number of bytes from getml engine nbytes = comm.recv_string(s) # ------------------------------------------------------ s.close() return np.uint64(nbytes)
def n_rows(self): """ Returns the number of rows in the data frame. """ # ------------------------------------------------------ # Build and send JSON command cmd = dict() cmd["type_"] = "DataFrame.nrows" cmd["name_"] = self.name s = comm.send_and_receive_socket(cmd) # ------------------------------------------------------ # Make sure model exists on getml engine msg = comm.recv_string(s) if msg != "Found!": s.close() raise Exception(msg) # ------------------------------------------------------ # Receive number of rows from getml engine nrows = comm.recv_string(s) # ------------------------------------------------------ s.close() return np.int32(nrows)
def __score(self, yhat, y): """ Returns the score for a set of predictions. **yhat**: Predictions. **y**: Targets. """ # ---------------------------------------------------------------------- # Build the cmd string cmd = dict() cmd["type_"] = "MultirelModel.score" cmd["name_"] = self.name #cmd["num_threads_"] = self.params["num_threads"] # ---------------------------------------------------------------------- # Establish connection with the getml engine and send command s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) if msg != "Found!": s.close() raise Exception(msg) # ---------------------------------------------------------------------- # Send data comm.send_matrix(s, yhat) comm.send_matrix(s, y) msg = comm.recv_string(s) # ---------------------------------------------------------------------- # Ensure success, receive scores if msg != "Success!": s.close() raise Exception(msg) scores = comm.recv_string(s) s.close() # ---------------------------------------------------------------------- return json.loads(scores)
def __fit(self, peripheral_data_frames, population_data_frame, s): # ----------------------------------------------------- # Send the complete fit command. cmd = dict() cmd["type_"] = "RelboostModel.fit" cmd["name_"] = self.name cmd["peripheral_names_"] = [df.name for df in peripheral_data_frames] cmd["population_name_"] = population_data_frame.name comm.send_string(s, json.dumps(cmd)) # ----------------------------------------------------- # Do the actual fitting begin = time.time() print("Loaded data. Features are now being trained...") msg = comm.recv_string(s) end = time.time() # ---------------------------------------------------------------------- # Print final message if "Trained" in msg: print(msg) self.__print_time_taken(begin, end, "Time taken: ") else: raise Exception(msg)
def execute(query): """ Executes an SQL query on the database. Args: query (str): The SQL query to be executed. """ # ------------------------------------------- # Prepare command. cmd = dict() cmd["name_"] = "" cmd["type_"] = "Database.execute" # ------------------------------------------- # Send JSON command to engine. s = comm.send_and_receive_socket(cmd) # ------------------------------------------- # Send the actual query. comm.send_string(s, query) # ------------------------------------------- # Make sure that everything went well. msg = comm.recv_string(s) s.close() if msg != "Success!": raise Exception(msg)
def fit(self, population_table, peripheral_tables): """ Fits the model. Args: population_table (:class:`pandas.DataFrame` or :class:`~getml.engine.DataFrame`): Population table containing the target. peripheral_tables (List[:class:`pandas.DataFrame` or :class:`~getml.engine.DataFrame`]): Peripheral tables. The peripheral tables have to be passed in the exact same order as their corresponding placeholders! """ # ----------------------------------------------------- # Prepare the command for the getml engine cmd = dict() cmd["type_"] = "MultirelModel.fit" cmd["name_"] = self.name # ----------------------------------------------------- # Send command to engine and make sure that model has # been found s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) if msg != "Found!": s.close() raise Exception(msg) # ---------------------------------------------------------------------- # Load peripheral tables peripheral_data_frames = self.__load_peripheral_tables( peripheral_tables, s) # ---------------------------------------------------------------------- # Load population table targets = self.params['population'].thisptr["targets_"] population_data_frame = self.__load_population_table( population_table, targets, s) # ---------------------------------------------------------------------- # Call the __fit(...) method, which does the actual fitting. self.__fit(peripheral_data_frames, population_data_frame, s) # ---------------------------------------------------------------------- s.close() self.__save() return self.refresh()
def __close(self, s): cmd = dict() cmd["type_"] = "RelboostModel.close" cmd["name_"] = self.name comm.send_string(s, json.dumps(cmd)) msg = comm.recv_string(s) if msg != "Success!": raise Exception(msg)
def send(self, data_frame, sock = None): """Send data to the getml engine. If sock is None, it will call a function to create a new socket, use it for the data transfer and close it afterwards. If, instead, a socket is provided, it just sends all the data but does not close it. Args: data_frame (pandas.DataFrame): Data Frame that you want to be appended to the existing data. sock (optional): Socket connecting the Python API with the getML engine. """ # ------------------------------------------------------ if data_frame is not None: self.__check_plausibility(data_frame) # ------------------------------------------------------ # Send data frame itself cmd = dict() cmd["type_"] = "DataFrame" cmd["name_"] = self.name if sock is None: s = comm.send_and_receive_socket(cmd) else: s = sock comm.send_string(s, json.dumps(cmd)) msg = comm.recv_string(s) if msg != "Success!": raise Exception(msg) # ------------------------------------------------------ # Send individual columns to getml engine self.__send_data(data_frame, s) # ------------------------------------------------------ self.__close(s) if sock is None: s.close() return self
def from_json(self, json_str, append=False, time_formats=["%Y-%m-%dT%H:%M:%s%z", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]): """ Fill from JSON Fills the data frame with data from a JSON string. Args: json_str (str): The JSON string containing the data. append (bool): If a DataFrame already exists, should json_str be appended? time_formats (str): The formats tried when parsing time stamps. Refer to https://pocoproject.org/docs/Poco.DateTimeFormatter.html#9946 for the options. """ # ------------------------------------------- # Send JSON command to getml engine cmd = dict() cmd["type_"] = "DataFrame.from_json" cmd["name_"] = self.name cmd["categoricals_"] = self.categorical_names cmd["discretes_"] = self.discrete_names cmd["join_keys_"] = self.join_key_names cmd["numericals_"] = self.numerical_names cmd["targets_"] = self.target_names cmd["time_stamps_"] = self.time_stamp_names cmd["append_"] = append cmd["time_formats_"] = time_formats s = comm.send_and_receive_socket(cmd) # ------------------------------------------- # Send the JSON string comm.send_string(s, json_str) # ------------------------------------------- # Make sure everything went well and close # connection msg = comm.recv_string(s) s.close() if msg != "Success!": raise Exception(msg) # ------------------------------------------- return self
def get_colnames(name): """ Lists the colnames of a table held in the database. Args: name (str): The name of the database. """ # ------------------------------------------- # Prepare command. cmd = dict() cmd["name_"] = name cmd["type_"] = "Database.get_colnames" # ------------------------------------------- # Send JSON command to engine. s = comm.send_and_receive_socket(cmd) # ------------------------------------------- # Make sure that everything went well. msg = comm.recv_string(s) if msg != "Success!": s.close() raise Exception(msg) # ------------------------------------------- # Parse result as list. arr = json.loads(comm.recv_string(s)) s.close() return arr
def list_projects(): """ List all projects on the engine. Returns: list: Lists the name all of the projects. """ cmd = dict() cmd["type_"] = "list_projects" cmd["name_"] = "" s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) if msg != "Success!": raise Exception(msg) json_str = comm.recv_string(s) s.close() return json.loads(json_str)["projects"]
def list_models(): """ List all models currently held in memory. Returns: dict: Lists the names of all of the models by type. """ cmd = dict() cmd["type_"] = "list_models" cmd["name_"] = "" s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) if msg != "Success!": raise Exception(msg) json_str = comm.recv_string(s) s.close() return json.loads(json_str)
def __get_categorical(self, sock=None): """ Transform column to numpy array Args: sock: Socket connecting the Python API with the getML engine. """ # ------------------------------------------- # Build command string cmd = dict() cmd["name_"] = self.thisptr["df_name_"] cmd["type_"] = "CategoricalColumn.get" cmd["col_"] = self.thisptr # ------------------------------------------- # Send command to engine if sock is None: s = comm.send_and_receive_socket(cmd) else: s = sock comm.send_string(s, json.dumps(cmd)) msg = comm.recv_string(s) # ------------------------------------------- # Make sure everything went well, receive data # and close connection if msg != "Found!": s.close() raise Exception(msg) mat = comm.recv_categorical_matrix(s) # ------------------------------------------- # Close connection. if sock is None: s.close() # ------------------------------------------- return mat.ravel()
def __transform(self, peripheral_data_frames, population_data_frame, s, score=False, predict=False, table_name=""): # ----------------------------------------------------- # Prepare the command for the getML engine cmd = dict() cmd["type_"] = "RelboostModel.transform" cmd["name_"] = self.name cmd["score_"] = score cmd["predict_"] = predict cmd["peripheral_names_"] = [df.name for df in peripheral_data_frames] cmd["population_name_"] = population_data_frame.name cmd["table_name_"] = table_name comm.send_string(s, json.dumps(cmd)) # ----------------------------------------------------- # Do the actual transformation msg = comm.recv_string(s) if msg == "Success!": if table_name == "": yhat = comm.recv_matrix(s) else: yhat = None else: raise Exception(msg) # ----------------------------------------------------- return yhat
def get(self): """ Receives the value of the aggregation over the column. """ # ------------------------------------------- # Build command string cmd = dict() cmd["name_"] = "" cmd["type_"] = "Column.aggregate" cmd["aggregation_"] = self.thisptr cmd["df_name_"] = self.thisptr["col_"]["df_name_"] # ------------------------------------------- # Create connection and send the command s = send_and_receive_socket(cmd) msg = comm.recv_string(s) # ------------------------------------------- # Make sure everything went well, receive data # and close connection if msg != "Success!": s.close() raise Exception(msg) mat = comm.recv_matrix(s) # ------------------------------------------- # Close connection. s.close() # ------------------------------------------- return mat.ravel()[0]
def get(self): """ Transform column to numpy array """ # ------------------------------------------- # Build command string cmd = dict() cmd["name_"] = self.thisptr["df_name_"] cmd["type_"] = "BooleanColumn.get" cmd["col_"] = self.thisptr # ------------------------------------------- # Send command to engine s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) # ------------------------------------------- # Make sure everything went well, receive data # and close connection if msg != "Found!": s.close() raise Exception(msg) mat = comm.recv_boolean_matrix(s) # ------------------------------------------- # Close connection, if necessary. s.close() # ------------------------------------------- return mat.ravel()
def get_model(name): """ Returns a handle to the model specified by name. Args: name (str): Name of the model. """ cmd = dict() cmd["type_"] = "get_model" cmd["name_"] = name s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) if msg == "MultirelModel": return MultirelModel(name=name).refresh() elif msg == "RelboostModel": return RelboostModel(name=name).refresh() else: raise Exception(msg)
def send(self, numpy_array, s): """ Sends the object to the engine, data taken from a numpy array. Args: numpy_array (:class:`numpy.ndarray`): Number of columns should match the number of columns of the object itself. s: Socket """ # ------------------------------------------- # Send own JSON command to getml engine comm.send_string(s, json.dumps(self.thisptr)) # ------------------------------------------- # Send data to getml engine if self.thisptr["type_"] == "CategoricalColumn": comm.send_categorical_matrix(s, numpy_array) elif self.thisptr["type_"] == "Column": comm.send_matrix(s, numpy_array) # ------------------------------------------- # Make sure everything went well msg = comm.recv_string(s) if msg != "Success!": raise Exception(msg) # ------------------------------------------- if len(numpy_array.shape) > 1: self.colnames = self.colnames or [ "column_" + str(i + 1) for i in range(numpy_array.shape[1]) ]
def refresh(self): """ Refreshes the hyperparameters and placeholders in Python based on a model already loaded in the engine. """ # ------------------------------------------- # Send JSON command to getml engine cmd = dict() cmd["type_"] = "RelboostModel.refresh" cmd["name_"] = self.name s = comm.send_and_receive_socket(cmd) # ------------------------------------------- # Make sure everything went well and close # connection msg = comm.recv_string(s) if msg[0] != '{': raise Exception(msg) s.close() # ------------------------------------------- # Parse results. json_obj = json.loads(msg) self.set_params(json_obj["hyperparameters_"]) self.params = _parse_placeholders(json_obj, self.params) return self
def sniff_csv(name, fnames, header=True, num_lines_sniffed=1000, quotechar='"', sep=',', skip=0): """ Sniffs a list of CSV files. Args: name (str): Name of the table in which the data is to be inserted. fnames (List[str]): The list of CSV file names to be read. header (bool, optional): Whether the CSV file contains a header with the column names. Default to True. quotechar (str, optional): The character used to wrap strings. Default:`"` sep (str, optional): The separator used for separating fields. Default:`,` skip (int, optional): Number of lines to skip at the beginning of each file (Default: 0). If *header* is True, the lines will be skipped before the header. Returns: str: Appropriate `CREATE TABLE` statement. """ # ------------------------------------------- # Transform paths fnames_ = [os.path.abspath(_) for _ in fnames] # ------------------------------------------- # Prepare command. cmd = dict() cmd["name_"] = name cmd["type_"] = "Database.sniff_csv" cmd["fnames_"] = fnames_ cmd["header_"] = header cmd["num_lines_sniffed_"] = num_lines_sniffed cmd["quotechar_"] = quotechar cmd["sep_"] = sep cmd["skip_"] = skip # ------------------------------------------- # Send JSON command to engine. s = comm.send_and_receive_socket(cmd) # ------------------------------------------- # Make sure that everything went well. msg = comm.recv_string(s) if msg != "Success!": s.close() raise Exception(msg) # ------------------------------------------- query = comm.recv_string(s) s.close() return query
def transform(self, population_table, peripheral_tables=None, table_name=""): """ Returns the features learned by the model or writes them into a data base. Args: population_table (:class:`pandas.DataFrame` or :class:`~getml.engine.DataFrame`): Population table. Targets will be ignored. peripheral_tables (List[:class:`pandas.DataFrame` or :class:`~getml.engine.DataFrame`]): Peripheral tables. The peripheral tables have to be passed in the exact same order as their corresponding placeholders! table_name (str): If not an empty string, the resulting features will be written into the data base, instead of returning them. """ # ----------------------------------------------------- # Prepare the command for the getML engine cmd = dict() cmd["type_"] = "RelboostModel.transform" cmd["name_"] = self.name # ----------------------------------------------------- # Send command to engine and make sure that model has # been found s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) if msg != "Found!": raise Exception(msg) # ---------------------------------------------------------------------- # Load peripheral tables peripheral_tables = peripheral_tables or self.params[ 'peripheral_tables'] peripheral_data_frames = self.__load_peripheral_tables( peripheral_tables, s) # ---------------------------------------------------------------------- # Load population table if type(population_table) == engine.DataFrame: targets = [] else: targets = [ elem for elem in self.params['population'].thisptr["targets_"] if elem in population_table.columns ] population_data_frame = self.__load_population_table( population_table, targets, s) # ---------------------------------------------------------------------- # Call the predict function to get features as numpy array y_hat = self.__transform(peripheral_data_frames, population_data_frame, s, table_name=table_name) self.__close(s) s.close() return y_hat
def score(self, population_table, peripheral_tables=None): """ Calculates scores for the model. Args: population_table (:class:`pandas.DataFrame` or :class:`~getml.engine.DataFrame`): Population table. Targets will be ignored peripheral_tables (List[:class:`pandas.DataFrame` or :class:`~getml.engine.DataFrame`]): Peripheral tables. The peripheral tables have to be passed in the exact same order as their corresponding placeholders! """ # ----------------------------------------------------- # Prepare the command for the getML engine cmd = dict() cmd["type_"] = "RelboostModel.transform" cmd["name_"] = self.name # ----------------------------------------------------- # Send command to engine and make sure that model has # been found s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) if msg != "Found!": raise Exception(msg) # ---------------------------------------------------------------------- # Load peripheral tables peripheral_tables = peripheral_tables or self.params[ 'peripheral_tables'] peripheral_data_frames = self.__load_peripheral_tables( peripheral_tables, s) # ---------------------------------------------------------------------- # Load population table if type(population_table) == engine.DataFrame: targets = [] else: targets = [ elem for elem in self.params['population'].thisptr["targets_"] if elem in population_table.columns ] population_data_frame = self.__load_population_table( population_table, targets, s) # ---------------------------------------------------------------------- # Get predictions as numpy array yhat = self.__transform(peripheral_data_frames, population_data_frame, s, predict=True, score=True) # ---------------------------------------------------------------------- # Get targets colname = population_data_frame.target_names[self.params["target_num"]] y = population_data_frame.target(colname).get(s).ravel() # ---------------------------------------------------------------------- # Close connection. self.__close(s) s.close() # ---------------------------------------------------------------------- # Do the actual scoring. scores = self.__score(yhat, y) # ---------------------------------------------------------------------- self.__save() return scores
def __get(self, sock=None): """ Transform column to numpy array Args: sock: Socket connecting the Python API with the getML engine. """ # ------------------------------------------- # Build command string cmd = dict() cmd["name_"] = self.thisptr["df_name_"] cmd["type_"] = "Column.get" cmd["col_"] = self.thisptr # ------------------------------------------- # Establish communication with getml engine if sock is None: s = comm.send_and_receive_socket(cmd) else: s = sock comm.send_string(s, json.dumps(cmd)) msg = comm.recv_string(s) # ------------------------------------------- # Make sure everything went well, receive data # and close connection if msg != "Found!": s.close() raise Exception(msg) mat = comm.recv_matrix(s) # ------------------------------------------- # Close connection. if sock is None: s.close() # ------------------------------------------- # If this is a time stamp, then transform to # pd.Timestamp. if self.thisptr["type_"] == "Column": if self.thisptr[ "role_"] == "time_stamp" or "time stamp" in self.thisptr[ "unit_"]: shape = mat.shape mat = [pd.Timestamp(ts_input=ts, unit="D") for ts in mat.ravel()] mat = np.asarray(mat) mat.reshape(shape[0], shape[1]) # ------------------------------------------- return mat.ravel()
def refresh(self): """ Aligns meta-information of the DataFrame with the engine. This method can be used to avoid encoding conflicts. Note that the .load() method automatically calls refresh. """ # ---------------------------------------------------------------------- cmd = dict() cmd["type_"] = "DataFrame.refresh" cmd["name_"] = self.name s = comm.send_and_receive_socket(cmd) msg = comm.recv_string(s) s.close() if msg[0] != "{": raise Exception(msg) # ---------------------------------------------------------------------- encodings = json.loads(msg) # ---------------------------------------------------------------------- # Extract colnames if encodings["categorical_"] == '': categorical = [] else: categorical = encodings["categorical_"] if encodings["discrete_"] == '': discrete = [] else: discrete = encodings["discrete_"] if encodings["join_keys_"] == '': join_keys = [] else: join_keys = encodings["join_keys_"] if encodings["numerical_"] == '': numerical = [] else: numerical = encodings["numerical_"] if encodings["targets_"] == '': targets = [] else: targets = encodings["targets_"] if encodings["time_stamps_"] == '': time_stamps = [] else: time_stamps = encodings["time_stamps_"] # ---------------------------------------------------------------------- # Re-initialize data frame self.__init__( name=self.name, join_keys=join_keys, time_stamps=time_stamps, categorical=categorical, discrete=discrete, numerical=numerical, targets=targets, units=self.units ) # ---------------------------------------------------------------------- return self