def DeriveChange(data_frame, params=None): logger.debug( "initialized DeriveChange. Use get_params() to see parameter values") if params == None: params = {} params["func_params"] = { "window_len": ("length of averaging window", 1, False), "angle_change": ("if the change is between angles, we return the signed smaller angle between the two headings", False, False) } params["measure_rules"] = { "target_measure": ("name of the target measure", "measure_name", True), "output_name": ("name of returned measure", "output_name", True) } return params logger.debug("transforming data to %s" % (params["measure_rules"]["output_name"])) if "window_len" in params["func_params"]: window_len = params["func_params"]["window_len"] else: window_len = 1 target_array = data_frame[params["measure_rules"]["target_measure"]].values diffed_data = diff_data(target_array, window_len, params["func_params"]["angle_change"]) return pd.DataFrame(data=diffed_data, columns=[params["measure_rules"]["output_name"]], index=data_frame.index[:-1])
def ButterLowpass(data_frame, params=None): """Class to apply a Butterworth lowpass filter to data""" logger.debug("Calculating ButterLowpass.") if params == None: params = { "order": ("order of the filter", 2, True), "nyquist": ("Wn parameter from scipy.signal.butter", 0.05, True), "filter_name": ("name to append to all filtered parameters", "_buttered", False) } return params elif "filter_name" not in params: params["filter_name"] = "_buttered" logger.debug("transforming_data") buttered_data = np.zeros(data_frame.shape) output_names = [] for col_ind, df_col in enumerate(data_frame): buttered_data[:, col_ind] = butter_data(data_frame[df_col].values, params["order"], params["nyquist"]) output_names.append(df_col + params["filter_name"]) return pd.DataFrame(data=buttered_data, columns=output_names, index=data_frame.index)
def compare_threshold(data_array, comparison_operator, comparision_val, absolute_compare=False): """ Fucntion for comparing an array to a values with a binary operator :param data_array: input data :type data_array: numpy array :param comparison_operator: string representation of the binary operator for comparison :type comparison_operator: str :param comparision_val: The value to be compared against :type comparision_val: float :param absolute_compare: specifying whether to compare raw value or absolute value :type absolute_compare: Boolean :return: the indices where the binary operator is true :rtype: numpy array """ logger.debug("comparing: %s %d" % (comparison_operator, comparision_val)) if absolute_compare: data_array = np.abs(data_array) comparisons = { "==": np.equal, "!=": np.not_equal, ">=": np.greater_equal, "<=": np.less_equal, ">": np.greater, "<": np.less } cur_comp = comparisons[comparison_operator] match_inds = cur_comp(np.nan_to_num(data_array), comparision_val) return match_inds
def select_transform(transform_type, transform_name): """Method to grab the transform function from the correct module""" logger.debug("Selecting transform function") available_transforms = {} available_transforms["filter_data"] = { "ButterLowpass": ButterLowpass, "WindowAverage": WindowAverage, "dummy": dummy_function } available_transforms["derive_param"] = { "DeriveSlope": DeriveSlope, "DeriveChange": DeriveChange, "DeriveCumsum": DeriveCumsum, "DeriveDistance": DeriveDistance, "DeriveHeading": DeriveHeading, "DeriveWindowSum": DeriveWindowSum, "DeriveScaled": DeriveScaled, "DeriveInBox": DeriveInBox, "DeriveThreshold": DeriveThreshold, "DeriveLogicalCombination": DeriveLogicalCombination, } available_transforms["detect_event"] = { "DetectThreshold": DetectThreshold } return available_transforms[transform_type][transform_name]
def __init__(self, template, dstreams): logger.debug("init BStream") super().__init__() self.dstreams = dstreams self["template_id"] = template["template_id"] self._load_from_dict(template) self["stream_token"] = str(template["stream_token"])
def process_data(self, dstream_list, token): """ Wrapper method for asynchronously processing data. :param dstream_list: list of dstreams with raw data :type dstream_list: list of dicts :param token: stream token :type token: string """ logger.debug("process_data_async") st = time.time() # retrieve most recent versioned dstream template template = dstream_list[0] # create bstream for dstream list bstream = self._list_to_bstream(template, dstream_list) # filter bstream data bstream.apply_filters() # apply derived param transforms bstream.apply_dparam_rules() # apply event transforms bstream.find_events() # post events to server self._post_parsed_events(bstream) self._post_dataframe(bstream["stream_token"], bstream["measures"]) print("whoop WHOOOOP", time.time() - st, len(bstream["timestamp"]))
def __init__(self): """ Initializes and empty DStream with a unique stream token. All the other expected keys are initialized as empty data structures of the desired type. """ self["stream_name"] = None self["user_description"] = None self["version"] = 0 self["stream_token"] = str(uuid.uuid1()) self["source_key"] = None self["template_id"] = str(uuid.uuid1()) self["storage_rules"] = {} self["ingest_rules"] = {} self["engine_rules"] = {} self["timestamp"] = None self["measures"] = {} self["fields"] = {} self["user_ids"] = {} self["tags"] = {} self["foreign_keys"] = [] self["filters"] = [] self["dparam_rules"] = [] self["event_rules"] = {} self["data_rules"] = {} logger.debug("DStream initialize")
def DeriveLogicalCombination(data_frame, params=None): logger.debug("Starting DeriveLogicalCombination") if params == None: params = {} params["func_params"] = { "combiner": ("Either AND or OR string specifying which operator to use", "AND", True) } params["measure_rules"] = { "first_measure": ("name of first measure to be combined", "measure_name", True), "second_measure": ("name of second measure to be combined", "measure_name", True), "output_name": ("name of returned measure", "output_name", True) } logger.debug("Combining {} and {} data to {}".format( params["measure_rules"]["first_measure"], params["measure_rules"]["second_measure"], params["measure_rules"]["output_name"])) first_array = data_frame[params["measure_rules"]["first_measure"]].values second_array = data_frame[params["measure_rules"]["second_measure"]].values print(first_array, second_array) combined = logical_combine(first_array, second_array, params["func_params"]["combiner"]) return pd.DataFrame(data=combined, columns=[params["measure_rules"]["output_name"]], index=data_frame.index)
def partition_data(self, list_of_partitions, logical_comparison="AND"): """This function takes a list of tuples of partition parameters used by partition_rows() and returns all rows from the measure DataFrame that meet the logical AND or logical OR of those conditions""" logger.debug("building parition rows") if logical_comparison == "AND": start_bools = np.ones((self["measures"].shape[0], ), dtype=bool) elif logical_comparison == "OR": start_bools = np.zeros((self["measures"].shape[0], ), dtype=bool) else: raise ValueError("{} is not a supported logical comparison".format( logical_comparison)) for partition in list_of_partitions: new_inds = self.partition_rows(partition[0], partition[1], partition[2]) if logical_comparison == "AND": start_bools = np.logical_and(start_bools, new_inds) elif logical_comparison == "OR": start_bools = np.logical_or(start_bools, new_inds) else: raise ValueError( "{} is not a supported logical comparison".format( logical_comparison)) return self["measures"][start_bools]
def DeriveThreshold(data_frame, params=None): logger.debug("Starting DeriveThreshold") if params == None: params = {} params["func_params"] = { "threshold_value": ("value to compare against", 0, True), "comparison_operator": ("one of == != >= <= > <", "==", True), "absolute_compare": ("whether to compare against absolute value instead of raw value", False, False) } params["measure_rules"] = { "target_measure": ("name of the target measure", "measure_name", True), "output_name": ("name of returned measure", "output_name", True) } logger.debug("transforming data to %s" % (params["measure_rules"]["output_name"])) target_array = data_frame[params["measure_rules"]["target_measure"]].values threshold_bool = compare_threshold( target_array, params["func_params"]["comparison_operator"], params["func_params"]["threshold_value"], params["func_params"]["absolute_compare"]) return pd.DataFrame(data=threshold_bool, columns=[params["measure_rules"]["output_name"]], index=data_frame.index)
def DeriveWindowSum(data_frame, params=None): logger.debug("Starting DeriveWindowSum.") if params == None: params = {} params["func_params"] = { "window_len": ("window size for summing", 2, True) } params["measure_rules"] = { "target_measure": ("name of the target measure", "measure_name", True), "output_name": ("name of returned measure", "output_name", True) } return params logger.debug("transforming data to %s" % (params["measure_rules"]["output_name"])) if "window_len" in params["func_params"]: window_len = params["func_params"]["window_len"] else: window_len = 1 target_array = data_frame[params["measure_rules"]["target_measure"]].values summed_data = window_sum(target_array, window_len) return pd.DataFrame(data=summed_data, columns=[params["measure_rules"]["output_name"]], index=data_frame.index)
def DeriveInBox(data_frame, params=None): logger.debug("Starting DeriveInBox.") if params == None: params = {} params["func_params"] = { "upper_left_corner": ("location of upper left corner", (0, 1), True), "lower_right_corner": ("location of lower right corner", (1, 0), True) } params["measure_rules"] = { "spatial_measure": ("name of geo-spatial measure", "measure_name", True), "output_name": ("name of returned measure", "measure_name", True) } return params logger.debug("transforming data to %s" % (params["measure_rules"]["output_name"])) position_array = pd.DataFrame( data_frame[params["measure_rules"]["spatial_measure"]].tolist()).values box_bool = in_box(position_array, params["func_params"]["upper_left_corner"], params["func_params"]["lower_right_corner"]) return pd.DataFrame(data=box_bool, columns=[params["measure_rules"]["output_name"]], index=data_frame.index)
def DeriveSlope(data_frame, params=None): logger.debug("Starting DeriveSlope.") if params == None: params = {} params["func_params"] = { "window_len": ("length of averaging window", 1, False) } params["measure_rules"] = { "rise_measure": ("measure y values (or rise in rise/run calculation of slope)", "measure_name", True), "run_measure": ("measure containing x values (or run in rise/run calculation of slope)", "measure_name", True), "output_name": ("name of returned measure", "output_name", True) } return params logger.debug("transforming data to %s" % (params["measure_rules"]["output_name"])) if "window_len" in params["func_params"]: window_len = params["func_params"]["window_len"] else: window_len = 1 xrun = data_frame[params["measure_rules"]["run_measure"]].values yrise = data_frame[params["measure_rules"]["rise_measure"]].values smaller_len = np.min([xrun.shape[0], yrise.shape[0]]) sloped = sloper(yrise[:smaller_len, ], xrun[:smaller_len, ], window_len) return pd.DataFrame(data=sloped, columns=[params["measure_rules"]["output_name"]], index=data_frame.index)
def bearing(position_array, window_len, units="deg"): """ Calculates the angle between lat lon points :param position_array: input vector of lat lon points :type position_array: N x 2 numpy array :param window_len: Length of window for averaging :type window_len: int :param units: String for output units. Currently 'mi' and 'km' supported :type units: str :return: the angle between consecutive latlon points :rtype: (N - 1) x 1 numpy array """ logger.debug("finding bearing of vector") lat1 = position_array[:-1, 0] lat2 = position_array[1:, 0] lon1 = position_array[:-1, 1] lon2 = position_array[1:, 1] lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) dlon = lon1 - lon2 first_val = np.sin(dlon) * np.cos(lat2) second_val = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos( lat2) * np.cos(dlon) cur_bear = np.arctan2(first_val, second_val) if units == "deg": cur_bear = (np.rad2deg(cur_bear) + 180.0) % 360 - 180 if window_len > 1: cur_bear = window_data(cur_bear, window_len) return cur_bear
def great_circle(position_array, window_len=1, units="mi"): """ Function to calculate the great circle distance between consecutive samples in lat lon vector :param position_array: input vector of lat lon points :type position_array: N x 2 numpy array :param window_len: length of window for averaging output :type window_len: int :param units: String for output units. Currently 'mi' and 'km' supported :type units: str :return: distances between consecutive points :rtype: (N-1) x 1 numpy array """ logger.debug("calculating great circle distance") lat1 = position_array[:-1, 0] lat2 = position_array[1:, 0] lon1 = position_array[:-1, 1] lon2 = position_array[1:, 1] lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) dlat = lat1 - lat2 dlon = lon1 - lon2 inner_val = np.sin( dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2 outer_val = 2 * np.arcsin(np.sqrt(inner_val)) if units == "mi": earth_diameter = 3959 elif units == "km": earth_diameter = 6371 great_dist = outer_val * earth_diameter if window_len > 1: great_dist = window_data(great_dist, window_len) return great_dist
def _aggregate_uids(self): logger.debug("aggregating uids") uids = [s["user_ids"] for s in self.dstreams] self["user_ids"] = { uidkey: [i[uidkey] for i in uids] for uidkey, v in self["user_ids"].items() }
def consume(self): """ """ tk['Consumer.consume : self.consumer.start'].start() self.consumer.start() #auto-start tk['Consumer.consume : self.consumer.start'].stop() for msg in self.consumer: if msg is not None: logger.debug(str(msg.value) + ": {}".format(msg.offset))
def find_events(self): logger.debug("finding events") self["events"] = {} for event_name, event_rule in self["event_rules"].items(): # print("finding event {}".format(event_name)) self["events"][event_name] = self.apply_transform( event_rule["partition_list"], event_rule["measure_list"], event_rule["transform_type"], event_rule["transform_name"], event_rule["param_dict"], event_rule["logical_comparison"])
def aggregate(self): logger.debug("aggregating everything") self._aggregate_uids() self._aggregate_ts() self._aggregate_fields() self._aggregate_tags() self._measure_df() return self
def get(self, option, section=None, default=None): try: if section is None: option, section = self.__get_option_name(option) value = self._cfg.get(section, option) except (NoSectionError, NoOptionError, ConfigParserGeneralError) as err: logger.info("Configuration parameter didn't exist, returning the default value." % err.message) return default logger.debug("Read configuration parameter: (section=%s) %s=%s" % (section, option, value)) return value
def load_from_json(self, json_file): """ The standard method for loading data from an existing json dict. :param json_file: the json dict containing data to be loaded into our DStream :type json_file: dict """ for key in json_file.keys(): if key != 'stream_token' and key != 'template_id': self[key] = json_file[key] logger.debug("added key %s" % (key))
def apply_dparam_rules(self): logger.debug("deriving parameters") self["derived_measures"] = {} for dparam_rule in self["dparam_rules"]: # print("deriving {}".format(dparam_rule["param_dict"]["measure_rules"]["output_name"])) self.apply_transform(dparam_rule["partition_list"], dparam_rule["measure_list"], dparam_rule["transform_type"], dparam_rule["transform_name"], dparam_rule["param_dict"], dparam_rule["logical_comparison"])
def apply_filters(self): logger.debug("applying filters") self["filter_measures"] = {} for filter_rule in self["filters"]: # logger.debug("applying filter {}".format(filter_rule["param_dict"]["filter_name"])) self.apply_transform(filter_rule["partition_list"], filter_rule["measure_list"], filter_rule["transform_type"], filter_rule["transform_name"], filter_rule["param_dict"], filter_rule["logical_comparison"])
def cumsum(data_array, offset=0): """ Calculate the cumulative sum of a vector :param data_array: data to be summed :type data_array: numpy array :param offset: starting value for sum :type offset: float :return: the cumulative sum of the data_array :rtype: numpy array """ logger.debug("cumsum") return np.cumsum(data_array) + offset
def produce(self, dmsg): """ Produce to given topic w/ partition_key and log e. 1k msg. :param dmsg: Message to produce """ tk['Producer.produce'].start() bcount = str(self.count).encode() tk['Producer.produce : self.producer.produce'].start() self.producer.produce(dmsg, partition_key=bcount) tk['Producer.produce : self.producer.produce'].stop() logger.debug("Just produced a message") self.count += 1 tk['Producer.produce'].stop()
def _post_events(event_data): """ Sends post request containing event data to API :param event_data: event data (individual event) :type event_data: dict :return: request status :rtype: string """ endpoint = 'http://{}:{}/new_event'.format(config['server_host'], config['server_port']) logger.debug(event_data) r = requests.post(endpoint, json=event_data) return {'request_status': r.status_code}
def _measure_df(self): logger.debug("aggregating into DataFrame") all_measures = [s["measures"] for s in self.dstreams] self["measures"] = { m: [i[m]['val'] for i in all_measures] for m, v in self["measures"].items() } self["measures"]["timestamp"] = self["timestamp"] for user_id, value in self["user_ids"].items(): self["measures"][user_id] = value self["measures"]["tags"] = self["tags"] self["measures"]["fields"] = self["fields"] self["measures"] = pd.DataFrame.from_dict(self["measures"])
def euclidean_dist(position_array, window_len=1): """ Function to calculate euclidean distance between consecutive samples in a positional vector :param position_array: input vector of positions :type position_array: N x 2 numpy array :param window_len: length of window for averaging output :type window_len: int :return: distances between consecutive points :rtype: (N-1) x 1 numpy array """ logger.debug("calculating euclidean distance") euclid_array = np.sqrt(np.sum(np.diff(position_array, axis=0)**2, axis=1)) if window_len > 1: euclid_array = window_data(euclid_array, window_len) return euclid_array
def logical_combine(array1, array2, combiner): """ Function for creating elementwise AND or OR of two vectors :param array1: First array for combination :type array1: (n,1) numpy array :param array2: Second array for combination :type array2: (n,1) numpy array :param combiner: "AND" or "OR" specifying which method to combine :type combiner: str :return: Boolean array combining the two inputs :rtype: (n,1) boolean numpy array """ logger.debug(f"Combining with{combiner}") combining_func = {"AND": np.logical_and, "OR": np.logical_or} return combining_func[combiner](array1, array2)
def _post_dataframe(stream_token, dataframe): """ Sends post request containing event data to API :param event_data: event data (individual event) :type event_data: dict :return: request status :rtype: string """ endpoint = 'http://{}:{}/data_storage'.format(config['server_host'], config['server_port']) logger.debug(dataframe) r = requests.post(endpoint, data=pickle.dumps((stream_token, dataframe))) return {'request_status': r.status_code}