def annotate_events(self, df): window_size = self.window_size active_features = self.active_features og_cols = df.columns.tolist() df_feat, features = head_features.apply_feature_engineering( df, relevant_features=self.relevant_features, ) df_feat.fillna(0, inplace=True) df_w, feats = head_features.generate_windows( df_feat, window = window_size, relevant_features=features, ) # Cut off the tail end of the data (lots of null values) df_w = df_w.loc[0:(len(df_w)-window_size)] Y = self.model.predict(df_w[active_features]) df_w['class'] = Y for c in og_cols: if not c in active_features: df_w[c] = df[c] self.df = df_w print Y # These are the raw events return self.find_true_events(df, Y.tolist(), index_col='frameIndex')
def generate_training_set( director, feature_generator=lambda df, cols: head_features.apply_feature_engineering(df, cols), k=4, window_size=10, relevant_features=[], verbose=True, ): """ Given the directory of data files, cluster the event points and saves the results in the merged folder. It appends an extra `class` column to represente the labeling results. """ training_data = pd.DataFrame() active_features = [] for csv in os.listdir(director): if not csv.find(".csv") == -1: fi_path = "%s/%s" % (director, csv) df = pd.read_csv(fi_path) if verbose: print fi_path # Save to raw so the original data is kept. We are interested in # keeping the original data for these rows df["noseX_raw"] = df["noseX"] df["noseY_raw"] = df["noseY"] # features df, active_features = feature_generator(df, relevant_features) df.fillna(0, inplace=True) df_w, active_features = head_features.generate_windows( df, window=window_size, relevant_features=active_features ) df_trimmed = df_w.loc[0 : (len(df_w) - window_size)] # cluster individiaul files Y = cluster_training_signals(df_trimmed, active_features, k) df_trimmed["class"] = Y training_data = pd.concat([training_data, df_trimmed]) df_w = training_data if verbose: print "Now clustering the data" # active_columns = get_active_features(df, ignore_columns) # cluster all together print "Number of data points clustered:", len(df_w) print "Features used to cluster:\n" for c in active_features: print "\t%s" % c return df_w, active_features