def main(): parser = argparse.ArgumentParser() parser.add_argument("notefile", help="raw notes file") parser.add_argument("outfile", help="raw notes file") parser.add_argument("--thr", type=float, default=0.74, help="threshold frequency") parser.add_argument("--lb", type=int, default=10, help="lower bound on number of topics") parser.add_argument("--ub", type=int, default=100, help="upper bound on number of topics") parser.add_argument("--step", type=int, default=5, help="step size to explore") args = parser.parse_args() aboveThr = args.thr progNoteDF = pd.read_csv(args.notefile) progScoreDF = lu.compute_lda_scores(progNoteDF, 'procNote', args.lb, args.ub, args.step, nAbove=aboveThr) # find the pareto front, or the points that are not dominated by others. aggPF = pareto.eps_sort([list(progScoreDF.itertuples(False))], [1, 6]) print(pd.DataFrame(aggPF, columns=progScoreDF.columns.get_values())) progScoreDF.to_csv(args.outfile, index=False)
def pareto_front(self, step_name=None, nums=None, records=None): """Get parent front. pareto.""" if records is None: records = self.all_records records = list( filter( lambda x: x.step_name == step_name and x.performance is not None, records)) in_pareto = [ record.rewards if isinstance(record.rewards, list) else [record.rewards] for record in records ] if not in_pareto: return None, None try: fitness = np.array(in_pareto) if fitness.shape[1] != 1 and nums is not None and len( in_pareto) > nums: # len must larger than nums, otherwise dead loop _, res, selected = SortAndSelectPopulation(fitness.T, nums) else: outs = pareto.eps_sort(fitness, maximize_all=True, attribution=True) res, selected = np.array(outs)[:, :-2], np.array( outs)[:, -1].astype(np.int32) return res.tolist(), selected.tolist() except Exception as ex: logging.error('No pareto_front_records found, ex=%s', ex) return [], []
def best_config(self): """Get current config list located in pareto front. :return: list of dict {'config_id': int, 'score': float, 'configs': dict} config_id, score, and configs of the current best config. """ if self.total_propose == 0: idx = random.randint(0, len(self.config_list)) result = {'config_id': idx, 'score': -1 * float('inf'), 'configs': self.config_list[idx]} return [result] else: pareto_board = self.sieve_board.copy() pareto_board = pareto_board.dropna() nondominated = pareto.eps_sort([list(pareto_board.itertuples(False))], objectives=self.pareto_cols, epsilons=None, maximize=self.max_object_ids) pareto_list = [] for tmp_list in nondominated: result = {} for i, value in enumerate(tmp_list): if i == 1: result['config_id'] = value result['configs'] = self.config_list[int(value)] elif i >= 3: result[self.sieve_columns[i]] = value pareto_list.append(result) return pareto_list
def pareto_frontier(df: pd.DataFrame, colx: str, coly: str) -> pd.DataFrame: cxi = list(df.columns).index(colx) cyi = list(df.columns).index(coly) rows = eps_sort([list(df.itertuples(False))], objectives=[cxi, cyi], maximize=[cxi, cyi]) return pd.DataFrame(rows, columns=df.columns).sort_values([colx])
def approximate_pareto(y, epsilons=None, margin=0): """ Uses pareto.py from https://github.com/matthewjwoodruff/pareto.py Returns the same data as prpt. """ tagalongs = np.array(pareto.eps_sort(y, epsilons=epsilons, maximize_all=True, attribution=True)) pareto_solutions = tagalongs[:, :y.shape[1]] pareto_idx = tagalongs[:, y.shape[1] + 1].astype(int) if margin > 0: miny = np.min(y, axis=0) ptp = pareto_solutions - miny margin = ptp * margin pareto_idx = range(y.shape[0]) for s, m in zip(pareto_solutions, margin): pareto_idx = np.intersect1d(pareto_idx, np.where(np.any(y >= s - m, axis=1))[0], assume_unique=True) pareto_solutions = y[pareto_idx, :] pareto_scores = compute_scores(y, pareto_solutions) return pareto_solutions, pareto_idx, pareto_scores
def _min_max(study): """ Multi-objective function to find best trial index with minimum deviation and max correlation :param study: Optuna study :return: """ # Iterate pareto-front trials storing mean correlation and std dev df = [] for trial in study.best_trials: df.append([trial.number, np.mean(trial.values), np.std(trial.values)]) # Sort dataframe ascending by mean correlation df = pd.DataFrame(df).sort_values(by=2, ascending=True) # Sort df with best trial in first row if len(df) > 1 and len(df.iloc[:, 1:3].drop_duplicates()) > 1: # Create second pareto to maximize correlation and minimize stddev # Epsilons define precision, ie dominance over other candidates # Dominance is defined as x percent of stddev of stddev try: nd = pareto.eps_sort([list(df.itertuples(False))], objectives=[1, 2], epsilons=[1e-09, np.std(df[1])*.5], maximize=[1]) except: # Something went wrong, return df nd = df # Sort remaining candidates nd = pd.DataFrame(nd).sort_values(by=2, ascending=True) # Only 1st trial so return it else: nd = df # Return "best" trial index return nd.iloc[0, 0]
def get_pareto_front(self): """Propose the Pareto front from the board. :return: The row count of the Pareto board. :rtype: dict """ pareto_list = [] pareto_dict = {} pareto_board = self.sieve_board.copy() pareto_board = pareto_board.dropna() if not pareto_board.empty: nondominated = pareto.eps_sort( [list(pareto_board.itertuples(False))], objectives=self.pareto_cols, epsilons=None, maximize=self.max_object_ids) for tmp_list in nondominated: for i, value in enumerate(tmp_list): if i == 2: pareto_list.append(copy.deepcopy(value)) break if len(tmp_list) > 2: pareto_dict[tmp_list[0]] = copy.deepcopy(tmp_list[2]) return pareto_dict
def plot_nondominated_sets(df, mdf, x_axis=None, y_axis=None, z_axis=None, hoverlabel=None, output='plot'): """ Takes in parameters and returns a figure object to be plotted by plotly :param df: Input objectives table as a pandas dataframe :param mdf: Input metadata file as a pandas dataframe :param x_axis: Objective name to be plotted on the X-axis [STRING] :param y_axis: Objective name to be plotted on the Y-axis [STRING] :param z_axis: Objective name to be plotted on the Y-axis [STRING] :param hoverlabel: Objective name to be used to tag all the points [STRING] :param output: specifies the output of the fucntion [STRING] :return: Plotly Figure object [use plotly.offline.plot() ] """ if output == 'table': objective_names = [] max_col = [] eps_vals = [] for i in range(len(mdf)): row = mdf.iloc[i] default_epsilon = 1e-9 objective_names.append(row['Col_Name']) if pd.isna(row['Epsilon']): eps_vals.append(default_epsilon) else: eps_vals.append((row['Epsilon'])) if row['Max_Min'] == 'Max': max_col.append(row['Col_Name']) objective_cols = [ df.columns.get_loc(c) for c in objective_names if c in df ] # Converts col-names to col_ids max_col_ids = [df.columns.get_loc(c) for c in max_col if c in df] nondominated_set_multi = pareto.eps_sort([list(df.itertuples(False))], objectives=objective_cols, maximize=max_col_ids, epsilons=eps_vals) nondominated_df_multi = pd.DataFrame(nondominated_set_multi) nondominated_df_multi.columns = df.columns return nondominated_df_multi else: if z_axis: plot_list = [x_axis, y_axis, z_axis] else: plot_list = [x_axis, y_axis] plot_mdf = mdf.loc[mdf['Col_Name'].isin(plot_list)] plot_col_ids = [df.columns.get_loc(c) for c in plot_list if c in df] plot_max_cols = [ df.columns.get_loc(c) for c in plot_mdf.query('Max_Min == "Max"')['Col_Name'].values if c in df ] col_eps = [1e-9 if pd.isna(e) else e for e in plot_mdf['Epsilon']] nondominated_set = pareto.eps_sort([list(df.itertuples(False))], objectives=plot_col_ids, maximize=plot_max_cols, epsilons=col_eps) nondominated_df = pd.DataFrame(nondominated_set) nondominated_df.columns = df.columns full_df = pd.merge(df, nondominated_df['row_index'], how='left', indicator='Optimal', on='row_index') full_df.Optimal.replace(to_replace=dict(both="True", left_only="False"), inplace=True) if z_axis is None: nd_points_list = [(nondominated_df[x_axis].values[i], nondominated_df[y_axis].values[i]) for i in range(len(nondominated_df))] nd_points_list.sort(key=lambda tup: tup[1]) x, y = map(list, zip(*nd_points_list)) if x_axis and y_axis and z_axis: fig = px.scatter_3d(full_df, x=x_axis, y=y_axis, z=z_axis, color='Optimal', hover_name=hoverlabel, color_discrete_map={ "True": "#23B58A", 'False': "#B5234E" }) fig.data[0].marker.size = 12 fig.data[0].marker.line.width = 2 fig.data[0].marker.line.color = "black" fig.update_layout(scene=dict(xaxis_title=x_axis, yaxis_title=y_axis, zaxis_title=z_axis), height=545, margin=dict(r=0, b=0, l=0)) return fig elif x_axis and y_axis: fig = px.scatter(full_df, x=x_axis, y=y_axis, color='Optimal', hover_name=hoverlabel, color_discrete_map={ "True": "#23B58A", 'False': "#B5234E" }) fig.add_trace( go.Scatter(x=x, y=y, mode="lines", line=go.scatter.Line(shape='linear', color='#3D9970'), showlegend=False)) fig.data[0].marker.size = 12 fig.data[0].marker.line.width = 2 fig.data[0].marker.line.color = "black" fig.update_layout( height=545, title=go.layout.Title( xref="paper", x=0, ), xaxis=go.layout.XAxis(title=go.layout.xaxis.Title( text=x_axis, font=dict(size=18, color="#7f7f7f"))), yaxis=go.layout.YAxis(title=go.layout.yaxis.Title( text=y_axis, font=dict(size=18, color="#7f7f7f")))) return fig else: raise Exception('You must supply at least two axes')
data = pandas.read_csv("/Users/rshu/Downloads/pareto.py-picture/data.txt", header=None, sep=" ") sets = {} archives = {} fig = matplotlib.figure.Figure(figsize=(15, 15)) agg.FigureCanvasAgg(fig) counter = 0 resolutions = [1e-9, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.4, 1.0] for resolution in resolutions: archives[resolution] = pareto.eps_sort([data.itertuples(False)], [0, 1], [resolution] * 2) sets[resolution] = pandas.DataFrame(data=archives[resolution].archive) # print("debugging") # print(sets[0.03]) # # for idx, row in sets[0.03].iterrows(): # print((idx, row[0], row[1])) for resolution in resolutions: counter += 1 ax = fig.add_subplot(3, 3, counter) ax.scatter(data[0], data[1], lw=0, facecolor=(0.7, 0.7, 0.7), zorder=-1) for idx, row in sets[resolution].iterrows(): ax.scatter(row[0], row[1],
def pr_plot(opensubs18_csv, eurosense_csv=None, out=None): import matplotlib.pyplot as plt import pareto from brokenaxes import brokenaxes import matplotlib as mpl mpl.rcParams.update({ "font.family": "serif", "font.serif": [], "font.sans-serif": [] }) opensubs18_df = pd.read_csv(opensubs18_csv) nondominated = pareto.eps_sort( opensubs18_df[["precision", "recall"]], [0, 1], maximize_all=True, attribution=True, ) nondominated_idxs = [nd[-1] for nd in nondominated] opensubs18_df["type"] = pd.Series([ "stiff-nondom" if idx in nondominated_idxs else "stiff" for idx in opensubs18_df.index ]) type_markers = {"stiff": ("o", "k"), "stiff-nondom": ("D", "b")} if eurosense_csv is not None: type_markers["eurosense"] = ("s", "y") eurosense_df = pd.read_csv(eurosense_csv) eurosense_df["type"] = "eurosense" eurosense_df["name"] = eurosense_df["name"].str.replace( "high", "eurosense") df = pd.concat([opensubs18_df, eurosense_df], ignore_index=True) else: df = opensubs18_df df = df.drop(df[df.name == "N"].index) print(df) fig = plt.gcf() fig.set_size_inches(645.0 / INCH_PTS, 441.0 / INCH_PTS) bax = brokenaxes( xlims=((0, 0.01), (0.24, 0.51), (0.69, 0.81), (0.99, 1)), ylims=((0, 0.21), (0.39, 0.43), (0.99, 1)), hspace=0.05, wspace=0.05, ) for type, (marker, c) in type_markers.items(): group_df = df[df.type == type] bax.scatter(x=group_df["precision"], y=group_df["recall"], marker=marker, c=c, s=10) texts = [] for idx, row in df.iterrows(): if row["precision"] > 0.6: axnum = 10 elif row["recall"] > 0.3: axnum = 5 else: axnum = 9 texts.append(bax.axs[axnum].text(row["precision"], row["recall"], row["name"], ha="center", va="center")) x = numpy.linspace(0, 0.6, 100) for eurosense in ("EC", "EP"): row = df[df.name == eurosense] p = row.precision.item() r = row.recall.item() assert p > r bax.plot(x, x * r / p, zorder=-1, color="#ffdddd") from adjustText import adjust_text adjust_text(texts) bax.set_xlabel("Precision", labelpad=0) bax.set_ylabel("Recall", labelpad=40) if out is not None: plt.savefig(out, bbox_inches="tight") else: plt.show()
def pr_plot(opensubs18_csv, eurosense_csv=None, out=None): import matplotlib.pyplot as plt from adjustText import adjust_text import pareto from brokenaxes import brokenaxes import matplotlib as mpl mpl.rcParams.update( {"font.family": "serif", "font.serif": [], "font.sans-serif": []} ) opensubs18_df = pd.read_csv(opensubs18_csv) nondominated = pareto.eps_sort( opensubs18_df[["precision", "recall"]], [0, 1], maximize_all=True, attribution=True, ) nondominated_idxs = [nd[-1] for nd in nondominated] opensubs18_df["type"] = pd.Series( [ "stiff-nondom" if idx in nondominated_idxs else "stiff" for idx in opensubs18_df.index ] ) type_markers = {"stiff": ("o", "k"), "stiff-nondom": ("D", "b")} if eurosense_csv is not None: type_markers["eurosense"] = ("s", "y") eurosense_df = pd.read_csv(eurosense_csv) eurosense_df["type"] = "eurosense" eurosense_df["name"] = eurosense_df["name"].str.replace("high", "eurosense") df = pd.concat([opensubs18_df, eurosense_df], ignore_index=True) else: df = opensubs18_df print(df) fig = plt.gcf() fig.set_size_inches(645.0 / INCH_PTS, 441.0 / INCH_PTS) bax = brokenaxes( xlims=((0, 0.01), (0.29, 1)), ylims=((0, 0.51), (0.99, 1)), hspace=0.05, wspace=0.05, ) for type, (marker, c) in type_markers.items(): group_df = df[df.type == type] bax.scatter( x=group_df["precision"], y=group_df["recall"], marker=marker, c=c, s=10 ) texts = [] for idx, row in df.iterrows(): texts.append( bax.axs[3].text( row["precision"], row["recall"], row["name"], ha="center", va="center" ) ) adjust_text(texts) bax.set_xlabel("Precision") bax.set_ylabel("Recall") if out is not None: plt.savefig(out, bbox_inches="tight") else: plt.show()