Пример #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("notefile", help="raw notes file")
    parser.add_argument("outfile", help="raw notes file")
    parser.add_argument("--thr",
                        type=float,
                        default=0.74,
                        help="threshold frequency")
    parser.add_argument("--lb",
                        type=int,
                        default=10,
                        help="lower bound on number of topics")
    parser.add_argument("--ub",
                        type=int,
                        default=100,
                        help="upper bound on number of topics")
    parser.add_argument("--step",
                        type=int,
                        default=5,
                        help="step size to explore")

    args = parser.parse_args()
    aboveThr = args.thr
    progNoteDF = pd.read_csv(args.notefile)
    progScoreDF = lu.compute_lda_scores(progNoteDF,
                                        'procNote',
                                        args.lb,
                                        args.ub,
                                        args.step,
                                        nAbove=aboveThr)
    # find the pareto front, or the points that are not dominated by others.
    aggPF = pareto.eps_sort([list(progScoreDF.itertuples(False))], [1, 6])
    print(pd.DataFrame(aggPF, columns=progScoreDF.columns.get_values()))
    progScoreDF.to_csv(args.outfile, index=False)
Пример #2
0
 def pareto_front(self, step_name=None, nums=None, records=None):
     """Get parent front. pareto."""
     if records is None:
         records = self.all_records
         records = list(
             filter(
                 lambda x: x.step_name == step_name and x.performance is
                 not None, records))
     in_pareto = [
         record.rewards
         if isinstance(record.rewards, list) else [record.rewards]
         for record in records
     ]
     if not in_pareto:
         return None, None
     try:
         fitness = np.array(in_pareto)
         if fitness.shape[1] != 1 and nums is not None and len(
                 in_pareto) > nums:
             # len must larger than nums, otherwise dead loop
             _, res, selected = SortAndSelectPopulation(fitness.T, nums)
         else:
             outs = pareto.eps_sort(fitness,
                                    maximize_all=True,
                                    attribution=True)
             res, selected = np.array(outs)[:, :-2], np.array(
                 outs)[:, -1].astype(np.int32)
         return res.tolist(), selected.tolist()
     except Exception as ex:
         logging.error('No pareto_front_records found, ex=%s', ex)
         return [], []
Пример #3
0
    def best_config(self):
        """Get current config list located in pareto front.

        :return:  list of dict
            {'config_id': int,
                'score': float,
                'configs': dict}
            config_id, score, and configs of the current best config.
        """
        if self.total_propose == 0:
            idx = random.randint(0, len(self.config_list))
            result = {'config_id': idx,
                      'score': -1 * float('inf'),
                      'configs': self.config_list[idx]}
            return [result]
        else:
            pareto_board = self.sieve_board.copy()
            pareto_board = pareto_board.dropna()
            nondominated = pareto.eps_sort([list(pareto_board.itertuples(False))],
                                           objectives=self.pareto_cols,
                                           epsilons=None,
                                           maximize=self.max_object_ids)
            pareto_list = []
            for tmp_list in nondominated:
                result = {}
                for i, value in enumerate(tmp_list):
                    if i == 1:
                        result['config_id'] = value
                        result['configs'] = self.config_list[int(value)]
                    elif i >= 3:
                        result[self.sieve_columns[i]] = value
                pareto_list.append(result)
            return pareto_list
Пример #4
0
def pareto_frontier(df: pd.DataFrame, colx: str, coly: str) -> pd.DataFrame:
    cxi = list(df.columns).index(colx)
    cyi = list(df.columns).index(coly)
    rows = eps_sort([list(df.itertuples(False))],
                    objectives=[cxi, cyi],
                    maximize=[cxi, cyi])
    return pd.DataFrame(rows, columns=df.columns).sort_values([colx])
Пример #5
0
def approximate_pareto(y, epsilons=None, margin=0):
    """
    Uses pareto.py from https://github.com/matthewjwoodruff/pareto.py
    Returns the same data as prpt.
    """
    tagalongs = np.array(pareto.eps_sort(y, epsilons=epsilons, maximize_all=True, attribution=True))
    pareto_solutions = tagalongs[:, :y.shape[1]]
    pareto_idx = tagalongs[:, y.shape[1] + 1].astype(int)
    if margin > 0:
        miny = np.min(y, axis=0)
        ptp = pareto_solutions - miny
        margin = ptp * margin
        pareto_idx = range(y.shape[0])
        for s, m in zip(pareto_solutions, margin):
            pareto_idx = np.intersect1d(pareto_idx, np.where(np.any(y >= s - m, axis=1))[0], assume_unique=True)
        pareto_solutions = y[pareto_idx, :]
    pareto_scores = compute_scores(y, pareto_solutions)
    return pareto_solutions, pareto_idx, pareto_scores
Пример #6
0
def _min_max(study):
    """
    Multi-objective function to find best trial index with minimum deviation and max correlation
    :param study: Optuna study
    :return:
    """

    # Iterate pareto-front trials storing mean correlation and std dev
    df = []
    for trial in study.best_trials:
        df.append([trial.number, np.mean(trial.values), np.std(trial.values)])

    # Sort dataframe ascending by mean correlation
    df = pd.DataFrame(df).sort_values(by=2, ascending=True)

    # Sort df with best trial in first row
    if len(df) > 1 and len(df.iloc[:, 1:3].drop_duplicates()) > 1:

        # Create second pareto to maximize correlation and minimize stddev
        # Epsilons define precision, ie dominance over other candidates
        # Dominance is defined as x percent of stddev of stddev
        try:
            nd = pareto.eps_sort([list(df.itertuples(False))], objectives=[1, 2],
                epsilons=[1e-09, np.std(df[1])*.5], maximize=[1])
        except:
            # Something went wrong, return df
            nd = df

        # Sort remaining candidates
        nd = pd.DataFrame(nd).sort_values(by=2, ascending=True)

    # Only 1st trial so return it
    else:
        nd = df

    # Return "best" trial index
    return nd.iloc[0, 0]
Пример #7
0
    def get_pareto_front(self):
        """Propose the Pareto front from the board.

        :return: The row count of the Pareto board.
        :rtype: dict
        """
        pareto_list = []
        pareto_dict = {}
        pareto_board = self.sieve_board.copy()
        pareto_board = pareto_board.dropna()
        if not pareto_board.empty:
            nondominated = pareto.eps_sort(
                [list(pareto_board.itertuples(False))],
                objectives=self.pareto_cols,
                epsilons=None,
                maximize=self.max_object_ids)
            for tmp_list in nondominated:
                for i, value in enumerate(tmp_list):
                    if i == 2:
                        pareto_list.append(copy.deepcopy(value))
                        break
                if len(tmp_list) > 2:
                    pareto_dict[tmp_list[0]] = copy.deepcopy(tmp_list[2])
        return pareto_dict
Пример #8
0
def plot_nondominated_sets(df,
                           mdf,
                           x_axis=None,
                           y_axis=None,
                           z_axis=None,
                           hoverlabel=None,
                           output='plot'):
    """
    Takes in parameters and returns a figure object to be plotted by plotly

    :param df: Input objectives table as a pandas dataframe
    :param mdf: Input metadata file as a pandas dataframe
    :param x_axis: Objective name to be plotted on the X-axis [STRING]
    :param y_axis: Objective name to be plotted on the Y-axis [STRING]
    :param z_axis: Objective name to be plotted on the Y-axis [STRING]
    :param hoverlabel: Objective name to be used to tag all the points [STRING]
    :param output: specifies the output of the fucntion [STRING]
    :return: Plotly Figure object [use plotly.offline.plot() ]
    """
    if output == 'table':
        objective_names = []
        max_col = []
        eps_vals = []
        for i in range(len(mdf)):
            row = mdf.iloc[i]
            default_epsilon = 1e-9
            objective_names.append(row['Col_Name'])
            if pd.isna(row['Epsilon']):
                eps_vals.append(default_epsilon)
            else:
                eps_vals.append((row['Epsilon']))
            if row['Max_Min'] == 'Max':
                max_col.append(row['Col_Name'])

        objective_cols = [
            df.columns.get_loc(c) for c in objective_names if c in df
        ]  # Converts col-names to col_ids
        max_col_ids = [df.columns.get_loc(c) for c in max_col if c in df]
        nondominated_set_multi = pareto.eps_sort([list(df.itertuples(False))],
                                                 objectives=objective_cols,
                                                 maximize=max_col_ids,
                                                 epsilons=eps_vals)
        nondominated_df_multi = pd.DataFrame(nondominated_set_multi)
        nondominated_df_multi.columns = df.columns
        return nondominated_df_multi
    else:
        if z_axis:
            plot_list = [x_axis, y_axis, z_axis]
        else:
            plot_list = [x_axis, y_axis]
        plot_mdf = mdf.loc[mdf['Col_Name'].isin(plot_list)]
        plot_col_ids = [df.columns.get_loc(c) for c in plot_list if c in df]
        plot_max_cols = [
            df.columns.get_loc(c)
            for c in plot_mdf.query('Max_Min == "Max"')['Col_Name'].values
            if c in df
        ]
        col_eps = [1e-9 if pd.isna(e) else e for e in plot_mdf['Epsilon']]

        nondominated_set = pareto.eps_sort([list(df.itertuples(False))],
                                           objectives=plot_col_ids,
                                           maximize=plot_max_cols,
                                           epsilons=col_eps)
        nondominated_df = pd.DataFrame(nondominated_set)
        nondominated_df.columns = df.columns
        full_df = pd.merge(df,
                           nondominated_df['row_index'],
                           how='left',
                           indicator='Optimal',
                           on='row_index')
        full_df.Optimal.replace(to_replace=dict(both="True",
                                                left_only="False"),
                                inplace=True)

        if z_axis is None:
            nd_points_list = [(nondominated_df[x_axis].values[i],
                               nondominated_df[y_axis].values[i])
                              for i in range(len(nondominated_df))]
            nd_points_list.sort(key=lambda tup: tup[1])
            x, y = map(list, zip(*nd_points_list))

        if x_axis and y_axis and z_axis:
            fig = px.scatter_3d(full_df,
                                x=x_axis,
                                y=y_axis,
                                z=z_axis,
                                color='Optimal',
                                hover_name=hoverlabel,
                                color_discrete_map={
                                    "True": "#23B58A",
                                    'False': "#B5234E"
                                })
            fig.data[0].marker.size = 12
            fig.data[0].marker.line.width = 2
            fig.data[0].marker.line.color = "black"
            fig.update_layout(scene=dict(xaxis_title=x_axis,
                                         yaxis_title=y_axis,
                                         zaxis_title=z_axis),
                              height=545,
                              margin=dict(r=0, b=0, l=0))
            return fig
        elif x_axis and y_axis:
            fig = px.scatter(full_df,
                             x=x_axis,
                             y=y_axis,
                             color='Optimal',
                             hover_name=hoverlabel,
                             color_discrete_map={
                                 "True": "#23B58A",
                                 'False': "#B5234E"
                             })
            fig.add_trace(
                go.Scatter(x=x,
                           y=y,
                           mode="lines",
                           line=go.scatter.Line(shape='linear',
                                                color='#3D9970'),
                           showlegend=False))
            fig.data[0].marker.size = 12
            fig.data[0].marker.line.width = 2
            fig.data[0].marker.line.color = "black"
            fig.update_layout(
                height=545,
                title=go.layout.Title(
                    xref="paper",
                    x=0,
                ),
                xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(
                    text=x_axis, font=dict(size=18, color="#7f7f7f"))),
                yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(
                    text=y_axis, font=dict(size=18, color="#7f7f7f"))))
            return fig
        else:
            raise Exception('You must supply at least two axes')
Пример #9
0
data = pandas.read_csv("/Users/rshu/Downloads/pareto.py-picture/data.txt",
                       header=None,
                       sep=" ")

sets = {}
archives = {}

fig = matplotlib.figure.Figure(figsize=(15, 15))
agg.FigureCanvasAgg(fig)

counter = 0

resolutions = [1e-9, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.4, 1.0]
for resolution in resolutions:
    archives[resolution] = pareto.eps_sort([data.itertuples(False)], [0, 1],
                                           [resolution] * 2)
    sets[resolution] = pandas.DataFrame(data=archives[resolution].archive)

# print("debugging")
# print(sets[0.03])
#
# for idx, row in sets[0.03].iterrows():
#      print((idx, row[0], row[1]))

for resolution in resolutions:
    counter += 1
    ax = fig.add_subplot(3, 3, counter)
    ax.scatter(data[0], data[1], lw=0, facecolor=(0.7, 0.7, 0.7), zorder=-1)
    for idx, row in sets[resolution].iterrows():
        ax.scatter(row[0],
                   row[1],
Пример #10
0
def pr_plot(opensubs18_csv, eurosense_csv=None, out=None):
    import matplotlib.pyplot as plt
    import pareto
    from brokenaxes import brokenaxes
    import matplotlib as mpl

    mpl.rcParams.update({
        "font.family": "serif",
        "font.serif": [],
        "font.sans-serif": []
    })

    opensubs18_df = pd.read_csv(opensubs18_csv)

    nondominated = pareto.eps_sort(
        opensubs18_df[["precision", "recall"]],
        [0, 1],
        maximize_all=True,
        attribution=True,
    )
    nondominated_idxs = [nd[-1] for nd in nondominated]

    opensubs18_df["type"] = pd.Series([
        "stiff-nondom" if idx in nondominated_idxs else "stiff"
        for idx in opensubs18_df.index
    ])

    type_markers = {"stiff": ("o", "k"), "stiff-nondom": ("D", "b")}

    if eurosense_csv is not None:
        type_markers["eurosense"] = ("s", "y")
        eurosense_df = pd.read_csv(eurosense_csv)
        eurosense_df["type"] = "eurosense"
        eurosense_df["name"] = eurosense_df["name"].str.replace(
            "high", "eurosense")
        df = pd.concat([opensubs18_df, eurosense_df], ignore_index=True)
    else:
        df = opensubs18_df

    df = df.drop(df[df.name == "N"].index)

    print(df)

    fig = plt.gcf()
    fig.set_size_inches(645.0 / INCH_PTS, 441.0 / INCH_PTS)

    bax = brokenaxes(
        xlims=((0, 0.01), (0.24, 0.51), (0.69, 0.81), (0.99, 1)),
        ylims=((0, 0.21), (0.39, 0.43), (0.99, 1)),
        hspace=0.05,
        wspace=0.05,
    )

    for type, (marker, c) in type_markers.items():
        group_df = df[df.type == type]
        bax.scatter(x=group_df["precision"],
                    y=group_df["recall"],
                    marker=marker,
                    c=c,
                    s=10)
    texts = []
    for idx, row in df.iterrows():
        if row["precision"] > 0.6:
            axnum = 10
        elif row["recall"] > 0.3:
            axnum = 5
        else:
            axnum = 9
        texts.append(bax.axs[axnum].text(row["precision"],
                                         row["recall"],
                                         row["name"],
                                         ha="center",
                                         va="center"))
    x = numpy.linspace(0, 0.6, 100)
    for eurosense in ("EC", "EP"):
        row = df[df.name == eurosense]
        p = row.precision.item()
        r = row.recall.item()
        assert p > r
        bax.plot(x, x * r / p, zorder=-1, color="#ffdddd")
    from adjustText import adjust_text

    adjust_text(texts)
    bax.set_xlabel("Precision", labelpad=0)
    bax.set_ylabel("Recall", labelpad=40)
    if out is not None:
        plt.savefig(out, bbox_inches="tight")
    else:
        plt.show()
Пример #11
0
def pr_plot(opensubs18_csv, eurosense_csv=None, out=None):
    import matplotlib.pyplot as plt
    from adjustText import adjust_text
    import pareto
    from brokenaxes import brokenaxes
    import matplotlib as mpl

    mpl.rcParams.update(
        {"font.family": "serif", "font.serif": [], "font.sans-serif": []}
    )

    opensubs18_df = pd.read_csv(opensubs18_csv)

    nondominated = pareto.eps_sort(
        opensubs18_df[["precision", "recall"]],
        [0, 1],
        maximize_all=True,
        attribution=True,
    )
    nondominated_idxs = [nd[-1] for nd in nondominated]

    opensubs18_df["type"] = pd.Series(
        [
            "stiff-nondom" if idx in nondominated_idxs else "stiff"
            for idx in opensubs18_df.index
        ]
    )

    type_markers = {"stiff": ("o", "k"), "stiff-nondom": ("D", "b")}

    if eurosense_csv is not None:
        type_markers["eurosense"] = ("s", "y")
        eurosense_df = pd.read_csv(eurosense_csv)
        eurosense_df["type"] = "eurosense"
        eurosense_df["name"] = eurosense_df["name"].str.replace("high", "eurosense")
        df = pd.concat([opensubs18_df, eurosense_df], ignore_index=True)
    else:
        df = opensubs18_df

    print(df)

    fig = plt.gcf()
    fig.set_size_inches(645.0 / INCH_PTS, 441.0 / INCH_PTS)

    bax = brokenaxes(
        xlims=((0, 0.01), (0.29, 1)),
        ylims=((0, 0.51), (0.99, 1)),
        hspace=0.05,
        wspace=0.05,
    )

    for type, (marker, c) in type_markers.items():
        group_df = df[df.type == type]
        bax.scatter(
            x=group_df["precision"], y=group_df["recall"], marker=marker, c=c, s=10
        )
    texts = []
    for idx, row in df.iterrows():
        texts.append(
            bax.axs[3].text(
                row["precision"], row["recall"], row["name"], ha="center", va="center"
            )
        )
    adjust_text(texts)
    bax.set_xlabel("Precision")
    bax.set_ylabel("Recall")
    if out is not None:
        plt.savefig(out, bbox_inches="tight")
    else:
        plt.show()