Exemplo n.º 1
0
    def scatterplot(cls, df):
        Utils.check_and_make_dir("Figures/Scatterplots")
        df = df[(df['index'] != 'Overall') &
                (df['index'] != 'No ROI')]  # Remove No ROI and Overall rows

        df = df.groupby([config.table_cols, config.table_rows]).apply(
            lambda x: x.sort_values(['Mean']))  # Group by parameters and sort
        df = df.reset_index(drop=True)  # Reset index to remove grouping

        scatterplots = ['roi_ordered', 'stat_ordered']
        if config.table_row_order == 'roi':
            scatterplots.remove('stat')
        elif config.table_row_order == 'statorder':
            scatterplots.remove('roi_ordered')

        for scatterplot in scatterplots:
            if config.verbose:
                print(f"Saving {scatterplot} scatterplot!")

            if scatterplot == 'roi_ordered':
                roi_ord = pd.Categorical(df['index'],
                                         categories=df['index'].unique()
                                         )  # Order rows based on first facet
            else:
                roi_ord = pd.Categorical(
                    df.groupby(['MB', 'SENSE'
                                ]).cumcount())  # Order each facet individually

            figure_table = (
                pltn.ggplot(df, pltn.aes(x="Mean", y=roi_ord)) +
                pltn.geom_point(na_rm=True, size=1) + pltn.geom_errorbarh(
                    pltn.aes(xmin="Mean-Conf_Int_95", xmax="Mean+Conf_Int_95"),
                    na_rm=True,
                    height=None) + pltn.xlim(0, None) +
                pltn.scale_y_discrete(labels=[]) +
                pltn.ylab(config.table_y_label) +
                pltn.xlab(config.table_x_label) +
                pltn.facet_grid('{rows}~{cols}'.format(rows=config.table_rows,
                                                       cols=config.table_cols),
                                drop=True,
                                labeller="label_both") +
                pltn.theme_538()  # Set theme
                + pltn.theme(
                    panel_grid_major_y=pltn.themes.element_line(alpha=0),
                    panel_grid_major_x=pltn.themes.element_line(alpha=1),
                    panel_background=pltn.element_rect(fill="gray", alpha=0.1),
                    dpi=config.plot_dpi))

            figure_table.save(
                f"Figures/Scatterplots/{scatterplot}_scatterplot.png",
                height=config.plot_scale,
                width=config.plot_scale * 3,
                verbose=False,
                limitsize=False)
Exemplo n.º 2
0
def setup_heatmap0(df: pd.DataFrame, format_string, axis_text):
    # https://stackoverflow.com/a/62161556/819272
    # Plotnine does not support changing the position of any axis.
    return (p9.ggplot(df, p9.aes(y='row', x='col')) + p9.coord_equal() +
            p9.geom_tile(p9.aes(fill='scale')) + p9.geom_text(
                p9.aes(label='value'), format_string=format_string, size=7) +
            p9.scale_y_discrete(drop=False) + p9.scale_x_discrete(drop=False) +
            p9.scale_fill_gradientn(colors=['#63BE7B', '#FFEB84', '#F8696B'],
                                    na_value='#CCCCCC',
                                    guide=False) +
            p9.theme(axis_text=p9.element_blank()
                     if not axis_text else p9.element_text(face='bold'),
                     axis_ticks=p9.element_blank(),
                     axis_title=p9.element_blank(),
                     panel_grid=p9.element_blank()))
Exemplo n.º 3
0
def scatter_plot(df,
                 x,
                 y,
                 group=None,
                 facet_x=None,
                 facet_y=None,
                 base_size=10,
                 figure_size=(6, 3),
                 **kwargs):
    '''
    Aggregates data in df and plots as a scatter plot chart.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size
    **kwargs:
      additional kwargs passed to geom_point

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, None, fill_groups=True)
    gdata = gdata[[
        c for c in ['x', 'y', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]

    # add group_x column
    if group is not None:
        gdata['group_x'] = gdata['group'].astype(
            'str') + '_' + gdata['x'].astype(str)

    g = EZPlot(gdata)

    # set groups
    if group is None:
        g += p9.geom_point(p9.aes(x="x", y="y"),
                           colour=ez_colors(1)[0],
                           **kwargs)
    else:
        g += p9.geom_point(
            p9.aes(x="x", y="y", group="factor(group)", color="factor(group)"),
            **kwargs)
        g += p9.scale_color_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    if g.column_is_timestamp('y'):
        g += p9.scale_y_datetime()
    elif g.column_is_categorical('y'):
        g += p9.scale_y_discrete()
    else:
        g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    return g
Exemplo n.º 4
0
Arquivo: gap.py Projeto: tommens/gap
def cli():
    parser = argparse.ArgumentParser(
        description='GAP - Git Activity Predictor')
    parser.add_argument('paths',
                        metavar='PATH',
                        type=str,
                        nargs='*',
                        default=['.'],
                        help='Paths to one or more git repositories')
    parser.add_argument(
        '--date',
        type=lambda d: dateutil.parser.parse(d).date(),
        required=False,
        default=datetime.date.today(),
        help='Date used for predictions (default to current date)')
    parser.add_argument('--obs',
                        type=int,
                        required=False,
                        default=20,
                        help='Number of observations to consider')
    parser.add_argument('--probs',
                        metavar='PROB',
                        type=float,
                        nargs='*',
                        required=False,
                        default=[0.5, 0.6, 0.7, 0.8, 0.9],
                        help='Probabilities to output, strictly in [0,1].')
    parser.add_argument(
        '--limit',
        type=int,
        required=False,
        default=30,
        help=
        'Limit contributors to the one that were active at least once during the last x days (default 30)'
    )
    parser.add_argument(
        '--mapping',
        type=str,
        nargs='?',
        help=
        'Mapping file to merge identities. This file must be a csv file where each line contains two values: the name to be merged, and the corresponding identity. Use "IGNORE" as identity to ignore specific names.'
    )
    parser.add_argument('--branches',
                        metavar='BRANCH',
                        type=str,
                        nargs='*',
                        default=list(),
                        help='Git branches to analyse (default to all).')
    parser.add_argument(
        '--as-dates',
        dest='as_dates',
        action='store_true',
        help=
        'Express predictions using dates instead of time differences in days')

    group = parser.add_mutually_exclusive_group()
    group.add_argument('--text',
                       action='store_true',
                       help='Print results as text.')
    group.add_argument('--csv',
                       action='store_true',
                       help='Print results as csv.')
    group.add_argument('--json',
                       action='store_true',
                       help='Print results as json.')
    group.add_argument(
        '--plot',
        nargs='?',
        const=True,
        help='Export results to a plot. Filepath can be optionaly specified.')

    args = parser.parse_args()

    # Default plot location
    if args.plot is True:
        args.plot = str(args.date) + '.pdf'

    # Default to text if not other option is provided
    if not args.csv and not args.json and not args.plot:
        args.text = True

    # Identity mapping
    if args.mapping:
        d = pandas.read_csv(args.mapping, names=['source', 'target'])
        mapping = {r.source: r.target for r in d.itertuples()}
    else:
        mapping = {}

    raw_data = dict()  # author -> dates of activity

    # Get data from git
    for path in args.paths:
        try:
            repo = git.Repo(path)
        except Exception as e:  # Must be refined
            print('Unable to access repository {} ({}:{})'.format(
                path, e.__class__.__name__, e))
            sys.exit()

        # Default branches
        if len(args.branches) == 0:
            commits = repo.iter_commits('--all')
        else:
            commits = repo.iter_commits(' '.join(args.branches))

        for commit in commits:
            try:
                author = commit.author.name
                identity = mapping.get(author, author)
                if author.lower() != 'ignore' and identity.lower() == 'ignore':
                    continue

                date = datetime.date.fromtimestamp(commit.authored_date)
                raw_data.setdefault(identity, []).append(date)
            except Exception as e:
                print('Unable to read commit ({}: {}): {}'.format(
                    e.__class__.__name__, e, commit))

    # Compute durations and apply model
    data = []  # (author, past activities, predicted durations)

    for author, commits in raw_data.items():
        commits = sorted([e for e in commits if e <= args.date])
        durations = dates_to_duration(commits, window_size=args.obs)

        if len(durations) >= args.obs:
            # Currently implemented with no censor
            surv = SurvfuncRight(durations, [1] * len(durations))
            predictions = [surv.quantile(p) for p in args.probs]
            last_day = commits[-1]

            if last_day >= args.date - datetime.timedelta(args.limit):
                data.append((
                    author,
                    commits,
                    predictions,
                ))

    # Prepare dataframe
    df = pandas.DataFrame(index=set([a for a, c, p in data]),
                          columns=['last'] + args.probs)
    if len(df) == 0:
        print(
            'No author has {} observations and was active at least once during the last {} days'
            .format(args.obs, args.limit))
        sys.exit()

    df.index.name = 'author'

    if not args.plot:
        for author, commits, predictions in data:
            last = commits[-1]
            if args.as_dates:
                df.at[author, 'last'] = last
            else:
                df.at[author, 'last'] = (last - args.date).days

            for prob, p in zip(args.probs, predictions):
                if args.as_dates:
                    df.at[author,
                          prob] = last + datetime.timedelta(days=int(p))
                else:
                    df.at[author,
                          prob] = (last + datetime.timedelta(days=int(p)) -
                                   args.date).days

        df = df.sort_values(['last'] + args.probs,
                            ascending=[False] + [True] * len(args.probs))
        df = df.astype(str)

        if args.text:
            pandas.set_option('expand_frame_repr', False)
            pandas.set_option('display.max_columns', 999)
            print(df)
        elif args.csv:
            print(df.to_csv())
        elif args.json:
            print(df.to_json(orient='index'))
    else:
        # Because of plotnine's way of initializing matplotlib
        import warnings
        warnings.filterwarnings("ignore")

        VIEW_LIMIT = 28

        activities = [
        ]  # List of (author, day) where day is a delta w.r.t. given date
        forecasts = [
        ]  # List of (author, from_day, to_day, p) where probability p
        # applies between from_day and to_day (delta w.r.t. given date)

        for author, commits, predictions in data:
            last = (commits[-1] - args.date).days
            for e in commits:
                activities.append((author, (e - args.date).days))

            previous = previous_previous = 0
            for d, p in zip(predictions, args.probs):
                if d > previous:
                    forecasts.append((author, last + previous, last + d, p))
                    previous_previous = previous
                    previous = d
                else:
                    forecasts.append(
                        (author, last + previous_previous, last + d, p))

        activities = pandas.DataFrame(columns=['author', 'day'],
                                      data=activities)
        forecasts = pandas.DataFrame(columns=['author', 'fromd', 'tod', 'p'],
                                     data=forecasts)

        plot = (p9.ggplot(p9.aes(y='author')) + p9.geom_segment(
            p9.aes('day - 0.5', 'author', xend='day + 0.5', yend='author'),
            data=activities,
            size=4,
            color='orange',
        ) + p9.geom_segment(
            p9.aes('fromd + 0.5',
                   'author',
                   xend='tod + 0.5',
                   yend='author',
                   alpha='factor(p)'),
            data=forecasts.sort_values('p').drop_duplicates(
                ['author', 'fromd', 'tod'], keep='last'),
            size=4,
            color='steelblue',
        ) + p9.geom_vline(
            xintercept=0,
            color='r', alpha=0.5, linetype='dashed') + p9.scale_x_continuous(
                name='  <<  past days {:^20} future days  >>'.format(
                    str(args.date)),
                breaks=range(-VIEW_LIMIT // 7 * 7,
                             (VIEW_LIMIT // 7 * 7) + 1, 7),
                minor_breaks=6) + p9.scale_y_discrete(
                    name='',
                    limits=activities.sort_values(
                        'day', ascending=False)['author'].unique()) +
                p9.scale_alpha_discrete(range=(0.2, 1), name=' ') +
                p9.coord_cartesian(xlim=(-VIEW_LIMIT, VIEW_LIMIT)) +
                p9.theme_matplotlib() + p9.theme(
                    figure_size=(6, 4 * activities['author'].nunique() / 15)))

        fig = plot.draw()
        fig.savefig(args.plot, bbox_inches='tight')
        print('Plot exported to {}'.format(args.plot))
Exemplo n.º 5
0
def hist_plot(df,
              x,
              y=None,
              group = None,
              facet_x = None,
              facet_y = None,
              w='1',
              bins=21,
              bin_width = None,
              position = 'stack',
              normalize = False,
              sort_groups=True,
              base_size=10,
              figure_size=(6, 3)):

    '''
    Plot a 1-d or 2-d histogram

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis. If this is specified the histogram will be 2-d.
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    w : str
      quoted expression representing histogram weights (default is 1)
    bins : int or tuple
      number of bins to be used
    bin_width : float or tuple
      bin width to be used
    position : str
      if groups are present, choose between `stack`, `overlay` or `dodge`
    normalize : bool
      normalize histogram counts
    sort_groups : bool
      sort groups by the sum of their value (otherwise alphabetical order is used)
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    if position not in ['overlay', 'stack', 'dodge']:
        log.error("position not recognized")
        raise NotImplementedError("position not recognized")

    if (bins is None) and (bin_width is None):
        log.error("Either bins or bin_with should be defined")
        raise ValueError("Either bins or bin_with should be defined")

    if (bins is not None) and (bin_width is not None):
        log.error("Only one between bins or bin_with should be defined")
        raise ValueError("Only one between  bins or bin_with should be defined")

    if (y is not None) and (group is not None):
        log.error("y and group cannot be requested at the same time")
        raise ValueError("y and group cannot be requested at the same time")

    if y is None:
        bins = (bins, bins)
        bin_width = (bin_width, bin_width)
    else:
        if type(bins) not in [tuple, list]:
            bins = (bins, bins)
        if type(bin_width) not in [tuple, list]:
            bin_width = (bin_width, bin_width)

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'y', 'group', 'facet_x', 'facet_y'], [x, y, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['w'], variables['w'] = unname(w)

    # set column names and evaluate expressions
    tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False)

    # redefine groups and variables; remove and store (eventual) names
    new_groups = {c:c for c in tmp_df.columns if c in ['x', 'y', 'group', 'facet_x', 'facet_y']}
    non_xy_groups = [g for g  in new_groups.keys() if g not in ['x', 'y']]
    new_variables = {'w':'w'}

    # bin data (if necessary)
    if tmp_df['x'].dtypes != np.dtype('O'):
        tmp_df['x'], bins_x, bin_width_x= bin_data(tmp_df['x'], bins[0], bin_width[0])
    else:
        bin_width_x=1
    if y is not None:
        if tmp_df['y'].dtypes != np.dtype('O'):
            tmp_df['y'], bins_y, bin_width_y = bin_data(tmp_df['y'], bins[1], bin_width[1])
        else:
            bin_width_y=1
    else:
        bin_width_y=1

    # aggregate data and reorder columns
    gdata = agg_data(tmp_df, new_variables, new_groups, 'sum', fill_groups=True)
    gdata.fillna(0, inplace=True)
    gdata = gdata[[c for c in ['x', 'y', 'w', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]]

    # normalize
    if normalize:
        if len(non_xy_groups)==0:
            gdata['w'] = gdata['w']/(gdata['w'].sum()*bin_width_x*bin_width_y)
        else:
            gdata['w'] = gdata.groupby(non_xy_groups)['w'].apply(lambda x: x/(x.sum()*bin_width_x*bin_width_y))

    # start plotting
    g = EZPlot(gdata)
    # determine order and create a categorical type
    if (group is not None) and sort_groups:
        if g.column_is_categorical('x'):
            g.sort_group('x', 'w', ascending=False)
        g.sort_group('group', 'w')
        g.sort_group('facet_x', 'w', ascending=False)
        g.sort_group('facet_y', 'w', ascending=False)
        if groups:
            colors = np.flip(ez_colors(g.n_groups('group')))
    elif (group is not None):
        colors = ez_colors(g.n_groups('group'))

    if y is None:
        # set groups
        if group is None:
            g += p9.geom_bar(p9.aes(x="x", y="w"),
                             stat = 'identity',
                             colour = None,
                             fill = ez_colors(1)[0])
        else:
            g += p9.geom_bar(p9.aes(x="x", y="w",
                                    group="factor(group)",
                                    fill="factor(group)"),
                             colour=None,
                             stat = 'identity',
                             **POSITION_KWARGS[position])
            g += p9.scale_fill_manual(values=colors)

        # set facets
        if facet_x is not None and facet_y is None:
            g += p9.facet_wrap('~facet_x')
        if facet_x is not None and facet_y is not None:
            g += p9.facet_grid('facet_y~facet_x')

        # set x scale
        if g.column_is_categorical('x'):
            g += p9.scale_x_discrete()
        else:
            g += p9.scale_x_continuous(labels=ez_labels)

        # set y scale
        g += p9.scale_y_continuous(labels=ez_labels)

        # set axis labels
        g += \
            p9.xlab(names['x']) + \
            p9.ylab('Counts')

        # set theme
        g += theme_ez(figure_size=figure_size,
                      base_size=base_size,
                      legend_title=p9.element_text(text=names['group'], size=base_size))

        if sort_groups:
            g += p9.guides(fill=p9.guide_legend(reverse=True))

    else:
        g += p9.geom_tile(p9.aes(x="x", y="y", fill='w'),
                          stat = 'identity',
                          colour = None)

        # set facets
        if facet_x is not None and facet_y is None:
            g += p9.facet_wrap('~facet_x')
        if facet_x is not None and facet_y is not None:
            g += p9.facet_grid('facet_y~facet_x')

        # set x scale
        if g.column_is_categorical('x'):
            g += p9.scale_x_discrete()
        else:
            g += p9.scale_x_continuous(labels=ez_labels)

        # set y scale
        if g.column_is_categorical('y'):
            g += p9.scale_y_discrete()
        else:
            g += p9.scale_y_continuous(labels=ez_labels)

        # set axis labels
        g += \
            p9.xlab(names['x']) + \
            p9.ylab(names['y'])

        # set theme
        g += theme_ez(figure_size=figure_size,
                      base_size=base_size,
                      legend_title=p9.element_text(text='Counts', size=base_size))

    return g
def plot_pointgraph(
        plot_df,
        x_axis_label,
        left_arrow_label,
        right_arrow_label,
        left_arrow_start=-0.5,
        left_arrow_height=38.5,
        right_arrow_start=0.5,
        right_arrow_height=1.5,
        arrow_length=2,
        left_arrow_label_x=-1.5,
        left_arrow_label_y=-1.5,
        right_arrow_label_x=-1.5,
        right_arrow_label_y=-1.5,
        limits=(-3, 3),
):
    """
    This function is designed to plot the an errorbar graph to show each token's odd ratio.
    The main idea for this graph is to show which corpora a token is enriched
    Args:
        plot_df - the data frame to plot,
        x_axis_label - the label of the x axis,
        left_arrow_label - the label for the left arrow,
        right_arrow_label - the label for the right arrow,
        left_arrow_start - the start of the left arrow to be plotted
        left_arrow_height - the height at which the arrow needs to be plotted
        right_arrow_start -  the start of the right arrow to be plotted
        right_arrow_height - - the height at which the arrow needs to be plotted
        arrow_length - the length of the arrow
        left_arrow_label_x - the x axis position for the label of the left arrow
        left_arrow_label_y - the y axis position for the label of the left arrow
        right_arrow_label_x - the x axis position for the label of the right arrow
        right_arrow_label_y - the y axis position for the label of the right arrow
        limits=(-3,3)
    """

    graph = (p9.ggplot(
        plot_df.assign(lemma=lambda x: pd.Categorical(x.lemma.tolist())),
        p9.aes(
            y="lemma",
            xmin="lower_odds",
            x="odds_ratio",
            xmax="upper_odds",
            yend="lemma",
        ),
    ) + p9.geom_errorbarh(color="#253494") + p9.scale_y_discrete(limits=(
        plot_df.sort_values("odds_ratio", ascending=True).lemma.tolist())) +
             p9.scale_x_continuous(limits=limits) +
             p9.geom_vline(p9.aes(xintercept=0), linetype="--", color="grey") +
             p9.annotate(
                 "segment",
                 x=left_arrow_start,
                 xend=left_arrow_start - arrow_length,
                 y=left_arrow_height,
                 yend=left_arrow_height,
                 colour="black",
                 size=0.5,
                 alpha=1,
                 arrow=p9.arrow(length=0.1),
             ) + p9.annotate(
                 "text",
                 label=left_arrow_label,
                 x=left_arrow_label_x,
                 y=left_arrow_label_y,
                 size=12,
                 alpha=0.7,
             ) + p9.annotate(
                 "segment",
                 x=right_arrow_start,
                 xend=right_arrow_start + arrow_length,
                 y=right_arrow_height,
                 yend=right_arrow_height,
                 colour="black",
                 size=0.5,
                 alpha=1,
                 arrow=p9.arrow(length=0.1),
             ) + p9.annotate(
                 "text",
                 label=right_arrow_label,
                 x=right_arrow_label_x,
                 y=right_arrow_label_y,
                 size=12,
                 alpha=0.7,
             ) + p9.theme_seaborn(
                 context="paper",
                 style="ticks", font_scale=1, font="Arial") + p9.theme(
                     figure_size=(11, 8.5),
                     panel_grid_minor=p9.element_blank(),
                     text=p9.element_text(size=12),
                 ) + p9.labs(y=None, x=x_axis_label))

    return graph
                odds_ratio=lambda x: x.odds_ratio.apply(lambda x: np.log2(x)),
                lower_odds=lambda x: x.lower_odds.apply(lambda x: np.log2(x)),
                upper_odds=lambda x: x.upper_odds.apply(lambda x: np.log2(x)),
            ))
plot_df.head()

g = (p9.ggplot(
    plot_df.assign(lemma=lambda x: pd.Categorical(x.lemma.tolist())),
    p9.aes(
        y="lemma",
        xmin="lower_odds",
        x="odds_ratio",
        xmax="upper_odds",
        yend="lemma",
    ),
) + p9.geom_errorbarh(color="#253494") + p9.scale_y_discrete(limits=(
    plot_df.sort_values("odds_ratio", ascending=True).lemma.tolist())) +
     p9.scale_x_continuous(limits=(-3, 3)) +
     p9.geom_vline(p9.aes(xintercept=0), linetype="--", color="grey") +
     p9.annotate(
         "segment",
         x=0.5,
         xend=2.5,
         y=1.5,
         yend=1.5,
         colour="black",
         size=0.5,
         alpha=1,
         arrow=p9.arrow(length=0.1),
     ) + p9.annotate(
         "text", label="bioRxiv Enriched", x=1.5, y=2.5, size=18, alpha=0.7) +
     p9.annotate(
Exemplo n.º 8
0
plot_df = (full_plot_df.sort_values(
    "odds_ratio", ascending=False).head(subset).append(
        full_plot_df.sort_values("odds_ratio", ascending=False).iloc[:-2].tail(
            subset)).replace("rna", "RNA").assign(
                odds_ratio=lambda x: x.odds_ratio.apply(lambda x: np.log2(x)),
                lower_odds=lambda x: x.lower_odds.apply(lambda x: np.log2(x)),
                upper_odds=lambda x: x.upper_odds.apply(lambda x: np.log2(x)),
            ))
plot_df.head()

g = (p9.ggplot(
    plot_df, p9.aes(y="lemma", x="lower_odds", xend="upper_odds",
                    yend="lemma")) +
     p9.geom_segment(color="#253494", size=6, alpha=0.7) +
     p9.scale_y_discrete(limits=(
         plot_df.sort_values("odds_ratio", ascending=True).lemma.tolist())) +
     p9.scale_x_continuous(limits=(-3, 3)) +
     p9.geom_vline(p9.aes(xintercept=0), linetype="--", color="grey") +
     p9.annotate(
         "segment",
         x=0.5,
         xend=2.5,
         y=1.5,
         yend=1.5,
         colour="black",
         size=0.5,
         alpha=1,
         arrow=p9.arrow(length=0.1),
     ) + p9.annotate(
         "text", label="bioRxiv Enriched", x=1.5, y=2.5, size=14, alpha=0.7) +
     p9.annotate(