예제 #1
0
def scale_x_datetime_auto(times: pd.Series, figsize=(12,10)):
    """
    Automatically set breaks and format based on duration of series
    """
    width = figsize[0]

    dt = (times.iloc[-1] - times.iloc[0]).total_seconds()
    mins = dt / 60
    hours = dt / 3600
    days = dt / 3600 / 24

    if days > 10:
        fmt = "%Y-%m-%d"
        breaks = "%1.0f days" % max(np.round(days / (width / 2.0)), 1.0)
        return plotnine.scale_x_datetime(date_labels=fmt, date_breaks=breaks)
    elif days > 1.0:
        fmt = "%Y-%m-%d %H:%M"
        breaks = "%1.0f hours" % max(np.round(hours / (width / 2.0)), 1.0)
        return plotnine.scale_x_datetime(date_labels=fmt, date_breaks=breaks)
    elif hours > 1:
        fmt = "%H:%M"
        breaks = "%1.0f minutes" % max(np.round(mins / (width / 1.5)), 1.0)
        return plotnine.scale_x_datetime(date_labels=fmt, date_breaks=breaks)
    else:
        fmt = "%H:%M:%S"
        breaks = "%1.0f minutes" % max(np.round(mins / (width / 1.5)), 1.0)
        return plotnine.scale_x_datetime(date_labels=fmt, date_breaks=breaks)
def event_counts_date(request_disc=None):
    '''
    Plot the average timeline of a certain institution
    request should be given as a dictionary
    '''
    request = np.ones(df.shape[0], dtype=bool)
    for key in request_disc.keys():
        if key == "institution":
            request = request & (df[key].str.contains(request_disc[key]))
        else:
            request = request & (df[key] == request_disc[key])
    df_selected = df[request]
    df_selected["date_md"] = df_selected["admission_date"].apply(
        lambda dt: dt.replace(year=1980))
    df_selected["year"] = df_selected["admission_date"].apply(
        lambda dt: dt.year)

    samp = df[request].iloc[0]
    title = ""
    for key in request_disc.keys():
        title += samp[key]
        title += " "
    gg = p9.ggplot(df_selected)
    gg += p9.aes(x="date_md", y="admission_status")
    gg += p9.scale_x_datetime(date_breaks='10 days',
                              date_labels="%m-%d",
                              limits=np.array([
                                  np.min(df_selected["date_md"]),
                                  pd.to_datetime("1980-4-20")
                              ]))
    gg += p9.geom_count()
    gg += p9.ggtitle(title)
    return gg
예제 #3
0
def plot_frequency(n = 200):
    """
    Draws the histogram of the distribution of n tweets by date.
    
    Parameters
    ----------
    n: int
    An integer specifying how many tweets should be analysed.
    
    Returns
    -------
    It saves the histogram as a .png file in the static folder.

    """
        
    from plotnine import ggplot, aes, geom_histogram,  scale_x_datetime, labs, theme_minimal, ggsave 
    from Mod_1_API import gather_tweets
    from mizani.breaks import date_breaks
    from mizani.formatters import date_format
    import pandas
    
     
    df = pandas.DataFrame(gather_tweets(n))
       
    plot1 = (ggplot(df, aes(x = 'Date', fill = 'Author')) +
           geom_histogram() +
           scale_x_datetime(breaks=date_breaks('1 week')) +
           labs(x = "Time in weeks", y = "Number of tweets by source") +
           theme_minimal()
           )
    ggsave(plot = plot1, filename = "test.png", path = "static/")
예제 #4
0
def worldbank_plot(
        df: pd.DataFrame,
        title: str,
        dates_are_yearly: bool,
        figure_size=(12, 6),
        add_points=False,
        **plot_kwargs,
) -> p9.ggplot:
    """
    Carefully written to support all worldbank plots, this method is the one place where the app needs themes, colour maps
    and various plot related settings. For sparse datasets it used geom_point() in addition to geom_line() in case the data
    is so sparse that lines cannot be drawn. Returns a ggplot instance or raises an exception if the dataframe is empty.
    """
    if df is None:
        print(f"No usable data/plot for {title}")
        raise Http404(f"No data for {title}")

    pct_na = (df["metric"].isnull().sum() / len(df)) * 100.0
    assert pct_na >= 0.0 and pct_na <= 100.0

    plot = (p9.ggplot(df, p9.aes("date", "metric", **plot_kwargs)) +
            p9.geom_path(size=1.2) +
            p9.scale_y_continuous(labels=label_shorten))
    if dates_are_yearly:
        plot += p9.scale_x_datetime(labels=date_format(
            "%Y"))  # yearly data? if so only print the year on the x-axis
    # if pct_na is too high, geom_path() may be unable to draw a line (each value is surrounded by nan preventing a path)
    # so we use geom_point() to highlight the sparse nature of the data
    if pct_na >= 30.0 or add_points or df["metric"].count() <= 3:
        plot += p9.geom_point(size=3.0)
    return user_theme(plot, y_axis_label="Value", figure_size=figure_size)
예제 #5
0
def plot_trend(sample_period="M", ld: LazyDictionary = None) -> str:
    """
    Given a dataframe of a single stock from company_prices() this plots the highest price
    in each month over the time period of the dataframe.
    """
    assert "stock_df" in ld

    def inner_date_fmt(dates_to_format):
        results = []
        for d in dates_to_format:
            d -= timedelta(
                weeks=4
            )  # breaks are set to the end of the month rather than the start... so
            results.append(d.strftime("%Y-%m"))
        return results

    stock_df = ld["stock_df"]
    # print(stock_df)
    dataframe = stock_df.filter(items=["last_price"])
    dataframe.index = pd.to_datetime(dataframe.index, format="%Y-%m-%d")
    dataframe = dataframe.resample(sample_period).max()
    # print(dataframe)
    plot = (
        p9.ggplot(
            dataframe,
            p9.aes(x="dataframe.index",
                   y=dataframe.columns[0],
                   fill=dataframe.columns[0]),
        ) + p9.geom_bar(stat="identity", alpha=0.7) + p9.scale_x_datetime(
            labels=inner_date_fmt
        )  # dont print day (always 1st day of month due to resampling)
    )
    return user_theme(plot,
                      y_axis_label="$ AUD",
                      asxtrade_want_fill_continuous=True)
예제 #6
0
def plot_predict(forecast):
    p = (ggplot(data=forecast, mapping=aes(x='ds', y='y')) +
         geom_point(colour='blue', alpha=0.3, na_rm=True) +
         geom_line(colour='blue', na_rm=True) + geom_line(
             data=forecast, mapping=aes(x='ds', y='yhat'), colour='red') +
         geom_ribbon(data=forecast,
                     mapping=aes(ymin='yhat_lower', ymax='yhat_upper'),
                     fill='blue',
                     alpha=0.1) +
         scale_x_datetime(breaks='1 days', date_labels='%y-%m-%d %H:%M') +
         xlab('Time') + ylab('Pressure') + theme_bw() +
         theme(axis_text_x=element_text(
             angle=45, hjust=1, face='bold', color='black'),
               axis_text_y=element_text(face='bold', colour='black')))

    ggplot.save(p,
                filename='predict_pressure_chart.png',
                path=os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                  'png'),
                width=8,
                height=6,
                units='in',
                dpi=326,
                verbose=False)
    return p
예제 #7
0
def duration_TL(Data):
    print('======= Creating duration_TL =======')
    x = Data.Duration[pd.isna(Data.Duration) == True]
    
    if ((len(x)+10)) >= len(Data):
       print("WARNING: All values for Duration are NA's")
    
    else:
        #Filter Symptomes and Correct Durations
        Symptomes = Data[(Data.Group == "sy") & (Data.Duration < 180)]
        
        #Setting data with missing times
        Symptomes['Date'] = pd.to_datetime(Symptomes['Date'])
        
        if len(Symptomes) == 0:
            print('No duration for TL_2')
        else: 
            sdate = min(Symptomes["Date"])   # start date
            edate = max(Symptomes["Date"])   # end date
            delta = edate - sdate       # as timedelta
#            from datetime import timedelta
            day = []
            for i in range(delta.days + 1):
                d= sdate + timedelta(days=i)
                day.append(d)
                
            DF = pd.DataFrame(day)
            DF.columns = ['Date']
            data_with_missing_times = pd.merge(DF, Symptomes, on='Date', how='outer')
            data_with_missing_times.Date = pd.to_datetime(data_with_missing_times.Date)
            if delta.days > 1825:
                datebreaks = '18 months'
            else:
                if delta.days > 1095:
                    datebreaks = '12 months'                
                else:
                    datebreaks = '6 months'

                
            plot = (p9.ggplot(data=data_with_missing_times, mapping=p9.aes(x='Date', 
                                                                           y='Duration'))
            + p9.geom_smooth(color = 'red', size = 5, method="loess", se=False)
            + p9.theme_classic()
            + p9.theme(axis_text = p9.element_text(size=33), 
                       axis_title = p9.element_text(size = 33,face = 'bold'))
            + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks)
            + p9.labs(x='',y=''))    

            if (len(data_with_missing_times) > 0):

                plot.save(filename = 'TL_2.jpeg',
                         plot = plot,
                         path = "pdf/iteration/",
                         width = 25, height = 5,
                         dpi = 320)
                

            else: 
                print('Plot not created; no data found.')
        return(print('=================================duration_TL DONE ============================='))
예제 #8
0
def frequency_TL(Data):
    print('======= Creating frequency_TL =======')
    #Filtering
    Data['date_4'] = Data['date'].dt.date
    tl4 = Data.groupby("date_4", sort = False, as_index = False).count()
    tl4 = tl4.iloc[:, 0:2]
    tl4 = tl4.rename(columns = {"Unnamed: 0": "n"})    
    
    sdate = min(tl4["date_4"])  # start date
    edate = max(tl4["date_4"])   # end date
    delta = edate - sdate       # as timedelta
    
#    tl4 = Data.groupby("Date", sort = False, as_index = False).count()
#    tl4 = tl4.iloc[:, 0:2]
#    tl4 = tl4.rename(columns = {"Unnamed: 0": "n"})
#    tl4['Date'] = pd.to_datetime(tl4['Date'])
    
#    #Setting data with missing times
#    sdate = min(tl4["Date"])  # start date
#    edate = max(tl4["Date"])   # end date
#    delta = edate - sdate       # as timedelta
    
    from datetime import timedelta    
    day = []
    for i in range(delta.days + 1):
        d= sdate + timedelta(days=i)
        day.append(d)
        
    DF = pd.DataFrame(day)
    DF.columns = ['date_4']
    data_with_missing_times = pd.merge(DF, tl4, on='date_4', how='outer')
    if delta.days > 1825:
                datebreaks = '18 months'
    else:
        if delta.days > 1095:
            datebreaks = '12 months'                
        else:
            datebreaks = '6 months'
    #Creating and saving TL_4
    
    plot =(p9.ggplot(data=data_with_missing_times,
                     mapping=p9.aes(x='date_4',y='n'))
        + p9.geom_col(fill = 'red')
        + p9.theme_classic()
        + p9.theme(axis_text = p9.element_text(size=40),
                   axis_title = p9.element_text(size = 40,face = 'bold'))
        + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks)
        + p9.labs(x='',y='')
        )
        
    if (len(data_with_missing_times) > 0):
        plot.save(filename = 'TL_4.jpeg',
                 plot = plot,
                 path = "pdf/iteration/",
                 width = 25, height = 5,
                 dpi = 320)
    else: 
        print('Plot not created; no data found.')
    return(print('=================================frequency_TL DONE ============================='))
예제 #9
0
 def create_plot(self, columns):
     for col in columns:
         if col not in self.data.columns:
             raise ValueError('No column "%s" in the data' % col)
     try:
         from plotnine import ggplot, theme_bw, aes, geom_line, expand_limits, scale_x_datetime, ylab, facet_wrap, theme
         from mizani.formatters import date_format
     except ImportError:
         msg = """Package 'plotnine' is required for the plot functionnality.
         Try installing it with 'pip install plotnine'.
         """
         raise RatatouilleDependencyError(msg)
     data = self.data.copy()
     if len(columns) > 0:
         data = data[['timestamp'] + columns]
     else:
         if 'hostname' in data:
             data.drop('hostname', axis=1, inplace=True)
     data['time_diff'] = data['timestamp'][1:].reset_index(
         drop=True) - data['timestamp'][:-1].reset_index(drop=True)
     time_step = data['time_diff'].median()
     breakpoints = list(data[data['time_diff'] > time_step * 10].timestamp)
     breakpoints = [
         data['timestamp'].min(), *breakpoints, data['timestamp'].max()
     ]
     data = data.drop('time_diff', 1).melt('timestamp')
     import pandas
     if len(columns) > 0:
         data['variable'] = pandas.Categorical(data['variable'],
                                               categories=columns)
     plot = ggplot() + theme_bw()
     for min_t, max_t in zip(breakpoints[:-1], breakpoints[1:]):
         tmp = data[(data['timestamp'] > min_t)
                    & (data['timestamp'] < max_t)]
         plot += geom_line(tmp,
                           aes(x='timestamp', y='value', color='variable'),
                           show_legend=False)
     plot += facet_wrap(['variable'], scales='free')
     timedelta = self.data.timestamp.max() - self.data.timestamp.min()
     if timedelta.days > 2:
         plot += scale_x_datetime(labels=date_format('%Y/%m/%d'))
     else:
         plot += scale_x_datetime(labels=date_format('%H:%M'))
     plot += ylab('Value')
     return plot
예제 #10
0
def intensity_TL(Data):
    print('======= Creating intensity_TL =======')    
    x = Data.Intensity[pd.isna(Data.Intensity) == True]
    if (len(x) == len(Data)):
       print("WARNING: All values for Intensity are NA's")

    else:
        #Filter Symptomes
        Symptomes = Data[(Data.Group == "sy")]
        tl3 = Symptomes.groupby("Date", as_index =False, sort = False)['Intensity'].agg({'Intensity': 'mean'})
        #tl3['Day'] = range(1,(len(tl3)+1))
        #tl3 = tl3.rename(columns = {'Intensity': "Intensity_mean"})
        tl3['Date'] = pd.to_datetime(tl3['Date'])
        #Setting data with missing times
        sdate = min(tl3["Date"])   # start date
        edate = max(tl3["Date"])   # end date
        delta = edate - sdate       # as timedelta
        
#        from datetime import timedelta
        day = []
        for i in range(delta.days + 1):
            d= sdate + timedelta(days=i)
            day.append(d)
            
        DF = pd.DataFrame(day)
        DF.columns = ['Date']
        data_with_missing_times = pd.merge(DF, tl3, on='Date', how='outer')
        if delta.days > 1825:
                datebreaks = '18 months'
        else:
            if delta.days > 1095:
                datebreaks = '12 months'
            else:
                datebreaks = '6 months'
        
        plot =(p9.ggplot(data=data_with_missing_times,
                         mapping=p9.aes(x='Date',y='Intensity'))
            + p9.geom_point(color = 'red', size = 5)
            + p9.theme_classic()
            + p9.theme(axis_text = p9.element_text(size=40),
                       axis_title = p9.element_text(size = 40,face = 'bold'))
            + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks)
            + p9.labs(x='',y='')
            )    
    
    #Creating and saving TL_3
    if (len(data_with_missing_times) > 5):
        #TL3 = TL_3(data_with_missing_times)
        
        plot.save(filename = 'TL_3.jpeg',
                 plot = plot,
                 path = "pdf/iteration/",
                 width = 25, height = 5,
                 dpi = 320)
    else: 
        print('Plot not created; no data found.')
    return(print('=================================intensity_TL DONE ============================='))
예제 #11
0
def plot_chart(df, category='temperature', time_interval=None):
    if category not in ['temperature', 'pressure']:
        raise TypeError(
            'category: {} not in temperature or pressure'.format(category))
    elif category == 'temperature':
        key_words = 'TEMP_Value'
    else:
        key_words = 'PSIG_Value'

    var_list = [
        variable for variable in df.columns.values if key_words in variable
    ]
    df_var = df[['Timestamp'] + var_list]
    df_var = df_var.dropna(axis=0)
    df_var = df_var.melt(id_vars=['Timestamp'],
                         var_name='ITEM',
                         value_name='Value')
    df_var['Timestamp'] = pd.to_datetime(df_var['Timestamp'])

    if time_interval is None:
        time_interval = [min(df_var['Timestamp']), max(df_var['Timestamp'])]

    p = (
        ggplot(data=df_var, mapping=aes(x='Timestamp', y='Value')) +
        geom_point(alpha=0.2, mapping=aes(colour='factor(ITEM)'), na_rm=True) +
        geom_line(mapping=aes(colour='factor(ITEM)'), na_rm=True) +
        scale_x_datetime(limits=pd.to_datetime(time_interval),
                         breaks='1 days',
                         date_labels='%y-%m-%d %H:%M') + theme_bw() +
        theme(axis_text_x=element_text(
            angle=45, hjust=0.5, face='bold', color='black'),
              axis_text_y=element_text(face='bold', colour='black'),
              legend_title=element_text(face='bold', colour='black'),
              legend_position='right',
              legend_direction="vertical"))
    ggplot.save(p,
                filename=category + '_chart' + '.png',
                path=os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                  'png'),
                width=8,
                height=6,
                units='in',
                dpi=326,
                verbose=False)
    return p
예제 #12
0
def plot_arima(df):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    p = (
        ggplot(data=df, mapping=aes(x='Timestamp', y=df.columns.values[1])) +
        geom_point(colour='blue', alpha=0.3, na_rm=True) +
        geom_line(colour='blue', na_rm=True) +
        geom_point(mapping=aes(x='Timestamp', y=df.columns.values[2]),
                   colour='red',
                   alpha=0.3,
                   na_rm=True) +
        geom_line(mapping=aes(x='Timestamp', y=df.columns.values[2]),
                  colour='red',
                  na_rm=True) +
        geom_vline(xintercept=max(df[['Timestamp', df.columns.values[1]
                                      ]].dropna(axis=0)['Timestamp']),
                   color='green',
                   linetype='dashed') +
        # geom_line(mapping=aes(x='Timestamp', y='Lower'), colour='green', na_rm=True, alpha=0.3) +
        # geom_line(mapping=aes(x='Timestamp', y='Upper'), colour='green', na_rm=True, alpha=0.3) +
        geom_ribbon(data=df,
                    mapping=aes(ymin='Lower', ymax='Upper'),
                    fill='red',
                    alpha=0.1) +
        scale_x_datetime(breaks='1 days', date_labels='%y-%m-%d %H:%M') +
        xlab('Time') + ylab(df.columns.values[1]) + theme_bw() +
        theme(axis_text_x=element_text(
            angle=45, hjust=1, face='bold', color='black'),
              axis_text_y=element_text(face='bold', colour='black')))

    ggplot.save(p,
                filename=df.columns.values[1] + '_predict.png',
                path=os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                  'png'),
                width=8,
                height=6,
                units='in',
                dpi=326,
                verbose=False)
    return p
예제 #13
0
def general(Data):
    logging.info('======= Creating general =======')
    print('======= Creating general =======')
    x = Data.Intensity[pd.isna(Data.Intensity) == True]
    if (len(x) == len(Data)):
       print("WARNING: All values for Intensity are NA's")
    
    else:
        Data['Minutesss'] = Data['date']
        Data['Minutesss'] = pd.to_datetime(Data['Minutesss'], errors='coerce')
        Data.date= pd.to_datetime(Data.date, errors = 'coerce')
        Data['Minutesss'] = Data['Minutesss'].dt.hour*60 + Data['Minutesss'].dt.minute
        #Data.Intensity = Data.Intensity.astype(str)
        #Data.Intensity = Data.Intensity.astype(float)
        #Data.Intensity.fillna('0', inplace=True)
        plot =(p9.ggplot(data=Data,
                             mapping=p9.aes(x='date',y='Minutesss',
                                            colour = 'Intensity'))
                        + p9.geom_point(size = 2)
                        #+ p9.geom_smooth(method="loess", se=False, color = 'tomato', size = 5)
                        + p9.theme_classic()
                        + p9.scale_colour_gradient(low = "white", high = "red", aesthetics = "colour")
                        + p9.theme(axis_text = p9.element_text(size=18),
                                   axis_title = p9.element_text(size = 18,face = 'bold'),
                                   legend_position = 'none')
                        + p9.scale_x_datetime(date_labels = '%b %y', date_breaks = '6 months')
                        + p9.labs(x='',y='', colour = 'Intensity: ')
                        )
    #Creating and saving TL_1
    if (len(Data) > 0):
        #TL1 = TL_1(Data)
        plot.save(filename = 'TL_1.jpeg',
                 plot = plot,
                 path = "pdf/iteration/",
                 width = 25, height = 5,
                 dpi = 320)
    else: 
        print('Plot not created; no data found.')
    return(print('=================================general DONE ============================='))
예제 #14
0
    def show_community_prediction(
        self,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        num_samples: int = 1000,
        bins: int = 50,
    ):
        """
        Plot samples from the community prediction on this question

        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param num_samples: number of samples from the community
        :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation
        :return: ggplot graphics object
        """
        community_samples = pd.Series([
            self.sample_normalized_community() for _ in range(0, num_samples)
        ])

        (_xmin,
         _xmax) = self.get_central_quantiles(community_samples,
                                             percent_kept=percent_kept,
                                             side_cut_from=side_cut_from)
        _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])

        df = pd.DataFrame(
            data={"samples": self.denormalize_samples(community_samples)})
        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )
        return (ggplot(df, aes("samples")) +
                geom_histogram(fill="#b3cde3", bins=bins) +
                scale_x_datetime(limits=(_xmin, _xmax)) +
                labs(x="Prediction", y="Counts", title=title_name) +
                ergo_theme +
                theme(axis_text_x=element_text(rotation=45, hjust=1)))
def plot_portfolio_vs_benchmark(cumulative_returns, benchmark_cum_returns):
    benchmark_cum_returns = benchmark_cum_returns.rename(
        columns={"benchmark": "returns"})
    benchmark_cum_returns['key'] = "benchmark"
    cumulative_returns['key'] = "portfolio"
    cumulative_returns["returns"] = cumulative_returns["returns"]
    df = cumulative_returns.append(benchmark_cum_returns)
    df.index.name = 'date'
    df.reset_index(level=0, inplace=True)
    df['returns'] = df['returns'] * 100
    warnings.filterwarnings('ignore')
    df.to_csv(data_path + portfolio_name + 'returns.csv', header=True)
    r = (ggplot(df) + aes(x='date', y='returns', color='key', group='key') +
         geom_line() + scale_x_datetime(breaks=date_breaks('1 years'),
                                        labels=date_format('%Y')) +
         theme(axis_text_x=element_text(rotation=90, hjust=1)) +
         labs(title=portfolio_name + 'portfolio vs. benchmark', y='Returns %'))
    r.save(filename=portfolio_name + 'returns.png',
           format="png",
           path=results_path,
           width=6.4,
           height=4.8,
           dpi=125)
    warnings.filterwarnings('default')
def plot_drawdowns(cumulative_returns, benchmark_cum_returns):
    """Any time the cumulative returns dips below the current cumulative
    maximum returns, it's a drawdown. Drawdowns are measured as a percentage of
    that maximum cumulative return, in effect, measured from peak equity."""
    benchmark_drawdown = get_drawdown(benchmark_cum_returns)
    benchmark_drawdown = benchmark_drawdown.to_frame()
    benchmark_drawdown = benchmark_drawdown.rename(
        columns={"benchmark": "drawdown"})
    benchmark_drawdown['key'] = "benchmark"
    benchmark_drawdown.index.name = 'date'
    benchmark_drawdown.reset_index(level=0, inplace=True)
    portfolio_drawdown = get_drawdown(cumulative_returns)
    portfolio_drawdown = portfolio_drawdown.to_frame()
    portfolio_drawdown['key'] = "portfolio"
    portfolio_drawdown = portfolio_drawdown.rename(
        columns={"returns": "drawdown"})
    portfolio_drawdown.index.name = 'date'
    portfolio_drawdown.reset_index(level=0, inplace=True)
    mask = benchmark_drawdown.date.isin(portfolio_drawdown.date)
    benchmark_drawdown = benchmark_drawdown[mask]
    df = portfolio_drawdown.append(benchmark_drawdown)
    df.to_csv(data_path + portfolio_name + 'drawdowns.csv', header=True)
    warnings.filterwarnings('ignore')
    d = (ggplot(df) + aes(x='date', y='drawdown', color='key', group='key') +
         geom_line() + scale_x_datetime(breaks=date_breaks('1 years'),
                                        labels=date_format('%Y')) +
         theme(axis_text_x=element_text(rotation=90, hjust=1)) +
         labs(title=portfolio_name + 'portfolio vs. benchmark',
              y='Drawdown % (change peak to trough)'))
    d.save(filename=portfolio_name + 'drawdowns.png',
           format="png",
           path=results_path,
           width=6.4,
           height=4.8,
           dpi=125)
    warnings.filterwarnings('default')
예제 #17
0
def make_plots(leak_df, time_df, site_df, sim_n, spin_up, output_directory):
    """
    This function makes a set of standard plots to output at end of simulation.
    """
    # Temporarily mute warnings
    warnings.filterwarnings('ignore')
    pn.theme_set(pn.theme_linedraw())

    # Chop off spin-up year (only for plots, still exists in raw output)
    time_df_adj = time_df.iloc[spin_up:, ]

    # Timeseries plots
    plot_time_1 = (
        pn.ggplot(time_df_adj, pn.aes('datetime', 'daily_emissions_kg')) +
        pn.geom_line(size=2) +
        pn.ggtitle('Daily emissions from all sites (kg)') + pn.ylab('') +
        pn.xlab('') + pn.scale_x_datetime(labels=date_format('%Y')) + pn.theme(
            panel_border=pn.element_rect(colour="black", fill=None, size=2),
            panel_grid_minor_x=pn.element_blank(),
            panel_grid_major_x=pn.element_blank(),
            panel_grid_minor_y=pn.element_line(
                colour='black', linewidth=0.5, alpha=0.3),
            panel_grid_major_y=pn.element_line(
                colour='black', linewidth=1, alpha=0.5)))

    plot_time_1.save(output_directory + '/plot_time_emissions_' + sim_n +
                     '.png',
                     width=10,
                     height=3,
                     dpi=300)

    plot_time_2 = (pn.ggplot(time_df_adj, pn.aes('datetime', 'active_leaks')) +
                   pn.geom_line(size=2) +
                   pn.ggtitle('Number of active leaks at all sites') +
                   pn.ylab('') + pn.xlab('') +
                   pn.scale_x_datetime(labels=date_format('%Y')) +
                   pn.theme(panel_border=pn.element_rect(
                       colour="black", fill=None, size=2),
                            panel_grid_minor_x=pn.element_blank(),
                            panel_grid_major_x=pn.element_blank(),
                            panel_grid_minor_y=pn.element_line(
                                colour='black', linewidth=0.5, alpha=0.3),
                            panel_grid_major_y=pn.element_line(
                                colour='black', linewidth=1, alpha=0.5)))

    plot_time_2.save(output_directory + '/plot_time_active_' + sim_n + '.png',
                     width=10,
                     height=3,
                     dpi=300)

    # Site-level plots
    plot_site_1 = (
        pn.ggplot(site_df, pn.aes('cum_frac_sites', 'cum_frac_emissions')) +
        pn.geom_line(size=2) + pn.theme(
            panel_border=pn.element_rect(colour="black", fill=None, size=2),
            panel_grid_minor_x=pn.element_blank(),
            panel_grid_major_x=pn.element_blank(),
            panel_grid_minor_y=pn.element_line(
                colour='black', linewidth=0.5, alpha=0.3),
            panel_grid_major_y=pn.element_line(
                colour='black', linewidth=1, alpha=0.5)) +
        pn.xlab('Cumulative fraction of sites') +
        pn.ylab('Cumulative fraction of emissions') +
        pn.ggtitle('Empirical cumulative distribution of site-level emissions')
    )

    plot_site_1.save(output_directory + '/site_cum_dist_' + sim_n + '.png',
                     width=5,
                     height=4,
                     dpi=300)

    # Leak plots
    plot_leak_1 = (pn.ggplot(leak_df, pn.aes('days_active')) +
                   pn.geom_histogram(colour='gray') +
                   pn.theme(panel_border=pn.element_rect(
                       colour="black", fill=None, size=2),
                            panel_grid_minor_x=pn.element_blank(),
                            panel_grid_major_x=pn.element_blank(),
                            panel_grid_minor_y=pn.element_line(
                                colour='black', linewidth=0.5, alpha=0.3),
                            panel_grid_major_y=pn.element_line(
                                colour='black', linewidth=1, alpha=0.5)) +
                   pn.ggtitle('Distribution of leak duration') +
                   pn.xlab('Number of days the leak was active') +
                   pn.ylab('Count'))
    plot_leak_1.save(output_directory + '/leak_active_hist' + sim_n + '.png',
                     width=5,
                     height=4,
                     dpi=300)

    plot_leak_2 = (pn.ggplot(
        leak_df, pn.aes('cum_frac_leaks', 'cum_frac_rate', colour='status')) +
                   pn.geom_line(size=2) +
                   pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                   pn.theme(panel_border=pn.element_rect(
                       colour="black", fill=None, size=2),
                            panel_grid_minor_x=pn.element_blank(),
                            panel_grid_major_x=pn.element_blank(),
                            panel_grid_minor_y=pn.element_line(
                                colour='black', linewidth=0.5, alpha=0.3),
                            panel_grid_major_y=pn.element_line(
                                colour='black', linewidth=1, alpha=0.5)) +
                   pn.xlab('Cumulative fraction of leak sources') +
                   pn.ylab('Cumulative leak rate fraction') +
                   pn.ggtitle('Fractional cumulative distribution'))

    plot_leak_2.save(output_directory + '/leak_cum_dist1_' + sim_n + '.png',
                     width=4,
                     height=4,
                     dpi=300)

    plot_leak_3 = (pn.ggplot(
        leak_df, pn.aes('cum_frac_leaks', 'cum_rate', colour='status')) +
                   pn.geom_line(size=2) +
                   pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                   pn.theme(panel_border=pn.element_rect(
                       colour="black", fill=None, size=2),
                            panel_grid_minor_x=pn.element_blank(),
                            panel_grid_major_x=pn.element_blank(),
                            panel_grid_minor_y=pn.element_line(
                                colour='black', linewidth=0.5, alpha=0.3),
                            panel_grid_major_y=pn.element_line(
                                colour='black', linewidth=1, alpha=0.5)) +
                   pn.scale_y_continuous(trans='log10') +
                   pn.xlab('Cumulative fraction of leak sources') +
                   pn.ylab('Cumulative emissions (kg/day)') +
                   pn.ggtitle('Absolute cumulative distribution'))

    plot_leak_3.save(output_directory + '/leak_cum_dist2_' + sim_n + '.png',
                     width=4,
                     height=4,
                     dpi=300)

    return
예제 #18
0
    def batch_plots(self):

        # First, put together active leak data and output for live plotting functionality
        # (no AL plot here currently)
        dfs = self.active_leak_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

        # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean',
                                                    'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'mean_active_leaks.csv', index=True)

        # Now repeat for emissions (which will actually be used for batch plotting)
        dfs = self.emission_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

            # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean',
                                                    'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'mean_emissions.csv', index=True)

        # Make plots from list of dataframes - one entry per dataframe
        pn.theme_set(pn.theme_linedraw())
        plot1 = (pn.ggplot(None) + pn.aes('datetime', 'value', group='program') +
                 pn.geom_ribbon(df_p1, pn.aes(ymin='low', ymax='high', fill='program'), alpha=0.2) +
                 pn.geom_line(df_p1, pn.aes('datetime', 'mean', colour='program'), size=1) +
                 pn.ylab('Daily emissions (kg/site)') + pn.xlab('') +
                 pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                 pn.scale_x_datetime(labels=date_format('%Y')) +
                 pn.scale_y_continuous(trans='log10') +
                 pn.ggtitle('To reduce uncertainty, use more simulations.') +
                 pn.labs(color='Program', fill='Program') +
                 pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2),
                          panel_grid_minor_x=pn.element_blank(),
                          panel_grid_major_x=pn.element_blank(),
                          panel_grid_minor_y=pn.element_line(
                              colour='black', linewidth=0.5, alpha=0.3),
                          panel_grid_major_y=pn.element_line(
                              colour='black', linewidth=1, alpha=0.5))
                 )
        plot1.save(self.output_directory + 'program_comparison.png', width=7, height=3, dpi=900)

        # Build relative mitigation plots
        dfs_p2 = dfs.copy()

        for i in dfs_p2[1:]:
            i['mean_dif'] = 0
            i['std_dif'] = 0
            i['mean_ratio'] = 0
            i['std_ratio'] = 0
            for j in range(len(i)):
                ref_mean = dfs_p2[0].loc[dfs_p2[0].index[j], 'mean']
                ref_std = dfs_p2[0].loc[dfs_p2[0].index[j], 'std']
                alt_mean = i.loc[i.index[j], 'mean']
                alt_std = i.loc[i.index[j], 'std']

                i.loc[i.index[j], 'mean_dif'] = alt_mean - ref_mean
                i.loc[i.index[j], 'std_dif'] = math.sqrt(
                    math.pow(alt_std, 2) + math.pow(ref_std, 2))
                i.loc[i.index[j], 'mean_ratio'] = alt_mean / ref_mean
                i.loc[i.index[j], 'std_ratio'] = math.sqrt(
                    math.pow((alt_std / alt_mean), 2) + math.pow((ref_std / ref_mean), 2))

        # Build plotting dataframe
        df_p2 = self.dates_trunc.copy().to_frame()
        df_p2['program'] = dfs_p2[1]['program']
        df_p2['mean_dif'] = dfs_p2[1]['mean_dif']
        df_p2['std_dif'] = dfs_p2[1]['std_dif']
        df_p2['mean_ratio'] = dfs_p2[1]['mean_ratio']
        df_p2['std_ratio'] = dfs_p2[1]['std_ratio']

        df_p2['low_dif'] = dfs_p2[1]['mean_dif'] - 2 * dfs_p2[1]['std_dif']
        df_p2['high_dif'] = dfs_p2[1]['mean_dif'] + 2 * dfs_p2[1]['std_dif']
        df_p2['low_ratio'] = dfs_p2[1]['mean_ratio'] / (dfs_p2[1]
                                                        ['mean_ratio'] + 2 * dfs_p2[1]['std_ratio'])
        df_p2['high_ratio'] = dfs_p2[1]['mean_ratio'] + 2 * dfs_p2[1]['std_ratio']

        pd.options.mode.chained_assignment = None
        for i in dfs_p2[2:]:
            i['low_dif'] = i['mean_dif'] - 2 * i['std_dif']
            i['high_dif'] = i['mean_dif'] + 2 * i['std_dif']
            i['low_ratio'] = i['mean_ratio'] / (i['mean_ratio'] + 2 * i['std_ratio'])
            i['high_ratio'] = i['mean_ratio'] + 2 * i['std_ratio']
            short_df = i[['program', 'mean_dif', 'std_dif', 'low_dif',
                          'high_dif', 'mean_ratio', 'std_ratio', 'low_ratio', 'high_ratio']]
            short_df['datetime'] = np.array(self.dates_trunc)
            df_p2 = df_p2.append(short_df, ignore_index=True)

        # Make plot 2
        plot2 = (pn.ggplot(None) + pn.aes('datetime', 'mean_dif', group='program') +
                 pn.geom_ribbon(
                     df_p2, pn.aes(ymin='low_dif', ymax='high_dif', fill='program'), alpha=0.2) +
                 pn.geom_line(df_p2, pn.aes('datetime', 'mean_dif', colour='program'), size=1) +
                 pn.ylab('Daily emissions difference (kg/site)') + pn.xlab('') +
                 pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                 pn.scale_x_datetime(labels=date_format('%Y')) +
                 pn.ggtitle('Daily differences may be uncertain for small sample sizes') +
                 #        pn.scale_y_continuous(trans='log10') +
                 pn.labs(color='Program', fill='Program') +
                 pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2),
                          panel_grid_minor_x=pn.element_blank(),
                          panel_grid_major_x=pn.element_blank(),
                          panel_grid_minor_y=pn.element_line(
                              colour='black', linewidth=0.5, alpha=0.3),
                          panel_grid_major_y=pn.element_line(
                              colour='black', linewidth=1, alpha=0.5))
                 )
        plot2.save(self.output_directory + 'relative_mitigation.png', width=7, height=3, dpi=900)

        # Make plot 3
        plot3 = (pn.ggplot(None) + pn.aes('datetime', 'mean_ratio', group='program') +
                 pn.geom_ribbon(df_p2, pn.aes(
                     ymin='low_ratio', ymax='high_ratio', fill='program'), alpha=0.2) +
                 pn.geom_hline(yintercept=1, size=0.5, colour='blue') +
                 pn.geom_line(df_p2, pn.aes('datetime', 'mean_ratio', colour='program'), size=1) +
                 pn.ylab('Emissions ratio') + pn.xlab('') +
                 pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                 pn.scale_x_datetime(labels=date_format('%Y')) +
                 pn.ggtitle(
                     'Blue line represents equivalence. \nIf uncertainty is high, use more '
                     'simulations and/or sites. \nLook also at ratio of mean daily emissions'
                     'over entire timeseries.') +
                 pn.labs(color='Program', fill='Program') +
                 pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2),
                          panel_grid_minor_x=pn.element_blank(),
                          panel_grid_major_x=pn.element_blank(),
                          panel_grid_minor_y=pn.element_line(
                              colour='black', linewidth=0.5, alpha=0.3),
                          panel_grid_major_y=pn.element_line(
                              colour='black', linewidth=1, alpha=0.5))
                 )
        plot3.save(self.output_directory + 'relative_mitigation2.png', width=7, height=3, dpi=900)

        # ---------------------------------------
        # ------ Figure to compare costs  ------
        dfs = self.cost_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

        # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean',
                                                    'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'rolling_cost_estimates.csv', index=True)

        # Make plots from list of dataframes - one entry per dataframe
        pn.theme_set(pn.theme_linedraw())
        plot1 = (pn.ggplot(None) + pn.aes('datetime', 'value', group='program') +
                 pn.geom_ribbon(df_p1, pn.aes(ymin='low', ymax='high', fill='program'), alpha=0.2) +
                 pn.geom_line(df_p1, pn.aes('datetime', 'mean', colour='program'), size=1) +
                 pn.ylab('Estimated cost per facility') + pn.xlab('') +
                 pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                 pn.scale_x_datetime(labels=date_format('%Y')) +
                 # pn.scale_y_continuous(trans='log10') +
                 pn.labs(color='Program', fill='Program') +
                 pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2),
                          panel_grid_minor_x=pn.element_blank(),
                          panel_grid_major_x=pn.element_blank(),
                          panel_grid_minor_y=pn.element_line(
                              colour='black', linewidth=0.5, alpha=0.3),
                          panel_grid_major_y=pn.element_line(
                              colour='black', linewidth=1, alpha=0.5))
                 )
        plot1.save(self.output_directory + 'cost_estimate_temporal.png', width=7, height=3, dpi=900)

        ########################################
        # Cost breakdown by program and method
        method_lists = []
        for i in range(len(self.directories)):
            df = pd.read_csv(
                self.output_directory + self.directories[i] + "/timeseries_output_0.csv")
            df = df.filter(regex='cost$', axis=1)
            df = df.drop(columns=["total_daily_cost"])
            method_lists.append(list(df))

        costs = [[] for i in range(len(self.all_data))]
        for i in range(len(self.all_data)):
            for j in range(len(self.all_data[i])):
                simcosts = []
                for k in range(len(method_lists[i])):
                    timesteps = len(self.all_data[i][j][method_lists[i][k]])
                    simcosts.append(
                        (sum(self.all_data[i][j][method_lists[i][k]])/timesteps/self.n_sites)*365)
                costs[i].append(simcosts)

        rows_list = []
        for i in range(len(costs)):
            df_temp = pd.DataFrame(costs[i])
            for j in range(len(df_temp.columns)):
                dict = {}
                dict.update({'Program': self.directories[i]})
                dict.update({'Mean Cost': round(df_temp.iloc[:, j].mean())})
                dict.update({'St. Dev.': df_temp.iloc[:, j].std()})
                dict.update({'Method': method_lists[i][j].replace('_cost', '')})
                rows_list.append(dict)
        df = pd.DataFrame(rows_list)

        # Output Emissions df for other uses
        df.to_csv(self.output_directory + 'cost_comparison.csv', index=True)

        plot = (
            pn.ggplot(
                df, pn.aes(
                    x='Program', y='Mean Cost', fill='Method', label='Mean Cost')) +
            pn.geom_bar(stat="identity") + pn.ylab('Cost per Site per Year') + pn.xlab('Program') +
            pn.scale_fill_hue(h=0.15, l=0.25, s=0.9) +
            pn.geom_text(size=15, position=pn.position_stack(vjust=0.5)) +
            pn.theme(
                panel_border=pn.element_rect(colour="black", fill=None, size=2),
                panel_grid_minor_x=pn.element_blank(),
                panel_grid_major_x=pn.element_blank(),
                panel_grid_minor_y=pn.element_line(
                    colour='black', linewidth=0.5, alpha=0.3),
                panel_grid_major_y=pn.element_line(
                    colour='black', linewidth=1, alpha=0.5)))
        plot.save(self.output_directory + 'cost_comparison.png', width=7, height=3, dpi=900)

        return
예제 #19
0
def marginal_plot(df,
                  x,
                  y,
                  group = None,
                  facet_x = None,
                  facet_y = None,
                  aggfun = 'sum',
                  bins=21,
                  use_quantiles = False,
                  label_pos='auto',
                  label_function=ez_labels,
                  sort_groups=True,
                  base_size=10,
                  figure_size=(6, 3)):

    '''
    Bin the data in a df and plot it using lines.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    aggfun : str or fun
      function to be used for aggregating (eg sum, mean, median ...)
    bins : int or tuple
      number of bins to be used
    use_quantiles : bool
      bin data using quantiles
    label_pos : str
      Use count label on each point. Choose between None, 'auto' or 'force'
    label_function : callable
      labelling function
    sort_groups : bool
      sort groups by the sum of their value (otherwise alphabetical order is used)
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object
    '''

    if label_pos not in [None, 'auto', 'force']:
        log.error("label_pos not recognized")
        raise NotImplementedError("label_pos not recognized")
    elif label_pos == 'auto':
        if bins<=21 and group is None:
            show_labels=True
        else:
            show_labels=False
    else:
        show_labels = True if label_pos=='force' else False

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x,  group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # set column names and evaluate expressions
    tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False)

    # redefine groups and variables; remove and store (eventual) names
    new_groups = {c:c for c in tmp_df.columns if c in ['x', 'group', 'facet_x', 'facet_y']}
    new_variables = {'y': 'y'}

    # bin data
    if use_quantiles:
        quantile_groups = [c for c in tmp_df.columns if c in ['group', 'facet_x', 'facet_y']]
        if len(quantile_groups)>0:
            tmp_df['x'] = tmp_df.groupby(quantile_groups)['x'].apply(lambda x: qbin_data(x, bins))
        else:
            tmp_df['x'] = qbin_data(tmp_df['x'], bins)
    else:
        tmp_df['x'], _, _ = bin_data(tmp_df['x'], bins, None)

    # aggregate data and reorder columns
    gdata = agg_data(tmp_df, new_variables, new_groups, aggfun, fill_groups=False)

    # reorder columns
    gdata = gdata[[c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]]

    # init plot obj
    g = EZPlot(gdata)

    # determine order and create a categorical type
    if sort_groups:
        sort_data_groups(g)

    # get colors
    colors = np.flip(ez_colors(g.n_groups('group')))

    # set groups
    if group is None:
        g += p9.geom_line(p9.aes(x="x", y="y"), group=1, colour=colors[0])
        if show_labels:
            g += p9.geom_point(p9.aes(x="x", y="y"), group=1, colour=colors[0])
    else:
        g += p9.geom_line(p9.aes(x="x", y="y", group="factor(group)", colour="factor(group)"))
        if show_labels:
            g += p9.geom_point(p9.aes(x="x", y="y", colour="factor(group)"))
        g += p9.scale_color_manual(values=colors)

    # set labels
    if show_labels:
        groups_to_count = [c for c in tmp_df.columns if c in ['x', 'group', 'facet_x', 'facet_y']]
        tmp_df['counts']=1
        top_labels = tmp_df \
            .groupby(groups_to_count)['counts'] \
            .sum()\
            .reset_index()
        top_labels['label'] = label_function(top_labels['counts'])
        
        # make sure labels and  data can be joined
        for c in ['group', 'facet_x', 'facet_y']:
            if c in tmp_df.columns:
                try:
                    top_labels[c] = pd.Categorical(top_labels[c].astype(str),
                                                   categories = g.data[c].cat.categories,
                                                   ordered = g.data[c].cat.ordered)
                except:
                    pass
        #return g.data, top_labels
        g.data = pd.merge(g.data, top_labels, on=groups_to_count, how='left')
        g.data['label_pos'] = g.data['y'] + \
                    np.sign(g.data['y'])*g.data['y'].abs().max()*0.02

        g += p9.geom_text(p9.aes(x='x', y='label_pos', label='label'),
                          color="#000000",
                          size=base_size * 0.7,
                          ha='center',
                          va='bottom')
    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')
        
    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'], size=base_size))
    return g
예제 #20
0
    def show_prediction(
        self,
        samples,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        show_community: bool = False,
        num_samples: int = 1000,
        bins: int = 50,
    ):
        """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions

        :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object
        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param show_community: boolean indicating whether comparison to community predictions should be made
        :param num_samples: number of samples from the community
        :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation
        :return: ggplot graphics object
        """

        if isinstance(samples, SubmissionMixtureParams):
            prediction = samples
            prediction_normed_samples = pd.Series([
                logistic.sample_mixture(prediction)
                for _ in range(0, num_samples)
            ])
        else:
            if isinstance(samples, list):
                samples = pd.Series(samples)
            if not type(samples) in [pd.Series, np.ndarray]:
                raise ValueError(
                    "Samples should be a list, numpy arrray or pandas series")
            num_samples = samples.shape[0]
            prediction_normed_samples = self.normalize_samples(samples)

        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )

        if show_community:
            df = pd.DataFrame(
                data={
                    "community": [  # type: ignore
                        self.sample_normalized_community()
                        for _ in range(0, num_samples)
                    ],
                    "prediction":
                    prediction_normed_samples,  # type: ignore
                })
            # import pdb
            # pdb.set_trace()
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)
            _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])
            df["prediction"] = self.denormalize_samples(df["prediction"])
            df["community"] = self.denormalize_samples(df["community"])

            df = pd.melt(df, var_name="sources",
                         value_name="samples")  # type: ignore
            return (ggplot(df, aes("samples", fill="sources")) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_histogram(position="identity", alpha=0.9) +
                    scale_x_datetime(limits=(_xmin, _xmax)) +
                    facet_wrap("sources", ncol=1) + labs(
                        x="Prediction",
                        y="Counts",
                        title=title_name,
                    ) + guides(fill=False) + ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
        else:
            (_xmin, _xmax) = self.get_central_quantiles(
                prediction_normed_samples,
                percent_kept=percent_kept,
                side_cut_from=side_cut_from,
            )
            _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])
            df = pd.DataFrame(data={
                "prediction":
                self.denormalize_samples(prediction_normed_samples)
            })
            return (ggplot(df, aes("prediction")) +
                    geom_histogram(fill="#b3cde3", bins=bins)
                    # + coord_cartesian(xlim = (_xmin,_xmax))
                    + scale_x_datetime(limits=(_xmin, _xmax)) +
                    labs(x="Prediction", y="Counts", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
예제 #21
0
def area_plot(df,
              x,
              y,
              group=None,
              facet_x=None,
              facet_y=None,
              aggfun='sum',
              fill=False,
              sort_groups=True,
              base_size=10,
              figure_size=(6, 3)):
    '''
    Aggregates data in df and plots as a stacked area chart.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    aggfun : str or fun
      function to be used for aggregating (eg sum, mean, median ...)
    fill : bool
      plot shares for each group instead of absolute values
    sort_groups : bool
      sort groups by the sum of their value (otherwise alphabetical order is used)
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True)
    gdata['y'].fillna(0, inplace=True)
    gdata = gdata[[
        c for c in ['x', 'y', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]

    if fill:
        groups_to_normalize = [
            c for c in ['x', 'facet_x', 'facet_y'] if c in gdata.columns
        ]
        total_values = gdata \
            .groupby(groups_to_normalize)['y'] \
            .sum() \
            .reset_index() \
            .rename(columns = {'y':'tot_y'})
        gdata = pd.merge(gdata, total_values, on=groups_to_normalize)
        gdata['y'] = gdata['y'] / (gdata['tot_y'] + EPSILON)
        gdata.drop('tot_y', axis=1, inplace=True)
        ylabeller = percent_labels
    else:
        ylabeller = ez_labels

    # get plot object
    g = EZPlot(gdata)

    # determine order and create a categorical type
    if sort_groups:
        sort_data_groups(g)

    # get colors
    colors = np.flip(ez_colors(g.n_groups('group')))

    # set groups
    if group is None:
        g += p9.geom_area(p9.aes(x="x", y="y"),
                          colour=None,
                          fill=ez_colors(1)[0],
                          na_rm=True)
    else:
        g += p9.geom_area(p9.aes(x="x",
                                 y="y",
                                 group="factor(group)",
                                 fill="factor(group)"),
                          colour=None,
                          na_rm=True)
        g += p9.scale_fill_manual(values=colors)

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ylabeller,
                               expand=[0, 0, 0.1 * (not fill) + 0.03, 0])

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    if sort_groups:
        g += p9.guides(fill=p9.guide_legend(reverse=True),
                       color=p9.guide_legend(reverse=True))

    return g
예제 #22
0
                                     group="group",
                                     colour="group"),
                              na_rm=True)

            g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group')))
            g += p9.scale_color_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
예제 #23
0
def scatter_plot(df,
                 x,
                 y,
                 group=None,
                 facet_x=None,
                 facet_y=None,
                 base_size=10,
                 figure_size=(6, 3),
                 **kwargs):
    '''
    Aggregates data in df and plots as a scatter plot chart.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size
    **kwargs:
      additional kwargs passed to geom_point

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, None, fill_groups=True)
    gdata = gdata[[
        c for c in ['x', 'y', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]

    # add group_x column
    if group is not None:
        gdata['group_x'] = gdata['group'].astype(
            'str') + '_' + gdata['x'].astype(str)

    g = EZPlot(gdata)

    # set groups
    if group is None:
        g += p9.geom_point(p9.aes(x="x", y="y"),
                           colour=ez_colors(1)[0],
                           **kwargs)
    else:
        g += p9.geom_point(
            p9.aes(x="x", y="y", group="factor(group)", color="factor(group)"),
            **kwargs)
        g += p9.scale_color_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    if g.column_is_timestamp('y'):
        g += p9.scale_y_datetime()
    elif g.column_is_categorical('y'):
        g += p9.scale_y_discrete()
    else:
        g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    return g
예제 #24
0
def line_plot(df,
              x,
              y,
              group=None,
              facet_x=None,
              facet_y=None,
              aggfun='sum',
              err=None,
              show_points=False,
              base_size=10,
              figure_size=(6, 3)):
    '''
  Aggregates data in df and plots multiple columns as a line chart.

  Parameters
  ----------
  df : pd.DataFrame
    input dataframe
  x : str
    quoted expression to be plotted on the x axis
  y : str or list of str
    quoted expression(s) to be plotted on the y axis
  group : str
    quoted expression to be used as group (ie color)
  facet_x : str
    quoted expression to be used as facet
  facet_y : str
    quoted expression to be used as facet
  aggfun : str or fun
    function to be used for aggregating (eg sum, mean, median ...)
  err : str
     quoted expression to be used as error shaded area
  show_points : bool
    show/hide markers
  base_size : int
    base size for theme_ez
  figure_size :tuple of int
    figure size

  Returns
  -------
  g : EZPlot
    EZplot object

  '''

    if group is not None and isinstance(y, list) and len(y) > 1:
        log.error(
            "groups can be specified only when a single y column is present")
        raise ValueError(
            "groups can be specified only when a single y column is present")

    if err is not None and isinstance(y, list) and len(y) > 1:
        log.error(
            "err can be specified only when a single y column is present")
        raise ValueError(
            "err can be specified only when a single y column is present")

    if isinstance(y, list) and len(y) == 1:
        y = y[0]

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    if isinstance(y, list):

        ys = []
        for i, var in enumerate(y):
            ys.append('y_{}'.format(i))
            names['y_{}'.format(i)], variables['y_{}'.format(i)] = unname(var)

        # aggregate data
        tmp_gdata = agg_data(dataframe,
                             variables,
                             groups,
                             aggfun,
                             fill_groups=True)
        groups_present = [
            c for c in ['x', 'facet_x', 'facet_y'] if c in tmp_gdata.columns
        ]
        gdata = pd.melt(tmp_gdata,
                        groups_present,
                        var_name='group',
                        value_name='y')
        gdata['group'] = gdata['group'].replace(
            {var: names[var]
             for var in ys})

        # update values for plotting
        names['y'] = 'Value'
        names['group'] = 'Variable'
        group = 'Variable'

    else:

        names['y'], variables['y'] = unname(y)
        if err is not None:
            names['err'], variables['err'] = unname(err)

        # aggregate data
        gdata = agg_data(dataframe,
                         variables,
                         groups,
                         aggfun,
                         fill_groups=True)

    # reorder columns
    gdata = gdata[[
        c for c in ['x', 'y', 'err', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]
    if err is not None:
        gdata['ymax'] = gdata['y'] + gdata['err']
        gdata['ymin'] = gdata['y'] - gdata['err']

    # init plot obj
    g = EZPlot(gdata)

    # set groups
    if group is None:
        g += p9.geom_line(p9.aes(x="x", y="y"),
                          group=1,
                          colour=ez_colors(1)[0])
        if show_points:
            g += p9.geom_point(p9.aes(x="x", y="y"),
                               group=1,
                               colour=ez_colors(1)[0])
        if err is not None:
            g += p9.geom_ribbon(p9.aes(x="x", ymax="ymax", ymin="ymin"),
                                group=1,
                                fill=ez_colors(1)[0],
                                alpha=0.2)
    else:
        g += p9.geom_line(
            p9.aes(x="x", y="y", group="factor(group)",
                   colour="factor(group)"))
        if show_points:
            g += p9.geom_point(p9.aes(x="x", y="y", colour="factor(group)"))
        if err is not None:
            g += p9.geom_ribbon(p9.aes(x="x",
                                       ymax="ymax",
                                       ymin="ymin",
                                       fill="factor(group)"),
                                alpha=0.2)
        g += p9.scale_color_manual(values=ez_colors(g.n_groups('group')))
        g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
      p9.xlab(names['x']) + \
      p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    return g
예제 #25
0
 def _scale_x(self, xmin: float = None, xmax: float = None):
     return scale_x_datetime(limits=(xmin, xmax))