示例#1
0
def make_hist(df, title, num_bins = 8, code_percentile=.99):
    '''
    Takes data, title, number of bins (max 10), and percentile.
    Outputs a histogram.
    '''
    title = title.title()
    data_list = df.dropna().tolist()
    top_code_val = df.quantile(code_percentile)
    distinct_vals = len(set(data_list))
    num_bins = min(distinct_vals, num_bins)

    # plt.style.use('ggplot')
    # df.hist(bins = np.linspace(0, top_code_val, num_bins + 1), normed=True)
    # plt.xlabel(title)
    # plt.title('Histogram of ' + title.replace("_", " "))
    # plt.tight_layout()
    # plt.savefig('Histogram_' + title + '.png', format='png')
    # plt.close()
    sns.displot(df)
示例#2
0
def plot_density_plots(metric):
    data = (pd.read_csv(f"../generated_data/data_{metric}.csv").drop(
        columns="Unnamed: 0").dropna().rename(columns={"value": "distance"}))
    sns.displot(
        data=data,
        x="distance",
        hue="variable",
        multiple="stack",
        height=6,
        aspect=0.7,
    )
    plt.subplots_adjust(top=0.85)
    plt.title("\n".join(
        wrap(
            f"Distribution of the {metric} distances "
            f"among diagnostic categories",
            50,
        )))
    plt.savefig(DOTENV_KEY2VAL["GEN_FIGURES_DIR"] + metric +
                "_histogram.png", )
def plot_median_score_distribution(df, title, path, file_name):
    
    if not os.path.exists(path):
        os.mkdir(path)
        
    dis_plt = sns.displot(df, x="median_scores", hue="dose", kind="hist", 
                          multiple="stack", palette = 'viridis', height=6.5, aspect=1.7)
    dis_plt.fig.suptitle(title)
    dis_plt.fig.subplots_adjust(top=.92)
    plt.savefig(os.path.join(path, file_name))
    plt.show()
def plot_p_value_dist(df, path, file_name):
    
    """Plot p-value frequency distribution per dose"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    dis_plt = sns.displot(df, x="p_values", col="dose", col_wrap=3, binwidth=0.03)
    dis_plt.fig.suptitle("P-values distribution across all doses(1-6)", size = 16)
    dis_plt.fig.subplots_adjust(top=.92)
    plt.savefig(os.path.join(path, file_name))
    plt.show()
示例#5
0
def general_plots():
    '''Make plots to illustrate the results of the scRNA-Seq analysis'''
    valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
    adata, phases = read_counts_and_phases(valuetype, use_spikeins,
                                           biotype_to_use)

    # QC plots before filtering
    sc.pl.highest_expr_genes(adata, n_top=20, show=False, save=True)
    shutil.move("figures/highest_expr_genes.pdf",
                f"figures/highest_expr_genes_AllCells.pdf")

    # Post filtering QC
    do_log_normalization = True
    do_remove_blob = False
    adata, phasesfilt = qc_filtering(adata, do_log_normalization,
                                     do_remove_blob)
    sc.pp.highly_variable_genes(adata,
                                min_mean=0.0125,
                                max_mean=3,
                                min_disp=0.5)
    sc.pl.highly_variable_genes(adata, show=False, save=True)
    shutil.move("figures/filter_genes_dispersion.pdf",
                f"figures/filter_genes_dispersionAllCells.pdf")

    # UMAP plots
    # Idea: based on the expression of all genes, do the cell cycle phases cluster together?
    # Execution: scanpy methods: UMAP statistics first, then make UMAP
    # Output: UMAP plots
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adata)
    plt.rcParams['figure.figsize'] = (10, 10)
    sc.pl.umap(adata, color=["phase"], show=False, save=True)
    shutil.move("figures/umap.pdf", f"figures/umapAllCellsSeqCenterPhase.pdf")

    # General display of RNA abundances in TPMs
    sbn.displot(np.concatenate(adata.X), color="tab:orange")
    plt.xlabel("TPM")
    plt.ylabel("Density")
    plt.savefig("figures/rna_abundance_density.pdf")
    # plt.show()
    plt.close()
 def _plot_metric_nodes_distribution(self,
                                     leaf_metrics: pd.DataFrame,
                                     dist_type: str = 'kde') -> None:
     g = sns.displot(data=leaf_metrics,
                     x=self.metric_col,
                     hue=self.__NODE_RULES__COL,
                     kind=dist_type,
                     common_norm=False)
     g.savefig(
         f"bias_distribution_{self.metric_col}-{self.dataset_name}.png",
         dpi=600)
     plt.show()
示例#7
0
    def distToCentroid(self, labels, distances):
        """
        Method to compute and plot distance of each point to the centroïd of its cluster
        
        input

            labels: list containing cluster label attached to index
            distances: distance from point to centroïd
            
        output
        
            distribution plot of distances from each point to its cluster's centroïd
        """
        self.clustersPCA = pd.DataFrame([list(i) for i in zip(labels,distances)],columns=['cluster','distance'])
        self.clustersPCA['distanceToCluster'] = self.clustersPCA['distance'].apply(lambda x: min(x))
        self.clustersPCA['distToCluster1'] = self.clustersPCA['distance'].apply(lambda x: x[0])
        self.clustersPCA['distToCluster2'] = self.clustersPCA['distance'].apply(lambda x: x[1])
        self.clustersPCA['distToCluster3'] = self.clustersPCA['distance'].apply(lambda x: x[2])
        self.clustersPCA.cluster.replace({0:1, 1:2, 2:3}, inplace=True)
        sns.displot(data=self.clustersPCA, x='distanceToCluster', hue='cluster', kde=True)
        plt.show()
示例#8
0
def plot_island_distribution(data: PlotData, island_size_: int, tmp_dir: Path):
    csv_out = Path(tmp_dir, f"out_{island_size_}.csv")
    with time_func(f"Populating the CSV at {csv_out}"):
        populate_csv(csv_out, data.distributions, [island_size_])
    with time_func("Reading the CSV"):
        data_set = pd.read_csv(csv_out)
    for normalize in (True, False):
        xs = "avg_occurr" if not normalize else "ln_avg_occurr"
        with time_func("Displaying the dataset:"):
            sns.displot(
                data_set,
                x=xs,
                hue="edge_length",
                kind="kde",  # kde=True,
                palette=sns.color_palette("Paired", data.lambdas))
        title = f"island_size_{island_size_}"
        if normalize:
            title = "normalized_" + title
        out_fie = Path(data.out_dir, f"{title}.png")
        plt.title(title)
        plt.savefig(str(out_fie))
示例#9
0
def map_total_calc_cov(target_moment_df,
                       col_name,
                       title,
                       file_path: str = None):

    # 過去に保存した同じファイルがあれば削除
    if path.exists(file_path):
        os.remove(file_path)

    sns_plt = sns.displot(data=target_moment_df, x=col_name)
    plt.title(title)
    sns_plt.savefig(file_path)
示例#10
0
def analysis(request):
    data = pd.read_csv(
        r"C:/Users/Ajay's/Desktop/Prediction/HousePricePrediction/ml/static/js/USA_Housing.csv"
    )
    sns.displot(data=data,
                x="Price",
                y="Avg. Area Number of Rooms",
                kind="kde",
                rug=True)
    plt.savefig(
        r"C:/Users/Ajay's/Desktop/Prediction/HousePricePrediction/ml/static/img/price_and_room.png"
    )

    sns.displot(data=data, x="Price", y="Avg. Area House Age", kind="kde")
    plt.savefig(
        r"C:/Users/Ajay's/Desktop/Prediction/HousePricePrediction/ml/static/img/price_and_house.png"
    )

    data = data.drop(['Address'], axis=1)
    X = data.drop('Price', axis=1)
    Y = data['Price']
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.30)
    model = LinearRegression()
    model.fit(X_train, Y_train)
    coeff_df = pd.DataFrame(model.coef_, X.columns, columns=['Cofficient'])
    v1 = round(coeff_df.at['Avg. Area Income', 'Cofficient'], 3)
    v2 = round(coeff_df.at['Avg. Area House Age', 'Cofficient'], 3)
    v3 = round(coeff_df.at['Avg. Area Number of Rooms', 'Cofficient'], 3)
    v4 = round(coeff_df.at['Avg. Area Number of Bedrooms', 'Cofficient'], 3)
    v5 = round(coeff_df.at['Area Population', 'Cofficient'], 3)
    prediction = model.predict(X_test)
    error = round(np.sqrt(metrics.mean_absolute_error(Y_test, prediction)), 3)
    return render(request, 'dash.html', {
        "v1": v1,
        "v2": v2,
        "v3": v3,
        "v4": v4,
        "v5": v5,
        "error": error,
    })
示例#11
0
def plot_congestion_dist(columns, dataframe, path, prefix, save, show):
    for atr in columns:
        plt.figure(figsize=set_size(418))
        plt.style.use('seaborn')
        plt.rcParams.update(tex_fonts)
        plt.title('Distribution of ' + atr)
        plt.ylabel('Count')
        sns.displot(x=atr, data=dataframe, palette='Spectral')
        if atr is 'TempExMax':
            plt.xlabel('Maximum temporal extend [min]')
        elif atr is 'SpatExMax':
            plt.xlabel('Maximum spatial extend [m]')
        elif atr is 'TempDist':
            plt.xlabel('Minimum temporal distance to incident [min]')
        elif atr is 'SpatDist':
            plt.xlabel('Minimum spatial distance to incident [m]')
        elif atr is 'temporalGlobalLoc':
            plt.xlabel('Relative temporal location')
        elif atr is 'spatialGlobalLoc':
            plt.xlabel('Relative spatial location')
        elif atr is 'temporalInternalLoc':
            plt.xlabel('Internal relative temporal location')
        elif atr is 'spatialInternalLoc':
            plt.xlabel('Internal relative spatial location')
        elif atr is 'Coverage':
            plt.xlabel('Ratio of jammed cells in covering rectangle [\%]')
        elif atr is 'TimeLossCar':
            plt.xlabel('Time loss per Cars [s]')
        elif atr is 'TimeLossHGV':
            plt.xlabel('Time loss per HGVs [s]')
        else:
            plt.xlabel(atr)
        if save:
            plt.savefig(path + prefix + '_congestion_dist_' + atr + '.pdf')
            if not show:
                plt.close()
        if show:
            plt.show()
        else:
            plt.close()
示例#12
0
def hist_pair2(df, stat, col2, cum=False, title=None):
    """Histogram of population pair stats.

    dd12_{n}_{k}_pop{kj}
    ddRank12_{n}_{k}_pop{kj}

    Parameters
    ----------
    df : TYPE
        DESCRIPTION.
    stat : TYPE
        DESCRIPTION.
    col2 : TYPE
        DESCRIPTION.
    cum : TYPE, optional
        DESCRIPTION. The default is False.
    title : TYPE, optional
        DESCRIPTION. The default is None.

    Returns
    -------
    None.

    """
    stat_cols = [col for col in df.columns if stat == col.split("_")[0]]
    df_stat = pd.melt(df.filter(regex=f"{stat}"),
                      value_vars=stat_cols,
                      var_name=stat,
                      value_name=col2)
    # add pop column
    pop = df_stat[stat].str.split("_", n=-1, expand=True)
    df_stat["pops"] = pop[pop.columns[-1]]
    df_stat["subpop"] = pop[2]
    df_stat[stat] = pop[0]
    # plotting if just obs or just sims
    if title is None:
        title = stat
    name = ""
    if cum:
        g = sns.histplot(data=df_stat,
                         x=col2,
                         hue="subpop",
                         col="pops",
                         kind="ecdf")
        name = "cum"
    else:
        g = sns.displot(data=df_stat,
                        x=col2,
                        hue="subpop",
                        col="pops",
                        kind="hist")
    g.savefig(f"{stat}.{name}histpair.pdf", bbox_inches='tight')
示例#13
0
def kolm_test(file, pref, source, target, meta_file):
    """
    Performs KS test for optimization process

    Args:
        file (str): Path to the file with TopoCMap output table
        pref (str): Prefix to all statistical files (e.g. histograms etc.)
        source (str): Source cell type
        target (str): Target cell type
        meta_file (str): Path to the file with drugs metadata

    Returns:
        statistics (:obj:`list` of :obj:`tuple` of :obj:`float`): Output of ks_2samp function for all 10 iterations
        mean_1 (float): Mean of 'Golden Standard' molecules distribution
        mean_2 (float): Mean of means of all molecules distribution

    """
    dist = []
    cids = []
    cmap_db = pd.read_csv(file)
    drug_meta = pd.read_csv(meta_file)
    cids_cur = stand_chems(source, target)
    cids_cur = [float(cid) for cid in cids_cur]
    pert_cur = []
    cids_cur = pd.unique(cids_cur)
    for ind, chem in enumerate(drug_meta['pubchem_cid']):
        for chem_1 in cids_cur:
            try:
                if int(chem) == int(chem_1):
                    pert_cur.append(drug_meta['pert_id'].loc[ind])
            except ValueError:
                continue
    for ind, chem in enumerate(cmap_db['pert_id']):
        for chem_1 in pert_cur:
            if chem == chem_1:
                cids.append(chem)
                dist.append(cmap_db['cosine_dist'].loc[ind])

    statistics = []
    mean_1 = np.mean(dist)
    means = []
    cmap_db = cmap_db[~cmap_db["pert_id"].isin(pert_cur)]
    print(len(cmap_db))
    for i in range(10):
        dist_rand = np.random.choice(list(cmap_db['cosine_dist']), len(dist))
        means.append(np.mean(dist_rand))
        stat, pval = stats.ks_2samp(dist, dist_rand)
        statistics.append((stat, pval))
        sns_plot = sns.displot([dist, dist_rand], kde=True)
        sns_plot.savefig(pref + str(i) + ".png")
    mean_2 = np.mean(means)
    return statistics, mean_1, mean_2
示例#14
0
def plot_2d_kde(x, y, hue, data):
    """
    Plot a bivariate kernel density estimate
    
    Parameters
    ----------
    x : array
        x-axis variable
    y : array
        y-axis variable
    hue : array
        Variable to map to colors in order to visually distinguish separate bivariate densities
        
    Returns
    -------
    None
    """

    plt.figure(figsize=(16, 16))
    sns.displot(x=x, y=y, hue=hue, kind='kde', data=data)

    plt.show()
示例#15
0
def plot_cont(df, plt_typ):
    numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
    df = df[numeric_columns]

    for i in range(0, len(numeric_columns), 2):
        if len(numeric_columns) > i + 1:
            plt.figure(figsize=(10, 4))
            plt.subplot(121)

            if plt_typ == 'boxplot':
                sns.boxplot(df[numeric_columns[i]])
                plt.subplot(122)
                sns.boxplot(df[numeric_columns[i + 1]])
            elif plt_typ == 'displot':
                sns.boxplot(df[numeric_columns[i]])
                plt.subplot(122)
                sns.displot(df[numeric_columns[i + 1]])
            else:
                print('Pass either distplot/boxplot')

        plt.tight_layout()
        plt.show()
示例#16
0
 def GCdistribut(self, _indf, out, X='query_length', Dup=[], log=False, title=''):
     if not _indf.empty:
         indef = _indf.copy()
         if Dup:
             indef = indef[Dup +[X]].drop_duplicates(keep='first')
         indef[X] = indef[X].astype(float)
         dp = sns.displot(data=indef, x=X, kde=True, log_scale=log)
         dp.set_xticklabels(rotation=270)
         if title:
             plt.title(title)
         plt.tight_layout()
         plt.savefig( out )
         plt.close()
示例#17
0
def create_fig(data, draw=False):

    if draw is not False:
        df = pd.DataFrame(data)
        df.columns = ['Node', 'Edges']

        fig = sns.displot(df['Edges'], bins=max(df['Edges']) + 1, kde=True)

        plt.title('Amount of edges per node')
        plt.show()

    else:
        pass
def DerCore_Ratio(enh):
    cdratio = enh.loc[enh["core_remodeling"] == 1].groupby(
        ['enh_id', 'core'])["arch"].count().reset_index()
    cdratio.columns = ["enh_id", "core", "num derived regions per enh"]
    data = cdratio.loc[cdratio["core"] == 0].describe()
    sns.set("talk")
    g = sns.displot(cdratio.loc[cdratio["core"] == 0,
                                "num derived regions per enh"],
                    kind="ecdf")
    outf = f"{RE}cdf_n_der.pdf"
    plt.savefig(outf, bbox_inches="tight")

    return data
示例#19
0
def show_sentence_length():
    train_data['sentence_length'] = list(
        map(lambda x: len(x), train_data['sentence']))
    ##绘制训练集句子长度分布
    sns.countplot(train_data["sentence_length"])
    plt.xticks([])
    plt.show()
    # 绘制dist长度分布图
    sns.displot(train_data["sentence_length"])
    plt.yticks([])
    plt.show()

    valid_data['sentence_length'] = list(
        map(lambda x: len(x), valid_data['sentence']))
    ##绘制训练集句子长度分布
    sns.countplot(valid_data["sentence_length"])
    plt.xticks([])
    plt.show()
    # 绘制dist长度分布图
    sns.displot(valid_data["sentence_length"])
    plt.yticks([])
    plt.show()
 def _plot_univariate(y_test, y_sampled):
     idlist = [[i + 1] * len(x) for i, x in enumerate([y_test, y_sampled])]
     df = pd.DataFrame(np.array(
         [np.concatenate([y_test, y_sampled]),
          np.concatenate(idlist)]).T,
                       columns=['value', 'type'])
     df['type'] = df['type'].map({1: 'original', 2: 'sampled'})
     return sns.displot(df,
                        x='value',
                        rug=True,
                        kind='kde',
                        color='black',
                        hue='type')
示例#21
0
 def get_charging_time(self):
     # Plot pdf do tempo de recarga do ev
     sns.set_theme(style="darkgrid")
     # Transformando vetor de curvas em vetor de horas
     hour = 1
     nem_charging_time = []
     for item in self.ev_charging_time:
         if item > 0:
             aux = [hour] * abs(int(item))
             nem_charging_time.extend(aux)
         hour += 1
         if hour == 25:
             hour = 1
     ev_charging_dict = {"value": nem_charging_time}
     df = pd.DataFrame.from_dict(ev_charging_dict)
     sns.displot(data=df, x="value", kind="kde")
     plt.gcf().subplots_adjust(bottom=0.15)
     plt.xlabel('Time [h]')
     plt.xlim(0, 24)
     plt.ylabel('PDF')
     plt.savefig("plot_charging_time.png", dpi=199)
     plt.show()
示例#22
0
def plot_distance_from_median_pl(distance_files, patient_types):
    print(f"-------- Distance from median PL --------")
    distance_stats_df = pd.DataFrame()
    for distances, patient_type in zip(distance_files, patient_types):
        distances = pd.read_csv(DOTENV_KEY2VAL["GEN_DATA_DIR"] + distances)
        distances = distances.set_index("Unnamed: 0")
        for i in range(len(HOMOLOGY_DIMENSIONS)):
            distance_data = distances.iloc[:, i]
            print(f"-------- {patient_type} H_{i} --------")
            distance_stats_dict = dict()
            distance_stats_dict["Mean"] = np.mean(distance_data)
            distance_stats_dict["Median"] = np.median(distance_data)
            distance_stats_dict["Standard deviation"] = np.std(distance_data)
            distance_stats_dict["Q3"] = np.quantile(distance_data, 0.75)
            distance_stats_dict["Max"] = np.max(distance_data)
            #            distance_stats_dict["kurtosis"] = kurtosis(distance_data)
            distance_stats_dict["Skewness"] = skew(distance_data)
            distance_stats_dict["Shapiro-Wilk test"] = shapiro(
                distance_data
            ).pvalue
            print(f"Shapiro-Wilk: {shapiro(distance_data)}")
            distance_stats_df_entry = pd.DataFrame.from_dict(
                distance_stats_dict, orient="index"
            )
            distance_stats_df_entry.columns = [f"{patient_type} $H_{i}$"]
            distance_stats_df = distance_stats_df.append(
                distance_stats_df_entry.T
            )
            ax = sns.displot(distance_data, kde=True)
            # ax.set(ylim=(0, 1))  # Finetuned to the data
            ax.set(xlim=(0, 12))
            plt.savefig(
                DOTENV_KEY2VAL["GEN_FIGURES_DIR"]
                + "/median_pls/"
                + f"median_pl_{patient_type}_H_{i}_displot.png",
                bbox_inches="tight",
            )
    test_results = pd.DataFrame(
        distance_stats_df["Shapiro-Wilk test"]
    ).applymap(lambda x: format_tex(x))
    stats = distance_stats_df[
        ["Mean", "Median", "Standard deviation", "Q3", "Max", "Skewness"]
    ].applymap(lambda x: format_tex_numbers(x))
    distance_stats_df = stats.join(test_results)
    print(test_results)
    distance_stats_df.to_latex(
        DOTENV_KEY2VAL["GEN_DATA_DIR"]
        + "distance_from_median_pl_statistics.tex",
        float_format="{:0.2f}".format,
        escape=False,
    )
示例#23
0
def Exploration_Taille_Masques():

    df_train = cheminSources + dftrain

    st.title("Taille des masques de forme\n")
    st.write('\n')

    st.subheader("Répartition des surfaces (en pixels) des masques encodés :")
    file2 = df_train
    df_train = load_data(file2)
    sns.histplot(df_train[df_train['nb_pixels'] != 0].nb_pixels,
                 bins=20,
                 kde=True,
                 stat="density")
    st.pyplot()
    nb_form = df_train[df_train['nb_pixels'] != 0].shape[0]

    st.subheader(
        "Répartition des surfaces (en pixels) des masques encodés suivant leur label : "
    )
    df_form = df_train[(df_train['nb_pixels'] != 0)]
    sns.set_context(font_scale=2)
    sns.displot(data=df_form,
                bins=20,
                x='nb_pixels',
                col="Label",
                stat="density")
    st.pyplot()

    st.subheader("Boxplot associés :")
    sns.boxplot(
        y='nb_pixels',
        x='Label',
        data=df_form,
        #width=0.5,
        palette="colorblind")

    st.pyplot()
示例#24
0
def ETC2Run():
    delta_vals = [i * 0.04 for i in range(1, 25)]
    ub_regrets = []
    actual_regrets = []
    num_runs = 500

    for delta in delta_vals:
        ub_regret_sum, actual_regret_sum = 0, 0
        for run_no in range(num_runs):
            ub_regret, actual_regret = ExploreThenCommit2(n=1000, delta=delta)
            ub_regret_sum += ub_regret
            actual_regret_sum += actual_regret
        ub_regrets.append(ub_regret_sum / num_runs)
        actual_regrets.append(actual_regret_sum / num_runs)

    plt.plot(delta_vals, ub_regrets, label="Upper Bound")
    plt.plot(delta_vals, actual_regrets, label="Actual Regret")
    plt.legend()
    plt.show()

    m_vals = [i * 15 for i in range(1, 25)]
    actual_regrets = np.zeros((len(m_vals), num_runs))
    for i, m in enumerate(m_vals):
        actual_regret_sum = 0
        for run_no in range(num_runs):
            _, actual_regret = ExploreThenCommit2(n=2000, delta=0.1, m=m)
            actual_regrets[i, run_no] = actual_regret

    plt.plot(m_vals, actual_regrets.mean(axis=1))
    plt.show()

    plt.plot(m_vals, actual_regrets.std(axis=1))
    plt.show()

    hue = np.array(m_vals)
    hue = np.repeat(hue, num_runs)
    sns.displot(x=actual_regrets.flatten(), hue=hue, kind="kde")
    plt.show()
    def plot_input_length(df, split_folder):
        """
        Plots the input length of the decisions in the given dataframe
        :param df:              the dataframe containing the decision texts
        :param split_folder:    where to save the plots and csv files
        :return:
        """
        # compute median input length
        input_length_distribution = df[['num_tokens_spacy', 'num_tokens_bert']].describe().round(0).astype(int)
        input_length_distribution.to_csv(split_folder / 'input_length_distribution.csv', index_label='measure')

        # bin outliers together at the cutoff point
        cutoff = 4000
        cut_df = df[['num_tokens_spacy', 'num_tokens_bert']]
        cut_df.num_tokens_spacy = cut_df.num_tokens_spacy.clip(upper=cutoff)
        cut_df.num_tokens_bert = cut_df.num_tokens_bert.clip(upper=cutoff)

        hist_df = pd.concat([cut_df.num_tokens_spacy, cut_df.num_tokens_bert], keys=['spacy', 'bert']).to_frame()
        hist_df = hist_df.reset_index(level=0)
        hist_df = hist_df.rename(columns={'level_0': 'tokenizer', 0: 'Number of tokens'})

        plot = sns.displot(hist_df, x="Number of tokens", hue="tokenizer",
                           bins=100, kde=True, fill=True, height=5, aspect=2.5, legend=False)
        plot.set(xticks=list(range(0, 4500, 500)))
        plt.ylabel('Number of court cases')
        plt.legend(["BERT", "SpaCy"], loc='upper right', title='Tokenizer', fontsize=16, title_fontsize=18)
        plot.savefig(split_folder / 'input_length_distribution-histogram.png', bbox_inches="tight")
        plt.clf()

        plot = sns.displot(hist_df, x="Number of tokens", hue="tokenizer", kind="ecdf", legend=False)
        plt.ylabel('Number of court cases')
        plt.legend(["BERT", "SPaCy"], loc='lower right', title='Tokenizer')
        plot.savefig(split_folder / 'input_length_distribution-cumulative.png', bbox_inches="tight")
        plt.clf()

        plot = sns.displot(cut_df, x="num_tokens_spacy", y="num_tokens_bert")
        plot.savefig(split_folder / 'input_length_distribution-bivariate.png', bbox_inches="tight")
        plt.clf()
def distributions(
    df: pd.DataFrame,
    dist_class: str,
    column: str,
    show: bool,
    save_location: str,
) -> None:
    """
        Plot distribution of the same x variable for multiple classes

    Args:
        df (pd.DataFrame): Data
        dist_class (str): How to split the values from df
        column (str): Column of dataframe
        show (bool): Flag to show plot
        hist (bool): Hist type flag
        kde (bool): Kde type flag
        save_location (str): Path to where the plot should be saved
    """
    sns.displot(x=df[column], hue=df[dist_class], kind="kde", clip=(1.0, 8.0))
    plt.savefig(save_location)
    if show:
        plt.show()
示例#27
0
文件: plot_main.py 项目: RyogaLi/PPS
 def make_perc_coverred_dist(self):
     """
     from all summary, plot perc covered by sequencing data on bar plot
     For All, HIP, Supp-SGD and Supp-PROT
     :return:
     """
     print(self._all_summary.columns)
     print(self._all_mut.columns)
     not_fully_covered = self._all_summary.loc[
         (self._all_summary["aligned_perc"] < 1)
         & (self._all_summary["found"] == "y")]
     print(self._all_summary["aligned_perc"])
     print(not_fully_covered.shape)
     fig, ax = plt.subplots(figsize=(10, 12))
     sns.displot(not_fully_covered.aligned_perc * 100,
                 bins=40,
                 ax=ax,
                 color="#084c61",
                 edgecolor="#084c61")
     plt.title("Human 9.1 ORFs")
     plt.xlabel("Percent of ORF len aligned")
     plt.tight_layout()
     plt.savefig(os.path.join(self._dir, "nfully_human91_perc_dist.png"))
示例#28
0
def displot(data, key, aim, **kwargs):
    sns.set_style('white')
    # fig = plt.figure(figsize=(10, 7.5))
    fig = plt.figure()
    ax = sns.displot(data=data, x=aim, kind="ecdf", hue="algorithm",
                     hue_order=['DRPA', 'FP', 'WMMSE', 'maximum', 'random'],
                     height=3, aspect=1.5, facet_kws=dict(legend_out=False),
                    # aspect=1.5, facet_kws=dict(legend_out=False),
                     **kwargs)
    ax.legend.set_title('')
    ax.legend._loc=7
    plt.xlabel(f'Average {aim} (bps/Hz)')
    plt.grid(axis="y")
    return fig, ax
示例#29
0
def oneVarDistribution(xName, data, title, catName=""):

    if is_numeric_dtype(data.loc[:, xName]):

        if catName == "":

            preparation()

            sns.displot(data=data, x=xName)
            plt.title(title)
            plt.show()

        else:

            preparation()

            sns.displot(data=data, x=xName, hue=catName)
            plt.title(title)
            plt.show()

    else:

        if catName == "":

            preparation()

            sns.countplot(data=data, x=xName)
            plt.title(title)
            plt.show()

        else:

            preparation()

            sns.countplot(data=data, x=xName, hue=catName)
            plt.title(title)
            plt.show()
示例#30
0
def plot_correlation_distribution(countries,
                                  delta=timedelta(days=60),
                                  weekly=True):
    """
    
    Args:
        countries ():
        delta ():
        weekly ():
    """
    # regions
    x = None
    regions = []
    if 'CZ' in countries: regions.append(CZ_regions)
    if 'PL' in countries: regions.append(PL_regions)
    if 'SE' in countries: regions.append(SE_regions)
    if 'IT' in countries: regions.append(IT_regions)
    # compute correlations
    for country in regions:
        components = 'ID' if country[0][:2] in {'PL', 'SE'} else 'IRD'
        corrs = prediction_data_correlation(country,
                                            components,
                                            delta=delta,
                                            weekly=weekly)
        corrs['Country'] = country[0][:2]
        if x is None: x = corrs
        else: x = pd.concat([x, corrs])
    # plot
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.displot(x,
                x="D",
                hue="Country",
                element="step",
                multiple="stack",
                bins=20,
                ax=ax)
    ax.set_xlim([-1, 1])
示例#31
0
def plot_prob_dist(data, output_path, cluster_type, n_cluster):
    """
    KDE plot of cluster probabilities.
    Args:
        data: Neuron activation dataframe with cluster labels
        output_path: Output path to save to
        cluster_type: Cluster algorithm used for file naming
        n_cluster: Number of unique clusters in clustering algorithm

    Returns:
    """

    plt.figure(figsize=(12, 6))
    sns.displot(data=data,
                x='label prob',
                hue='label',
                multiple='stack',
                palette='dark',
                kind='kde',
                aspect=2)
    plt.xlabel('Cluster Probability', fontsize=14)
    plt.ylabel('Density', fontsize=14)
    plt.savefig(join(output_path, f'{cluster_type}_{n_cluster}_prob_dist.png'),
                bbox_inches='tight')
示例#32
0
def confoundplot(tseries, gs_ts, gs_dist=None, name=None, normalize=True,
                 units=None, tr=None, hide_x=True, color='b', nskip=0,
                 cutoff=None, ylims=None):

    # Define TR and number of frames
    notr = False
    if tr is None:
        notr = True
        tr = 1.
    ntsteps = len(tseries)

    # Normalize time series
    tseries = np.array(tseries)
    if normalize:
        tseries /= tr

    # Define nested GridSpec
    gs = mgs.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs_ts,
                                     width_ratios=[1, 100], wspace=0.0)

    ax_ts = plt.subplot(gs[1])
    ax_ts.grid(False)
    ax_ts.plot(tseries, color=color)
    ax_ts.set_xlim((0, ntsteps - 1))

    # Set 10 frame markers in X axis
    interval = ntsteps // 10
    xticks = list(range(0, ntsteps)[::interval]) + [ntsteps - 1]
    ax_ts.set_xticks(xticks)

    if not hide_x:
        if notr:
            ax_ts.set_xlabel('time (frame #)')
        else:
            ax_ts.set_xlabel('time (s)')
            labels = tr * np.array(xticks)
            ax_ts.set_xticklabels(['%.02f' % t for t in labels.tolist()])
    else:
        ax_ts.set_xticklabels([])

    no_scale = notr or not normalize
    if not name is None:
        var_label = name
        if not units is None:
            var_label += (' [{}]' if no_scale else ' [{}/s]').format(units)
        ax_ts.set_ylabel(var_label)

    for side in ["top", "right"]:
        ax_ts.spines[side].set_color('none')
        ax_ts.spines[side].set_visible(False)

    if not hide_x:
        ax_ts.spines["bottom"].set_position(('outward', 20))
        ax_ts.xaxis.set_ticks_position('bottom')
    else:
        ax_ts.spines["bottom"].set_color('none')
        ax_ts.spines["bottom"].set_visible(False)

    ax_ts.spines["left"].set_position(('outward', 30))
    ax_ts.yaxis.set_ticks_position('left')

    # Calculate Y limits
    def_ylims = [0.95 * tseries[~np.isnan(tseries)].min(),
                 1.1 * tseries[~np.isnan(tseries)].max()]
    if ylims is not None:
        if ylims[0] is not None:
            def_ylims[0] = min([def_ylims[0], ylims[0]])
        if ylims[1] is not None:
            def_ylims[1] = max([def_ylims[1], ylims[1]])

    ax_ts.set_ylim(def_ylims)
    yticks = sorted(def_ylims)
    ax_ts.set_yticks(yticks)
    ax_ts.set_yticklabels(['%.02f' % y for y in yticks])
    yrange = def_ylims[1] - def_ylims[0]

    # Plot average
    if cutoff is None:
        cutoff = []

    cutoff.insert(0, tseries[~np.isnan(tseries)].mean())

    for i, thr in enumerate(cutoff):
        ax_ts.plot((0, ntsteps - 1), [thr] * 2,
                   linewidth=.75,
                   linestyle='-' if i == 0 else ':',
                   color=color if i == 0 else 'k')

        if i == 0:
            mean_label = r'$\mu$=%.3f%s' % (thr, units if units is not None else '')
            ax_ts.annotate(
                mean_label, xy=(ntsteps - 1, thr), xytext=(11, 0),
                textcoords='offset points', va='center', color='w', size=10,
                bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),
                arrowprops=dict(
                    arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,
                    fc=color, ec='none', relpos=(0.01, 0.5)))
        else:
            y_off = [0.0, 0.0]
            for pth in cutoff[:i]:
                inc = abs(thr - pth)
                if inc < yrange:
                    factor = (- (inc / yrange) + 1) ** 2
                    if (thr - pth) < 0.0:
                        y_off[0] -= factor * 20
                    else:
                        y_off[1] += factor * 20

            offset = y_off[0] if abs(y_off[0]) > y_off[1] else y_off[1]

            a_label = '%.2f%s' % (thr, units if units is not None else '')
            ax_ts.annotate(
                a_label, xy=(ntsteps - 1, thr), xytext=(11, offset),
                textcoords='offset points', va='center',
                color='w', size=10,
                bbox=dict(boxstyle='round', fc='dimgray', ec='none', color='none', lw=0),
                arrowprops=dict(
                    arrowstyle='wedge,tail_width=.9', lw=0, patchA=None, patchB=None,
                    fc='dimgray', ec='none', relpos=(.1, .5)))

    if not gs_dist is None:
        ax_dist = plt.subplot(gs_dist)
        sns.displot(tseries, vertical=True, ax=ax_dist)
        ax_dist.set_xlabel('Timesteps')
        ax_dist.set_ylim(ax_ts.get_ylim())
        ax_dist.set_yticklabels([])

        return [ax_ts, ax_dist], gs
    else:
        return ax_ts, gs
示例#33
0
def confoundplot(tseries, gs_ts, gs_dist=None, name=None, normalize=True,
                 units=None, tr=None, hide_x=True, color='b', nskip=4):

    # Define TR and number of frames
    notr = False
    if tr is None:
        notr = True
        tr = 1.
    ntsteps = len(tseries)

    # Normalize time series
    tseries = np.array(tseries)
    if normalize:
        tseries /= tr

    # Define nested GridSpec
    gs = mgs.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs_ts,
                                     width_ratios=[1, 100], wspace=0.0)

    ax_ts = plt.subplot(gs[1])
    ax_ts.grid(False)
    ax_ts.plot(tseries, color=color)
    ax_ts.set_xlim((0, ntsteps - 1))

    # Set 10 frame markers in X axis
    interval = ntsteps // 10
    xticks = list(range(0, ntsteps)[::interval]) + [ntsteps - 1]
    ax_ts.set_xticks(xticks)

    if not hide_x:
        if notr:
            ax_ts.set_xlabel('time (frame #)')
        else:
            ax_ts.set_xlabel('time (s)')
            labels = tr * np.array(xticks)
            ax_ts.set_xticklabels(['%.02f' % t for t in labels.tolist()])
    else:
        ax_ts.set_xticklabels([])

    if not name is None:
        var_label = name
        if not units is None:
            var_label += (' [{}]' if notr else ' [{}/s]').format(units)
        ax_ts.set_ylabel(var_label)

    for side in ["top", "right"]:
        ax_ts.spines[side].set_color('none')
        ax_ts.spines[side].set_visible(False)

    if not hide_x:
        ax_ts.spines["bottom"].set_position(('outward', 20))
        ax_ts.xaxis.set_ticks_position('bottom')
    else:
        ax_ts.spines["bottom"].set_color('none')
        ax_ts.spines["bottom"].set_visible(False)

    ax_ts.spines["left"].set_position(('outward', 30))
    ax_ts.yaxis.set_ticks_position('left')

    # Plot average
    ax_ts.plot((0, ntsteps), [tseries.mean()] * 2, color=color, linestyle=':')

    ax_ts.set_ylim(tseries[nskip:].min(), tseries[nskip:].max())

    if not gs_dist is None:
        ax_dist = plt.subplot(gs_dist)
        sns.displot(tseries, vertical=True, ax=ax_dist)
        ax_dist.set_xlabel('Timesteps')
        ax_dist.set_ylim(ax_ts.get_ylim())
        ax_dist.set_yticklabels([])

        return [ax_ts, ax_dist], gs
    else:
        return ax_ts, gs
示例#34
0
def confoundplot(tseries, gs_ts, gs_dist=None, name=None,
                 units=None, tr=None, hide_x=True, color='b', nskip=0,
                 cutoff=None, ylims=None):

    # Define TR and number of frames
    notr = False
    if tr is None:
        notr = True
        tr = 1.
    ntsteps = len(tseries)
    tseries = np.array(tseries)

    # Define nested GridSpec
    gs = mgs.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs_ts,
                                     width_ratios=[1, 100], wspace=0.0)

    ax_ts = plt.subplot(gs[1])
    ax_ts.grid(False)

    # Set 10 frame markers in X axis
    interval = max((ntsteps // 10, ntsteps // 5, 1))
    xticks = list(range(0, ntsteps)[::interval])
    ax_ts.set_xticks(xticks)

    if not hide_x:
        if notr:
            ax_ts.set_xlabel('time (frame #)')
        else:
            ax_ts.set_xlabel('time (s)')
            labels = tr * np.array(xticks)
            ax_ts.set_xticklabels(['%.02f' % t for t in labels.tolist()])
    else:
        ax_ts.set_xticklabels([])

    if name is not None:
        if units is not None:
            name += ' [%s]' % units

        ax_ts.annotate(
            name, xy=(0.0, 0.7), xytext=(0, 0), xycoords='axes fraction',
            textcoords='offset points', va='center', ha='left',
            color=color, size=8,
            bbox={'boxstyle': 'round', 'fc': 'w', 'ec': 'none',
                  'color': 'none', 'lw': 0, 'alpha': 0.8})

    for side in ["top", "right"]:
        ax_ts.spines[side].set_color('none')
        ax_ts.spines[side].set_visible(False)

    if not hide_x:
        ax_ts.spines["bottom"].set_position(('outward', 20))
        ax_ts.xaxis.set_ticks_position('bottom')
    else:
        ax_ts.spines["bottom"].set_color('none')
        ax_ts.spines["bottom"].set_visible(False)

    # ax_ts.spines["left"].set_position(('outward', 30))
    ax_ts.spines["left"].set_color('none')
    ax_ts.spines["left"].set_visible(False)
    # ax_ts.yaxis.set_ticks_position('left')

    ax_ts.set_yticks([])
    ax_ts.set_yticklabels([])

    nonnan = tseries[~np.isnan(tseries)]
    if nonnan.size > 0:
        # Calculate Y limits
        valrange = (nonnan.max() - nonnan.min())
        def_ylims = [nonnan.min() - 0.1 * valrange, nonnan.max() + 0.1 * valrange]
        if ylims is not None:
            if ylims[0] is not None:
                def_ylims[0] = min([def_ylims[0], ylims[0]])
            if ylims[1] is not None:
                def_ylims[1] = max([def_ylims[1], ylims[1]])

        # Add space for plot title and mean/SD annotation
        def_ylims[0] -= 0.1 * (def_ylims[1] - def_ylims[0])

        ax_ts.set_ylim(def_ylims)

        # Annotate stats
        maxv = nonnan.max()
        mean = nonnan.mean()
        stdv = nonnan.std()
        p95 = np.percentile(nonnan, 95.0)
    else:
        maxv = 0
        mean = 0
        stdv = 0
        p95 = 0

    stats_label = (r'max: {max:.3f}{units} $\bullet$ mean: {mean:.3f}{units} '
                   r'$\bullet$ $\sigma$: {sigma:.3f}').format(
        max=maxv, mean=mean, units=units or '', sigma=stdv)
    ax_ts.annotate(
        stats_label, xy=(0.98, 0.7), xycoords='axes fraction',
        xytext=(0, 0), textcoords='offset points',
        va='center', ha='right', color=color, size=4,
        bbox={'boxstyle': 'round', 'fc': 'w', 'ec': 'none', 'color': 'none',
              'lw': 0, 'alpha': 0.8}
    )

    # Annotate percentile 95
    ax_ts.plot((0, ntsteps - 1), [p95] * 2, linewidth=.1, color='lightgray')
    ax_ts.annotate(
        '%.2f' % p95, xy=(0, p95), xytext=(-1, 0),
        textcoords='offset points', va='center', ha='right',
        color='lightgray', size=3)

    if cutoff is None:
        cutoff = []

    for i, thr in enumerate(cutoff):
        ax_ts.plot((0, ntsteps - 1), [thr] * 2,
                   linewidth=.2, color='dimgray')

        ax_ts.annotate(
            '%.2f' % thr, xy=(0, thr), xytext=(-1, 0),
            textcoords='offset points', va='center', ha='right',
            color='dimgray', size=3)

    ax_ts.plot(tseries, color=color, linewidth=.8)
    ax_ts.set_xlim((0, ntsteps - 1))

    if gs_dist is not None:
        ax_dist = plt.subplot(gs_dist)
        sns.displot(tseries, vertical=True, ax=ax_dist)
        ax_dist.set_xlabel('Timesteps')
        ax_dist.set_ylim(ax_ts.get_ylim())
        ax_dist.set_yticklabels([])

        return [ax_ts, ax_dist], gs
    return ax_ts, gs